diff --git a/Documentation/filesystems/bcachefs/CodingStyle.rst b/Documentation/filesystems/bcachefs/CodingStyle.rst index 0c45829a4899..01de555e21d8 100644 --- a/Documentation/filesystems/bcachefs/CodingStyle.rst +++ b/Documentation/filesystems/bcachefs/CodingStyle.rst @@ -175,7 +175,7 @@ errors in our thinking by running our code and seeing what happens. If your time is being wasted because your tools are bad or too slow - don't accept it, fix it. -Put effort into your documentation, commmit messages, and code comments - but +Put effort into your documentation, commit messages, and code comments - but don't go overboard. A good commit message is wonderful - but if the information was important enough to go in a commit message, ask yourself if it would be even better as a code comment. diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig index 5cdfef3b551a..5bac803ea367 100644 --- a/fs/bcachefs/Kconfig +++ b/fs/bcachefs/Kconfig @@ -87,6 +87,13 @@ config BCACHEFS_SIX_OPTIMISTIC_SPIN is held by another thread, spin for a short while, as long as the thread owning the lock is running. +config BCACHEFS_PATH_TRACEPOINTS + bool "Extra btree_path tracepoints" + depends on BCACHEFS_FS + help + Enable extra tracepoints for debugging btree_path operations; we don't + normally want these enabled because they happen at very high rates. + config MEAN_AND_VARIANCE_UNIT_TEST tristate "mean_and_variance unit tests" if !KUNIT_ALL_TESTS depends on KUNIT diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile index 0ab533a2b03b..56d20e219f59 100644 --- a/fs/bcachefs/Makefile +++ b/fs/bcachefs/Makefile @@ -69,6 +69,7 @@ bcachefs-y := \ printbuf.o \ quota.o \ rebalance.o \ + rcu_pending.o \ recovery.o \ recovery_passes.o \ reflink.o \ diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c index 331a17f3f113..87f1be9d4db4 100644 --- a/fs/bcachefs/acl.c +++ b/fs/bcachefs/acl.c @@ -361,7 +361,7 @@ int bch2_set_acl(struct mnt_idmap *idmap, bch2_trans_begin(trans); acl = _acl; - ret = bch2_subvol_is_ro_trans(trans, inode->ei_subvol) ?: + ret = bch2_subvol_is_ro_trans(trans, inode->ei_inum.subvol) ?: bch2_inode_peek(trans, &inode_iter, &inode_u, inode_inum(inode), BTREE_ITER_intent); if (ret) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index dc3a4024aab6..645b5ed4babb 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -30,6 +30,7 @@ #include #include #include +#include static void bch2_discard_one_bucket_fast(struct bch_dev *, u64); @@ -2183,7 +2184,7 @@ int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca, * freespace/need_discard/need_gc_gens btrees as needed: */ while (1) { - if (last_updated + HZ * 10 < jiffies) { + if (time_after(jiffies, last_updated + HZ * 10)) { bch_info(ca, "%s: currently at %llu/%llu", __func__, iter.pos.offset, ca->mi.nbuckets); last_updated = jiffies; @@ -2297,6 +2298,36 @@ int bch2_fs_freespace_init(struct bch_fs *c) return 0; } +/* device removal */ + +int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca) +{ + struct bpos start = POS(ca->dev_idx, 0); + struct bpos end = POS(ca->dev_idx, U64_MAX); + int ret; + + /* + * We clear the LRU and need_discard btrees first so that we don't race + * with bch2_do_invalidates() and bch2_do_discards() + */ + ret = bch2_dev_remove_stripes(c, ca->dev_idx) ?: + bch2_btree_delete_range(c, BTREE_ID_lru, start, end, + BTREE_TRIGGER_norun, NULL) ?: + bch2_btree_delete_range(c, BTREE_ID_need_discard, start, end, + BTREE_TRIGGER_norun, NULL) ?: + bch2_btree_delete_range(c, BTREE_ID_freespace, start, end, + BTREE_TRIGGER_norun, NULL) ?: + bch2_btree_delete_range(c, BTREE_ID_backpointers, start, end, + BTREE_TRIGGER_norun, NULL) ?: + bch2_btree_delete_range(c, BTREE_ID_bucket_gens, start, end, + BTREE_TRIGGER_norun, NULL) ?: + bch2_btree_delete_range(c, BTREE_ID_alloc, start, end, + BTREE_TRIGGER_norun, NULL) ?: + bch2_dev_usage_remove(c, ca->dev_idx); + bch_err_msg(ca, ret, "removing dev alloc info"); + return ret; +} + /* Bucket IO clocks: */ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev, @@ -2432,13 +2463,15 @@ static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca) /* device goes ro: */ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca) { - unsigned i; + lockdep_assert_held(&c->state_lock); /* First, remove device from allocation groups: */ - for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++) + for (unsigned i = 0; i < ARRAY_SIZE(c->rw_devs); i++) clear_bit(ca->dev_idx, c->rw_devs[i].d); + c->rw_devs_change_count++; + /* * Capacity is calculated based off of devices in allocation groups: */ @@ -2467,11 +2500,13 @@ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca) /* device goes rw: */ void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca) { - unsigned i; + lockdep_assert_held(&c->state_lock); - for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++) + for (unsigned i = 0; i < ARRAY_SIZE(c->rw_devs); i++) if (ca->mi.data_allowed & (1 << i)) set_bit(ca->dev_idx, c->rw_devs[i].d); + + c->rw_devs_change_count++; } void bch2_dev_allocator_background_exit(struct bch_dev *ca) diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h index fd790b03fbe1..f8e87c6721b1 100644 --- a/fs/bcachefs/alloc_background.h +++ b/fs/bcachefs/alloc_background.h @@ -16,7 +16,7 @@ enum bch_validate_flags; static inline bool bch2_dev_bucket_exists(struct bch_fs *c, struct bpos pos) { rcu_read_lock(); - struct bch_dev *ca = bch2_dev_rcu(c, pos.inode); + struct bch_dev *ca = bch2_dev_rcu_noerror(c, pos.inode); bool ret = ca && bucket_valid(ca, pos.offset); rcu_read_unlock(); return ret; @@ -338,6 +338,7 @@ static inline const struct bch_backpointer *alloc_v4_backpointers_c(const struct int bch2_dev_freespace_init(struct bch_fs *, struct bch_dev *, u64, u64); int bch2_fs_freespace_init(struct bch_fs *); +int bch2_dev_remove_alloc(struct bch_fs *, struct bch_dev *); void bch2_recalc_capacity(struct bch_fs *); u64 bch2_min_rw_member_capacity(struct bch_fs *); diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index 8563c2d26847..d0e0b56892e3 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -600,6 +600,7 @@ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans, enum bch_watermark watermark, enum bch_data_type data_type, struct closure *cl, + bool nowait, struct bch_dev_usage *usage) { struct bch_fs *c = trans->c; @@ -609,7 +610,7 @@ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans, struct bucket_alloc_state s = { .btree_bitmap = data_type == BCH_DATA_btree, }; - bool waiting = false; + bool waiting = nowait; again: bch2_dev_usage_read_fast(ca, usage); avail = dev_buckets_free(ca, *usage, watermark); @@ -685,7 +686,7 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, bch2_trans_do(c, NULL, NULL, 0, PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(trans, ca, watermark, - data_type, cl, &usage))); + data_type, cl, false, &usage))); return ob; } @@ -748,7 +749,6 @@ static int add_new_bucket(struct bch_fs *c, unsigned nr_replicas, unsigned *nr_effective, bool *have_cache, - unsigned flags, struct open_bucket *ob) { unsigned durability = ob_dev(c, ob)->mi.durability; @@ -775,7 +775,7 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans, unsigned nr_replicas, unsigned *nr_effective, bool *have_cache, - unsigned flags, + enum bch_write_flags flags, enum bch_data_type data_type, enum bch_watermark watermark, struct closure *cl) @@ -801,7 +801,8 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans, continue; } - ob = bch2_bucket_alloc_trans(trans, ca, watermark, data_type, cl, &usage); + ob = bch2_bucket_alloc_trans(trans, ca, watermark, data_type, + cl, flags & BCH_WRITE_ALLOC_NOWAIT, &usage); if (!IS_ERR(ob)) bch2_dev_stripe_increment_inlined(ca, stripe, &usage); bch2_dev_put(ca); @@ -815,7 +816,7 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans, if (add_new_bucket(c, ptrs, devs_may_alloc, nr_replicas, nr_effective, - have_cache, flags, ob)) { + have_cache, ob)) { ret = 0; break; } @@ -841,7 +842,7 @@ static int bucket_alloc_from_stripe(struct btree_trans *trans, unsigned *nr_effective, bool *have_cache, enum bch_watermark watermark, - unsigned flags, + enum bch_write_flags flags, struct closure *cl) { struct bch_fs *c = trans->c; @@ -883,7 +884,7 @@ static int bucket_alloc_from_stripe(struct btree_trans *trans, ret = add_new_bucket(c, ptrs, devs_may_alloc, nr_replicas, nr_effective, - have_cache, flags, ob); + have_cache, ob); out_put_head: bch2_ec_stripe_head_put(c, h); return ret; @@ -922,7 +923,7 @@ static int bucket_alloc_set_writepoint(struct bch_fs *c, unsigned nr_replicas, unsigned *nr_effective, bool *have_cache, - bool ec, unsigned flags) + bool ec) { struct open_buckets ptrs_skip = { .nr = 0 }; struct open_bucket *ob; @@ -934,7 +935,7 @@ static int bucket_alloc_set_writepoint(struct bch_fs *c, have_cache, ec, ob)) ret = add_new_bucket(c, ptrs, devs_may_alloc, nr_replicas, nr_effective, - have_cache, flags, ob); + have_cache, ob); else ob_push(c, &ptrs_skip, ob); } @@ -950,8 +951,7 @@ static int bucket_alloc_set_partial(struct bch_fs *c, unsigned nr_replicas, unsigned *nr_effective, bool *have_cache, bool ec, - enum bch_watermark watermark, - unsigned flags) + enum bch_watermark watermark) { int i, ret = 0; @@ -983,7 +983,7 @@ static int bucket_alloc_set_partial(struct bch_fs *c, ret = add_new_bucket(c, ptrs, devs_may_alloc, nr_replicas, nr_effective, - have_cache, flags, ob); + have_cache, ob); if (ret) break; } @@ -1003,7 +1003,7 @@ static int __open_bucket_add_buckets(struct btree_trans *trans, unsigned *nr_effective, bool *have_cache, enum bch_watermark watermark, - unsigned flags, + enum bch_write_flags flags, struct closure *_cl) { struct bch_fs *c = trans->c; @@ -1022,18 +1022,15 @@ static int __open_bucket_add_buckets(struct btree_trans *trans, open_bucket_for_each(c, ptrs, ob, i) __clear_bit(ob->dev, devs.d); - if (erasure_code && ec_open_bucket(c, ptrs)) - return 0; - ret = bucket_alloc_set_writepoint(c, ptrs, wp, &devs, nr_replicas, nr_effective, - have_cache, erasure_code, flags); + have_cache, erasure_code); if (ret) return ret; ret = bucket_alloc_set_partial(c, ptrs, wp, &devs, nr_replicas, nr_effective, - have_cache, erasure_code, watermark, flags); + have_cache, erasure_code, watermark); if (ret) return ret; @@ -1074,12 +1071,12 @@ static int open_bucket_add_buckets(struct btree_trans *trans, unsigned *nr_effective, bool *have_cache, enum bch_watermark watermark, - unsigned flags, + enum bch_write_flags flags, struct closure *cl) { int ret; - if (erasure_code) { + if (erasure_code && !ec_open_bucket(trans->c, ptrs)) { ret = __open_bucket_add_buckets(trans, ptrs, wp, devs_have, target, erasure_code, nr_replicas, nr_effective, have_cache, @@ -1376,7 +1373,7 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *trans, unsigned nr_replicas, unsigned nr_replicas_required, enum bch_watermark watermark, - unsigned flags, + enum bch_write_flags flags, struct closure *cl, struct write_point **wp_ret) { @@ -1392,8 +1389,6 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *trans, if (!IS_ENABLED(CONFIG_BCACHEFS_ERASURE_CODING)) erasure_code = false; - BUG_ON(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS); - BUG_ON(!nr_replicas || !nr_replicas_required); retry: ptrs.nr = 0; @@ -1498,11 +1493,12 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *trans, try_decrease_writepoints(trans, write_points_nr)) goto retry; - if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty) || + if (cl && bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) + ret = -BCH_ERR_bucket_alloc_blocked; + + if (cl && !(flags & BCH_WRITE_ALLOC_NOWAIT) && bch2_err_matches(ret, BCH_ERR_freelist_empty)) - return cl - ? -BCH_ERR_bucket_alloc_blocked - : -BCH_ERR_ENOSPC_bucket_alloc; + ret = -BCH_ERR_bucket_alloc_blocked; return ret; } @@ -1733,13 +1729,6 @@ void bch2_dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca) for (unsigned i = 0; i < ARRAY_SIZE(c->open_buckets); i++) nr[c->open_buckets[i].data_type]++; - printbuf_tabstops_reset(out); - printbuf_tabstop_push(out, 12); - printbuf_tabstop_push(out, 16); - printbuf_tabstop_push(out, 16); - printbuf_tabstop_push(out, 16); - printbuf_tabstop_push(out, 16); - bch2_dev_usage_to_text(out, ca, &stats); prt_newline(out); diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h index 386d231ceca3..1a16fd5bd4f8 100644 --- a/fs/bcachefs/alloc_foreground.h +++ b/fs/bcachefs/alloc_foreground.h @@ -155,9 +155,10 @@ static inline bool bch2_bucket_is_open_safe(struct bch_fs *c, unsigned dev, u64 return ret; } +enum bch_write_flags; int bch2_bucket_alloc_set_trans(struct btree_trans *, struct open_buckets *, struct dev_stripe_state *, struct bch_devs_mask *, - unsigned, unsigned *, bool *, unsigned, + unsigned, unsigned *, bool *, enum bch_write_flags, enum bch_data_type, enum bch_watermark, struct closure *); @@ -167,7 +168,7 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *, struct bch_devs_list *, unsigned, unsigned, enum bch_watermark, - unsigned, + enum bch_write_flags, struct closure *, struct write_point **); diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index d4da6343efa9..e11989a57ca0 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -9,6 +9,7 @@ #include "btree_update_interior.h" #include "btree_write_buffer.h" #include "checksum.h" +#include "disk_accounting.h" #include "error.h" #include @@ -53,7 +54,7 @@ int bch2_backpointer_validate(struct bch_fs *c, struct bkey_s_c k, struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k); rcu_read_lock(); - struct bch_dev *ca = bch2_dev_rcu(c, bp.k->p.inode); + struct bch_dev *ca = bch2_dev_rcu_noerror(c, bp.k->p.inode); if (!ca) { /* these will be caught by fsck */ rcu_read_unlock(); @@ -87,7 +88,7 @@ void bch2_backpointer_to_text(struct printbuf *out, const struct bch_backpointer void bch2_backpointer_k_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) { rcu_read_lock(); - struct bch_dev *ca = bch2_dev_rcu(c, k.k->p.inode); + struct bch_dev *ca = bch2_dev_rcu_noerror(c, k.k->p.inode); if (ca) { struct bpos bucket = bp_pos_to_bucket(ca, k.k->p); rcu_read_unlock(); @@ -671,7 +672,7 @@ static int check_extent_to_backpointers(struct btree_trans *trans, continue; rcu_read_lock(); - struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev); + struct bch_dev *ca = bch2_dev_rcu_noerror(c, p.ptr.dev); if (ca) bch2_extent_ptr_to_bp(c, ca, btree, level, k, p, entry, &bucket_pos, &bp); rcu_read_unlock(); @@ -750,10 +751,12 @@ static int bch2_get_btree_in_memory_pos(struct btree_trans *trans, s64 mem_may_pin = mem_may_pin_bytes(c); int ret = 0; + bch2_btree_cache_unpin(c); + btree_interior_mask |= btree_leaf_mask; - c->btree_cache.pinned_nodes_leaf_mask = btree_leaf_mask; - c->btree_cache.pinned_nodes_interior_mask = btree_interior_mask; + c->btree_cache.pinned_nodes_mask[0] = btree_leaf_mask; + c->btree_cache.pinned_nodes_mask[1] = btree_interior_mask; c->btree_cache.pinned_nodes_start = start; c->btree_cache.pinned_nodes_end = *end = BBPOS_MAX; @@ -775,6 +778,7 @@ static int bch2_get_btree_in_memory_pos(struct btree_trans *trans, BBPOS(btree, b->key.k.p); break; } + bch2_node_pin(c, b); 0; })); } @@ -782,12 +786,80 @@ static int bch2_get_btree_in_memory_pos(struct btree_trans *trans, return ret; } +struct progress_indicator_state { + unsigned long next_print; + u64 nodes_seen; + u64 nodes_total; + struct btree *last_node; +}; + +static inline void progress_init(struct progress_indicator_state *s, + struct bch_fs *c, + u64 btree_id_mask) +{ + memset(s, 0, sizeof(*s)); + + s->next_print = jiffies + HZ * 10; + + for (unsigned i = 0; i < BTREE_ID_NR; i++) { + if (!(btree_id_mask & BIT_ULL(i))) + continue; + + struct disk_accounting_pos acc = { + .type = BCH_DISK_ACCOUNTING_btree, + .btree.id = i, + }; + + u64 v; + bch2_accounting_mem_read(c, disk_accounting_pos_to_bpos(&acc), &v, 1); + s->nodes_total += div64_ul(v, btree_sectors(c)); + } +} + +static inline bool progress_update_p(struct progress_indicator_state *s) +{ + bool ret = time_after_eq(jiffies, s->next_print); + + if (ret) + s->next_print = jiffies + HZ * 10; + return ret; +} + +static void progress_update_iter(struct btree_trans *trans, + struct progress_indicator_state *s, + struct btree_iter *iter, + const char *msg) +{ + struct bch_fs *c = trans->c; + struct btree *b = path_l(btree_iter_path(trans, iter))->b; + + s->nodes_seen += b != s->last_node; + s->last_node = b; + + if (progress_update_p(s)) { + struct printbuf buf = PRINTBUF; + unsigned percent = s->nodes_total + ? div64_u64(s->nodes_seen * 100, s->nodes_total) + : 0; + + prt_printf(&buf, "%s: %d%%, done %llu/%llu nodes, at ", + msg, percent, s->nodes_seen, s->nodes_total); + bch2_bbpos_to_text(&buf, BBPOS(iter->btree_id, iter->pos)); + + bch_info(c, "%s", buf.buf); + printbuf_exit(&buf); + } +} + static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans, struct extents_to_bp_state *s) { struct bch_fs *c = trans->c; + struct progress_indicator_state progress; int ret = 0; + progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_extents)|BIT_ULL(BTREE_ID_reflink)); + for (enum btree_id btree_id = 0; btree_id < btree_id_nr_alive(c); btree_id++) { @@ -805,6 +877,7 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans, BTREE_ITER_prefetch); ret = for_each_btree_key_continue(trans, iter, 0, k, ({ + progress_update_iter(trans, &progress, &iter, "extents_to_backpointers"); check_extent_to_backpointers(trans, s, btree_id, level, k) ?: bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); })); @@ -865,8 +938,7 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c) bch2_trans_put(trans); bch2_bkey_buf_exit(&s.last_flushed, c); - c->btree_cache.pinned_nodes_leaf_mask = 0; - c->btree_cache.pinned_nodes_interior_mask = 0; + bch2_btree_cache_unpin(c); bch_err_fn(c, ret); return ret; @@ -920,19 +992,24 @@ static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans, struct bbpos start, struct bbpos end) { + struct bch_fs *c = trans->c; struct bkey_buf last_flushed; + struct progress_indicator_state progress; bch2_bkey_buf_init(&last_flushed); bkey_init(&last_flushed.k->k); + progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_backpointers)); int ret = for_each_btree_key_commit(trans, iter, BTREE_ID_backpointers, POS_MIN, BTREE_ITER_prefetch, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - check_one_backpointer(trans, start, end, - bkey_s_c_to_backpointer(k), - &last_flushed)); + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ + progress_update_iter(trans, &progress, &iter, "backpointers_to_extents"); + check_one_backpointer(trans, start, end, + bkey_s_c_to_backpointer(k), + &last_flushed); + })); - bch2_bkey_buf_exit(&last_flushed, trans->c); + bch2_bkey_buf_exit(&last_flushed, c); return ret; } @@ -977,8 +1054,7 @@ int bch2_check_backpointers_to_extents(struct bch_fs *c) } bch2_trans_put(trans); - c->btree_cache.pinned_nodes_leaf_mask = 0; - c->btree_cache.pinned_nodes_interior_mask = 0; + bch2_btree_cache_unpin(c); bch_err_fn(c, ret); return ret; diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h index 7daecadb764e..3b29fdf519dd 100644 --- a/fs/bcachefs/backpointers.h +++ b/fs/bcachefs/backpointers.h @@ -134,26 +134,35 @@ static inline enum bch_data_type bch2_bkey_ptr_data_type(struct bkey_s_c k, } } +static inline void __bch2_extent_ptr_to_bp(struct bch_fs *c, struct bch_dev *ca, + enum btree_id btree_id, unsigned level, + struct bkey_s_c k, struct extent_ptr_decoded p, + const union bch_extent_entry *entry, + struct bpos *bucket_pos, struct bch_backpointer *bp, + u64 sectors) +{ + u32 bucket_offset; + *bucket_pos = PTR_BUCKET_POS_OFFSET(ca, &p.ptr, &bucket_offset); + *bp = (struct bch_backpointer) { + .btree_id = btree_id, + .level = level, + .data_type = bch2_bkey_ptr_data_type(k, p, entry), + .bucket_offset = ((u64) bucket_offset << MAX_EXTENT_COMPRESS_RATIO_SHIFT) + + p.crc.offset, + .bucket_len = sectors, + .pos = k.k->p, + }; +} + static inline void bch2_extent_ptr_to_bp(struct bch_fs *c, struct bch_dev *ca, enum btree_id btree_id, unsigned level, struct bkey_s_c k, struct extent_ptr_decoded p, const union bch_extent_entry *entry, struct bpos *bucket_pos, struct bch_backpointer *bp) { - enum bch_data_type data_type = bch2_bkey_ptr_data_type(k, p, entry); - s64 sectors = level ? btree_sectors(c) : k.k->size; - u32 bucket_offset; + u64 sectors = ptr_disk_sectors(level ? btree_sectors(c) : k.k->size, p); - *bucket_pos = PTR_BUCKET_POS_OFFSET(ca, &p.ptr, &bucket_offset); - *bp = (struct bch_backpointer) { - .btree_id = btree_id, - .level = level, - .data_type = data_type, - .bucket_offset = ((u64) bucket_offset << MAX_EXTENT_COMPRESS_RATIO_SHIFT) + - p.crc.offset, - .bucket_len = ptr_disk_sectors(sectors, p), - .pos = k.k->p, - }; + __bch2_extent_ptr_to_bp(c, ca, btree_id, level, k, p, entry, bucket_pos, bp, sectors); } int bch2_get_next_backpointer(struct btree_trans *, struct bch_dev *ca, struct bpos, int, diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 0c7086e00d18..c711d4c27a03 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -542,7 +542,7 @@ struct bch_dev { * gc_gens_lock, for device resize - holding any is sufficient for * access: Or rcu_read_lock(), but only for dev_ptr_stale(): */ - struct bucket_array __rcu *buckets_gc; + GENRADIX(struct bucket) buckets_gc; struct bucket_gens __rcu *bucket_gens; u8 *oldest_gen; unsigned long *buckets_nouse; @@ -871,6 +871,7 @@ struct bch_fs { /* ALLOCATION */ struct bch_devs_mask rw_devs[BCH_DATA_NR]; + unsigned long rw_devs_change_count; u64 capacity; /* sectors */ u64 reserved; /* sectors */ @@ -1023,6 +1024,7 @@ struct bch_fs { /* fs.c */ struct list_head vfs_inodes_list; struct mutex vfs_inodes_lock; + struct rhashtable vfs_inodes_table; /* VFS IO PATH - fs-io.c */ struct bio_set writepage_bioset; @@ -1044,8 +1046,6 @@ struct bch_fs { * for signaling to the toplevel code which pass we want to run now. */ enum bch_recovery_pass curr_recovery_pass; - /* bitmap of explicitly enabled recovery passes: */ - u64 recovery_passes_explicit; /* bitmask of recovery passes that we actually ran */ u64 recovery_passes_complete; /* never rewinds version of curr_recovery_pass */ @@ -1085,7 +1085,6 @@ struct bch_fs { u64 __percpu *counters; unsigned copy_gc_enabled:1; - bool promote_whole_extents; struct bch2_time_stats times[BCH_TIME_STAT_NR]; @@ -1195,12 +1194,15 @@ static inline bool btree_id_cached(const struct bch_fs *c, enum btree_id btree) static inline struct timespec64 bch2_time_to_timespec(const struct bch_fs *c, s64 time) { struct timespec64 t; + s64 sec; s32 rem; time += c->sb.time_base_lo; - t.tv_sec = div_s64_rem(time, c->sb.time_units_per_sec, &rem); - t.tv_nsec = rem * c->sb.nsec_per_time_unit; + sec = div_s64_rem(time, c->sb.time_units_per_sec, &rem); + + set_normalized_timespec64(&t, sec, rem * (s64)c->sb.nsec_per_time_unit); + return t; } diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index 14ce726bf5a3..8c4addddd07e 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -795,6 +795,8 @@ LE64_BITMASK(BCH_SB_HAS_ERRORS, struct bch_sb, flags[0], 60, 61); LE64_BITMASK(BCH_SB_HAS_TOPOLOGY_ERRORS,struct bch_sb, flags[0], 61, 62); LE64_BITMASK(BCH_SB_BIG_ENDIAN, struct bch_sb, flags[0], 62, 63); +LE64_BITMASK(BCH_SB_PROMOTE_WHOLE_EXTENTS, + struct bch_sb, flags[0], 63, 64); LE64_BITMASK(BCH_SB_STR_HASH_TYPE, struct bch_sb, flags[1], 0, 4); LE64_BITMASK(BCH_SB_COMPRESSION_TYPE_LO,struct bch_sb, flags[1], 4, 8); diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c index 575e1d0b6eeb..d1f6092624d8 100644 --- a/fs/bcachefs/bset.c +++ b/fs/bcachefs/bset.c @@ -304,11 +304,6 @@ struct bkey_float { }; #define BKEY_MANTISSA_BITS 16 -static unsigned bkey_float_byte_offset(unsigned idx) -{ - return idx * sizeof(struct bkey_float); -} - struct ro_aux_tree { u8 nothing[0]; struct bkey_float f[]; @@ -328,8 +323,7 @@ static unsigned bset_aux_tree_buf_end(const struct bset_tree *t) return t->aux_data_offset; case BSET_RO_AUX_TREE: return t->aux_data_offset + - DIV_ROUND_UP(t->size * sizeof(struct bkey_float) + - t->size * sizeof(u8), 8); + DIV_ROUND_UP(t->size * sizeof(struct bkey_float), 8); case BSET_RW_AUX_TREE: return t->aux_data_offset + DIV_ROUND_UP(sizeof(struct rw_aux_tree) * t->size, 8); @@ -360,14 +354,6 @@ static struct ro_aux_tree *ro_aux_tree_base(const struct btree *b, return __aux_tree_base(b, t); } -static u8 *ro_aux_tree_prev(const struct btree *b, - const struct bset_tree *t) -{ - EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE); - - return __aux_tree_base(b, t) + bkey_float_byte_offset(t->size); -} - static struct bkey_float *bkey_float(const struct btree *b, const struct bset_tree *t, unsigned idx) @@ -479,15 +465,6 @@ static inline struct bkey_packed *tree_to_bkey(const struct btree *b, bkey_float(b, t, j)->key_offset); } -static struct bkey_packed *tree_to_prev_bkey(const struct btree *b, - const struct bset_tree *t, - unsigned j) -{ - unsigned prev_u64s = ro_aux_tree_prev(b, t)[j]; - - return (void *) ((u64 *) tree_to_bkey(b, t, j)->_data - prev_u64s); -} - static struct rw_aux_tree *rw_aux_tree(const struct btree *b, const struct bset_tree *t) { @@ -585,8 +562,7 @@ static unsigned rw_aux_tree_bsearch(struct btree *b, } static inline unsigned bkey_mantissa(const struct bkey_packed *k, - const struct bkey_float *f, - unsigned idx) + const struct bkey_float *f) { u64 v; @@ -617,7 +593,7 @@ static __always_inline void make_bfloat(struct btree *b, struct bset_tree *t, struct bkey_packed *m = tree_to_bkey(b, t, j); struct bkey_packed *l = is_power_of_2(j) ? min_key - : tree_to_prev_bkey(b, t, j >> ffs(j)); + : tree_to_bkey(b, t, j >> ffs(j)); struct bkey_packed *r = is_power_of_2(j + 1) ? max_key : tree_to_bkey(b, t, j >> (ffz(j) + 1)); @@ -668,7 +644,7 @@ static __always_inline void make_bfloat(struct btree *b, struct bset_tree *t, EBUG_ON(shift < 0 || shift >= BFLOAT_FAILED); f->exponent = shift; - mantissa = bkey_mantissa(m, f, j); + mantissa = bkey_mantissa(m, f); /* * If we've got garbage bits, set them to all 1s - it's legal for the @@ -690,8 +666,7 @@ static unsigned __bset_tree_capacity(struct btree *b, const struct bset_tree *t) static unsigned bset_ro_tree_capacity(struct btree *b, const struct bset_tree *t) { - return __bset_tree_capacity(b, t) / - (sizeof(struct bkey_float) + sizeof(u8)); + return __bset_tree_capacity(b, t) / sizeof(struct bkey_float); } static unsigned bset_rw_tree_capacity(struct btree *b, const struct bset_tree *t) @@ -720,7 +695,7 @@ static noinline void __build_rw_aux_tree(struct btree *b, struct bset_tree *t) static noinline void __build_ro_aux_tree(struct btree *b, struct bset_tree *t) { - struct bkey_packed *prev = NULL, *k = btree_bkey_first(b, t); + struct bkey_packed *k = btree_bkey_first(b, t); struct bkey_i min_key, max_key; unsigned cacheline = 1; @@ -733,12 +708,12 @@ static noinline void __build_ro_aux_tree(struct btree *b, struct bset_tree *t) return; } - t->extra = (t->size - rounddown_pow_of_two(t->size - 1)) << 1; + t->extra = eytzinger1_extra(t->size - 1); /* First we figure out where the first key in each cacheline is */ eytzinger1_for_each(j, t->size - 1) { while (bkey_to_cacheline(b, t, k) < cacheline) - prev = k, k = bkey_p_next(k); + k = bkey_p_next(k); if (k >= btree_bkey_last(b, t)) { /* XXX: this path sucks */ @@ -746,17 +721,12 @@ static noinline void __build_ro_aux_tree(struct btree *b, struct bset_tree *t) goto retry; } - ro_aux_tree_prev(b, t)[j] = prev->u64s; bkey_float(b, t, j)->key_offset = bkey_to_cacheline_offset(b, t, cacheline++, k); - EBUG_ON(tree_to_prev_bkey(b, t, j) != prev); EBUG_ON(tree_to_bkey(b, t, j) != k); } - while (k != btree_bkey_last(b, t)) - prev = k, k = bkey_p_next(k); - if (!bkey_pack_pos(bkey_to_packed(&min_key), b->data->min_key, b)) { bkey_init(&min_key.k); min_key.k.p = b->data->min_key; @@ -915,66 +885,18 @@ struct bkey_packed *bch2_bkey_prev_filter(struct btree *b, /* Insert */ -static void bch2_bset_fix_lookup_table(struct btree *b, - struct bset_tree *t, - struct bkey_packed *_where, - unsigned clobber_u64s, - unsigned new_u64s) +static void rw_aux_tree_insert_entry(struct btree *b, + struct bset_tree *t, + unsigned idx) { - int shift = new_u64s - clobber_u64s; - unsigned l, j, where = __btree_node_key_to_offset(b, _where); - - EBUG_ON(bset_has_ro_aux_tree(t)); - - if (!bset_has_rw_aux_tree(t)) - return; - - /* returns first entry >= where */ - l = rw_aux_tree_bsearch(b, t, where); - - if (!l) /* never delete first entry */ - l++; - else if (l < t->size && - where < t->end_offset && - rw_aux_tree(b, t)[l].offset == where) - rw_aux_tree_set(b, t, l++, _where); - - /* l now > where */ - - for (j = l; - j < t->size && - rw_aux_tree(b, t)[j].offset < where + clobber_u64s; - j++) - ; - - if (j < t->size && - rw_aux_tree(b, t)[j].offset + shift == - rw_aux_tree(b, t)[l - 1].offset) - j++; - - memmove(&rw_aux_tree(b, t)[l], - &rw_aux_tree(b, t)[j], - (void *) &rw_aux_tree(b, t)[t->size] - - (void *) &rw_aux_tree(b, t)[j]); - t->size -= j - l; - - for (j = l; j < t->size; j++) - rw_aux_tree(b, t)[j].offset += shift; - - EBUG_ON(l < t->size && - rw_aux_tree(b, t)[l].offset == - rw_aux_tree(b, t)[l - 1].offset); + EBUG_ON(!idx || idx > t->size); + struct bkey_packed *start = rw_aux_to_bkey(b, t, idx - 1); + struct bkey_packed *end = idx < t->size + ? rw_aux_to_bkey(b, t, idx) + : btree_bkey_last(b, t); if (t->size < bset_rw_tree_capacity(b, t) && - (l < t->size - ? rw_aux_tree(b, t)[l].offset - : t->end_offset) - - rw_aux_tree(b, t)[l - 1].offset > - L1_CACHE_BYTES / sizeof(u64)) { - struct bkey_packed *start = rw_aux_to_bkey(b, t, l - 1); - struct bkey_packed *end = l < t->size - ? rw_aux_to_bkey(b, t, l) - : btree_bkey_last(b, t); + (void *) end - (void *) start > L1_CACHE_BYTES) { struct bkey_packed *k = start; while (1) { @@ -983,23 +905,78 @@ static void bch2_bset_fix_lookup_table(struct btree *b, break; if ((void *) k - (void *) start >= L1_CACHE_BYTES) { - memmove(&rw_aux_tree(b, t)[l + 1], - &rw_aux_tree(b, t)[l], + memmove(&rw_aux_tree(b, t)[idx + 1], + &rw_aux_tree(b, t)[idx], (void *) &rw_aux_tree(b, t)[t->size] - - (void *) &rw_aux_tree(b, t)[l]); + (void *) &rw_aux_tree(b, t)[idx]); t->size++; - rw_aux_tree_set(b, t, l, k); + rw_aux_tree_set(b, t, idx, k); break; } } } +} +static void bch2_bset_fix_lookup_table(struct btree *b, + struct bset_tree *t, + struct bkey_packed *_where, + unsigned clobber_u64s, + unsigned new_u64s) +{ + int shift = new_u64s - clobber_u64s; + unsigned idx, j, where = __btree_node_key_to_offset(b, _where); + + EBUG_ON(bset_has_ro_aux_tree(t)); + + if (!bset_has_rw_aux_tree(t)) + return; + + if (where > rw_aux_tree(b, t)[t->size - 1].offset) { + rw_aux_tree_insert_entry(b, t, t->size); + goto verify; + } + + /* returns first entry >= where */ + idx = rw_aux_tree_bsearch(b, t, where); + + if (rw_aux_tree(b, t)[idx].offset == where) { + if (!idx) { /* never delete first entry */ + idx++; + } else if (where < t->end_offset) { + rw_aux_tree_set(b, t, idx++, _where); + } else { + EBUG_ON(where != t->end_offset); + rw_aux_tree_insert_entry(b, t, --t->size); + goto verify; + } + } + + EBUG_ON(idx < t->size && rw_aux_tree(b, t)[idx].offset <= where); + if (idx < t->size && + rw_aux_tree(b, t)[idx].offset + shift == + rw_aux_tree(b, t)[idx - 1].offset) { + memmove(&rw_aux_tree(b, t)[idx], + &rw_aux_tree(b, t)[idx + 1], + (void *) &rw_aux_tree(b, t)[t->size] - + (void *) &rw_aux_tree(b, t)[idx + 1]); + t->size -= 1; + } + + for (j = idx; j < t->size; j++) + rw_aux_tree(b, t)[j].offset += shift; + + EBUG_ON(idx < t->size && + rw_aux_tree(b, t)[idx].offset == + rw_aux_tree(b, t)[idx - 1].offset); + + rw_aux_tree_insert_entry(b, t, idx); + +verify: bch2_bset_verify_rw_aux_tree(b, t); bset_aux_tree_verify(b); } void bch2_bset_insert(struct btree *b, - struct btree_node_iter *iter, struct bkey_packed *where, struct bkey_i *insert, unsigned clobber_u64s) @@ -1098,8 +1075,7 @@ static inline void prefetch_four_cachelines(void *p) } static inline bool bkey_mantissa_bits_dropped(const struct btree *b, - const struct bkey_float *f, - unsigned idx) + const struct bkey_float *f) { #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ unsigned key_bits_start = b->format.key_u64s * 64 - b->nr_key_bits; @@ -1133,9 +1109,9 @@ static struct bkey_packed *bset_search_tree(const struct btree *b, goto slowpath; l = f->mantissa; - r = bkey_mantissa(packed_search, f, n); + r = bkey_mantissa(packed_search, f); - if (unlikely(l == r) && bkey_mantissa_bits_dropped(b, f, n)) + if (unlikely(l == r) && bkey_mantissa_bits_dropped(b, f)) goto slowpath; n = n * 2 + (l < r); diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h index 5c6c7a14fa0f..6953d55b72cc 100644 --- a/fs/bcachefs/bset.h +++ b/fs/bcachefs/bset.h @@ -270,8 +270,8 @@ void bch2_bset_init_first(struct btree *, struct bset *); void bch2_bset_init_next(struct btree *, struct btree_node_entry *); void bch2_bset_build_aux_tree(struct btree *, struct bset_tree *, bool); -void bch2_bset_insert(struct btree *, struct btree_node_iter *, - struct bkey_packed *, struct bkey_i *, unsigned); +void bch2_bset_insert(struct btree *, struct bkey_packed *, struct bkey_i *, + unsigned); void bch2_bset_delete(struct btree *, struct bkey_packed *, unsigned); /* Bkey utility code */ diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index e52a06d3418c..6e4afb2b5441 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -15,11 +15,12 @@ #include #include +#include #define BTREE_CACHE_NOT_FREED_INCREMENT(counter) \ do { \ if (shrinker_counter) \ - bc->not_freed_##counter++; \ + bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_##counter]++; \ } while (0) const char * const bch2_btree_node_flags[] = { @@ -31,24 +32,29 @@ const char * const bch2_btree_node_flags[] = { void bch2_recalc_btree_reserve(struct bch_fs *c) { - unsigned i, reserve = 16; + unsigned reserve = 16; if (!c->btree_roots_known[0].b) reserve += 8; - for (i = 0; i < btree_id_nr_alive(c); i++) { + for (unsigned i = 0; i < btree_id_nr_alive(c); i++) { struct btree_root *r = bch2_btree_id_root(c, i); if (r->b) reserve += min_t(unsigned, 1, r->b->c.level) * 8; } - c->btree_cache.reserve = reserve; + c->btree_cache.nr_reserve = reserve; } -static inline unsigned btree_cache_can_free(struct btree_cache *bc) +static inline size_t btree_cache_can_free(struct btree_cache_list *list) { - return max_t(int, 0, bc->used - bc->reserve); + struct btree_cache *bc = container_of(list, struct btree_cache, live[list->idx]); + + size_t can_free = list->nr; + if (!list->idx) + can_free = max_t(ssize_t, 0, can_free - bc->nr_reserve); + return can_free; } static void btree_node_to_freedlist(struct btree_cache *bc, struct btree *b) @@ -63,6 +69,18 @@ static void btree_node_data_free(struct bch_fs *c, struct btree *b) { struct btree_cache *bc = &c->btree_cache; + BUG_ON(btree_node_hashed(b)); + + /* + * This should really be done in slub/vmalloc, but we're using the + * kmalloc_large() path, so we're working around a slub bug by doing + * this here: + */ + if (b->data) + mm_account_reclaimed_pages(btree_buf_bytes(b) / PAGE_SIZE); + if (b->aux_data) + mm_account_reclaimed_pages(btree_aux_data_bytes(b) / PAGE_SIZE); + EBUG_ON(btree_node_write_in_flight(b)); clear_btree_node_just_written(b); @@ -76,7 +94,7 @@ static void btree_node_data_free(struct bch_fs *c, struct btree *b) #endif b->aux_data = NULL; - bc->used--; + bc->nr_freeable--; btree_node_to_freedlist(bc, b); } @@ -102,6 +120,8 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) { BUG_ON(b->data || b->aux_data); + gfp |= __GFP_ACCOUNT|__GFP_RECLAIMABLE; + b->data = kvmalloc(btree_buf_bytes(b), gfp); if (!b->data) return -BCH_ERR_ENOMEM_btree_node_mem_alloc; @@ -154,7 +174,7 @@ struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c) bch2_btree_lock_init(&b->c, 0); - bc->used++; + bc->nr_freeable++; list_add(&b->list, &bc->freeable); return b; } @@ -169,10 +189,56 @@ void bch2_btree_node_to_freelist(struct bch_fs *c, struct btree *b) six_unlock_intent(&b->c.lock); } +static inline bool __btree_node_pinned(struct btree_cache *bc, struct btree *b) +{ + struct bbpos pos = BBPOS(b->c.btree_id, b->key.k.p); + + u64 mask = bc->pinned_nodes_mask[!!b->c.level]; + + return ((mask & BIT_ULL(b->c.btree_id)) && + bbpos_cmp(bc->pinned_nodes_start, pos) < 0 && + bbpos_cmp(bc->pinned_nodes_end, pos) >= 0); +} + +void bch2_node_pin(struct bch_fs *c, struct btree *b) +{ + struct btree_cache *bc = &c->btree_cache; + + mutex_lock(&bc->lock); + BUG_ON(!__btree_node_pinned(bc, b)); + if (b != btree_node_root(c, b) && !btree_node_pinned(b)) { + set_btree_node_pinned(b); + list_move(&b->list, &bc->live[1].list); + bc->live[0].nr--; + bc->live[1].nr++; + } + mutex_unlock(&bc->lock); +} + +void bch2_btree_cache_unpin(struct bch_fs *c) +{ + struct btree_cache *bc = &c->btree_cache; + struct btree *b, *n; + + mutex_lock(&bc->lock); + c->btree_cache.pinned_nodes_mask[0] = 0; + c->btree_cache.pinned_nodes_mask[1] = 0; + + list_for_each_entry_safe(b, n, &bc->live[1].list, list) { + clear_btree_node_pinned(b); + list_move(&b->list, &bc->live[0].list); + bc->live[0].nr++; + bc->live[1].nr--; + } + + mutex_unlock(&bc->lock); +} + /* Btree in memory cache - hash table */ void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b) { + lockdep_assert_held(&bc->lock); int ret = rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params); BUG_ON(ret); @@ -181,7 +247,11 @@ void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b) b->hash_val = 0; if (b->c.btree_id < BTREE_ID_NR) - --bc->used_by_btree[b->c.btree_id]; + --bc->nr_by_btree[b->c.btree_id]; + + bc->live[btree_node_pinned(b)].nr--; + bc->nr_freeable++; + list_move(&b->list, &bc->freeable); } int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b) @@ -191,23 +261,30 @@ int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b) int ret = rhashtable_lookup_insert_fast(&bc->table, &b->hash, bch_btree_cache_params); - if (!ret && b->c.btree_id < BTREE_ID_NR) - bc->used_by_btree[b->c.btree_id]++; - return ret; + if (ret) + return ret; + + if (b->c.btree_id < BTREE_ID_NR) + bc->nr_by_btree[b->c.btree_id]++; + + bool p = __btree_node_pinned(bc, b); + mod_bit(BTREE_NODE_pinned, &b->flags, p); + + list_move_tail(&b->list, &bc->live[p].list); + bc->live[p].nr++; + + bc->nr_freeable--; + return 0; } int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b, unsigned level, enum btree_id id) { - int ret; - b->c.level = level; b->c.btree_id = id; mutex_lock(&bc->lock); - ret = __bch2_btree_node_hash_insert(bc, b); - if (!ret) - list_add_tail(&b->list, &bc->live); + int ret = __bch2_btree_node_hash_insert(bc, b); mutex_unlock(&bc->lock); return ret; @@ -261,18 +338,6 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush, b int ret = 0; lockdep_assert_held(&bc->lock); - - struct bbpos pos = BBPOS(b->c.btree_id, b->key.k.p); - - u64 mask = b->c.level - ? bc->pinned_nodes_interior_mask - : bc->pinned_nodes_leaf_mask; - - if ((mask & BIT_ULL(b->c.btree_id)) && - bbpos_cmp(bc->pinned_nodes_start, pos) < 0 && - bbpos_cmp(bc->pinned_nodes_end, pos) >= 0) - return -BCH_ERR_ENOMEM_btree_node_reclaim; - wait_on_io: if (b->flags & ((1U << BTREE_NODE_dirty)| (1U << BTREE_NODE_read_in_flight)| @@ -377,8 +442,9 @@ static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b) static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, struct shrink_control *sc) { - struct bch_fs *c = shrink->private_data; - struct btree_cache *bc = &c->btree_cache; + struct btree_cache_list *list = shrink->private_data; + struct btree_cache *bc = container_of(list, struct btree_cache, live[list->idx]); + struct bch_fs *c = container_of(bc, struct bch_fs, btree_cache); struct btree *b, *t; unsigned long nr = sc->nr_to_scan; unsigned long can_free = 0; @@ -386,8 +452,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, unsigned long touched = 0; unsigned i, flags; unsigned long ret = SHRINK_STOP; - bool trigger_writes = atomic_read(&bc->dirty) + nr >= - bc->used * 3 / 4; + bool trigger_writes = atomic_long_read(&bc->nr_dirty) + nr >= list->nr * 3 / 4; if (bch2_btree_shrinker_disabled) return SHRINK_STOP; @@ -402,7 +467,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, * succeed, so that inserting keys into the btree can always succeed and * IO can always make forward progress: */ - can_free = btree_cache_can_free(bc); + can_free = btree_cache_can_free(list); nr = min_t(unsigned long, nr, can_free); i = 0; @@ -424,22 +489,24 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, six_unlock_write(&b->c.lock); six_unlock_intent(&b->c.lock); freed++; - bc->freed++; + bc->nr_freed++; } } restart: - list_for_each_entry_safe(b, t, &bc->live, list) { + list_for_each_entry_safe(b, t, &list->list, list) { touched++; if (btree_node_accessed(b)) { clear_btree_node_accessed(b); - bc->not_freed_access_bit++; + bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_access_bit]++; + --touched;; } else if (!btree_node_reclaim(c, b, true)) { + bch2_btree_node_hash_remove(bc, b); + freed++; btree_node_data_free(c, b); - bc->freed++; + bc->nr_freed++; - bch2_btree_node_hash_remove(bc, b); six_unlock_write(&b->c.lock); six_unlock_intent(&b->c.lock); @@ -450,7 +517,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, !btree_node_will_make_reachable(b) && !btree_node_write_blocked(b) && six_trylock_read(&b->c.lock)) { - list_move(&bc->live, &b->list); + list_move(&list->list, &b->list); mutex_unlock(&bc->lock); __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim); six_unlock_read(&b->c.lock); @@ -464,8 +531,8 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, break; } out_rotate: - if (&t->list != &bc->live) - list_move_tail(&bc->live, &t->list); + if (&t->list != &list->list) + list_move_tail(&list->list, &t->list); out: mutex_unlock(&bc->lock); out_nounlock: @@ -478,44 +545,45 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, static unsigned long bch2_btree_cache_count(struct shrinker *shrink, struct shrink_control *sc) { - struct bch_fs *c = shrink->private_data; - struct btree_cache *bc = &c->btree_cache; + struct btree_cache_list *list = shrink->private_data; if (bch2_btree_shrinker_disabled) return 0; - return btree_cache_can_free(bc); + return btree_cache_can_free(list); } void bch2_fs_btree_cache_exit(struct bch_fs *c) { struct btree_cache *bc = &c->btree_cache; - struct btree *b; - unsigned i, flags; + struct btree *b, *t; + unsigned long flags; - shrinker_free(bc->shrink); + shrinker_free(bc->live[1].shrink); + shrinker_free(bc->live[0].shrink); /* vfree() can allocate memory: */ flags = memalloc_nofs_save(); mutex_lock(&bc->lock); if (c->verify_data) - list_move(&c->verify_data->list, &bc->live); + list_move(&c->verify_data->list, &bc->live[0].list); kvfree(c->verify_ondisk); - for (i = 0; i < btree_id_nr_alive(c); i++) { + for (unsigned i = 0; i < btree_id_nr_alive(c); i++) { struct btree_root *r = bch2_btree_id_root(c, i); if (r->b) - list_add(&r->b->list, &bc->live); + list_add(&r->b->list, &bc->live[0].list); } - list_splice(&bc->freeable, &bc->live); - - while (!list_empty(&bc->live)) { - b = list_first_entry(&bc->live, struct btree, list); + list_for_each_entry_safe(b, t, &bc->live[1].list, list) + bch2_btree_node_hash_remove(bc, b); + list_for_each_entry_safe(b, t, &bc->live[0].list, list) + bch2_btree_node_hash_remove(bc, b); + list_for_each_entry_safe(b, t, &bc->freeable, list) { BUG_ON(btree_node_read_in_flight(b) || btree_node_write_in_flight(b)); @@ -523,12 +591,11 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c) } BUG_ON(!bch2_journal_error(&c->journal) && - atomic_read(&c->btree_cache.dirty)); + atomic_long_read(&c->btree_cache.nr_dirty)); list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu); - while (!list_empty(&bc->freed_nonpcpu)) { - b = list_first_entry(&bc->freed_nonpcpu, struct btree, list); + list_for_each_entry_safe(b, t, &bc->freed_nonpcpu, list) { list_del(&b->list); six_lock_exit(&b->c.lock); kfree(b); @@ -537,6 +604,12 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c) mutex_unlock(&bc->lock); memalloc_nofs_restore(flags); + for (unsigned i = 0; i < ARRAY_SIZE(bc->nr_by_btree); i++) + BUG_ON(bc->nr_by_btree[i]); + BUG_ON(bc->live[0].nr); + BUG_ON(bc->live[1].nr); + BUG_ON(bc->nr_freeable); + if (bc->table_init_done) rhashtable_destroy(&bc->table); } @@ -556,22 +629,32 @@ int bch2_fs_btree_cache_init(struct bch_fs *c) bch2_recalc_btree_reserve(c); - for (i = 0; i < bc->reserve; i++) + for (i = 0; i < bc->nr_reserve; i++) if (!__bch2_btree_node_mem_alloc(c)) goto err; - list_splice_init(&bc->live, &bc->freeable); + list_splice_init(&bc->live[0].list, &bc->freeable); mutex_init(&c->verify_lock); shrink = shrinker_alloc(0, "%s-btree_cache", c->name); if (!shrink) goto err; - bc->shrink = shrink; + bc->live[0].shrink = shrink; shrink->count_objects = bch2_btree_cache_count; shrink->scan_objects = bch2_btree_cache_scan; - shrink->seeks = 4; - shrink->private_data = c; + shrink->seeks = 2; + shrink->private_data = &bc->live[0]; + shrinker_register(shrink); + + shrink = shrinker_alloc(0, "%s-btree_cache-pinned", c->name); + if (!shrink) + goto err; + bc->live[1].shrink = shrink; + shrink->count_objects = bch2_btree_cache_count; + shrink->scan_objects = bch2_btree_cache_scan; + shrink->seeks = 8; + shrink->private_data = &bc->live[1]; shrinker_register(shrink); return 0; @@ -582,7 +665,10 @@ int bch2_fs_btree_cache_init(struct bch_fs *c) void bch2_fs_btree_cache_init_early(struct btree_cache *bc) { mutex_init(&bc->lock); - INIT_LIST_HEAD(&bc->live); + for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++) { + bc->live[i].idx = i; + INIT_LIST_HEAD(&bc->live[i].list); + } INIT_LIST_HEAD(&bc->freeable); INIT_LIST_HEAD(&bc->freed_pcpu); INIT_LIST_HEAD(&bc->freed_nonpcpu); @@ -644,14 +730,16 @@ static struct btree *btree_node_cannibalize(struct bch_fs *c) struct btree_cache *bc = &c->btree_cache; struct btree *b; - list_for_each_entry_reverse(b, &bc->live, list) - if (!btree_node_reclaim(c, b, false)) - return b; + for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++) + list_for_each_entry_reverse(b, &bc->live[i].list, list) + if (!btree_node_reclaim(c, b, false)) + return b; while (1) { - list_for_each_entry_reverse(b, &bc->live, list) - if (!btree_node_write_and_reclaim(c, b)) - return b; + for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++) + list_for_each_entry_reverse(b, &bc->live[i].list, list) + if (!btree_node_write_and_reclaim(c, b)) + return b; /* * Rare case: all nodes were intent-locked. @@ -671,9 +759,7 @@ struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_rea : &bc->freed_nonpcpu; struct btree *b, *b2; u64 start_time = local_clock(); - unsigned flags; - flags = memalloc_nofs_save(); mutex_lock(&bc->lock); /* @@ -725,7 +811,7 @@ struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_rea } mutex_lock(&bc->lock); - bc->used++; + bc->nr_freeable++; got_mem: mutex_unlock(&bc->lock); @@ -745,8 +831,6 @@ struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_rea bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc], start_time); - memalloc_nofs_restore(flags); - int ret = bch2_trans_relock(trans); if (unlikely(ret)) { bch2_btree_node_to_freelist(c, b); @@ -781,7 +865,6 @@ struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_rea } mutex_unlock(&bc->lock); - memalloc_nofs_restore(flags); return ERR_PTR(-BCH_ERR_ENOMEM_btree_node_mem_alloc); } @@ -1269,8 +1352,8 @@ void bch2_btree_node_evict(struct btree_trans *trans, const struct bkey_i *k) BUG_ON(btree_node_dirty(b)); mutex_lock(&bc->lock); - btree_node_data_free(c, b); bch2_btree_node_hash_remove(bc, b); + btree_node_data_free(c, b); mutex_unlock(&bc->lock); out: six_unlock_write(&b->c.lock); @@ -1342,13 +1425,20 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struc } static void prt_btree_cache_line(struct printbuf *out, const struct bch_fs *c, - const char *label, unsigned nr) + const char *label, size_t nr) { prt_printf(out, "%s\t", label); prt_human_readable_u64(out, nr * c->opts.btree_node_size); - prt_printf(out, " (%u)\n", nr); + prt_printf(out, " (%zu)\n", nr); } +static const char * const bch2_btree_cache_not_freed_reasons_strs[] = { +#define x(n) #n, + BCH_BTREE_CACHE_NOT_FREED_REASONS() +#undef x + NULL +}; + void bch2_btree_cache_to_text(struct printbuf *out, const struct btree_cache *bc) { struct bch_fs *c = container_of(bc, struct bch_fs, btree_cache); @@ -1356,24 +1446,21 @@ void bch2_btree_cache_to_text(struct printbuf *out, const struct btree_cache *bc if (!out->nr_tabstops) printbuf_tabstop_push(out, 32); - prt_btree_cache_line(out, c, "total:", bc->used); - prt_btree_cache_line(out, c, "nr dirty:", atomic_read(&bc->dirty)); + prt_btree_cache_line(out, c, "live:", bc->live[0].nr); + prt_btree_cache_line(out, c, "pinned:", bc->live[1].nr); + prt_btree_cache_line(out, c, "freeable:", bc->nr_freeable); + prt_btree_cache_line(out, c, "dirty:", atomic_long_read(&bc->nr_dirty)); prt_printf(out, "cannibalize lock:\t%p\n", bc->alloc_lock); prt_newline(out); - for (unsigned i = 0; i < ARRAY_SIZE(bc->used_by_btree); i++) - prt_btree_cache_line(out, c, bch2_btree_id_str(i), bc->used_by_btree[i]); + for (unsigned i = 0; i < ARRAY_SIZE(bc->nr_by_btree); i++) + prt_btree_cache_line(out, c, bch2_btree_id_str(i), bc->nr_by_btree[i]); prt_newline(out); - prt_printf(out, "freed:\t%u\n", bc->freed); + prt_printf(out, "freed:\t%zu\n", bc->nr_freed); prt_printf(out, "not freed:\n"); - prt_printf(out, " dirty\t%u\n", bc->not_freed_dirty); - prt_printf(out, " write in flight\t%u\n", bc->not_freed_write_in_flight); - prt_printf(out, " read in flight\t%u\n", bc->not_freed_read_in_flight); - prt_printf(out, " lock intent failed\t%u\n", bc->not_freed_lock_intent); - prt_printf(out, " lock write failed\t%u\n", bc->not_freed_lock_write); - prt_printf(out, " access bit\t%u\n", bc->not_freed_access_bit); - prt_printf(out, " no evict failed\t%u\n", bc->not_freed_noevict); - prt_printf(out, " write blocked\t%u\n", bc->not_freed_write_blocked); - prt_printf(out, " will make reachable\t%u\n", bc->not_freed_will_make_reachable); + + for (unsigned i = 0; i < ARRAY_SIZE(bc->not_freed); i++) + prt_printf(out, " %s\t%llu\n", + bch2_btree_cache_not_freed_reasons_strs[i], bc->not_freed[i]); } diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h index f82064007127..367acd217c6a 100644 --- a/fs/bcachefs/btree_cache.h +++ b/fs/bcachefs/btree_cache.h @@ -19,6 +19,9 @@ int __bch2_btree_node_hash_insert(struct btree_cache *, struct btree *); int bch2_btree_node_hash_insert(struct btree_cache *, struct btree *, unsigned, enum btree_id); +void bch2_node_pin(struct bch_fs *, struct btree *); +void bch2_btree_cache_unpin(struct bch_fs *); + void bch2_btree_node_update_key_early(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *); diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index eb3002c4eae7..b5e0692f03c6 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -549,9 +549,8 @@ int bch2_check_topology(struct bch_fs *c) six_unlock_read(&b->c.lock); if (ret == DROP_THIS_NODE) { - bch2_btree_node_hash_remove(&c->btree_cache, b); mutex_lock(&c->btree_cache.lock); - list_move(&b->list, &c->btree_cache.freeable); + bch2_btree_node_hash_remove(&c->btree_cache, b); mutex_unlock(&c->btree_cache.lock); r->b = NULL; @@ -753,10 +752,8 @@ static void bch2_gc_free(struct bch_fs *c) genradix_free(&c->reflink_gc_table); genradix_free(&c->gc_stripes); - for_each_member_device(c, ca) { - kvfree(rcu_dereference_protected(ca->buckets_gc, 1)); - ca->buckets_gc = NULL; - } + for_each_member_device(c, ca) + genradix_free(&ca->buckets_gc); } static int bch2_gc_start(struct bch_fs *c) @@ -910,20 +907,12 @@ static int bch2_gc_alloc_start(struct bch_fs *c) int ret = 0; for_each_member_device(c, ca) { - struct bucket_array *buckets = kvmalloc(sizeof(struct bucket_array) + - ca->mi.nbuckets * sizeof(struct bucket), - GFP_KERNEL|__GFP_ZERO); - if (!buckets) { + ret = genradix_prealloc(&ca->buckets_gc, ca->mi.nbuckets, GFP_KERNEL); + if (ret) { bch2_dev_put(ca); ret = -BCH_ERR_ENOMEM_gc_alloc_start; break; } - - buckets->first_bucket = ca->mi.first_bucket; - buckets->nbuckets = ca->mi.nbuckets; - buckets->nbuckets_minus_first = - buckets->nbuckets - buckets->first_bucket; - rcu_assign_pointer(ca->buckets_gc, buckets); } bch_err_fn(c, ret); diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 56ea9a77cd4a..cb48a9477514 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -1666,7 +1666,7 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b, bch2_btree_pos_to_text(&buf, c, b); bch_err_ratelimited(c, "%s", buf.buf); - if (c->recovery_passes_explicit & BIT_ULL(BCH_RECOVERY_PASS_check_topology) && + if (c->opts.recovery_passes & BIT_ULL(BCH_RECOVERY_PASS_check_topology) && c->curr_recovery_pass > BCH_RECOVERY_PASS_check_topology) bch2_fatal_error(c); @@ -1749,10 +1749,8 @@ static int __bch2_btree_root_read(struct btree_trans *trans, enum btree_id id, bch2_btree_node_read(trans, b, true); if (btree_node_read_error(b)) { - bch2_btree_node_hash_remove(&c->btree_cache, b); - mutex_lock(&c->btree_cache.lock); - list_move(&b->list, &c->btree_cache.freeable); + bch2_btree_node_hash_remove(&c->btree_cache, b); mutex_unlock(&c->btree_cache.lock); ret = -BCH_ERR_btree_node_read_error; @@ -2031,7 +2029,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags) do_write: BUG_ON((type == BTREE_WRITE_initial) != (b->written == 0)); - atomic_dec(&c->btree_cache.dirty); + atomic_long_dec(&c->btree_cache.nr_dirty); BUG_ON(btree_node_fake(b)); BUG_ON((b->will_make_reachable != 0) != !b->written); diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h index 63d76f5c6403..9b01ca3de907 100644 --- a/fs/bcachefs/btree_io.h +++ b/fs/bcachefs/btree_io.h @@ -18,13 +18,13 @@ struct btree_node_read_all; static inline void set_btree_node_dirty_acct(struct bch_fs *c, struct btree *b) { if (!test_and_set_bit(BTREE_NODE_dirty, &b->flags)) - atomic_inc(&c->btree_cache.dirty); + atomic_long_inc(&c->btree_cache.nr_dirty); } static inline void clear_btree_node_dirty_acct(struct bch_fs *c, struct btree *b) { if (test_and_clear_bit(BTREE_NODE_dirty, &b->flags)) - atomic_dec(&c->btree_cache.dirty); + atomic_long_dec(&c->btree_cache.nr_dirty); } static inline unsigned btree_ptr_sectors_written(struct bkey_s_c k) diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index 2e84d22e17bd..bfe9f0c1e1be 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -1010,9 +1010,9 @@ static int bch2_btree_path_traverse_all(struct btree_trans *trans) * the same position: */ if (trans->paths[idx].uptodate) { - __btree_path_get(&trans->paths[idx], false); + __btree_path_get(trans, &trans->paths[idx], false); ret = bch2_btree_path_traverse_one(trans, idx, 0, _THIS_IP_); - __btree_path_put(&trans->paths[idx], false); + __btree_path_put(trans, &trans->paths[idx], false); if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || bch2_err_matches(ret, ENOMEM)) @@ -1131,6 +1131,8 @@ int bch2_btree_path_traverse_one(struct btree_trans *trans, if (unlikely(!trans->srcu_held)) bch2_trans_srcu_lock(trans); + trace_btree_path_traverse_start(trans, path); + /* * Ensure we obey path->should_be_locked: if it's set, we can't unlock * and re-traverse the path without a transaction restart: @@ -1194,6 +1196,7 @@ int bch2_btree_path_traverse_one(struct btree_trans *trans, out_uptodate: path->uptodate = BTREE_ITER_UPTODATE; + trace_btree_path_traverse_end(trans, path); out: if (bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted) panic("ret %s (%i) trans->restarted %s (%i)\n", @@ -1225,7 +1228,7 @@ static btree_path_idx_t btree_path_clone(struct btree_trans *trans, btree_path_i { btree_path_idx_t new = btree_path_alloc(trans, src); btree_path_copy(trans, trans->paths + new, trans->paths + src); - __btree_path_get(trans->paths + new, intent); + __btree_path_get(trans, trans->paths + new, intent); #ifdef TRACK_PATH_ALLOCATED trans->paths[new].ip_allocated = ip; #endif @@ -1236,8 +1239,10 @@ __flatten btree_path_idx_t __bch2_btree_path_make_mut(struct btree_trans *trans, btree_path_idx_t path, bool intent, unsigned long ip) { - __btree_path_put(trans->paths + path, intent); + struct btree_path *old = trans->paths + path; + __btree_path_put(trans, trans->paths + path, intent); path = btree_path_clone(trans, path, intent, ip); + trace_btree_path_clone(trans, old, trans->paths + path); trans->paths[path].preserve = false; return path; } @@ -1252,6 +1257,8 @@ __bch2_btree_path_set_pos(struct btree_trans *trans, bch2_trans_verify_not_in_restart(trans); EBUG_ON(!trans->paths[path_idx].ref); + trace_btree_path_set_pos(trans, trans->paths + path_idx, &new_pos); + path_idx = bch2_btree_path_make_mut(trans, path_idx, intent, ip); struct btree_path *path = trans->paths + path_idx; @@ -1361,13 +1368,15 @@ void bch2_path_put(struct btree_trans *trans, btree_path_idx_t path_idx, bool in { struct btree_path *path = trans->paths + path_idx, *dup; - if (!__btree_path_put(path, intent)) + if (!__btree_path_put(trans, path, intent)) return; dup = path->preserve ? have_path_at_pos(trans, path) : have_node_at_pos(trans, path); + trace_btree_path_free(trans, path_idx, dup); + if (!dup && !(!path->preserve && !is_btree_node(path, path->level))) return; @@ -1392,7 +1401,7 @@ void bch2_path_put(struct btree_trans *trans, btree_path_idx_t path_idx, bool in static void bch2_path_put_nokeep(struct btree_trans *trans, btree_path_idx_t path, bool intent) { - if (!__btree_path_put(trans->paths + path, intent)) + if (!__btree_path_put(trans, trans->paths + path, intent)) return; __bch2_path_free(trans, path); @@ -1421,8 +1430,8 @@ void __noreturn bch2_trans_unlocked_error(struct btree_trans *trans) noinline __cold void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans) { - prt_printf(buf, "transaction updates for %s journal seq %llu\n", - trans->fn, trans->journal_res.seq); + prt_printf(buf, "%u transaction updates for %s journal seq %llu\n", + trans->nr_updates, trans->fn, trans->journal_res.seq); printbuf_indent_add(buf, 2); trans_for_each_update(trans, i) { @@ -1464,7 +1473,7 @@ static void bch2_btree_path_to_text_short(struct printbuf *out, struct btree_tra { struct btree_path *path = trans->paths + path_idx; - prt_printf(out, "path: idx %2u ref %u:%u %c %c %c btree=%s l=%u pos ", + prt_printf(out, "path: idx %3u ref %u:%u %c %c %c btree=%s l=%u pos ", path_idx, path->ref, path->intent_ref, path->preserve ? 'P' : ' ', path->should_be_locked ? 'S' : ' ', @@ -1716,14 +1725,16 @@ btree_path_idx_t bch2_path_get(struct btree_trans *trans, trans->paths[path_pos].cached == cached && trans->paths[path_pos].btree_id == btree_id && trans->paths[path_pos].level == level) { - __btree_path_get(trans->paths + path_pos, intent); + trace_btree_path_get(trans, trans->paths + path_pos, &pos); + + __btree_path_get(trans, trans->paths + path_pos, intent); path_idx = bch2_btree_path_set_pos(trans, path_pos, pos, intent, ip); path = trans->paths + path_idx; } else { path_idx = btree_path_alloc(trans, path_pos); path = trans->paths + path_idx; - __btree_path_get(path, intent); + __btree_path_get(trans, path, intent); path->pos = pos; path->btree_id = btree_id; path->cached = cached; @@ -1738,6 +1749,8 @@ btree_path_idx_t bch2_path_get(struct btree_trans *trans, path->ip_allocated = ip; #endif trans->paths_sorted = false; + + trace_btree_path_alloc(trans, path); } if (!(flags & BTREE_ITER_nopreserve)) @@ -1857,7 +1870,7 @@ bch2_btree_iter_traverse(struct btree_iter *iter) struct btree_path *path = btree_iter_path(trans, iter); if (btree_path_node(path, path->level)) - btree_path_set_should_be_locked(path); + btree_path_set_should_be_locked(trans, path); return 0; } @@ -1889,7 +1902,7 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter) iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p, iter->flags & BTREE_ITER_intent, btree_iter_ip_allocated(iter)); - btree_path_set_should_be_locked(btree_iter_path(trans, iter)); + btree_path_set_should_be_locked(trans, btree_iter_path(trans, iter)); out: bch2_btree_iter_verify_entry_exit(iter); bch2_btree_iter_verify(iter); @@ -1983,7 +1996,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p, iter->flags & BTREE_ITER_intent, btree_iter_ip_allocated(iter)); - btree_path_set_should_be_locked(btree_iter_path(trans, iter)); + btree_path_set_should_be_locked(trans, btree_iter_path(trans, iter)); EBUG_ON(btree_iter_path(trans, iter)->uptodate); out: bch2_btree_iter_verify_entry_exit(iter); @@ -2155,7 +2168,7 @@ struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos if (unlikely(ret)) return bkey_s_c_err(ret); - btree_path_set_should_be_locked(trans->paths + iter->key_cache_path); + btree_path_set_should_be_locked(trans, trans->paths + iter->key_cache_path); k = bch2_btree_path_peek_slot(trans->paths + iter->key_cache_path, &u); if (k.k && !bkey_err(k)) { @@ -2199,7 +2212,7 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp goto out; } - btree_path_set_should_be_locked(path); + btree_path_set_should_be_locked(trans, path); k = btree_path_level_peek_all(trans->c, l, &iter->k); @@ -2326,7 +2339,7 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e * advance, same as on exit for iter->path, but only up * to snapshot */ - __btree_path_get(trans->paths + iter->path, iter->flags & BTREE_ITER_intent); + __btree_path_get(trans, trans->paths + iter->path, iter->flags & BTREE_ITER_intent); iter->update_path = iter->path; iter->update_path = bch2_btree_path_set_pos(trans, @@ -2382,14 +2395,14 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e iter->flags & BTREE_ITER_intent, btree_iter_ip_allocated(iter)); - btree_path_set_should_be_locked(btree_iter_path(trans, iter)); + btree_path_set_should_be_locked(trans, btree_iter_path(trans, iter)); out_no_locked: if (iter->update_path) { ret = bch2_btree_path_relock(trans, trans->paths + iter->update_path, _THIS_IP_); if (unlikely(ret)) k = bkey_s_c_err(ret); else - btree_path_set_should_be_locked(trans->paths + iter->update_path); + btree_path_set_should_be_locked(trans, trans->paths + iter->update_path); } if (!(iter->flags & BTREE_ITER_all_snapshots)) @@ -2511,6 +2524,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) iter->flags & BTREE_ITER_intent, _THIS_IP_); path = btree_iter_path(trans, iter); + trace_btree_path_save_pos(trans, path, trans->paths + saved_path); saved_k = *k.k; saved_v = k.v; } @@ -2527,7 +2541,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) continue; } - btree_path_set_should_be_locked(path); + btree_path_set_should_be_locked(trans, path); break; } else if (likely(!bpos_eq(path->l[0].b->data->min_key, POS_MIN))) { /* Advance to previous leaf node: */ @@ -2685,7 +2699,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) } } out: - btree_path_set_should_be_locked(btree_iter_path(trans, iter)); + btree_path_set_should_be_locked(trans, btree_iter_path(trans, iter)); out_no_locked: bch2_btree_iter_verify_entry_exit(iter); bch2_btree_iter_verify(iter); @@ -2712,6 +2726,7 @@ struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *iter) return bch2_btree_iter_peek_slot(iter); } +/* Obsolete, but still used by rust wrapper in -tools */ struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *iter) { struct bkey_s_c k; @@ -2911,9 +2926,9 @@ void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src) dst->ip_allocated = _RET_IP_; #endif if (src->path) - __btree_path_get(trans->paths + src->path, src->flags & BTREE_ITER_intent); + __btree_path_get(trans, trans->paths + src->path, src->flags & BTREE_ITER_intent); if (src->update_path) - __btree_path_get(trans->paths + src->update_path, src->flags & BTREE_ITER_intent); + __btree_path_get(trans, trans->paths + src->update_path, src->flags & BTREE_ITER_intent); dst->key_cache_path = 0; } @@ -3237,7 +3252,7 @@ void bch2_trans_put(struct btree_trans *trans) bch2_trans_unlock(trans); trans_for_each_update(trans, i) - __btree_path_put(trans->paths + i->path, true); + __btree_path_put(trans, trans->paths + i->path, true); trans->nr_updates = 0; check_btree_paths_leaked(trans); diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h index 222b7ce8a901..78e63ad7d380 100644 --- a/fs/bcachefs/btree_iter.h +++ b/fs/bcachefs/btree_iter.h @@ -6,6 +6,12 @@ #include "btree_types.h" #include "trace.h" +void bch2_trans_updates_to_text(struct printbuf *, struct btree_trans *); +void bch2_btree_path_to_text(struct printbuf *, struct btree_trans *, btree_path_idx_t); +void bch2_trans_paths_to_text(struct printbuf *, struct btree_trans *); +void bch2_dump_trans_updates(struct btree_trans *); +void bch2_dump_trans_paths_updates(struct btree_trans *); + static inline int __bkey_err(const struct bkey *k) { return PTR_ERR_OR_ZERO(k); @@ -13,16 +19,28 @@ static inline int __bkey_err(const struct bkey *k) #define bkey_err(_k) __bkey_err((_k).k) -static inline void __btree_path_get(struct btree_path *path, bool intent) +static inline void __btree_path_get(struct btree_trans *trans, struct btree_path *path, bool intent) { + unsigned idx = path - trans->paths; + + EBUG_ON(!test_bit(idx, trans->paths_allocated)); + if (unlikely(path->ref == U8_MAX)) { + bch2_dump_trans_paths_updates(trans); + panic("path %u refcount overflow\n", idx); + } + path->ref++; path->intent_ref += intent; + trace_btree_path_get_ll(trans, path); } -static inline bool __btree_path_put(struct btree_path *path, bool intent) +static inline bool __btree_path_put(struct btree_trans *trans, struct btree_path *path, bool intent) { + EBUG_ON(!test_bit(path - trans->paths, trans->paths_allocated)); EBUG_ON(!path->ref); EBUG_ON(!path->intent_ref && intent); + + trace_btree_path_put_ll(trans, path); path->intent_ref -= intent; return --path->ref == 0; } @@ -511,6 +529,12 @@ void bch2_set_btree_iter_dontneed(struct btree_iter *); void *__bch2_trans_kmalloc(struct btree_trans *, size_t); +/** + * bch2_trans_kmalloc - allocate memory for use by the current transaction + * + * Must be called after bch2_trans_begin, which on second and further calls + * frees all memory allocated in this transaction + */ static inline void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size) { size = roundup(size, 8); @@ -814,20 +838,6 @@ transaction_restart: \ struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *); -static inline struct bkey_s_c -__bch2_btree_iter_peek_and_restart(struct btree_trans *trans, - struct btree_iter *iter, unsigned flags) -{ - struct bkey_s_c k; - - while (btree_trans_too_many_iters(trans) || - (k = bch2_btree_iter_peek_type(iter, flags), - bch2_err_matches(bkey_err(k), BCH_ERR_transaction_restart))) - bch2_trans_begin(trans); - - return k; -} - #define for_each_btree_key_upto_norestart(_trans, _iter, _btree_id, \ _start, _end, _flags, _k, _ret) \ for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ @@ -868,7 +878,7 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans, \ if (bch2_err_matches(_ret, ENOMEM)) { \ _gfp = GFP_KERNEL; \ - _ret = drop_locks_do(trans, _do); \ + _ret = drop_locks_do(_trans, _do); \ } \ _ret; \ }) @@ -881,7 +891,7 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans, _ret = 0; \ if (unlikely(!_p)) { \ _gfp = GFP_KERNEL; \ - _ret = drop_locks_do(trans, ((_p = _do), 0)); \ + _ret = drop_locks_do(_trans, ((_p = _do), 0)); \ } \ _p; \ }) @@ -894,12 +904,6 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans, _ret; \ }) -void bch2_trans_updates_to_text(struct printbuf *, struct btree_trans *); -void bch2_btree_path_to_text(struct printbuf *, struct btree_trans *, btree_path_idx_t); -void bch2_trans_paths_to_text(struct printbuf *, struct btree_trans *); -void bch2_dump_trans_updates(struct btree_trans *); -void bch2_dump_trans_paths_updates(struct btree_trans *); - struct btree_trans *__bch2_trans_get(struct bch_fs *, unsigned); void bch2_trans_put(struct btree_trans *); diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c index fda7998734cb..244610b1d0b5 100644 --- a/fs/bcachefs/btree_key_cache.c +++ b/fs/bcachefs/btree_key_cache.c @@ -79,134 +79,47 @@ static bool bkey_cached_lock_for_evict(struct bkey_cached *ck) return true; } -static void bkey_cached_evict(struct btree_key_cache *c, +static bool bkey_cached_evict(struct btree_key_cache *c, struct bkey_cached *ck) { - BUG_ON(rhashtable_remove_fast(&c->table, &ck->hash, - bch2_btree_key_cache_params)); - memset(&ck->key, ~0, sizeof(ck->key)); + bool ret = !rhashtable_remove_fast(&c->table, &ck->hash, + bch2_btree_key_cache_params); + if (ret) { + memset(&ck->key, ~0, sizeof(ck->key)); + atomic_long_dec(&c->nr_keys); + } - atomic_long_dec(&c->nr_keys); + return ret; +} + +static void __bkey_cached_free(struct rcu_pending *pending, struct rcu_head *rcu) +{ + struct bch_fs *c = container_of(pending->srcu, struct bch_fs, btree_trans_barrier); + struct bkey_cached *ck = container_of(rcu, struct bkey_cached, rcu); + + this_cpu_dec(*c->btree_key_cache.nr_pending); + kmem_cache_free(bch2_key_cache, ck); } static void bkey_cached_free(struct btree_key_cache *bc, struct bkey_cached *ck) { - struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); - - BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags)); - - ck->btree_trans_barrier_seq = - start_poll_synchronize_srcu(&c->btree_trans_barrier); - - if (ck->c.lock.readers) { - list_move_tail(&ck->list, &bc->freed_pcpu); - bc->nr_freed_pcpu++; - } else { - list_move_tail(&ck->list, &bc->freed_nonpcpu); - bc->nr_freed_nonpcpu++; - } - atomic_long_inc(&bc->nr_freed); - kfree(ck->k); ck->k = NULL; ck->u64s = 0; six_unlock_write(&ck->c.lock); six_unlock_intent(&ck->c.lock); -} -#ifdef __KERNEL__ -static void __bkey_cached_move_to_freelist_ordered(struct btree_key_cache *bc, - struct bkey_cached *ck) -{ - struct bkey_cached *pos; - - bc->nr_freed_nonpcpu++; - - list_for_each_entry_reverse(pos, &bc->freed_nonpcpu, list) { - if (ULONG_CMP_GE(ck->btree_trans_barrier_seq, - pos->btree_trans_barrier_seq)) { - list_move(&ck->list, &pos->list); - return; - } - } - - list_move(&ck->list, &bc->freed_nonpcpu); -} -#endif - -static void bkey_cached_move_to_freelist(struct btree_key_cache *bc, - struct bkey_cached *ck) -{ - BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags)); - - if (!ck->c.lock.readers) { -#ifdef __KERNEL__ - struct btree_key_cache_freelist *f; - bool freed = false; - - preempt_disable(); - f = this_cpu_ptr(bc->pcpu_freed); - - if (f->nr < ARRAY_SIZE(f->objs)) { - f->objs[f->nr++] = ck; - freed = true; - } - preempt_enable(); - - if (!freed) { - mutex_lock(&bc->lock); - preempt_disable(); - f = this_cpu_ptr(bc->pcpu_freed); - - while (f->nr > ARRAY_SIZE(f->objs) / 2) { - struct bkey_cached *ck2 = f->objs[--f->nr]; - - __bkey_cached_move_to_freelist_ordered(bc, ck2); - } - preempt_enable(); - - __bkey_cached_move_to_freelist_ordered(bc, ck); - mutex_unlock(&bc->lock); - } -#else - mutex_lock(&bc->lock); - list_move_tail(&ck->list, &bc->freed_nonpcpu); - bc->nr_freed_nonpcpu++; - mutex_unlock(&bc->lock); -#endif - } else { - mutex_lock(&bc->lock); - list_move_tail(&ck->list, &bc->freed_pcpu); - bc->nr_freed_pcpu++; - mutex_unlock(&bc->lock); - } -} - -static void bkey_cached_free_fast(struct btree_key_cache *bc, - struct bkey_cached *ck) -{ - struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); - - ck->btree_trans_barrier_seq = - start_poll_synchronize_srcu(&c->btree_trans_barrier); - - list_del_init(&ck->list); - atomic_long_inc(&bc->nr_freed); - - kfree(ck->k); - ck->k = NULL; - ck->u64s = 0; - - bkey_cached_move_to_freelist(bc, ck); - - six_unlock_write(&ck->c.lock); - six_unlock_intent(&ck->c.lock); + bool pcpu_readers = ck->c.lock.readers != NULL; + rcu_pending_enqueue(&bc->pending[pcpu_readers], &ck->rcu); + this_cpu_inc(*bc->nr_pending); } static struct bkey_cached *__bkey_cached_alloc(unsigned key_u64s, gfp_t gfp) { + gfp |= __GFP_ACCOUNT|__GFP_RECLAIMABLE; + struct bkey_cached *ck = kmem_cache_zalloc(bch2_key_cache, gfp); if (unlikely(!ck)) return NULL; @@ -224,74 +137,14 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, unsigned k { struct bch_fs *c = trans->c; struct btree_key_cache *bc = &c->btree_key_cache; - struct bkey_cached *ck = NULL; bool pcpu_readers = btree_uses_pcpu_readers(path->btree_id); int ret; - if (!pcpu_readers) { -#ifdef __KERNEL__ - struct btree_key_cache_freelist *f; - - preempt_disable(); - f = this_cpu_ptr(bc->pcpu_freed); - if (f->nr) - ck = f->objs[--f->nr]; - preempt_enable(); - - if (!ck) { - mutex_lock(&bc->lock); - preempt_disable(); - f = this_cpu_ptr(bc->pcpu_freed); - - while (!list_empty(&bc->freed_nonpcpu) && - f->nr < ARRAY_SIZE(f->objs) / 2) { - ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list); - list_del_init(&ck->list); - bc->nr_freed_nonpcpu--; - f->objs[f->nr++] = ck; - } - - ck = f->nr ? f->objs[--f->nr] : NULL; - preempt_enable(); - mutex_unlock(&bc->lock); - } -#else - mutex_lock(&bc->lock); - if (!list_empty(&bc->freed_nonpcpu)) { - ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list); - list_del_init(&ck->list); - bc->nr_freed_nonpcpu--; - } - mutex_unlock(&bc->lock); -#endif - } else { - mutex_lock(&bc->lock); - if (!list_empty(&bc->freed_pcpu)) { - ck = list_last_entry(&bc->freed_pcpu, struct bkey_cached, list); - list_del_init(&ck->list); - bc->nr_freed_pcpu--; - } - mutex_unlock(&bc->lock); - } - - if (ck) { - ret = btree_node_lock_nopath(trans, &ck->c, SIX_LOCK_intent, _THIS_IP_); - if (unlikely(ret)) { - bkey_cached_move_to_freelist(bc, ck); - return ERR_PTR(ret); - } - - btree_path_cached_set(trans, path, ck, BTREE_NODE_INTENT_LOCKED); - - ret = bch2_btree_node_lock_write(trans, path, &ck->c); - if (unlikely(ret)) { - btree_node_unlock(trans, path, 0); - bkey_cached_move_to_freelist(bc, ck); - return ERR_PTR(ret); - } - - return ck; - } + struct bkey_cached *ck = container_of_or_null( + rcu_pending_dequeue(&bc->pending[pcpu_readers]), + struct bkey_cached, rcu); + if (ck) + goto lock; ck = allocate_dropping_locks(trans, ret, __bkey_cached_alloc(key_u64s, _gfp)); @@ -302,15 +155,19 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, unsigned k return ERR_PTR(ret); } - if (!ck) - return NULL; + if (ck) { + bch2_btree_lock_init(&ck->c, pcpu_readers ? SIX_LOCK_INIT_PCPU : 0); + ck->c.cached = true; + goto lock; + } - INIT_LIST_HEAD(&ck->list); - bch2_btree_lock_init(&ck->c, pcpu_readers ? SIX_LOCK_INIT_PCPU : 0); - - ck->c.cached = true; - BUG_ON(!six_trylock_intent(&ck->c.lock)); - BUG_ON(!six_trylock_write(&ck->c.lock)); + ck = container_of_or_null(rcu_pending_dequeue_from_all(&bc->pending[pcpu_readers]), + struct bkey_cached, rcu); + if (ck) + goto lock; +lock: + six_lock_intent(&ck->c.lock, NULL, NULL); + six_lock_write(&ck->c.lock, NULL, NULL); return ck; } @@ -322,21 +179,21 @@ bkey_cached_reuse(struct btree_key_cache *c) struct bkey_cached *ck; unsigned i; - mutex_lock(&c->lock); rcu_read_lock(); tbl = rht_dereference_rcu(c->table.tbl, &c->table); for (i = 0; i < tbl->size; i++) rht_for_each_entry_rcu(ck, pos, tbl, i, hash) { if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) && bkey_cached_lock_for_evict(ck)) { - bkey_cached_evict(c, ck); - goto out; + if (bkey_cached_evict(c, ck)) + goto out; + six_unlock_write(&ck->c.lock); + six_unlock_intent(&ck->c.lock); } } ck = NULL; out: rcu_read_unlock(); - mutex_unlock(&c->lock); return ck; } @@ -415,7 +272,7 @@ static int btree_key_cache_create(struct btree_trans *trans, struct btree_path * path->uptodate = BTREE_ITER_UPTODATE; return 0; err: - bkey_cached_free_fast(bc, ck); + bkey_cached_free(bc, ck); mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED); return ret; @@ -611,8 +468,12 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, } mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED); - bkey_cached_evict(&c->btree_key_cache, ck); - bkey_cached_free_fast(&c->btree_key_cache, ck); + if (bkey_cached_evict(&c->btree_key_cache, ck)) { + bkey_cached_free(&c->btree_key_cache, ck); + } else { + six_unlock_write(&ck->c.lock); + six_unlock_intent(&ck->c.lock); + } } out: bch2_trans_iter_exit(trans, &b_iter); @@ -722,7 +583,7 @@ void bch2_btree_key_cache_drop(struct btree_trans *trans, } bkey_cached_evict(bc, ck); - bkey_cached_free_fast(bc, ck); + bkey_cached_free(bc, ck); mark_btree_node_locked(trans, path, 0, BTREE_NODE_UNLOCKED); btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); @@ -735,48 +596,14 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, struct bch_fs *c = shrink->private_data; struct btree_key_cache *bc = &c->btree_key_cache; struct bucket_table *tbl; - struct bkey_cached *ck, *t; + struct bkey_cached *ck; size_t scanned = 0, freed = 0, nr = sc->nr_to_scan; - unsigned start, flags; + unsigned iter, start; int srcu_idx; - mutex_lock(&bc->lock); - bc->requested_to_free += sc->nr_to_scan; - srcu_idx = srcu_read_lock(&c->btree_trans_barrier); - flags = memalloc_nofs_save(); - - /* - * Newest freed entries are at the end of the list - once we hit one - * that's too new to be freed, we can bail out: - */ - list_for_each_entry_safe(ck, t, &bc->freed_nonpcpu, list) { - if (!poll_state_synchronize_srcu(&c->btree_trans_barrier, - ck->btree_trans_barrier_seq)) - break; - - list_del(&ck->list); - six_lock_exit(&ck->c.lock); - kmem_cache_free(bch2_key_cache, ck); - atomic_long_dec(&bc->nr_freed); - bc->nr_freed_nonpcpu--; - bc->freed++; - } - - list_for_each_entry_safe(ck, t, &bc->freed_pcpu, list) { - if (!poll_state_synchronize_srcu(&c->btree_trans_barrier, - ck->btree_trans_barrier_seq)) - break; - - list_del(&ck->list); - six_lock_exit(&ck->c.lock); - kmem_cache_free(bch2_key_cache, ck); - atomic_long_dec(&bc->nr_freed); - bc->nr_freed_pcpu--; - bc->freed++; - } - rcu_read_lock(); + tbl = rht_dereference_rcu(bc->table.tbl, &bc->table); /* @@ -792,17 +619,18 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, return SHRINK_STOP; } - if (bc->shrink_iter >= tbl->size) - bc->shrink_iter = 0; - start = bc->shrink_iter; + iter = bc->shrink_iter; + if (iter >= tbl->size) + iter = 0; + start = iter; do { struct rhash_head *pos, *next; - pos = rht_ptr_rcu(&tbl->buckets[bc->shrink_iter]); + pos = rht_ptr_rcu(&tbl->buckets[iter]); while (!rht_is_a_nulls(pos)) { - next = rht_dereference_bucket_rcu(pos->next, tbl, bc->shrink_iter); + next = rht_dereference_bucket_rcu(pos->next, tbl, iter); ck = container_of(pos, struct bkey_cached, hash); if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { @@ -812,29 +640,31 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, bc->skipped_accessed++; } else if (!bkey_cached_lock_for_evict(ck)) { bc->skipped_lock_fail++; - } else { - bkey_cached_evict(bc, ck); + } else if (bkey_cached_evict(bc, ck)) { bkey_cached_free(bc, ck); - bc->moved_to_freelist++; + bc->freed++; freed++; + } else { + six_unlock_write(&ck->c.lock); + six_unlock_intent(&ck->c.lock); } scanned++; if (scanned >= nr) - break; + goto out; pos = next; } - bc->shrink_iter++; - if (bc->shrink_iter >= tbl->size) - bc->shrink_iter = 0; - } while (scanned < nr && bc->shrink_iter != start); + iter++; + if (iter >= tbl->size) + iter = 0; + } while (scanned < nr && iter != start); +out: + bc->shrink_iter = iter; rcu_read_unlock(); - memalloc_nofs_restore(flags); srcu_read_unlock(&c->btree_trans_barrier, srcu_idx); - mutex_unlock(&bc->lock); return freed; } @@ -862,18 +692,13 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) { struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); struct bucket_table *tbl; - struct bkey_cached *ck, *n; + struct bkey_cached *ck; struct rhash_head *pos; LIST_HEAD(items); unsigned i; -#ifdef __KERNEL__ - int cpu; -#endif shrinker_free(bc->shrink); - mutex_lock(&bc->lock); - /* * The loop is needed to guard against racing with rehash: */ @@ -892,44 +717,14 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) for (i = 0; i < tbl->size; i++) while (pos = rht_ptr_rcu(&tbl->buckets[i]), !rht_is_a_nulls(pos)) { ck = container_of(pos, struct bkey_cached, hash); - bkey_cached_evict(bc, ck); - list_add(&ck->list, &items); + BUG_ON(!bkey_cached_evict(bc, ck)); + kfree(ck->k); + kmem_cache_free(bch2_key_cache, ck); } } rcu_read_unlock(); } -#ifdef __KERNEL__ - if (bc->pcpu_freed) { - for_each_possible_cpu(cpu) { - struct btree_key_cache_freelist *f = - per_cpu_ptr(bc->pcpu_freed, cpu); - - for (i = 0; i < f->nr; i++) { - ck = f->objs[i]; - list_add(&ck->list, &items); - } - } - } -#endif - - BUG_ON(list_count_nodes(&bc->freed_pcpu) != bc->nr_freed_pcpu); - BUG_ON(list_count_nodes(&bc->freed_nonpcpu) != bc->nr_freed_nonpcpu); - - list_splice(&bc->freed_pcpu, &items); - list_splice(&bc->freed_nonpcpu, &items); - - mutex_unlock(&bc->lock); - - list_for_each_entry_safe(ck, n, &items, list) { - cond_resched(); - - list_del(&ck->list); - kfree(ck->k); - six_lock_exit(&ck->c.lock); - kmem_cache_free(bch2_key_cache, ck); - } - if (atomic_long_read(&bc->nr_dirty) && !bch2_journal_error(&c->journal) && test_bit(BCH_FS_was_rw, &c->flags)) @@ -943,14 +738,14 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) if (bc->table_init_done) rhashtable_destroy(&bc->table); - free_percpu(bc->pcpu_freed); + rcu_pending_exit(&bc->pending[0]); + rcu_pending_exit(&bc->pending[1]); + + free_percpu(bc->nr_pending); } void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c) { - mutex_init(&c->lock); - INIT_LIST_HEAD(&c->freed_pcpu); - INIT_LIST_HEAD(&c->freed_nonpcpu); } int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc) @@ -958,11 +753,13 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc) struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); struct shrinker *shrink; -#ifdef __KERNEL__ - bc->pcpu_freed = alloc_percpu(struct btree_key_cache_freelist); - if (!bc->pcpu_freed) + bc->nr_pending = alloc_percpu(size_t); + if (!bc->nr_pending) + return -BCH_ERR_ENOMEM_fs_btree_cache_init; + + if (rcu_pending_init(&bc->pending[0], &c->btree_trans_barrier, __bkey_cached_free) || + rcu_pending_init(&bc->pending[1], &c->btree_trans_barrier, __bkey_cached_free)) return -BCH_ERR_ENOMEM_fs_btree_cache_init; -#endif if (rhashtable_init(&bc->table, &bch2_btree_key_cache_params)) return -BCH_ERR_ENOMEM_fs_btree_cache_init; @@ -984,45 +781,21 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc) void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *bc) { - struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); - printbuf_tabstop_push(out, 24); printbuf_tabstop_push(out, 12); - unsigned flags = memalloc_nofs_save(); - mutex_lock(&bc->lock); prt_printf(out, "keys:\t%lu\r\n", atomic_long_read(&bc->nr_keys)); prt_printf(out, "dirty:\t%lu\r\n", atomic_long_read(&bc->nr_dirty)); - prt_printf(out, "freelist:\t%lu\r\n", atomic_long_read(&bc->nr_freed)); - prt_printf(out, "nonpcpu freelist:\t%zu\r\n", bc->nr_freed_nonpcpu); - prt_printf(out, "pcpu freelist:\t%zu\r\n", bc->nr_freed_pcpu); - - prt_printf(out, "\nshrinker:\n"); + prt_printf(out, "table size:\t%u\r\n", bc->table.tbl->size); + prt_newline(out); + prt_printf(out, "shrinker:\n"); prt_printf(out, "requested_to_free:\t%lu\r\n", bc->requested_to_free); prt_printf(out, "freed:\t%lu\r\n", bc->freed); - prt_printf(out, "moved_to_freelist:\t%lu\r\n", bc->moved_to_freelist); prt_printf(out, "skipped_dirty:\t%lu\r\n", bc->skipped_dirty); prt_printf(out, "skipped_accessed:\t%lu\r\n", bc->skipped_accessed); prt_printf(out, "skipped_lock_fail:\t%lu\r\n", bc->skipped_lock_fail); - - prt_printf(out, "srcu seq:\t%lu\r\n", get_state_synchronize_srcu(&c->btree_trans_barrier)); - - struct bkey_cached *ck; - unsigned iter = 0; - list_for_each_entry(ck, &bc->freed_nonpcpu, list) { - prt_printf(out, "freed_nonpcpu:\t%lu\r\n", ck->btree_trans_barrier_seq); - if (++iter > 10) - break; - } - - iter = 0; - list_for_each_entry(ck, &bc->freed_pcpu, list) { - prt_printf(out, "freed_pcpu:\t%lu\r\n", ck->btree_trans_barrier_seq); - if (++iter > 10) - break; - } - mutex_unlock(&bc->lock); - memalloc_flags_restore(flags); + prt_newline(out); + prt_printf(out, "pending:\t%zu\r\n", per_cpu_sum(bc->nr_pending)); } void bch2_btree_key_cache_exit(void) diff --git a/fs/bcachefs/btree_key_cache_types.h b/fs/bcachefs/btree_key_cache_types.h index 237e8bb3ac40..722f1ed10551 100644 --- a/fs/bcachefs/btree_key_cache_types.h +++ b/fs/bcachefs/btree_key_cache_types.h @@ -2,33 +2,25 @@ #ifndef _BCACHEFS_BTREE_KEY_CACHE_TYPES_H #define _BCACHEFS_BTREE_KEY_CACHE_TYPES_H -struct btree_key_cache_freelist { - struct bkey_cached *objs[16]; - unsigned nr; -}; +#include "rcu_pending.h" struct btree_key_cache { - struct mutex lock; struct rhashtable table; bool table_init_done; - struct list_head freed_pcpu; - size_t nr_freed_pcpu; - struct list_head freed_nonpcpu; - size_t nr_freed_nonpcpu; - struct shrinker *shrink; unsigned shrink_iter; - struct btree_key_cache_freelist __percpu *pcpu_freed; - atomic_long_t nr_freed; + /* 0: non pcpu reader locks, 1: pcpu reader locks */ + struct rcu_pending pending[2]; + size_t __percpu *nr_pending; + atomic_long_t nr_keys; atomic_long_t nr_dirty; /* shrinker stats */ unsigned long requested_to_free; unsigned long freed; - unsigned long moved_to_freelist; unsigned long skipped_dirty; unsigned long skipped_accessed; unsigned long skipped_lock_fail; diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h index 11a64ead8685..7c07f9fa9add 100644 --- a/fs/bcachefs/btree_locking.h +++ b/fs/bcachefs/btree_locking.h @@ -218,16 +218,17 @@ static inline int __btree_node_lock_nopath(struct btree_trans *trans, bool lock_may_not_fail, unsigned long ip) { - int ret; - trans->lock_may_not_fail = lock_may_not_fail; trans->lock_must_abort = false; trans->locking = b; - ret = six_lock_ip_waiter(&b->lock, type, &trans->locking_wait, - bch2_six_check_for_deadlock, trans, ip); + int ret = six_lock_ip_waiter(&b->lock, type, &trans->locking_wait, + bch2_six_check_for_deadlock, trans, ip); WRITE_ONCE(trans->locking, NULL); WRITE_ONCE(trans->locking_wait.start_time, 0); + + if (!ret) + trace_btree_path_lock(trans, _THIS_IP_, b); return ret; } @@ -281,6 +282,7 @@ static inline int btree_node_lock(struct btree_trans *trans, int ret = 0; EBUG_ON(level >= BTREE_MAX_DEPTH); + bch2_trans_verify_not_unlocked(trans); if (likely(six_trylock_type(&b->lock, type)) || btree_node_lock_increment(trans, b, level, (enum btree_node_locked_type) type) || @@ -400,12 +402,13 @@ static inline int bch2_btree_path_upgrade(struct btree_trans *trans, /* misc: */ -static inline void btree_path_set_should_be_locked(struct btree_path *path) +static inline void btree_path_set_should_be_locked(struct btree_trans *trans, struct btree_path *path) { EBUG_ON(!btree_node_locked(path, path->level)); EBUG_ON(path->uptodate); path->should_be_locked = true; + trace_btree_path_should_be_locked(trans, path); } static inline void __btree_path_set_level_up(struct btree_trans *trans, diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c index a0101d9c5d83..91884da4e30a 100644 --- a/fs/bcachefs/btree_trans_commit.c +++ b/fs/bcachefs/btree_trans_commit.c @@ -214,7 +214,7 @@ bool bch2_btree_bset_insert_key(struct btree_trans *trans, k = bch2_btree_node_iter_bset_pos(node_iter, b, bset_tree_last(b)); overwrite: - bch2_bset_insert(b, node_iter, k, insert, clobber_u64s); + bch2_bset_insert(b, k, insert, clobber_u64s); new_u64s = k->u64s; fix_iter: if (clobber_u64s != new_u64s) diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h index b256b2a20a4f..4568a41fefaf 100644 --- a/fs/bcachefs/btree_types.h +++ b/fs/bcachefs/btree_types.h @@ -138,6 +138,31 @@ struct btree { struct list_head list; }; +#define BCH_BTREE_CACHE_NOT_FREED_REASONS() \ + x(lock_intent) \ + x(lock_write) \ + x(dirty) \ + x(read_in_flight) \ + x(write_in_flight) \ + x(noevict) \ + x(write_blocked) \ + x(will_make_reachable) \ + x(access_bit) + +enum bch_btree_cache_not_freed_reasons { +#define x(n) BCH_BTREE_CACHE_NOT_FREED_##n, + BCH_BTREE_CACHE_NOT_FREED_REASONS() +#undef x + BCH_BTREE_CACHE_NOT_FREED_REASONS_NR, +}; + +struct btree_cache_list { + unsigned idx; + struct shrinker *shrink; + struct list_head list; + size_t nr; +}; + struct btree_cache { struct rhashtable table; bool table_init_done; @@ -155,28 +180,19 @@ struct btree_cache { * should never grow past ~2-3 nodes in practice. */ struct mutex lock; - struct list_head live; struct list_head freeable; struct list_head freed_pcpu; struct list_head freed_nonpcpu; + struct btree_cache_list live[2]; - /* Number of elements in live + freeable lists */ - unsigned used; - unsigned reserve; - unsigned freed; - unsigned not_freed_lock_intent; - unsigned not_freed_lock_write; - unsigned not_freed_dirty; - unsigned not_freed_read_in_flight; - unsigned not_freed_write_in_flight; - unsigned not_freed_noevict; - unsigned not_freed_write_blocked; - unsigned not_freed_will_make_reachable; - unsigned not_freed_access_bit; - atomic_t dirty; - struct shrinker *shrink; + size_t nr_freeable; + size_t nr_reserve; + size_t nr_by_btree[BTREE_ID_NR]; + atomic_long_t nr_dirty; - unsigned used_by_btree[BTREE_ID_NR]; + /* shrinker stats */ + size_t nr_freed; + u64 not_freed[BCH_BTREE_CACHE_NOT_FREED_REASONS_NR]; /* * If we need to allocate memory for a new btree node and that @@ -189,8 +205,8 @@ struct btree_cache { struct bbpos pinned_nodes_start; struct bbpos pinned_nodes_end; - u64 pinned_nodes_leaf_mask; - u64 pinned_nodes_interior_mask; + /* btree id mask: 0 for leaves, 1 for interior */ + u64 pinned_nodes_mask[2]; }; struct btree_node_iter { @@ -386,17 +402,16 @@ struct bkey_cached { struct btree_bkey_cached_common c; unsigned long flags; - unsigned long btree_trans_barrier_seq; u16 u64s; struct bkey_cached_key key; struct rhash_head hash; - struct list_head list; struct journal_entry_pin journal; u64 seq; struct bkey_i *k; + struct rcu_head rcu; }; static inline struct bpos btree_node_pos(struct btree_bkey_cached_common *b) @@ -583,7 +598,8 @@ enum btree_write_type { x(dying) \ x(fake) \ x(need_rewrite) \ - x(never_write) + x(never_write) \ + x(pinned) enum btree_flags { /* First bits for btree node write type */ diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c index d6f6df10dcc3..514df618548e 100644 --- a/fs/bcachefs/btree_update.c +++ b/fs/bcachefs/btree_update.c @@ -374,7 +374,7 @@ static noinline int flush_new_cached_update(struct btree_trans *trans, i->key_cache_already_flushed = true; i->flags |= BTREE_TRIGGER_norun; - btree_path_set_should_be_locked(btree_path); + btree_path_set_should_be_locked(trans, btree_path); ret = bch2_trans_update_by_path(trans, path_idx, i->k, flags, ip); out: bch2_path_put(trans, path_idx, true); @@ -422,7 +422,9 @@ bch2_trans_update_by_path(struct btree_trans *trans, btree_path_idx_t path_idx, break; } - if (!cmp && i < trans->updates + trans->nr_updates) { + bool overwrite = !cmp && i < trans->updates + trans->nr_updates; + + if (overwrite) { EBUG_ON(i->insert_trigger_run || i->overwrite_trigger_run); bch2_path_put(trans, i->path, true); @@ -449,7 +451,9 @@ bch2_trans_update_by_path(struct btree_trans *trans, btree_path_idx_t path_idx, } } - __btree_path_get(trans->paths + i->path, true); + __btree_path_get(trans, trans->paths + i->path, true); + + trace_update_by_path(trans, path, i, overwrite); /* * If a key is present in the key cache, it must also exist in the @@ -498,7 +502,7 @@ static noinline int bch2_trans_update_get_key_cache(struct btree_trans *trans, return btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_raced); } - btree_path_set_should_be_locked(trans->paths + iter->key_cache_path); + btree_path_set_should_be_locked(trans, trans->paths + iter->key_cache_path); } return 0; diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 8fd112026e7a..190bc1e81756 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -16,6 +16,7 @@ #include "clock.h" #include "error.h" #include "extents.h" +#include "io_write.h" #include "journal.h" #include "journal_reclaim.h" #include "keylist.h" @@ -145,7 +146,7 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b) printbuf_exit(&buf); return ret; topology_repair: - if ((c->recovery_passes_explicit & BIT_ULL(BCH_RECOVERY_PASS_check_topology)) && + if ((c->opts.recovery_passes & BIT_ULL(BCH_RECOVERY_PASS_check_topology)) && c->curr_recovery_pass > BCH_RECOVERY_PASS_check_topology) { bch2_inconsistent_error(c); ret = -BCH_ERR_btree_need_topology_repair; @@ -250,8 +251,13 @@ static void bch2_btree_node_free_inmem(struct btree_trans *trans, unsigned i, level = b->c.level; bch2_btree_node_lock_write_nofail(trans, path, &b->c); + + mutex_lock(&c->btree_cache.lock); bch2_btree_node_hash_remove(&c->btree_cache, b); + mutex_unlock(&c->btree_cache.lock); + __btree_node_free(trans, b); + six_unlock_write(&b->c.lock); mark_btree_node_locked_noreset(path, level, BTREE_NODE_INTENT_LOCKED); @@ -283,7 +289,6 @@ static void bch2_btree_node_free_never_used(struct btree_update *as, clear_btree_node_need_write(b); mutex_lock(&c->btree_cache.lock); - list_del_init(&b->list); bch2_btree_node_hash_remove(&c->btree_cache, b); mutex_unlock(&c->btree_cache.lock); @@ -731,6 +736,18 @@ static void btree_update_nodes_written(struct btree_update *as) bch2_fs_fatal_err_on(ret && !bch2_journal_error(&c->journal), c, "%s", bch2_err_str(ret)); err: + /* + * Ensure transaction is unlocked before using btree_node_lock_nopath() + * (the use of which is always suspect, we need to work on removing this + * in the future) + * + * It should be, but bch2_path_get_unlocked_mut() -> bch2_path_get() + * calls bch2_path_upgrade(), before we call path_make_mut(), so we may + * rarely end up with a locked path besides the one we have here: + */ + bch2_trans_unlock(trans); + bch2_trans_begin(trans); + /* * We have to be careful because another thread might be getting ready * to free as->b and calling btree_update_reparent() on us - we'll @@ -750,18 +767,6 @@ static void btree_update_nodes_written(struct btree_update *as) * we're in journal error state: */ - /* - * Ensure transaction is unlocked before using - * btree_node_lock_nopath() (the use of which is always suspect, - * we need to work on removing this in the future) - * - * It should be, but bch2_path_get_unlocked_mut() -> bch2_path_get() - * calls bch2_path_upgrade(), before we call path_make_mut(), so - * we may rarely end up with a locked path besides the one we - * have here: - */ - bch2_trans_unlock(trans); - bch2_trans_begin(trans); btree_path_idx_t path_idx = bch2_path_get_unlocked_mut(trans, as->btree_id, b->c.level, b->key.k.p); struct btree_path *path = trans->paths + path_idx; @@ -1899,7 +1904,7 @@ static void __btree_increase_depth(struct btree_update *as, struct btree_trans * six_unlock_intent(&n->c.lock); mutex_lock(&c->btree_cache.lock); - list_add_tail(&b->list, &c->btree_cache.live); + list_add_tail(&b->list, &c->btree_cache.live[btree_node_pinned(b)].list); mutex_unlock(&c->btree_cache.lock); bch2_trans_verify_locks(trans); @@ -1981,7 +1986,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, if (ret) goto err; - btree_path_set_should_be_locked(trans->paths + sib_path); + btree_path_set_should_be_locked(trans, trans->paths + sib_path); m = trans->paths[sib_path].l[level].b; diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h index 02c6ecada97c..10f400957f21 100644 --- a/fs/bcachefs/btree_update_interior.h +++ b/fs/bcachefs/btree_update_interior.h @@ -159,6 +159,8 @@ static inline int bch2_foreground_maybe_merge(struct btree_trans *trans, unsigned level, unsigned flags) { + bch2_trans_verify_not_unlocked(trans); + return bch2_foreground_maybe_merge_sibling(trans, path, level, flags, btree_prev_sib) ?: bch2_foreground_maybe_merge_sibling(trans, path, level, flags, diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index 721bbe1dffc1..546cd01a72e3 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -75,6 +75,15 @@ void bch2_dev_usage_to_text(struct printbuf *out, struct bch_dev *ca, struct bch_dev_usage *usage) { + if (out->nr_tabstops < 5) { + printbuf_tabstops_reset(out); + printbuf_tabstop_push(out, 12); + printbuf_tabstop_push(out, 16); + printbuf_tabstop_push(out, 16); + printbuf_tabstop_push(out, 16); + printbuf_tabstop_push(out, 16); + } + prt_printf(out, "\tbuckets\rsectors\rfragmented\r\n"); for (unsigned i = 0; i < BCH_DATA_NR; i++) { @@ -272,7 +281,7 @@ int bch2_check_fix_ptrs(struct btree_trans *trans, goto err; rcu_read_lock(); - bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, !bch2_dev_rcu(c, ptr->dev)); + bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, !bch2_dev_exists(c, ptr->dev)); rcu_read_unlock(); if (level) { @@ -477,7 +486,7 @@ int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca, return ret; err: bch2_dump_trans_updates(trans); - ret = -EIO; + ret = -BCH_ERR_bucket_ref_update; goto out; } @@ -556,22 +565,24 @@ static int bch2_trigger_pointer(struct btree_trans *trans, s64 *sectors, enum btree_iter_update_trigger_flags flags) { + struct bch_fs *c = trans->c; bool insert = !(flags & BTREE_TRIGGER_overwrite); struct printbuf buf = PRINTBUF; int ret = 0; - struct bch_fs *c = trans->c; + u64 abs_sectors = ptr_disk_sectors(level ? btree_sectors(c) : k.k->size, p); + *sectors = insert ? abs_sectors : -abs_sectors; + struct bch_dev *ca = bch2_dev_tryget(c, p.ptr.dev); if (unlikely(!ca)) { if (insert && p.ptr.dev != BCH_SB_MEMBER_INVALID) - ret = -EIO; + ret = -BCH_ERR_trigger_pointer; goto err; } struct bpos bucket; struct bch_backpointer bp; - bch2_extent_ptr_to_bp(trans->c, ca, btree_id, level, k, p, entry, &bucket, &bp); - *sectors = insert ? bp.bucket_len : -((s64) bp.bucket_len); + __bch2_extent_ptr_to_bp(trans->c, ca, btree_id, level, k, p, entry, &bucket, &bp, abs_sectors); if (flags & BTREE_TRIGGER_transactional) { struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update(trans, bucket, 0); @@ -593,7 +604,7 @@ static int bch2_trigger_pointer(struct btree_trans *trans, if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n %s", p.ptr.dev, (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - ret = -EIO; + ret = -BCH_ERR_trigger_pointer; goto err_unlock; } @@ -638,7 +649,7 @@ static int bch2_trigger_stripe_ptr(struct btree_trans *trans, bch2_trans_inconsistent(trans, "stripe pointer doesn't match stripe %llu", (u64) p.ec.idx); - ret = -EIO; + ret = -BCH_ERR_trigger_stripe_pointer; goto err; } @@ -677,7 +688,7 @@ static int bch2_trigger_stripe_ptr(struct btree_trans *trans, (u64) p.ec.idx, buf.buf); printbuf_exit(&buf); bch2_inconsistent_error(c); - return -EIO; + return -BCH_ERR_trigger_stripe_pointer; } m->block_sectors[p.ec.block] += sectors; @@ -741,7 +752,7 @@ static int __trigger_extent(struct btree_trans *trans, return ret; } else if (!p.has_ec) { *replicas_sectors += disk_sectors; - acc_replicas_key.replicas.devs[acc_replicas_key.replicas.nr_devs++] = p.ptr.dev; + replicas_entry_add_dev(&acc_replicas_key.replicas, p.ptr.dev); } else { ret = bch2_trigger_stripe_ptr(trans, k, p, data_type, disk_sectors, flags); if (ret) @@ -957,7 +968,7 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, bch2_data_type_str(a->v.data_type), bch2_data_type_str(type), bch2_data_type_str(type)); - ret = -EIO; + ret = -BCH_ERR_metadata_bucket_inconsistency; goto err; } @@ -1013,7 +1024,7 @@ static int bch2_mark_metadata_bucket(struct btree_trans *trans, struct bch_dev * bucket_unlock(g); err_unlock: percpu_up_read(&c->mark_lock); - return -EIO; + return -BCH_ERR_metadata_bucket_inconsistency; } int bch2_trans_mark_metadata_bucket(struct btree_trans *trans, diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h index edbdffd508fc..e2cb7b24b220 100644 --- a/fs/bcachefs/buckets.h +++ b/fs/bcachefs/buckets.h @@ -80,22 +80,9 @@ static inline void bucket_lock(struct bucket *b) TASK_UNINTERRUPTIBLE); } -static inline struct bucket_array *gc_bucket_array(struct bch_dev *ca) -{ - return rcu_dereference_check(ca->buckets_gc, - !ca->fs || - percpu_rwsem_is_held(&ca->fs->mark_lock) || - lockdep_is_held(&ca->fs->state_lock) || - lockdep_is_held(&ca->bucket_lock)); -} - static inline struct bucket *gc_bucket(struct bch_dev *ca, size_t b) { - struct bucket_array *buckets = gc_bucket_array(ca); - - if (b - buckets->first_bucket >= buckets->nbuckets_minus_first) - return NULL; - return buckets->b + b; + return genradix_ptr(&ca->buckets_gc, b); } static inline struct bucket_gens *bucket_gens(struct bch_dev *ca) diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h index c9698cdf866f..28bd09a253c8 100644 --- a/fs/bcachefs/buckets_types.h +++ b/fs/bcachefs/buckets_types.h @@ -19,14 +19,6 @@ struct bucket { u32 stripe_sectors; } __aligned(sizeof(long)); -struct bucket_array { - struct rcu_head rcu; - u16 first_bucket; - size_t nbuckets; - size_t nbuckets_minus_first; - struct bucket b[]; -}; - struct bucket_gens { struct rcu_head rcu; u16 first_bucket; diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c index e7208bf1974e..ce8fc677bef9 100644 --- a/fs/bcachefs/checksum.c +++ b/fs/bcachefs/checksum.c @@ -100,13 +100,12 @@ static inline int do_encrypt_sg(struct crypto_sync_skcipher *tfm, struct scatterlist *sg, size_t len) { SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm); - int ret; skcipher_request_set_sync_tfm(req, tfm); skcipher_request_set_callback(req, 0, NULL, NULL); skcipher_request_set_crypt(req, sg, sg, len, nonce.d); - ret = crypto_skcipher_encrypt(req); + int ret = crypto_skcipher_encrypt(req); if (ret) pr_err("got error %i from crypto_skcipher_encrypt()", ret); @@ -118,38 +117,47 @@ static inline int do_encrypt(struct crypto_sync_skcipher *tfm, void *buf, size_t len) { if (!is_vmalloc_addr(buf)) { - struct scatterlist sg; + struct scatterlist sg = {}; - sg_init_table(&sg, 1); - sg_set_page(&sg, - is_vmalloc_addr(buf) - ? vmalloc_to_page(buf) - : virt_to_page(buf), - len, offset_in_page(buf)); + sg_mark_end(&sg); + sg_set_page(&sg, virt_to_page(buf), len, offset_in_page(buf)); return do_encrypt_sg(tfm, nonce, &sg, len); } else { - unsigned pages = buf_pages(buf, len); - struct scatterlist *sg; - size_t orig_len = len; - int ret, i; + DARRAY_PREALLOCATED(struct scatterlist, 4) sgl; + size_t sgl_len = 0; + int ret; - sg = kmalloc_array(pages, sizeof(*sg), GFP_KERNEL); - if (!sg) - return -BCH_ERR_ENOMEM_do_encrypt; + darray_init(&sgl); - sg_init_table(sg, pages); - - for (i = 0; i < pages; i++) { + while (len) { unsigned offset = offset_in_page(buf); - unsigned pg_len = min_t(size_t, len, PAGE_SIZE - offset); + struct scatterlist sg = { + .page_link = (unsigned long) vmalloc_to_page(buf), + .offset = offset, + .length = min(len, PAGE_SIZE - offset), + }; - sg_set_page(sg + i, vmalloc_to_page(buf), pg_len, offset); - buf += pg_len; - len -= pg_len; + if (darray_push(&sgl, sg)) { + sg_mark_end(&darray_last(sgl)); + ret = do_encrypt_sg(tfm, nonce, sgl.data, sgl_len); + if (ret) + goto err; + + nonce = nonce_add(nonce, sgl_len); + sgl_len = 0; + sgl.nr = 0; + BUG_ON(darray_push(&sgl, sg)); + } + + buf += sg.length; + len -= sg.length; + sgl_len += sg.length; } - ret = do_encrypt_sg(tfm, nonce, sg, orig_len); - kfree(sg); + sg_mark_end(&darray_last(sgl)); + ret = do_encrypt_sg(tfm, nonce, sgl.data, sgl_len); +err: + darray_exit(&sgl); return ret; } } @@ -325,39 +333,42 @@ int __bch2_encrypt_bio(struct bch_fs *c, unsigned type, { struct bio_vec bv; struct bvec_iter iter; - struct scatterlist sgl[16], *sg = sgl; - size_t bytes = 0; + DARRAY_PREALLOCATED(struct scatterlist, 4) sgl; + size_t sgl_len = 0; int ret = 0; if (!bch2_csum_type_is_encryption(type)) return 0; - sg_init_table(sgl, ARRAY_SIZE(sgl)); + darray_init(&sgl); bio_for_each_segment(bv, bio, iter) { - if (sg == sgl + ARRAY_SIZE(sgl)) { - sg_mark_end(sg - 1); + struct scatterlist sg = { + .page_link = (unsigned long) bv.bv_page, + .offset = bv.bv_offset, + .length = bv.bv_len, + }; - ret = do_encrypt_sg(c->chacha20, nonce, sgl, bytes); + if (darray_push(&sgl, sg)) { + sg_mark_end(&darray_last(sgl)); + ret = do_encrypt_sg(c->chacha20, nonce, sgl.data, sgl_len); if (ret) - return ret; + goto err; - nonce = nonce_add(nonce, bytes); - bytes = 0; + nonce = nonce_add(nonce, sgl_len); + sgl_len = 0; + sgl.nr = 0; - sg_init_table(sgl, ARRAY_SIZE(sgl)); - sg = sgl; + BUG_ON(darray_push(&sgl, sg)); } - sg_set_page(sg++, bv.bv_page, bv.bv_len, bv.bv_offset); - bytes += bv.bv_len; - } - - if (sg != sgl) { - sg_mark_end(sg - 1); - return do_encrypt_sg(c->chacha20, nonce, sgl, bytes); + sgl_len += sg.length; } + sg_mark_end(&darray_last(sgl)); + ret = do_encrypt_sg(c->chacha20, nonce, sgl.data, sgl_len); +err: + darray_exit(&sgl); return ret; } diff --git a/fs/bcachefs/clock.h b/fs/bcachefs/clock.h index 85c975dfbcfe..82c79c8baf92 100644 --- a/fs/bcachefs/clock.h +++ b/fs/bcachefs/clock.h @@ -20,15 +20,6 @@ static inline void bch2_increment_clock(struct bch_fs *c, u64 sectors, void bch2_io_clock_schedule_timeout(struct io_clock *, u64); -#define bch2_kthread_wait_event_ioclock_timeout(condition, clock, timeout)\ -({ \ - long __ret = timeout; \ - might_sleep(); \ - if (!___wait_cond_timeout(condition)) \ - __ret = __wait_event_timeout(wq, condition, timeout); \ - __ret; \ -}) - void bch2_io_timers_to_text(struct printbuf *, struct io_clock *); void bch2_io_clock_exit(struct io_clock *); diff --git a/fs/bcachefs/darray.c b/fs/bcachefs/darray.c index b7d223f85873..4f06cd8bbbe1 100644 --- a/fs/bcachefs/darray.c +++ b/fs/bcachefs/darray.c @@ -4,12 +4,12 @@ #include #include "darray.h" -int __bch2_darray_resize(darray_char *d, size_t element_size, size_t new_size, gfp_t gfp) +int __bch2_darray_resize_noprof(darray_char *d, size_t element_size, size_t new_size, gfp_t gfp) { if (new_size > d->size) { new_size = roundup_pow_of_two(new_size); - void *data = kvmalloc_array(new_size, element_size, gfp); + void *data = kvmalloc_array_noprof(new_size, element_size, gfp); if (!data) return -ENOMEM; diff --git a/fs/bcachefs/darray.h b/fs/bcachefs/darray.h index 4b340d13caac..8f4c3f0665c4 100644 --- a/fs/bcachefs/darray.h +++ b/fs/bcachefs/darray.h @@ -22,29 +22,23 @@ struct { \ typedef DARRAY(char) darray_char; typedef DARRAY(char *) darray_str; -int __bch2_darray_resize(darray_char *, size_t, size_t, gfp_t); +int __bch2_darray_resize_noprof(darray_char *, size_t, size_t, gfp_t); -static inline int __darray_resize(darray_char *d, size_t element_size, - size_t new_size, gfp_t gfp) -{ - return unlikely(new_size > d->size) - ? __bch2_darray_resize(d, element_size, new_size, gfp) - : 0; -} +#define __bch2_darray_resize(...) alloc_hooks(__bch2_darray_resize_noprof(__VA_ARGS__)) + +#define __darray_resize(_d, _element_size, _new_size, _gfp) \ + (unlikely((_new_size) > (_d)->size) \ + ? __bch2_darray_resize((_d), (_element_size), (_new_size), (_gfp))\ + : 0) #define darray_resize_gfp(_d, _new_size, _gfp) \ - unlikely(__darray_resize((darray_char *) (_d), sizeof((_d)->data[0]), (_new_size), _gfp)) + __darray_resize((darray_char *) (_d), sizeof((_d)->data[0]), (_new_size), _gfp) #define darray_resize(_d, _new_size) \ darray_resize_gfp(_d, _new_size, GFP_KERNEL) -static inline int __darray_make_room(darray_char *d, size_t t_size, size_t more, gfp_t gfp) -{ - return __darray_resize(d, t_size, d->nr + more, gfp); -} - #define darray_make_room_gfp(_d, _more, _gfp) \ - __darray_make_room((darray_char *) (_d), sizeof((_d)->data[0]), (_more), _gfp) + darray_resize_gfp((_d), (_d)->nr + (_more), _gfp) #define darray_make_room(_d, _more) \ darray_make_room_gfp(_d, _more, GFP_KERNEL) diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index 004894ad4147..757b9884ef55 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -571,7 +571,7 @@ int bch2_extent_drop_ptrs(struct btree_trans *trans, while (data_opts.kill_ptrs) { unsigned i = 0, drop = __fls(data_opts.kill_ptrs); - bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, i++ == drop); + bch2_bkey_drop_ptrs_noerror(bkey_i_to_s(n), ptr, i++ == drop); data_opts.kill_ptrs ^= 1U << drop; } diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c index 32bfdf19289a..84dd4a879d98 100644 --- a/fs/bcachefs/dirent.c +++ b/fs/bcachefs/dirent.c @@ -552,62 +552,30 @@ static int bch2_dir_emit(struct dir_context *ctx, struct bkey_s_c_dirent d, subv int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx) { - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - struct bkey_s_c k; - subvol_inum target; - u32 snapshot; struct bkey_buf sk; - int ret; - bch2_bkey_buf_init(&sk); -retry: - bch2_trans_begin(trans); - ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); - if (ret) - goto err; + int ret = bch2_trans_run(c, + for_each_btree_key_in_subvolume_upto(trans, iter, BTREE_ID_dirents, + POS(inum.inum, ctx->pos), + POS(inum.inum, U64_MAX), + inum.subvol, 0, k, ({ + if (k.k->type != KEY_TYPE_dirent) + continue; - for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_dirents, - SPOS(inum.inum, ctx->pos, snapshot), - POS(inum.inum, U64_MAX), 0, k, ret) { - if (k.k->type != KEY_TYPE_dirent) - continue; + /* dir_emit() can fault and block: */ + bch2_bkey_buf_reassemble(&sk, c, k); + struct bkey_s_c_dirent dirent = bkey_i_to_s_c_dirent(sk.k); - /* dir_emit() can fault and block: */ - bch2_bkey_buf_reassemble(&sk, c, k); - struct bkey_s_c_dirent dirent = bkey_i_to_s_c_dirent(sk.k); + subvol_inum target; + int ret2 = bch2_dirent_read_target(trans, inum, dirent, &target); + if (ret2 > 0) + continue; - ret = bch2_dirent_read_target(trans, inum, dirent, &target); - if (ret < 0) - break; - if (ret) - continue; + ret2 ?: drop_locks_do(trans, bch2_dir_emit(ctx, dirent, target)); + }))); - /* - * read_target looks up subvolumes, we can overflow paths if the - * directory has many subvolumes in it - * - * XXX: btree_trans_too_many_iters() is something we'd like to - * get rid of, and there's no good reason to be using it here - * except that we don't yet have a for_each_btree_key() helper - * that does subvolume_get_snapshot(). - */ - ret = drop_locks_do(trans, - bch2_dir_emit(ctx, dirent, target)) ?: - btree_trans_too_many_iters(trans); - if (ret) { - ret = ret < 0 ? ret : 0; - break; - } - } - bch2_trans_iter_exit(trans, &iter); -err: - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto retry; - - bch2_trans_put(trans); bch2_bkey_buf_exit(&sk, c); - return ret; + return ret < 0 ? ret : 0; } diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index 141a4c63142f..1587c6e1866a 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -18,6 +18,7 @@ #include "ec.h" #include "error.h" #include "io_read.h" +#include "io_write.h" #include "keylist.h" #include "recovery.h" #include "replicas.h" @@ -146,12 +147,18 @@ void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c, bch2_prt_csum_type(out, s.csum_type); prt_printf(out, " gran %u", 1U << s.csum_granularity_bits); + if (s.disk_label) { + prt_str(out, " label"); + bch2_disk_path_to_text(out, c, s.disk_label - 1); + } + for (unsigned i = 0; i < s.nr_blocks; i++) { const struct bch_extent_ptr *ptr = sp->ptrs + i; if ((void *) ptr >= bkey_val_end(k)) break; + prt_char(out, ' '); bch2_extent_ptr_to_text(out, c, ptr); if (s.csum_type < BCH_CSUM_NR && @@ -192,7 +199,7 @@ static int __mark_stripe_bucket(struct btree_trans *trans, a->dirty_sectors, a->stripe, s.k->p.offset, (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { - ret = -EIO; + ret = -BCH_ERR_mark_stripe; goto err; } @@ -203,7 +210,7 @@ static int __mark_stripe_bucket(struct btree_trans *trans, a->dirty_sectors, a->cached_sectors, (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { - ret = -EIO; + ret = -BCH_ERR_mark_stripe; goto err; } } else { @@ -213,7 +220,7 @@ static int __mark_stripe_bucket(struct btree_trans *trans, bucket.inode, bucket.offset, a->gen, a->stripe, (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { - ret = -EIO; + ret = -BCH_ERR_mark_stripe; goto err; } @@ -223,7 +230,7 @@ static int __mark_stripe_bucket(struct btree_trans *trans, bch2_data_type_str(a->data_type), bch2_data_type_str(data_type), (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { - ret = -EIO; + ret = -BCH_ERR_mark_stripe; goto err; } @@ -235,7 +242,7 @@ static int __mark_stripe_bucket(struct btree_trans *trans, a->dirty_sectors, a->cached_sectors, (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { - ret = -EIO; + ret = -BCH_ERR_mark_stripe; goto err; } } @@ -273,8 +280,8 @@ static int mark_stripe_bucket(struct btree_trans *trans, struct bch_dev *ca = bch2_dev_tryget(c, ptr->dev); if (unlikely(!ca)) { - if (!(flags & BTREE_TRIGGER_overwrite)) - ret = -EIO; + if (ptr->dev != BCH_SB_MEMBER_INVALID && !(flags & BTREE_TRIGGER_overwrite)) + ret = -BCH_ERR_mark_stripe; goto err; } @@ -293,7 +300,7 @@ static int mark_stripe_bucket(struct btree_trans *trans, if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n %s", ptr->dev, (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { - ret = -EIO; + ret = -BCH_ERR_mark_stripe; goto err_unlock; } @@ -351,6 +358,19 @@ static int mark_stripe_buckets(struct btree_trans *trans, return 0; } +static inline void stripe_to_mem(struct stripe *m, const struct bch_stripe *s) +{ + m->sectors = le16_to_cpu(s->sectors); + m->algorithm = s->algorithm; + m->nr_blocks = s->nr_blocks; + m->nr_redundant = s->nr_redundant; + m->disk_label = s->disk_label; + m->blocks_nonempty = 0; + + for (unsigned i = 0; i < s->nr_blocks; i++) + m->blocks_nonempty += !!stripe_blockcount_get(s, i); +} + int bch2_trigger_stripe(struct btree_trans *trans, enum btree_id btree, unsigned level, struct bkey_s_c old, struct bkey_s _new, @@ -467,14 +487,7 @@ int bch2_trigger_stripe(struct btree_trans *trans, memset(m, 0, sizeof(*m)); } else { - m->sectors = le16_to_cpu(new_s->sectors); - m->algorithm = new_s->algorithm; - m->nr_blocks = new_s->nr_blocks; - m->nr_redundant = new_s->nr_redundant; - m->blocks_nonempty = 0; - - for (unsigned i = 0; i < new_s->nr_blocks; i++) - m->blocks_nonempty += !!stripe_blockcount_get(new_s, i); + stripe_to_mem(m, new_s); if (!old_s) bch2_stripes_heap_insert(c, m, idx); @@ -816,13 +829,16 @@ static int get_stripe_key_trans(struct btree_trans *trans, u64 idx, } /* recovery read path: */ -int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio) +int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio, + struct bkey_s_c orig_k) { struct bch_fs *c = trans->c; - struct ec_stripe_buf *buf; + struct ec_stripe_buf *buf = NULL; struct closure cl; struct bch_stripe *v; unsigned i, offset; + const char *msg = NULL; + struct printbuf msgbuf = PRINTBUF; int ret = 0; closure_init_stack(&cl); @@ -835,32 +851,28 @@ int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio) ret = lockrestart_do(trans, get_stripe_key_trans(trans, rbio->pick.ec.idx, buf)); if (ret) { - bch_err_ratelimited(c, - "error doing reconstruct read: error %i looking up stripe", ret); - kfree(buf); - return -EIO; + msg = "stripe not found"; + goto err; } v = &bkey_i_to_stripe(&buf->key)->v; if (!bch2_ptr_matches_stripe(v, rbio->pick)) { - bch_err_ratelimited(c, - "error doing reconstruct read: pointer doesn't match stripe"); - ret = -EIO; + msg = "pointer doesn't match stripe"; goto err; } offset = rbio->bio.bi_iter.bi_sector - v->ptrs[rbio->pick.ec.block].offset; if (offset + bio_sectors(&rbio->bio) > le16_to_cpu(v->sectors)) { - bch_err_ratelimited(c, - "error doing reconstruct read: read is bigger than stripe"); - ret = -EIO; + msg = "read is bigger than stripe"; goto err; } ret = ec_stripe_buf_init(buf, offset, bio_sectors(&rbio->bio)); - if (ret) + if (ret) { + msg = "-ENOMEM"; goto err; + } for (i = 0; i < v->nr_blocks; i++) ec_block_io(c, buf, REQ_OP_READ, i, &cl); @@ -868,9 +880,7 @@ int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio) closure_sync(&cl); if (ec_nr_failed(buf) > v->nr_redundant) { - bch_err_ratelimited(c, - "error doing reconstruct read: unable to read enough blocks"); - ret = -EIO; + msg = "unable to read enough blocks"; goto err; } @@ -882,10 +892,17 @@ int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio) memcpy_to_bio(&rbio->bio, rbio->bio.bi_iter, buf->data[rbio->pick.ec.block] + ((offset - buf->offset) << 9)); -err: +out: ec_stripe_buf_exit(buf); kfree(buf); return ret; +err: + bch2_bkey_val_to_text(&msgbuf, c, orig_k); + bch_err_ratelimited(c, + "error doing reconstruct read: %s\n %s", msg, msgbuf.buf); + printbuf_exit(&msgbuf);; + ret = -BCH_ERR_stripe_reconstruct; + goto out; } /* stripe bucket accounting: */ @@ -1305,7 +1322,7 @@ static int ec_stripe_update_extent(struct btree_trans *trans, bkey_reassemble(n, k); - bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, ptr->dev != dev); + bch2_bkey_drop_ptrs_noerror(bkey_i_to_s(n), ptr, ptr->dev != dev); ec_ptr = bch2_bkey_has_device(bkey_i_to_s(n), dev); BUG_ON(!ec_ptr); @@ -1555,10 +1572,12 @@ void bch2_ec_do_stripe_creates(struct bch_fs *c) bch2_write_ref_put(c, BCH_WRITE_REF_stripe_create); } -static void ec_stripe_set_pending(struct bch_fs *c, struct ec_stripe_head *h) +static void ec_stripe_new_set_pending(struct bch_fs *c, struct ec_stripe_head *h) { struct ec_stripe_new *s = h->s; + lockdep_assert_held(&h->lock); + BUG_ON(!s->allocated && !s->err); h->s = NULL; @@ -1571,6 +1590,12 @@ static void ec_stripe_set_pending(struct bch_fs *c, struct ec_stripe_head *h) ec_stripe_new_put(c, s, STRIPE_REF_io); } +static void ec_stripe_new_cancel(struct bch_fs *c, struct ec_stripe_head *h, int err) +{ + h->s->err = err; + ec_stripe_new_set_pending(c, h); +} + void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob) { struct ec_stripe_new *s = ob->ec; @@ -1641,7 +1666,8 @@ static void ec_stripe_key_init(struct bch_fs *c, struct bkey_i *k, unsigned nr_data, unsigned nr_parity, - unsigned stripe_size) + unsigned stripe_size, + unsigned disk_label) { struct bkey_i_stripe *s = bkey_stripe_init(k); unsigned u64s; @@ -1652,7 +1678,7 @@ static void ec_stripe_key_init(struct bch_fs *c, s->v.nr_redundant = nr_parity; s->v.csum_granularity_bits = ilog2(c->opts.encoded_extent_max >> 9); s->v.csum_type = BCH_CSUM_crc32c; - s->v.pad = 0; + s->v.disk_label = disk_label; while ((u64s = stripe_val_u64s(&s->v)) > BKEY_VAL_U64s_MAX) { BUG_ON(1 << s->v.csum_granularity_bits >= @@ -1685,14 +1711,70 @@ static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h) s->nr_parity = h->redundancy; ec_stripe_key_init(c, &s->new_stripe.key, - s->nr_data, s->nr_parity, h->blocksize); + s->nr_data, s->nr_parity, + h->blocksize, h->disk_label); h->s = s; + h->nr_created++; return 0; } +static void ec_stripe_head_devs_update(struct bch_fs *c, struct ec_stripe_head *h) +{ + struct bch_devs_mask devs = h->devs; + + rcu_read_lock(); + h->devs = target_rw_devs(c, BCH_DATA_user, h->disk_label + ? group_to_target(h->disk_label - 1) + : 0); + unsigned nr_devs = dev_mask_nr(&h->devs); + + for_each_member_device_rcu(c, ca, &h->devs) + if (!ca->mi.durability) + __clear_bit(ca->dev_idx, h->devs.d); + unsigned nr_devs_with_durability = dev_mask_nr(&h->devs); + + h->blocksize = pick_blocksize(c, &h->devs); + + h->nr_active_devs = 0; + for_each_member_device_rcu(c, ca, &h->devs) + if (ca->mi.bucket_size == h->blocksize) + h->nr_active_devs++; + + rcu_read_unlock(); + + /* + * If we only have redundancy + 1 devices, we're better off with just + * replication: + */ + h->insufficient_devs = h->nr_active_devs < h->redundancy + 2; + + if (h->insufficient_devs) { + const char *err; + + if (nr_devs < h->redundancy + 2) + err = NULL; + else if (nr_devs_with_durability < h->redundancy + 2) + err = "cannot use durability=0 devices"; + else + err = "mismatched bucket sizes"; + + if (err) + bch_err(c, "insufficient devices available to create stripe (have %u, need %u): %s", + h->nr_active_devs, h->redundancy + 2, err); + } + + struct bch_devs_mask devs_leaving; + bitmap_andnot(devs_leaving.d, devs.d, h->devs.d, BCH_SB_MEMBERS_MAX); + + if (h->s && !h->s->allocated && dev_mask_nr(&devs_leaving)) + ec_stripe_new_cancel(c, h, -EINTR); + + h->rw_devs_change_count = c->rw_devs_change_count; +} + static struct ec_stripe_head * -ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target, +ec_new_stripe_head_alloc(struct bch_fs *c, unsigned disk_label, unsigned algo, unsigned redundancy, enum bch_watermark watermark) { @@ -1705,34 +1787,11 @@ ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target, mutex_init(&h->lock); BUG_ON(!mutex_trylock(&h->lock)); - h->target = target; + h->disk_label = disk_label; h->algo = algo; h->redundancy = redundancy; h->watermark = watermark; - rcu_read_lock(); - h->devs = target_rw_devs(c, BCH_DATA_user, target); - - for_each_member_device_rcu(c, ca, &h->devs) - if (!ca->mi.durability) - __clear_bit(ca->dev_idx, h->devs.d); - - h->blocksize = pick_blocksize(c, &h->devs); - - for_each_member_device_rcu(c, ca, &h->devs) - if (ca->mi.bucket_size == h->blocksize) - h->nr_active_devs++; - - rcu_read_unlock(); - - /* - * If we only have redundancy + 1 devices, we're better off with just - * replication: - */ - if (h->nr_active_devs < h->redundancy + 2) - bch_err(c, "insufficient devices available to create stripe (have %u, need %u) - mismatched bucket sizes?", - h->nr_active_devs, h->redundancy + 2); - list_add(&h->list, &c->ec_stripe_head_list); return h; } @@ -1743,14 +1802,14 @@ void bch2_ec_stripe_head_put(struct bch_fs *c, struct ec_stripe_head *h) h->s->allocated && bitmap_weight(h->s->blocks_allocated, h->s->nr_data) == h->s->nr_data) - ec_stripe_set_pending(c, h); + ec_stripe_new_set_pending(c, h); mutex_unlock(&h->lock); } static struct ec_stripe_head * __bch2_ec_stripe_head_get(struct btree_trans *trans, - unsigned target, + unsigned disk_label, unsigned algo, unsigned redundancy, enum bch_watermark watermark) @@ -1768,27 +1827,32 @@ __bch2_ec_stripe_head_get(struct btree_trans *trans, if (test_bit(BCH_FS_going_ro, &c->flags)) { h = ERR_PTR(-BCH_ERR_erofs_no_writes); - goto found; + goto err; } list_for_each_entry(h, &c->ec_stripe_head_list, list) - if (h->target == target && + if (h->disk_label == disk_label && h->algo == algo && h->redundancy == redundancy && h->watermark == watermark) { ret = bch2_trans_mutex_lock(trans, &h->lock); - if (ret) + if (ret) { h = ERR_PTR(ret); + goto err; + } goto found; } - h = ec_new_stripe_head_alloc(c, target, algo, redundancy, watermark); + h = ec_new_stripe_head_alloc(c, disk_label, algo, redundancy, watermark); found: - if (!IS_ERR_OR_NULL(h) && - h->nr_active_devs < h->redundancy + 2) { + if (h->rw_devs_change_count != c->rw_devs_change_count) + ec_stripe_head_devs_update(c, h); + + if (h->insufficient_devs) { mutex_unlock(&h->lock); h = NULL; } +err: mutex_unlock(&c->ec_stripe_head_lock); return h; } @@ -1878,7 +1942,6 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_ return 0; } -/* XXX: doesn't obey target: */ static s64 get_existing_stripe(struct bch_fs *c, struct ec_stripe_head *head) { @@ -1901,7 +1964,8 @@ static s64 get_existing_stripe(struct bch_fs *c, m = genradix_ptr(&c->stripes, stripe_idx); - if (m->algorithm == head->algo && + if (m->disk_label == head->disk_label && + m->algorithm == head->algo && m->nr_redundant == head->redundancy && m->sectors == head->blocksize && m->blocks_nonempty < m->nr_blocks - m->nr_redundant && @@ -2046,9 +2110,19 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans, struct bch_fs *c = trans->c; struct ec_stripe_head *h; bool waiting = false; + unsigned disk_label = 0; + struct target t = target_decode(target); int ret; - h = __bch2_ec_stripe_head_get(trans, target, algo, redundancy, watermark); + if (t.type == TARGET_GROUP) { + if (t.group > U8_MAX) { + bch_err(c, "cannot create a stripe when disk_label > U8_MAX"); + return NULL; + } + disk_label = t.group + 1; /* 0 == no label */ + } + + h = __bch2_ec_stripe_head_get(trans, disk_label, algo, redundancy, watermark); if (IS_ERR_OR_NULL(h)) return h; @@ -2126,6 +2200,73 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans, return ERR_PTR(ret); } +/* device removal */ + +static int bch2_invalidate_stripe_to_dev(struct btree_trans *trans, struct bkey_s_c k_a) +{ + struct bch_alloc_v4 a_convert; + const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k_a, &a_convert); + + if (!a->stripe) + return 0; + + if (a->stripe_sectors) { + bch_err(trans->c, "trying to invalidate device in stripe when bucket has stripe data"); + return -BCH_ERR_invalidate_stripe_to_dev; + } + + struct btree_iter iter; + struct bkey_i_stripe *s = + bch2_bkey_get_mut_typed(trans, &iter, BTREE_ID_stripes, POS(0, a->stripe), + BTREE_ITER_slots, stripe); + int ret = PTR_ERR_OR_ZERO(s); + if (ret) + return ret; + + struct disk_accounting_pos acc = { + .type = BCH_DISK_ACCOUNTING_replicas, + }; + + s64 sectors = 0; + for (unsigned i = 0; i < s->v.nr_blocks; i++) + sectors -= stripe_blockcount_get(&s->v, i); + + bch2_bkey_to_replicas(&acc.replicas, bkey_i_to_s_c(&s->k_i)); + acc.replicas.data_type = BCH_DATA_user; + ret = bch2_disk_accounting_mod(trans, &acc, §ors, 1, false); + if (ret) + goto err; + + struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(&s->k_i)); + bkey_for_each_ptr(ptrs, ptr) + if (ptr->dev == k_a.k->p.inode) + ptr->dev = BCH_SB_MEMBER_INVALID; + + sectors = -sectors; + + bch2_bkey_to_replicas(&acc.replicas, bkey_i_to_s_c(&s->k_i)); + acc.replicas.data_type = BCH_DATA_user; + ret = bch2_disk_accounting_mod(trans, &acc, §ors, 1, false); + if (ret) + goto err; +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +int bch2_dev_remove_stripes(struct bch_fs *c, unsigned dev_idx) +{ + return bch2_trans_run(c, + for_each_btree_key_upto_commit(trans, iter, + BTREE_ID_alloc, POS(dev_idx, 0), POS(dev_idx, U64_MAX), + BTREE_ITER_intent, k, + NULL, NULL, 0, ({ + bch2_invalidate_stripe_to_dev(trans, k); + }))); +} + +/* startup/shutdown */ + static void __bch2_ec_stop(struct bch_fs *c, struct bch_dev *ca) { struct ec_stripe_head *h; @@ -2151,8 +2292,7 @@ static void __bch2_ec_stop(struct bch_fs *c, struct bch_dev *ca) } goto unlock; found: - h->s->err = -BCH_ERR_erofs_no_writes; - ec_stripe_set_pending(c, h); + ec_stripe_new_cancel(c, h, -BCH_ERR_erofs_no_writes); unlock: mutex_unlock(&h->lock); } @@ -2197,17 +2337,9 @@ int bch2_stripes_read(struct bch_fs *c) if (ret) break; - const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; - struct stripe *m = genradix_ptr(&c->stripes, k.k->p.offset); - m->sectors = le16_to_cpu(s->sectors); - m->algorithm = s->algorithm; - m->nr_blocks = s->nr_blocks; - m->nr_redundant = s->nr_redundant; - m->blocks_nonempty = 0; - for (unsigned i = 0; i < s->nr_blocks; i++) - m->blocks_nonempty += !!stripe_blockcount_get(s, i); + stripe_to_mem(m, bkey_s_c_to_stripe(k).v); bch2_stripes_heap_insert(c, m, k.k->p.offset); 0; @@ -2252,6 +2384,8 @@ static void bch2_new_stripe_to_text(struct printbuf *out, struct bch_fs *c, for_each_set_bit(i, s->blocks_gotten, v->nr_blocks) prt_printf(out, " %u", s->blocks[i]); prt_newline(out); + bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&s->new_stripe.key)); + prt_newline(out); } void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c) @@ -2261,9 +2395,10 @@ void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c) mutex_lock(&c->ec_stripe_head_lock); list_for_each_entry(h, &c->ec_stripe_head_list, list) { - prt_printf(out, "target %u algo %u redundancy %u %s:\n", - h->target, h->algo, h->redundancy, - bch2_watermarks[h->watermark]); + prt_printf(out, "disk label %u algo %u redundancy %u %s nr created %llu:\n", + h->disk_label, h->algo, h->redundancy, + bch2_watermarks[h->watermark], + h->nr_created); if (h->s) bch2_new_stripe_to_text(out, c, h->s); diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h index 9baf3411a8f9..43326370b410 100644 --- a/fs/bcachefs/ec.h +++ b/fs/bcachefs/ec.h @@ -188,10 +188,15 @@ struct ec_stripe_head { struct list_head list; struct mutex lock; - unsigned target; + unsigned disk_label; unsigned algo; unsigned redundancy; enum bch_watermark watermark; + bool insufficient_devs; + + unsigned long rw_devs_change_count; + + u64 nr_created; struct bch_devs_mask devs; unsigned nr_active_devs; @@ -204,7 +209,7 @@ struct ec_stripe_head { struct ec_stripe_new *s; }; -int bch2_ec_read_extent(struct btree_trans *, struct bch_read_bio *); +int bch2_ec_read_extent(struct btree_trans *, struct bch_read_bio *, struct bkey_s_c); void *bch2_writepoint_ec_buf(struct bch_fs *, struct write_point *); @@ -249,6 +254,8 @@ static inline void ec_stripe_new_put(struct bch_fs *c, struct ec_stripe_new *s, } } +int bch2_dev_remove_stripes(struct bch_fs *, unsigned); + void bch2_ec_stop_dev(struct bch_fs *, struct bch_dev *); void bch2_fs_ec_stop(struct bch_fs *); void bch2_fs_ec_flush(struct bch_fs *); diff --git a/fs/bcachefs/ec_format.h b/fs/bcachefs/ec_format.h index 44ce88ba08d7..64ef52e00078 100644 --- a/fs/bcachefs/ec_format.h +++ b/fs/bcachefs/ec_format.h @@ -11,7 +11,14 @@ struct bch_stripe { __u8 csum_granularity_bits; __u8 csum_type; - __u8 pad; + + /* + * XXX: targets should be 16 bits - fix this if we ever do a stripe_v2 + * + * we can manage with this because this only needs to point to a + * disk label, not a target: + */ + __u8 disk_label; struct bch_extent_ptr ptrs[]; } __packed __aligned(8); diff --git a/fs/bcachefs/ec_types.h b/fs/bcachefs/ec_types.h index 1df03dccfc72..8d1e70e830ac 100644 --- a/fs/bcachefs/ec_types.h +++ b/fs/bcachefs/ec_types.h @@ -16,6 +16,7 @@ struct stripe { u8 nr_blocks; u8 nr_redundant; u8 blocks_nonempty; + u8 disk_label; }; struct gc_stripe { diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index 742dcdd3e5d7..60b7875adada 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -119,8 +119,8 @@ x(EEXIST, EEXIST_str_hash_set) \ x(EEXIST, EEXIST_discard_in_flight_add) \ x(EEXIST, EEXIST_subvolume_create) \ - x(0, open_buckets_empty) \ - x(0, freelist_empty) \ + x(ENOSPC, open_buckets_empty) \ + x(ENOSPC, freelist_empty) \ x(BCH_ERR_freelist_empty, no_buckets_found) \ x(0, transaction_restart) \ x(BCH_ERR_transaction_restart, transaction_restart_fault_inject) \ @@ -244,6 +244,16 @@ x(EIO, btree_node_read_error) \ x(EIO, btree_node_read_validate_error) \ x(EIO, btree_need_topology_repair) \ + x(EIO, bucket_ref_update) \ + x(EIO, trigger_pointer) \ + x(EIO, trigger_stripe_pointer) \ + x(EIO, metadata_bucket_inconsistency) \ + x(EIO, mark_stripe) \ + x(EIO, stripe_reconstruct) \ + x(EIO, key_type_error) \ + x(EIO, no_device_to_read_from) \ + x(EIO, missing_indirect_extent) \ + x(EIO, invalidate_stripe_to_dev) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_fixable) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_want_retry) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_must_retry) \ diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index 324303bf4353..cc0d22085aef 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -115,7 +115,7 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, int ret = 0; if (k.k->type == KEY_TYPE_error) - return -EIO; + return -BCH_ERR_key_type_error; rcu_read_lock(); bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { @@ -133,7 +133,7 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, * read: */ if (!ret && !p.ptr.cached) - ret = -EIO; + ret = -BCH_ERR_no_device_to_read_from; struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev); @@ -146,16 +146,13 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, ? f->idx : f->idx + 1; - if (!p.idx && !ca) + if (!p.idx && (!ca || !bch2_dev_is_readable(ca))) p.idx++; if (!p.idx && p.has_ec && bch2_force_reconstruct_read) p.idx++; - if (!p.idx && !bch2_dev_is_readable(ca)) - p.idx++; - - if (p.idx >= (unsigned) p.has_ec + 1) + if (p.idx > (unsigned) p.has_ec) continue; if (ret > 0 && !ptr_better(c, p, *pick)) @@ -821,6 +818,18 @@ void bch2_bkey_drop_ptr_noerror(struct bkey_s k, struct bch_extent_ptr *ptr) void bch2_bkey_drop_ptr(struct bkey_s k, struct bch_extent_ptr *ptr) { + if (k.k->type != KEY_TYPE_stripe) { + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k.s_c); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) + if (p.ptr.dev == ptr->dev && p.has_ec) { + ptr->dev = BCH_SB_MEMBER_INVALID; + return; + } + } + bool have_dirty = bch2_bkey_dirty_devs(k.s_c).nr; bch2_bkey_drop_ptr_noerror(k, ptr); @@ -848,10 +857,7 @@ void bch2_bkey_drop_device(struct bkey_s k, unsigned dev) void bch2_bkey_drop_device_noerror(struct bkey_s k, unsigned dev) { - struct bch_extent_ptr *ptr = bch2_bkey_has_device(k, dev); - - if (ptr) - bch2_bkey_drop_ptr_noerror(k, ptr); + bch2_bkey_drop_ptrs_noerror(k, ptr, ptr->dev == dev); } const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c k, unsigned dev) @@ -1021,7 +1027,7 @@ void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *c, const struc { out->atomic++; rcu_read_lock(); - struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); + struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev); if (!ca) { prt_printf(out, "ptr: %u:%llu gen %u%s", ptr->dev, (u64) ptr->offset, ptr->gen, @@ -1125,8 +1131,9 @@ static int extent_ptr_validate(struct bch_fs *c, { int ret = 0; + /* bad pointers are repaired by check_fix_ptrs(): */ rcu_read_lock(); - struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); + struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev); if (!ca) { rcu_read_unlock(); return 0; diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h index 42a7c6d820a0..ed5001dd662e 100644 --- a/fs/bcachefs/extents.h +++ b/fs/bcachefs/extents.h @@ -357,7 +357,7 @@ out: \ __bkey_for_each_ptr_decode(_k, (_p).start, (_p).end, \ _ptr, _entry) -#define bkey_crc_next(_k, _start, _end, _crc, _iter) \ +#define bkey_crc_next(_k, _end, _crc, _iter) \ ({ \ __bkey_extent_entry_for_each_from(_iter, _end, _iter) \ if (extent_entry_is_crc(_iter)) { \ @@ -372,7 +372,7 @@ out: \ #define __bkey_for_each_crc(_k, _start, _end, _crc, _iter) \ for ((_crc) = bch2_extent_crc_unpack(_k, NULL), \ (_iter) = (_start); \ - bkey_crc_next(_k, _start, _end, _crc, _iter); \ + bkey_crc_next(_k, _end, _crc, _iter); \ (_iter) = extent_entry_next(_iter)) #define bkey_for_each_crc(_k, _p, _crc, _iter) \ @@ -611,9 +611,6 @@ unsigned bch2_extent_ptr_desired_durability(struct bch_fs *, struct extent_ptr_d unsigned bch2_extent_ptr_durability(struct bch_fs *, struct extent_ptr_decoded *); unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c); -void bch2_bkey_drop_device(struct bkey_s, unsigned); -void bch2_bkey_drop_device_noerror(struct bkey_s, unsigned); - const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c, unsigned); static inline struct bch_extent_ptr *bch2_bkey_has_device(struct bkey_s k, unsigned dev) @@ -652,6 +649,23 @@ void bch2_extent_ptr_decoded_append(struct bkey_i *, void bch2_bkey_drop_ptr_noerror(struct bkey_s, struct bch_extent_ptr *); void bch2_bkey_drop_ptr(struct bkey_s, struct bch_extent_ptr *); +void bch2_bkey_drop_device_noerror(struct bkey_s, unsigned); +void bch2_bkey_drop_device(struct bkey_s, unsigned); + +#define bch2_bkey_drop_ptrs_noerror(_k, _ptr, _cond) \ +do { \ + __label__ _again; \ + struct bkey_ptrs _ptrs; \ +_again: \ + _ptrs = bch2_bkey_ptrs(_k); \ + \ + bkey_for_each_ptr(_ptrs, _ptr) \ + if (_cond) { \ + bch2_bkey_drop_ptr_noerror(_k, _ptr); \ + goto _again; \ + } \ +} while (0) + #define bch2_bkey_drop_ptrs(_k, _ptr, _cond) \ do { \ __label__ _again; \ diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c index 508d029ac53d..7e10a9ddcfd9 100644 --- a/fs/bcachefs/fs-common.c +++ b/fs/bcachefs/fs-common.c @@ -42,7 +42,8 @@ int bch2_create_trans(struct btree_trans *trans, if (ret) goto err; - ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_intent); + ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, + BTREE_ITER_intent|BTREE_ITER_with_updates); if (ret) goto err; @@ -163,7 +164,7 @@ int bch2_create_trans(struct btree_trans *trans, name, dir_target, &dir_offset, - STR_HASH_must_create); + STR_HASH_must_create|BTREE_ITER_with_updates); if (ret) goto err; diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c index ff60c041abe5..48a1ab9a649b 100644 --- a/fs/bcachefs/fs-io-buffered.c +++ b/fs/bcachefs/fs-io-buffered.c @@ -151,7 +151,6 @@ static void bchfs_read(struct btree_trans *trans, struct bkey_buf sk; int flags = BCH_READ_RETRY_IF_STALE| BCH_READ_MAY_PROMOTE; - u32 snapshot; int ret = 0; rbio->c = c; @@ -159,29 +158,23 @@ static void bchfs_read(struct btree_trans *trans, rbio->subvol = inum.subvol; bch2_bkey_buf_init(&sk); -retry: bch2_trans_begin(trans); - iter = (struct btree_iter) { NULL }; - - ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); - if (ret) - goto err; - bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, - SPOS(inum.inum, rbio->bio.bi_iter.bi_sector, snapshot), + POS(inum.inum, rbio->bio.bi_iter.bi_sector), BTREE_ITER_slots); while (1) { struct bkey_s_c k; unsigned bytes, sectors, offset_into_extent; enum btree_id data_btree = BTREE_ID_extents; - /* - * read_extent -> io_time_reset may cause a transaction restart - * without returning an error, we need to check for that here: - */ - ret = bch2_trans_relock(trans); + bch2_trans_begin(trans); + + u32 snapshot; + ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); if (ret) - break; + goto err; + + bch2_btree_iter_set_snapshot(&iter, snapshot); bch2_btree_iter_set_pos(&iter, POS(inum.inum, rbio->bio.bi_iter.bi_sector)); @@ -189,7 +182,7 @@ static void bchfs_read(struct btree_trans *trans, k = bch2_btree_iter_peek_slot(&iter); ret = bkey_err(k); if (ret) - break; + goto err; offset_into_extent = iter.pos.offset - bkey_start_offset(k.k); @@ -200,7 +193,7 @@ static void bchfs_read(struct btree_trans *trans, ret = bch2_read_indirect_extent(trans, &data_btree, &offset_into_extent, &sk); if (ret) - break; + goto err; k = bkey_i_to_s_c(sk.k); @@ -210,7 +203,7 @@ static void bchfs_read(struct btree_trans *trans, ret = readpage_bio_extend(trans, readpages_iter, &rbio->bio, sectors, extent_partial_reads_expensive(k)); if (ret) - break; + goto err; } bytes = min(sectors, bio_sectors(&rbio->bio)) << 9; @@ -229,17 +222,13 @@ static void bchfs_read(struct btree_trans *trans, swap(rbio->bio.bi_iter.bi_size, bytes); bio_advance(&rbio->bio, bytes); - - ret = btree_trans_too_many_iters(trans); - if (ret) +err: + if (ret && + !bch2_err_matches(ret, BCH_ERR_transaction_restart)) break; } -err: bch2_trans_iter_exit(trans, &iter); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto retry; - if (ret) { bch_err_inum_offset_ratelimited(c, iter.pos.inode, @@ -486,7 +475,7 @@ static void bch2_writepage_io_alloc(struct bch_fs *c, op->nr_replicas = nr_replicas; op->res.nr_replicas = nr_replicas; op->write_point = writepoint_hashed(inode->ei_last_dirtied); - op->subvol = inode->ei_subvol; + op->subvol = inode->ei_inum.subvol; op->pos = POS(inode->v.i_ino, sector); op->end_io = bch2_writepage_io_done; op->devs_need_flush = &inode->ei_devs_need_flush; diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c index e246b1e05aa2..ee1c0325f313 100644 --- a/fs/bcachefs/fs-io-direct.c +++ b/fs/bcachefs/fs-io-direct.c @@ -500,7 +500,7 @@ static __always_inline long bch2_dio_write_loop(struct dio_write *dio) dio->op.target = dio->op.opts.foreground_target; dio->op.write_point = writepoint_hashed((unsigned long) current); dio->op.nr_replicas = dio->op.opts.data_replicas; - dio->op.subvol = inode->ei_subvol; + dio->op.subvol = inode->ei_inum.subvol; dio->op.pos = POS(inode->v.i_ino, (u64) req->ki_pos >> 9); dio->op.devs_need_flush = &inode->ei_devs_need_flush; diff --git a/fs/bcachefs/fs-io-pagecache.c b/fs/bcachefs/fs-io-pagecache.c index a9cc5cad9cc9..af3a24546aa3 100644 --- a/fs/bcachefs/fs-io-pagecache.c +++ b/fs/bcachefs/fs-io-pagecache.c @@ -182,18 +182,11 @@ static void __bch2_folio_set(struct folio *folio, int bch2_folio_set(struct bch_fs *c, subvol_inum inum, struct folio **fs, unsigned nr_folios) { - struct btree_trans *trans; - struct btree_iter iter; - struct bkey_s_c k; - struct bch_folio *s; u64 offset = folio_sector(fs[0]); - unsigned folio_idx; - u32 snapshot; bool need_set = false; - int ret; - for (folio_idx = 0; folio_idx < nr_folios; folio_idx++) { - s = bch2_folio_create(fs[folio_idx], GFP_KERNEL); + for (unsigned folio_idx = 0; folio_idx < nr_folios; folio_idx++) { + struct bch_folio *s = bch2_folio_create(fs[folio_idx], GFP_KERNEL); if (!s) return -ENOMEM; @@ -203,53 +196,40 @@ int bch2_folio_set(struct bch_fs *c, subvol_inum inum, if (!need_set) return 0; - folio_idx = 0; - trans = bch2_trans_get(c); -retry: - bch2_trans_begin(trans); + unsigned folio_idx = 0; - ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); - if (ret) - goto err; + return bch2_trans_run(c, + for_each_btree_key_in_subvolume_upto(trans, iter, BTREE_ID_extents, + POS(inum.inum, offset), + POS(inum.inum, U64_MAX), + inum.subvol, BTREE_ITER_slots, k, ({ + unsigned nr_ptrs = bch2_bkey_nr_ptrs_fully_allocated(k); + unsigned state = bkey_to_sector_state(k); - for_each_btree_key_norestart(trans, iter, BTREE_ID_extents, - SPOS(inum.inum, offset, snapshot), - BTREE_ITER_slots, k, ret) { - unsigned nr_ptrs = bch2_bkey_nr_ptrs_fully_allocated(k); - unsigned state = bkey_to_sector_state(k); + while (folio_idx < nr_folios) { + struct folio *folio = fs[folio_idx]; + u64 folio_start = folio_sector(folio); + u64 folio_end = folio_end_sector(folio); + unsigned folio_offset = max(bkey_start_offset(k.k), folio_start) - + folio_start; + unsigned folio_len = min(k.k->p.offset, folio_end) - + folio_offset - folio_start; - while (folio_idx < nr_folios) { - struct folio *folio = fs[folio_idx]; - u64 folio_start = folio_sector(folio); - u64 folio_end = folio_end_sector(folio); - unsigned folio_offset = max(bkey_start_offset(k.k), folio_start) - - folio_start; - unsigned folio_len = min(k.k->p.offset, folio_end) - - folio_offset - folio_start; + BUG_ON(k.k->p.offset < folio_start); + BUG_ON(bkey_start_offset(k.k) > folio_end); - BUG_ON(k.k->p.offset < folio_start); - BUG_ON(bkey_start_offset(k.k) > folio_end); + if (!bch2_folio(folio)->uptodate) + __bch2_folio_set(folio, folio_offset, folio_len, nr_ptrs, state); - if (!bch2_folio(folio)->uptodate) - __bch2_folio_set(folio, folio_offset, folio_len, nr_ptrs, state); + if (k.k->p.offset < folio_end) + break; + folio_idx++; + } - if (k.k->p.offset < folio_end) + if (folio_idx == nr_folios) break; - folio_idx++; - } - - if (folio_idx == nr_folios) - break; - } - - offset = iter.pos.offset; - bch2_trans_iter_exit(trans, &iter); -err: - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto retry; - bch2_trans_put(trans); - - return ret; + 0; + }))); } void bch2_bio_page_state_set(struct bio *bio, struct bkey_s_c k) diff --git a/fs/bcachefs/fs-io-pagecache.h b/fs/bcachefs/fs-io-pagecache.h index fd7d692c087e..fad911cf5068 100644 --- a/fs/bcachefs/fs-io-pagecache.h +++ b/fs/bcachefs/fs-io-pagecache.h @@ -99,9 +99,7 @@ static inline void bch2_folio_release(struct folio *folio) static inline struct bch_folio *__bch2_folio(struct folio *folio) { - return folio_has_private(folio) - ? (struct bch_folio *) folio_get_private(folio) - : NULL; + return folio_get_private(folio); } static inline struct bch_folio *bch2_folio(struct folio *folio) diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c index 77b85da30fb2..71d0fa387509 100644 --- a/fs/bcachefs/fs-io.c +++ b/fs/bcachefs/fs-io.c @@ -221,30 +221,11 @@ static inline int range_has_data(struct bch_fs *c, u32 subvol, struct bpos start, struct bpos end) { - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - struct bkey_s_c k; - int ret = 0; -retry: - bch2_trans_begin(trans); - - ret = bch2_subvolume_get_snapshot(trans, subvol, &start.snapshot); - if (ret) - goto err; - - for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_extents, start, end, 0, k, ret) - if (bkey_extent_is_data(k.k) && !bkey_extent_is_unwritten(k)) { - ret = 1; - break; - } - start = iter.pos; - bch2_trans_iter_exit(trans, &iter); -err: - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto retry; - - bch2_trans_put(trans); - return ret; + return bch2_trans_run(c, + for_each_btree_key_in_subvolume_upto(trans, iter, BTREE_ID_extents, start, end, + subvol, 0, k, ({ + bkey_extent_is_data(k.k) && !bkey_extent_is_unwritten(k); + }))); } static int __bch2_truncate_folio(struct bch_inode_info *inode, @@ -267,7 +248,7 @@ static int __bch2_truncate_folio(struct bch_inode_info *inode, * XXX: we're doing two index lookups when we end up reading the * folio */ - ret = range_has_data(c, inode->ei_subvol, + ret = range_has_data(c, inode->ei_inum.subvol, POS(inode->v.i_ino, (index << PAGE_SECTORS_SHIFT)), POS(inode->v.i_ino, (index << PAGE_SECTORS_SHIFT) + PAGE_SECTORS)); if (ret <= 0) @@ -618,7 +599,7 @@ static noinline int __bchfs_fallocate(struct bch_inode_info *inode, int mode, bch2_trans_begin(trans); ret = bch2_subvolume_get_snapshot(trans, - inode->ei_subvol, &snapshot); + inode->ei_inum.subvol, &snapshot); if (ret) goto bkey_err; @@ -813,41 +794,23 @@ static int quota_reserve_range(struct bch_inode_info *inode, u64 start, u64 end) { struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - struct bkey_s_c k; - u32 snapshot; u64 sectors = end - start; - u64 pos = start; - int ret; -retry: - bch2_trans_begin(trans); - ret = bch2_subvolume_get_snapshot(trans, inode->ei_subvol, &snapshot); - if (ret) - goto err; + int ret = bch2_trans_run(c, + for_each_btree_key_in_subvolume_upto(trans, iter, + BTREE_ID_extents, + POS(inode->v.i_ino, start), + POS(inode->v.i_ino, end - 1), + inode->ei_inum.subvol, 0, k, ({ + if (bkey_extent_is_allocation(k.k)) { + u64 s = min(end, k.k->p.offset) - + max(start, bkey_start_offset(k.k)); + BUG_ON(s > sectors); + sectors -= s; + } - bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, - SPOS(inode->v.i_ino, pos, snapshot), 0); - - while (!(ret = btree_trans_too_many_iters(trans)) && - (k = bch2_btree_iter_peek_upto(&iter, POS(inode->v.i_ino, end - 1))).k && - !(ret = bkey_err(k))) { - if (bkey_extent_is_allocation(k.k)) { - u64 s = min(end, k.k->p.offset) - - max(start, bkey_start_offset(k.k)); - BUG_ON(s > sectors); - sectors -= s; - } - bch2_btree_iter_advance(&iter); - } - pos = iter.pos.offset; - bch2_trans_iter_exit(trans, &iter); -err: - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto retry; - - bch2_trans_put(trans); + 0; + }))); return ret ?: bch2_quota_reservation_add(c, inode, res, sectors, true); } @@ -942,42 +905,25 @@ static loff_t bch2_seek_data(struct file *file, u64 offset) { struct bch_inode_info *inode = file_bch_inode(file); struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct btree_trans *trans; - struct btree_iter iter; - struct bkey_s_c k; subvol_inum inum = inode_inum(inode); u64 isize, next_data = MAX_LFS_FILESIZE; - u32 snapshot; - int ret; isize = i_size_read(&inode->v); if (offset >= isize) return -ENXIO; - trans = bch2_trans_get(c); -retry: - bch2_trans_begin(trans); - - ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); - if (ret) - goto err; - - for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_extents, - SPOS(inode->v.i_ino, offset >> 9, snapshot), - POS(inode->v.i_ino, U64_MAX), - 0, k, ret) { - if (bkey_extent_is_data(k.k)) { - next_data = max(offset, bkey_start_offset(k.k) << 9); - break; - } else if (k.k->p.offset >> 9 > isize) - break; - } - bch2_trans_iter_exit(trans, &iter); -err: - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto retry; - - bch2_trans_put(trans); + int ret = bch2_trans_run(c, + for_each_btree_key_in_subvolume_upto(trans, iter, BTREE_ID_extents, + POS(inode->v.i_ino, offset >> 9), + POS(inode->v.i_ino, U64_MAX), + inum.subvol, 0, k, ({ + if (bkey_extent_is_data(k.k)) { + next_data = max(offset, bkey_start_offset(k.k) << 9); + break; + } else if (k.k->p.offset >> 9 > isize) + break; + 0; + }))); if (ret) return ret; @@ -995,50 +941,34 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset) { struct bch_inode_info *inode = file_bch_inode(file); struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct btree_trans *trans; - struct btree_iter iter; - struct bkey_s_c k; subvol_inum inum = inode_inum(inode); u64 isize, next_hole = MAX_LFS_FILESIZE; - u32 snapshot; - int ret; isize = i_size_read(&inode->v); if (offset >= isize) return -ENXIO; - trans = bch2_trans_get(c); -retry: - bch2_trans_begin(trans); - - ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); - if (ret) - goto err; - - for_each_btree_key_norestart(trans, iter, BTREE_ID_extents, - SPOS(inode->v.i_ino, offset >> 9, snapshot), - BTREE_ITER_slots, k, ret) { - if (k.k->p.inode != inode->v.i_ino) { - next_hole = bch2_seek_pagecache_hole(&inode->v, - offset, MAX_LFS_FILESIZE, 0, false); - break; - } else if (!bkey_extent_is_data(k.k)) { - next_hole = bch2_seek_pagecache_hole(&inode->v, - max(offset, bkey_start_offset(k.k) << 9), - k.k->p.offset << 9, 0, false); - - if (next_hole < k.k->p.offset << 9) + int ret = bch2_trans_run(c, + for_each_btree_key_in_subvolume_upto(trans, iter, BTREE_ID_extents, + POS(inode->v.i_ino, offset >> 9), + POS(inode->v.i_ino, U64_MAX), + inum.subvol, BTREE_ITER_slots, k, ({ + if (k.k->p.inode != inode->v.i_ino) { + next_hole = bch2_seek_pagecache_hole(&inode->v, + offset, MAX_LFS_FILESIZE, 0, false); break; - } else { - offset = max(offset, bkey_start_offset(k.k) << 9); - } - } - bch2_trans_iter_exit(trans, &iter); -err: - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto retry; + } else if (!bkey_extent_is_data(k.k)) { + next_hole = bch2_seek_pagecache_hole(&inode->v, + max(offset, bkey_start_offset(k.k) << 9), + k.k->p.offset << 9, 0, false); - bch2_trans_put(trans); + if (next_hole < k.k->p.offset << 9) + break; + } else { + offset = max(offset, bkey_start_offset(k.k) << 9); + } + 0; + }))); if (ret) return ret; diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c index 99c7fe987c74..405cf08bda34 100644 --- a/fs/bcachefs/fs-ioctl.c +++ b/fs/bcachefs/fs-ioctl.c @@ -100,7 +100,7 @@ static int bch2_ioc_setflags(struct bch_fs *c, } mutex_lock(&inode->ei_update_lock); - ret = bch2_subvol_is_ro(c, inode->ei_subvol) ?: + ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?: bch2_write_inode(c, inode, bch2_inode_flags_set, &s, ATTR_CTIME); mutex_unlock(&inode->ei_update_lock); @@ -184,7 +184,7 @@ static int bch2_ioc_fssetxattr(struct bch_fs *c, } mutex_lock(&inode->ei_update_lock); - ret = bch2_subvol_is_ro(c, inode->ei_subvol) ?: + ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?: bch2_set_projid(c, inode, fa.fsx_projid) ?: bch2_write_inode(c, inode, fssetxattr_inode_update_fn, &s, ATTR_CTIME); diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 011817afc3ad..4a1bb07a2574 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -108,7 +108,7 @@ int __must_check bch2_write_inode(struct bch_fs *c, goto retry; bch2_fs_fatal_err_on(bch2_err_matches(ret, ENOENT), c, - "%s: inode %u:%llu not found when updating", + "%s: inode %llu:%llu not found when updating", bch2_err_str(ret), inode_inum(inode).subvol, inode_inum(inode).inum); @@ -152,50 +152,106 @@ int bch2_fs_quota_transfer(struct bch_fs *c, return ret; } -static int bch2_iget5_test(struct inode *vinode, void *p) +static bool subvol_inum_eq(subvol_inum a, subvol_inum b) { - struct bch_inode_info *inode = to_bch_ei(vinode); - subvol_inum *inum = p; - - return inode->ei_subvol == inum->subvol && - inode->ei_inode.bi_inum == inum->inum; + return a.subvol == b.subvol && a.inum == b.inum; } -static int bch2_iget5_set(struct inode *vinode, void *p) +static int bch2_vfs_inode_cmp_fn(struct rhashtable_compare_arg *arg, + const void *obj) { - struct bch_inode_info *inode = to_bch_ei(vinode); - subvol_inum *inum = p; + const struct bch_inode_info *inode = obj; + const subvol_inum *v = arg->key; - inode->v.i_ino = inum->inum; - inode->ei_subvol = inum->subvol; - inode->ei_inode.bi_inum = inum->inum; - return 0; + return !subvol_inum_eq(inode->ei_inum, *v); } -static unsigned bch2_inode_hash(subvol_inum inum) +static const struct rhashtable_params bch2_vfs_inodes_params = { + .head_offset = offsetof(struct bch_inode_info, hash), + .key_offset = offsetof(struct bch_inode_info, ei_inum), + .key_len = sizeof(subvol_inum), + .obj_cmpfn = bch2_vfs_inode_cmp_fn, + .automatic_shrinking = true, +}; + +static void __wait_on_freeing_inode(struct inode *inode) { - return jhash_3words(inum.subvol, inum.inum >> 32, inum.inum, JHASH_INITVAL); + wait_queue_head_t *wq; + DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW); + wq = bit_waitqueue(&inode->i_state, __I_NEW); + prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); + spin_unlock(&inode->i_lock); + schedule(); + finish_wait(wq, &wait.wq_entry); } struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *c, subvol_inum inum) { - return to_bch_ei(ilookup5_nowait(c->vfs_sb, - bch2_inode_hash(inum), - bch2_iget5_test, - &inum)); + return rhashtable_lookup_fast(&c->vfs_inodes_table, &inum, bch2_vfs_inodes_params); } -static struct bch_inode_info *bch2_inode_insert(struct bch_fs *c, struct bch_inode_info *inode) +static struct bch_inode_info *bch2_inode_hash_find(struct bch_fs *c, struct btree_trans *trans, + subvol_inum inum) { - subvol_inum inum = inode_inum(inode); - struct bch_inode_info *old = to_bch_ei(inode_insert5(&inode->v, - bch2_inode_hash(inum), - bch2_iget5_test, - bch2_iget5_set, - &inum)); - BUG_ON(!old); + struct bch_inode_info *inode; +repeat: + inode = __bch2_inode_hash_find(c, inum); + if (inode) { + spin_lock(&inode->v.i_lock); + if (!test_bit(EI_INODE_HASHED, &inode->ei_flags)) { + spin_unlock(&inode->v.i_lock); + return NULL; + } + if ((inode->v.i_state & (I_FREEING|I_WILL_FREE))) { + if (!trans) { + __wait_on_freeing_inode(&inode->v); + } else { + bch2_trans_unlock(trans); + __wait_on_freeing_inode(&inode->v); + int ret = bch2_trans_relock(trans); + if (ret) + return ERR_PTR(ret); + } + goto repeat; + } + __iget(&inode->v); + spin_unlock(&inode->v.i_lock); + } + + return inode; +} + +static void bch2_inode_hash_remove(struct bch_fs *c, struct bch_inode_info *inode) +{ + spin_lock(&inode->v.i_lock); + bool remove = test_and_clear_bit(EI_INODE_HASHED, &inode->ei_flags); + spin_unlock(&inode->v.i_lock); + + if (remove) { + int ret = rhashtable_remove_fast(&c->vfs_inodes_table, + &inode->hash, bch2_vfs_inodes_params); + BUG_ON(ret); + inode->v.i_hash.pprev = NULL; + } +} + +static struct bch_inode_info *bch2_inode_hash_insert(struct bch_fs *c, + struct btree_trans *trans, + struct bch_inode_info *inode) +{ + struct bch_inode_info *old = inode; + + set_bit(EI_INODE_HASHED, &inode->ei_flags); +retry: + if (unlikely(rhashtable_lookup_insert_fast(&c->vfs_inodes_table, + &inode->hash, + bch2_vfs_inodes_params))) { + old = bch2_inode_hash_find(c, trans, inode->ei_inum); + if (!old) + goto retry; + + clear_bit(EI_INODE_HASHED, &inode->ei_flags); - if (unlikely(old != inode)) { /* * bcachefs doesn't use I_NEW; we have no use for it since we * only insert fully created inodes in the inode hash table. But @@ -209,21 +265,17 @@ static struct bch_inode_info *bch2_inode_insert(struct bch_fs *c, struct bch_ino */ set_nlink(&inode->v, 1); discard_new_inode(&inode->v); - inode = old; + return old; } else { + inode_fake_hash(&inode->v); + + inode_sb_list_add(&inode->v); + mutex_lock(&c->vfs_inodes_lock); list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list); mutex_unlock(&c->vfs_inodes_lock); - /* - * Again, I_NEW makes no sense for bcachefs. This is only needed - * for clearing I_NEW, but since the inode was already fully - * created and initialized we didn't actually want - * inode_insert5() to set it for us. - */ - unlock_new_inode(&inode->v); + return inode; } - - return inode; } #define memalloc_flags_do(_flags, _do) \ @@ -241,7 +293,8 @@ static struct inode *bch2_alloc_inode(struct super_block *sb) static struct bch_inode_info *__bch2_new_inode(struct bch_fs *c) { - struct bch_inode_info *inode = kmem_cache_alloc(bch2_inode_cache, GFP_NOFS); + struct bch_inode_info *inode = alloc_inode_sb(c->vfs_sb, + bch2_inode_cache, GFP_NOFS); if (!inode) return NULL; @@ -283,13 +336,24 @@ static struct bch_inode_info *bch2_new_inode(struct btree_trans *trans) return inode; } +static struct bch_inode_info *bch2_inode_hash_init_insert(struct btree_trans *trans, + subvol_inum inum, + struct bch_inode_unpacked *bi, + struct bch_subvolume *subvol) +{ + struct bch_inode_info *inode = bch2_new_inode(trans); + if (IS_ERR(inode)) + return inode; + + bch2_vfs_inode_init(trans, inum, inode, bi, subvol); + + return bch2_inode_hash_insert(trans->c, trans, inode); + +} + struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum) { - struct bch_inode_info *inode = - to_bch_ei(ilookup5_nowait(c->vfs_sb, - bch2_inode_hash(inum), - bch2_iget5_test, - &inum)); + struct bch_inode_info *inode = bch2_inode_hash_find(c, NULL, inum); if (inode) return &inode->v; @@ -300,11 +364,7 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum) int ret = lockrestart_do(trans, bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?: bch2_inode_find_by_inum_trans(trans, inum, &inode_u)) ?: - PTR_ERR_OR_ZERO(inode = bch2_new_inode(trans)); - if (!ret) { - bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol); - inode = bch2_inode_insert(c, inode); - } + PTR_ERR_OR_ZERO(inode = bch2_inode_hash_init_insert(trans, inum, &inode_u, &subvol)); bch2_trans_put(trans); return ret ? ERR_PTR(ret) : &inode->v; @@ -325,6 +385,8 @@ __bch2_create(struct mnt_idmap *idmap, subvol_inum inum; struct bch_subvolume subvol; u64 journal_seq = 0; + kuid_t kuid; + kgid_t kgid; int ret; /* @@ -351,13 +413,15 @@ __bch2_create(struct mnt_idmap *idmap, retry: bch2_trans_begin(trans); - ret = bch2_subvol_is_ro_trans(trans, dir->ei_subvol) ?: + kuid = mapped_fsuid(idmap, i_user_ns(&dir->v)); + kgid = mapped_fsgid(idmap, i_user_ns(&dir->v)); + ret = bch2_subvol_is_ro_trans(trans, dir->ei_inum.subvol) ?: bch2_create_trans(trans, inode_inum(dir), &dir_u, &inode_u, !(flags & BCH_CREATE_TMPFILE) ? &dentry->d_name : NULL, - from_kuid(i_user_ns(&dir->v), current_fsuid()), - from_kgid(i_user_ns(&dir->v), current_fsgid()), + from_kuid(i_user_ns(&dir->v), kuid), + from_kgid(i_user_ns(&dir->v), kgid), mode, rdev, default_acl, acl, snapshot_src, flags) ?: bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1, @@ -365,7 +429,7 @@ __bch2_create(struct mnt_idmap *idmap, if (unlikely(ret)) goto err_before_quota; - inum.subvol = inode_u.bi_subvol ?: dir->ei_subvol; + inum.subvol = inode_u.bi_subvol ?: dir->ei_inum.subvol; inum.inum = inode_u.bi_inum; ret = bch2_subvolume_get(trans, inum.subvol, true, @@ -395,8 +459,16 @@ __bch2_create(struct mnt_idmap *idmap, * we must insert the new inode into the inode cache before calling * bch2_trans_exit() and dropping locks, else we could race with another * thread pulling the inode in and modifying it: + * + * also, calling bch2_inode_hash_insert() without passing in the + * transaction object is sketchy - if we could ever end up in + * __wait_on_freeing_inode(), we'd risk deadlock. + * + * But that shouldn't be possible, since we still have the inode locked + * that we just created, and we _really_ can't take a transaction + * restart here. */ - inode = bch2_inode_insert(c, inode); + inode = bch2_inode_hash_insert(c, NULL, inode); bch2_trans_put(trans); err: posix_acl_release(default_acl); @@ -436,11 +508,7 @@ static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans, if (ret) goto err; - struct bch_inode_info *inode = - to_bch_ei(ilookup5_nowait(c->vfs_sb, - bch2_inode_hash(inum), - bch2_iget5_test, - &inum)); + struct bch_inode_info *inode = bch2_inode_hash_find(c, trans, inum); if (inode) goto out; @@ -448,7 +516,7 @@ static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans, struct bch_inode_unpacked inode_u; ret = bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?: bch2_inode_find_by_inum_nowarn_trans(trans, inum, &inode_u) ?: - PTR_ERR_OR_ZERO(inode = bch2_new_inode(trans)); + PTR_ERR_OR_ZERO(inode = bch2_inode_hash_init_insert(trans, inum, &inode_u, &subvol)); bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, "dirent to missing inode:\n %s", @@ -468,9 +536,6 @@ static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans, ret = -ENOENT; goto err; } - - bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol); - inode = bch2_inode_insert(c, inode); out: bch2_trans_iter_exit(trans, &dirent_iter); printbuf_exit(&buf); @@ -557,8 +622,8 @@ static int bch2_link(struct dentry *old_dentry, struct inode *vdir, lockdep_assert_held(&inode->v.i_rwsem); - ret = bch2_subvol_is_ro(c, dir->ei_subvol) ?: - bch2_subvol_is_ro(c, inode->ei_subvol) ?: + ret = bch2_subvol_is_ro(c, dir->ei_inum.subvol) ?: + bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?: __bch2_link(c, inode, dir, dentry); if (unlikely(ret)) return bch2_err_class(ret); @@ -614,7 +679,7 @@ static int bch2_unlink(struct inode *vdir, struct dentry *dentry) struct bch_inode_info *dir= to_bch_ei(vdir); struct bch_fs *c = dir->v.i_sb->s_fs_info; - int ret = bch2_subvol_is_ro(c, dir->ei_subvol) ?: + int ret = bch2_subvol_is_ro(c, dir->ei_inum.subvol) ?: __bch2_unlink(vdir, dentry, false); return bch2_err_class(ret); } @@ -671,15 +736,16 @@ static int bch2_rename2(struct mnt_idmap *idmap, struct bch_inode_info *src_inode = to_bch_ei(src_dentry->d_inode); struct bch_inode_info *dst_inode = to_bch_ei(dst_dentry->d_inode); struct bch_inode_unpacked dst_dir_u, src_dir_u; - struct bch_inode_unpacked src_inode_u, dst_inode_u; + struct bch_inode_unpacked src_inode_u, dst_inode_u, *whiteout_inode_u; struct btree_trans *trans; enum bch_rename_mode mode = flags & RENAME_EXCHANGE ? BCH_RENAME_EXCHANGE : dst_dentry->d_inode ? BCH_RENAME_OVERWRITE : BCH_RENAME; + bool whiteout = !!(flags & RENAME_WHITEOUT); int ret; - if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE)) + if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE|RENAME_WHITEOUT)) return -EINVAL; if (mode == BCH_RENAME_OVERWRITE) { @@ -697,8 +763,8 @@ static int bch2_rename2(struct mnt_idmap *idmap, trans = bch2_trans_get(c); - ret = bch2_subvol_is_ro_trans(trans, src_dir->ei_subvol) ?: - bch2_subvol_is_ro_trans(trans, dst_dir->ei_subvol); + ret = bch2_subvol_is_ro_trans(trans, src_dir->ei_inum.subvol) ?: + bch2_subvol_is_ro_trans(trans, dst_dir->ei_inum.subvol); if (ret) goto err; @@ -720,18 +786,48 @@ static int bch2_rename2(struct mnt_idmap *idmap, if (ret) goto err; } +retry: + bch2_trans_begin(trans); - ret = commit_do(trans, NULL, NULL, 0, - bch2_rename_trans(trans, - inode_inum(src_dir), &src_dir_u, - inode_inum(dst_dir), &dst_dir_u, - &src_inode_u, - &dst_inode_u, - &src_dentry->d_name, - &dst_dentry->d_name, - mode)); + ret = bch2_rename_trans(trans, + inode_inum(src_dir), &src_dir_u, + inode_inum(dst_dir), &dst_dir_u, + &src_inode_u, + &dst_inode_u, + &src_dentry->d_name, + &dst_dentry->d_name, + mode); if (unlikely(ret)) + goto err_tx_restart; + + if (whiteout) { + whiteout_inode_u = bch2_trans_kmalloc_nomemzero(trans, sizeof(*whiteout_inode_u)); + ret = PTR_ERR_OR_ZERO(whiteout_inode_u); + if (unlikely(ret)) + goto err_tx_restart; + bch2_inode_init_early(c, whiteout_inode_u); + + ret = bch2_create_trans(trans, + inode_inum(src_dir), &src_dir_u, + whiteout_inode_u, + &src_dentry->d_name, + from_kuid(i_user_ns(&src_dir->v), current_fsuid()), + from_kgid(i_user_ns(&src_dir->v), current_fsgid()), + S_IFCHR|WHITEOUT_MODE, 0, + NULL, NULL, (subvol_inum) { 0 }, 0) ?: + bch2_quota_acct(c, bch_qid(whiteout_inode_u), Q_INO, 1, + KEY_TYPE_QUOTA_PREALLOC); + if (unlikely(ret)) + goto err_tx_restart; + } + + ret = bch2_trans_commit(trans, NULL, NULL, 0); + if (unlikely(ret)) { +err_tx_restart: + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto retry; goto err; + } BUG_ON(src_inode->v.i_ino != src_inode_u.bi_inum); BUG_ON(dst_inode && @@ -779,11 +875,17 @@ static void bch2_setattr_copy(struct mnt_idmap *idmap, { struct bch_fs *c = inode->v.i_sb->s_fs_info; unsigned int ia_valid = attr->ia_valid; + kuid_t kuid; + kgid_t kgid; - if (ia_valid & ATTR_UID) - bi->bi_uid = from_kuid(i_user_ns(&inode->v), attr->ia_uid); - if (ia_valid & ATTR_GID) - bi->bi_gid = from_kgid(i_user_ns(&inode->v), attr->ia_gid); + if (ia_valid & ATTR_UID) { + kuid = from_vfsuid(idmap, i_user_ns(&inode->v), attr->ia_vfsuid); + bi->bi_uid = from_kuid(i_user_ns(&inode->v), kuid); + } + if (ia_valid & ATTR_GID) { + kgid = from_vfsgid(idmap, i_user_ns(&inode->v), attr->ia_vfsgid); + bi->bi_gid = from_kgid(i_user_ns(&inode->v), kgid); + } if (ia_valid & ATTR_SIZE) bi->bi_size = attr->ia_size; @@ -798,11 +900,11 @@ static void bch2_setattr_copy(struct mnt_idmap *idmap, if (ia_valid & ATTR_MODE) { umode_t mode = attr->ia_mode; kgid_t gid = ia_valid & ATTR_GID - ? attr->ia_gid + ? kgid : inode->v.i_gid; - if (!in_group_p(gid) && - !capable_wrt_inode_uidgid(idmap, &inode->v, CAP_FSETID)) + if (!in_group_or_capable(idmap, &inode->v, + make_vfsgid(idmap, i_user_ns(&inode->v), gid))) mode &= ~S_ISGID; bi->bi_mode = mode; } @@ -818,17 +920,23 @@ int bch2_setattr_nonsize(struct mnt_idmap *idmap, struct btree_iter inode_iter = { NULL }; struct bch_inode_unpacked inode_u; struct posix_acl *acl = NULL; + kuid_t kuid; + kgid_t kgid; int ret; mutex_lock(&inode->ei_update_lock); qid = inode->ei_qid; - if (attr->ia_valid & ATTR_UID) - qid.q[QTYP_USR] = from_kuid(i_user_ns(&inode->v), attr->ia_uid); + if (attr->ia_valid & ATTR_UID) { + kuid = from_vfsuid(idmap, i_user_ns(&inode->v), attr->ia_vfsuid); + qid.q[QTYP_USR] = from_kuid(i_user_ns(&inode->v), kuid); + } - if (attr->ia_valid & ATTR_GID) - qid.q[QTYP_GRP] = from_kgid(i_user_ns(&inode->v), attr->ia_gid); + if (attr->ia_valid & ATTR_GID) { + kgid = from_vfsgid(idmap, i_user_ns(&inode->v), attr->ia_vfsgid); + qid.q[QTYP_GRP] = from_kgid(i_user_ns(&inode->v), kgid); + } ret = bch2_fs_quota_transfer(c, inode, qid, ~0, KEY_TYPE_QUOTA_PREALLOC); @@ -884,13 +992,15 @@ static int bch2_getattr(struct mnt_idmap *idmap, { struct bch_inode_info *inode = to_bch_ei(d_inode(path->dentry)); struct bch_fs *c = inode->v.i_sb->s_fs_info; + vfsuid_t vfsuid = i_uid_into_vfsuid(idmap, &inode->v); + vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, &inode->v); stat->dev = inode->v.i_sb->s_dev; stat->ino = inode->v.i_ino; stat->mode = inode->v.i_mode; stat->nlink = inode->v.i_nlink; - stat->uid = inode->v.i_uid; - stat->gid = inode->v.i_gid; + stat->uid = vfsuid_into_kuid(vfsuid); + stat->gid = vfsgid_into_kgid(vfsgid); stat->rdev = inode->v.i_rdev; stat->size = i_size_read(&inode->v); stat->atime = inode_get_atime(&inode->v); @@ -899,7 +1009,7 @@ static int bch2_getattr(struct mnt_idmap *idmap, stat->blksize = block_bytes(c); stat->blocks = inode->v.i_blocks; - stat->subvol = inode->ei_subvol; + stat->subvol = inode->ei_inum.subvol; stat->result_mask |= STATX_SUBVOL; if ((request_mask & STATX_DIOALIGN) && S_ISREG(inode->v.i_mode)) { @@ -941,7 +1051,7 @@ static int bch2_setattr(struct mnt_idmap *idmap, lockdep_assert_held(&inode->v.i_rwsem); - ret = bch2_subvol_is_ro(c, inode->ei_subvol) ?: + ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?: setattr_prepare(idmap, dentry, iattr); if (ret) return ret; @@ -1034,7 +1144,6 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, struct bkey_buf cur, prev; unsigned offset_into_extent, sectors; bool have_extent = false; - u32 snapshot; int ret = 0; ret = fiemap_prep(&ei->v, info, start, &len, FIEMAP_FLAG_SYNC); @@ -1050,21 +1159,30 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, bch2_bkey_buf_init(&cur); bch2_bkey_buf_init(&prev); trans = bch2_trans_get(c); -retry: - bch2_trans_begin(trans); - - ret = bch2_subvolume_get_snapshot(trans, ei->ei_subvol, &snapshot); - if (ret) - goto err; bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, - SPOS(ei->v.i_ino, start, snapshot), 0); + POS(ei->v.i_ino, start), 0); - while (!(ret = btree_trans_too_many_iters(trans)) && - (k = bch2_btree_iter_peek_upto(&iter, end)).k && - !(ret = bkey_err(k))) { + while (true) { enum btree_id data_btree = BTREE_ID_extents; + bch2_trans_begin(trans); + + u32 snapshot; + ret = bch2_subvolume_get_snapshot(trans, ei->ei_inum.subvol, &snapshot); + if (ret) + goto err; + + bch2_btree_iter_set_snapshot(&iter, snapshot); + + k = bch2_btree_iter_peek_upto(&iter, end); + ret = bkey_err(k); + if (ret) + goto err; + + if (!k.k) + break; + if (!bkey_extent_is_data(k.k) && k.k->type != KEY_TYPE_reservation) { bch2_btree_iter_advance(&iter); @@ -1108,16 +1226,12 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, bch2_btree_iter_set_pos(&iter, POS(iter.pos.inode, iter.pos.offset + sectors)); - - ret = bch2_trans_relock(trans); - if (ret) +err: + if (ret && + !bch2_err_matches(ret, BCH_ERR_transaction_restart)) break; } - start = iter.pos.offset; bch2_trans_iter_exit(trans, &iter); -err: - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto retry; if (!ret && have_extent) { bch2_trans_unlock(trans); @@ -1173,7 +1287,7 @@ static int bch2_open(struct inode *vinode, struct file *file) struct bch_inode_info *inode = to_bch_ei(vinode); struct bch_fs *c = inode->v.i_sb->s_fs_info; - int ret = bch2_subvol_is_ro(c, inode->ei_subvol); + int ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol); if (ret) return ret; } @@ -1305,8 +1419,8 @@ static int bcachefs_fid_valid(int fh_len, int fh_type) static struct bcachefs_fid bch2_inode_to_fid(struct bch_inode_info *inode) { return (struct bcachefs_fid) { - .inum = inode->ei_inode.bi_inum, - .subvol = inode->ei_subvol, + .inum = inode->ei_inum.inum, + .subvol = inode->ei_inum.subvol, .gen = inode->ei_inode.bi_generation, }; } @@ -1391,7 +1505,7 @@ static struct dentry *bch2_get_parent(struct dentry *child) struct bch_fs *c = inode->v.i_sb->s_fs_info; subvol_inum parent_inum = { .subvol = inode->ei_inode.bi_parent_subvol ?: - inode->ei_subvol, + inode->ei_inum.subvol, .inum = inode->ei_inode.bi_dir, }; @@ -1427,7 +1541,7 @@ static int bch2_get_name(struct dentry *parent, char *name, struct dentry *child retry: bch2_trans_begin(trans); - ret = bch2_subvolume_get_snapshot(trans, dir->ei_subvol, &snapshot); + ret = bch2_subvolume_get_snapshot(trans, dir->ei_inum.subvol, &snapshot); if (ret) goto err; @@ -1458,8 +1572,7 @@ static int bch2_get_name(struct dentry *parent, char *name, struct dentry *child if (ret) goto err; - if (target.subvol == inode->ei_subvol && - target.inum == inode->ei_inode.bi_inum) + if (subvol_inum_eq(target, inode->ei_inum)) goto found; } else { /* @@ -1480,8 +1593,7 @@ static int bch2_get_name(struct dentry *parent, char *name, struct dentry *child if (ret) continue; - if (target.subvol == inode->ei_subvol && - target.inum == inode->ei_inode.bi_inum) + if (subvol_inum_eq(target, inode->ei_inum)) goto found; } } @@ -1513,12 +1625,15 @@ static const struct export_operations bch_export_ops = { .get_name = bch2_get_name, }; -static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum, +static void bch2_vfs_inode_init(struct btree_trans *trans, + subvol_inum inum, struct bch_inode_info *inode, struct bch_inode_unpacked *bi, struct bch_subvolume *subvol) { - bch2_iget5_set(&inode->v, &inum); + inode->v.i_ino = inum.inum; + inode->ei_inum = inum; + inode->ei_inode.bi_inum = inum.inum; bch2_inode_update_after_write(trans, inode, bi, ~0); inode->v.i_blocks = bi->bi_sectors; @@ -1530,7 +1645,6 @@ static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum, inode->ei_flags = 0; inode->ei_quota_reserved = 0; inode->ei_qid = bch_qid(bi); - inode->ei_subvol = inum.subvol; if (BCH_SUBVOLUME_SNAP(subvol)) set_bit(EI_INODE_SNAPSHOT, &inode->ei_flags); @@ -1597,6 +1711,17 @@ static void bch2_evict_inode(struct inode *vinode) { struct bch_fs *c = vinode->i_sb->s_fs_info; struct bch_inode_info *inode = to_bch_ei(vinode); + bool delete = !inode->v.i_nlink && !is_bad_inode(&inode->v); + + /* + * evict() has waited for outstanding writeback, we'll do no more IO + * through this inode: it's safe to remove from VFS inode hashtable here + * + * Do that now so that other threads aren't blocked from pulling it back + * in, there's no reason for them to be: + */ + if (!delete) + bch2_inode_hash_remove(c, inode); truncate_inode_pages_final(&inode->v.i_data); @@ -1604,12 +1729,18 @@ static void bch2_evict_inode(struct inode *vinode) BUG_ON(!is_bad_inode(&inode->v) && inode->ei_quota_reserved); - if (!inode->v.i_nlink && !is_bad_inode(&inode->v)) { + if (delete) { bch2_quota_acct(c, inode->ei_qid, Q_SPC, -((s64) inode->v.i_blocks), KEY_TYPE_QUOTA_WARN); bch2_quota_acct(c, inode->ei_qid, Q_INO, -1, KEY_TYPE_QUOTA_WARN); bch2_inode_rm(c, inode_inum(inode)); + + /* + * If we are deleting, we need it present in the vfs hash table + * so that fsck can check if unlinked inodes are still open: + */ + bch2_inode_hash_remove(c, inode); } mutex_lock(&c->vfs_inodes_lock); @@ -1639,7 +1770,7 @@ void bch2_evict_subvolume_inodes(struct bch_fs *c, snapshot_id_list *s) mutex_lock(&c->vfs_inodes_lock); list_for_each_entry(inode, &c->vfs_inodes_list, ei_vfs_inode_list) { - if (!snapshot_list_has_id(s, inode->ei_subvol)) + if (!snapshot_list_has_id(s, inode->ei_inum.subvol)) continue; if (!(inode->v.i_state & I_DONTCACHE) && @@ -1801,30 +1932,14 @@ static int bch2_show_devname(struct seq_file *seq, struct dentry *root) static int bch2_show_options(struct seq_file *seq, struct dentry *root) { struct bch_fs *c = root->d_sb->s_fs_info; - enum bch_opt_id i; struct printbuf buf = PRINTBUF; - int ret = 0; - for (i = 0; i < bch2_opts_nr; i++) { - const struct bch_option *opt = &bch2_opt_table[i]; - u64 v = bch2_opt_get_by_id(&c->opts, i); + bch2_opts_to_text(&buf, c->opts, c, c->disk_sb.sb, + OPT_MOUNT, OPT_HIDDEN, OPT_SHOW_MOUNT_STYLE); + printbuf_nul_terminate(&buf); + seq_puts(seq, buf.buf); - if ((opt->flags & OPT_HIDDEN) || - !(opt->flags & OPT_MOUNT)) - continue; - - if (v == bch2_opt_get_by_id(&bch2_opts_default, i)) - continue; - - printbuf_reset(&buf); - bch2_opt_to_text(&buf, c, c->disk_sb.sb, opt, v, - OPT_SHOW_MOUNT_STYLE); - seq_putc(seq, ','); - seq_puts(seq, buf.buf); - } - - if (buf.allocation_failure) - ret = -ENOMEM; + int ret = buf.allocation_failure ? -ENOMEM : 0; printbuf_exit(&buf); return ret; } @@ -2129,12 +2244,23 @@ static int bch2_init_fs_context(struct fs_context *fc) return 0; } +void bch2_fs_vfs_exit(struct bch_fs *c) +{ + if (c->vfs_inodes_table.tbl) + rhashtable_destroy(&c->vfs_inodes_table); +} + +int bch2_fs_vfs_init(struct bch_fs *c) +{ + return rhashtable_init(&c->vfs_inodes_table, &bch2_vfs_inodes_params); +} + static struct file_system_type bcache_fs_type = { .owner = THIS_MODULE, .name = "bcachefs", .init_fs_context = bch2_init_fs_context, .kill_sb = bch2_kill_sb, - .fs_flags = FS_REQUIRES_DEV, + .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP, }; MODULE_ALIAS_FS("bcachefs"); @@ -2149,7 +2275,8 @@ int __init bch2_vfs_init(void) { int ret = -ENOMEM; - bch2_inode_cache = KMEM_CACHE(bch_inode_info, SLAB_RECLAIM_ACCOUNT); + bch2_inode_cache = KMEM_CACHE(bch_inode_info, SLAB_RECLAIM_ACCOUNT | + SLAB_ACCOUNT); if (!bch2_inode_cache) goto err; diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h index 990ec43e0365..da74ecc236e7 100644 --- a/fs/bcachefs/fs.h +++ b/fs/bcachefs/fs.h @@ -13,6 +13,9 @@ struct bch_inode_info { struct inode v; + struct rhash_head hash; + subvol_inum ei_inum; + struct list_head ei_vfs_inode_list; unsigned long ei_flags; @@ -24,8 +27,6 @@ struct bch_inode_info { struct mutex ei_quota_lock; struct bch_qid ei_qid; - u32 ei_subvol; - /* * When we've been doing nocow writes we'll need to issue flushes to the * underlying block devices @@ -50,10 +51,7 @@ struct bch_inode_info { static inline subvol_inum inode_inum(struct bch_inode_info *inode) { - return (subvol_inum) { - .subvol = inode->ei_subvol, - .inum = inode->ei_inode.bi_inum, - }; + return inode->ei_inum; } struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *, subvol_inum); @@ -69,6 +67,7 @@ struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *, subvol_inum); * those: */ #define EI_INODE_SNAPSHOT 1 +#define EI_INODE_HASHED 2 #define to_bch_ei(_inode) \ container_of_or_null(_inode, struct bch_inode_info, v) @@ -189,6 +188,9 @@ int __bch2_unlink(struct inode *, struct dentry *, bool); void bch2_evict_subvolume_inodes(struct bch_fs *, snapshot_id_list *); +void bch2_fs_vfs_exit(struct bch_fs *); +int bch2_fs_vfs_init(struct bch_fs *); + void bch2_vfs_exit(void); int bch2_vfs_init(void); @@ -203,6 +205,10 @@ static inline struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *c, su static inline void bch2_evict_subvolume_inodes(struct bch_fs *c, snapshot_id_list *s) {} + +static inline void bch2_fs_vfs_exit(struct bch_fs *c) {} +static inline int bch2_fs_vfs_init(struct bch_fs *c) { return 0; } + static inline void bch2_vfs_exit(void) {} static inline int bch2_vfs_init(void) { return 0; } diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index 2be6be33afa3..6ac0ff7e074b 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -365,7 +365,7 @@ int bch2_inode_peek(struct btree_trans *trans, subvol_inum inum, unsigned flags) { int ret = bch2_inode_peek_nowarn(trans, iter, inode, inum, flags); - bch_err_msg(trans->c, ret, "looking up inum %u:%llu:", inum.subvol, inum.inum); + bch_err_msg(trans->c, ret, "looking up inum %llu:%llu:", inum.subvol, inum.inum); return ret; } diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index 7ee3b75480df..b2f50e74bb76 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -286,7 +286,7 @@ static struct promote_op *promote_alloc(struct btree_trans *trans, */ bool promote_full = (failed || *read_full || - READ_ONCE(c->promote_whole_extents)); + READ_ONCE(c->opts.promote_whole_extents)); /* data might have to be decompressed in the write path: */ unsigned sectors = promote_full ? max(pick->crc.compressed_size, pick->crc.live_size) @@ -777,7 +777,7 @@ int __bch2_read_indirect_extent(struct btree_trans *trans, orig_k->k->k.size, reflink_offset); bch2_inconsistent_error(trans->c); - ret = -EIO; + ret = -BCH_ERR_missing_indirect_extent; goto err; } @@ -869,9 +869,15 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, goto hole; if (pick_ret < 0) { + struct printbuf buf = PRINTBUF; + bch2_bkey_val_to_text(&buf, c, k); + bch_err_inum_offset_ratelimited(c, read_pos.inode, read_pos.offset << 9, - "no device to read from"); + "no device to read from: %s\n %s", + bch2_err_str(pick_ret), + buf.buf); + printbuf_exit(&buf); goto err; } @@ -1086,7 +1092,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, trans->notrace_relock_fail = true; } else { /* Attempting reconstruct read: */ - if (bch2_ec_read_extent(trans, rbio)) { + if (bch2_ec_read_extent(trans, rbio, k)) { bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); goto out; } @@ -1214,10 +1220,6 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, swap(bvec_iter.bi_size, bytes); bio_advance_iter(&rbio->bio, &bvec_iter, bytes); - - ret = btree_trans_too_many_iters(trans); - if (ret) - goto err; err: if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart) && diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c index 1d4761d15002..d3b5be7fd9bf 100644 --- a/fs/bcachefs/io_write.c +++ b/fs/bcachefs/io_write.c @@ -1447,9 +1447,7 @@ static void __bch2_write(struct bch_write_op *op) op->nr_replicas_required, op->watermark, op->flags, - (op->flags & (BCH_WRITE_ALLOC_NOWAIT| - BCH_WRITE_ONLY_SPECIFIED_DEVS)) - ? NULL : &op->cl, &wp)); + &op->cl, &wp)); if (unlikely(ret)) { if (bch2_err_matches(ret, BCH_ERR_operation_blocked)) break; @@ -1592,6 +1590,9 @@ CLOSURE_CALLBACK(bch2_write) BUG_ON(!op->write_point.v); BUG_ON(bkey_eq(op->pos, POS_MAX)); + if (op->flags & BCH_WRITE_ONLY_SPECIFIED_DEVS) + op->flags |= BCH_WRITE_ALLOC_NOWAIT; + op->nr_replicas_required = min_t(unsigned, op->nr_replicas_required, op->nr_replicas); op->start_time = local_clock(); bch2_keylist_init(&op->insert_keys, op->inline_keys); diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index 7664b68e6a15..30460bce04be 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -1353,6 +1353,7 @@ int bch2_journal_read(struct bch_fs *c, genradix_for_each(&c->journal_entries, radix_iter, _i) { struct bch_replicas_padded replicas = { .e.data_type = BCH_DATA_journal, + .e.nr_devs = 0, .e.nr_required = 1, }; @@ -1379,7 +1380,7 @@ int bch2_journal_read(struct bch_fs *c, goto err; darray_for_each(i->ptrs, ptr) - replicas.e.devs[replicas.e.nr_devs++] = ptr->dev; + replicas_entry_add_dev(&replicas.e, ptr->dev); bch2_replicas_entry_sort(&replicas.e); @@ -1950,7 +1951,8 @@ static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf * if (error || w->noflush || (!w->must_flush && - (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) && + time_before(jiffies, j->last_flush_write + + msecs_to_jiffies(c->opts.journal_flush_delay)) && test_bit(JOURNAL_may_skip_flush, &j->flags))) { w->noflush = true; SET_JSET_NO_FLUSH(w->data, true); diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c index 70b998d9f19c..ace291f175dd 100644 --- a/fs/bcachefs/journal_reclaim.c +++ b/fs/bcachefs/journal_reclaim.c @@ -641,6 +641,7 @@ static u64 journal_seq_to_flush(struct journal *j) static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked) { struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct btree_cache *bc = &c->btree_cache; bool kthread = (current->flags & PF_KTHREAD) != 0; u64 seq_to_flush; size_t min_nr, min_key_cache, nr_flushed; @@ -681,7 +682,8 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked) if (j->watermark != BCH_WATERMARK_stripe) min_nr = 1; - if (atomic_read(&c->btree_cache.dirty) * 2 > c->btree_cache.used) + size_t btree_cache_live = bc->live[0].nr + bc->live[1].nr; + if (atomic_long_read(&bc->nr_dirty) * 2 > btree_cache_live) min_nr = 1; min_key_cache = min(bch2_nr_btree_keys_need_flush(c), (size_t) 128); @@ -689,8 +691,7 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked) trace_and_count(c, journal_reclaim_start, c, direct, kicked, min_nr, min_key_cache, - atomic_read(&c->btree_cache.dirty), - c->btree_cache.used, + atomic_long_read(&bc->nr_dirty), btree_cache_live, atomic_long_read(&c->btree_key_cache.nr_dirty), atomic_long_read(&c->btree_key_cache.nr_keys)); diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c index e10fc1da71b1..232be8a44051 100644 --- a/fs/bcachefs/opts.c +++ b/fs/bcachefs/opts.c @@ -230,6 +230,8 @@ const struct bch_option bch2_opt_table[] = { #define OPT_STR_NOLIMIT(_choices) .type = BCH_OPT_STR, \ .min = 0, .max = U64_MAX, \ .choices = _choices +#define OPT_BITFIELD(_choices) .type = BCH_OPT_BITFIELD, \ + .choices = _choices #define OPT_FN(_fn) .type = BCH_OPT_FN, .fn = _fn #define x(_name, _bits, _flags, _type, _sb_opt, _default, _hint, _help) \ @@ -376,6 +378,13 @@ int bch2_opt_parse(struct bch_fs *c, *res = ret; break; + case BCH_OPT_BITFIELD: { + s64 v = bch2_read_flag_list(val, opt->choices); + if (v < 0) + return v; + *res = v; + break; + } case BCH_OPT_FN: ret = opt->fn.parse(c, val, res, err); @@ -423,6 +432,9 @@ void bch2_opt_to_text(struct printbuf *out, else prt_str(out, opt->choices[v]); break; + case BCH_OPT_BITFIELD: + prt_bitflags(out, opt->choices, v); + break; case BCH_OPT_FN: opt->fn.to_text(out, c, sb, v); break; @@ -431,6 +443,32 @@ void bch2_opt_to_text(struct printbuf *out, } } +void bch2_opts_to_text(struct printbuf *out, + struct bch_opts opts, + struct bch_fs *c, struct bch_sb *sb, + unsigned show_mask, unsigned hide_mask, + unsigned flags) +{ + bool first = true; + + for (enum bch_opt_id i = 0; i < bch2_opts_nr; i++) { + const struct bch_option *opt = &bch2_opt_table[i]; + + if ((opt->flags & hide_mask) || !(opt->flags & show_mask)) + continue; + + u64 v = bch2_opt_get_by_id(&opts, i); + if (v == bch2_opt_get_by_id(&bch2_opts_default, i)) + continue; + + if (!first) + prt_char(out, ','); + first = false; + + bch2_opt_to_text(out, c, sb, opt, v, flags); + } +} + int bch2_opt_check_may_set(struct bch_fs *c, int id, u64 v) { int ret = 0; @@ -608,10 +646,20 @@ int bch2_opts_from_sb(struct bch_opts *opts, struct bch_sb *sb) return 0; } -void __bch2_opt_set_sb(struct bch_sb *sb, const struct bch_option *opt, u64 v) +struct bch_dev_sb_opt_set { + void (*set_sb)(struct bch_member *, u64); +}; + +static const struct bch_dev_sb_opt_set bch2_dev_sb_opt_setters [] = { +#define x(n, set) [Opt_##n] = { .set_sb = SET_##set }, + BCH_DEV_OPT_SETTERS() +#undef x +}; + +void __bch2_opt_set_sb(struct bch_sb *sb, int dev_idx, + const struct bch_option *opt, u64 v) { - if (opt->set_sb == SET_BCH2_NO_SB_OPT) - return; + enum bch_opt_id id = opt - bch2_opt_table; if (opt->flags & OPT_SB_FIELD_SECTORS) v >>= 9; @@ -619,16 +667,35 @@ void __bch2_opt_set_sb(struct bch_sb *sb, const struct bch_option *opt, u64 v) if (opt->flags & OPT_SB_FIELD_ILOG2) v = ilog2(v); - opt->set_sb(sb, v); + if (opt->flags & OPT_SB_FIELD_ONE_BIAS) + v++; + + if (opt->flags & OPT_FS) { + if (opt->set_sb != SET_BCH2_NO_SB_OPT) + opt->set_sb(sb, v); + } + + if ((opt->flags & OPT_DEVICE) && dev_idx >= 0) { + if (WARN(!bch2_member_exists(sb, dev_idx), + "tried to set device option %s on nonexistent device %i", + opt->attr.name, dev_idx)) + return; + + struct bch_member *m = bch2_members_v2_get_mut(sb, dev_idx); + + const struct bch_dev_sb_opt_set *set = bch2_dev_sb_opt_setters + id; + if (set->set_sb) + set->set_sb(m, v); + else + pr_err("option %s cannot be set via opt_set_sb()", opt->attr.name); + } } -void bch2_opt_set_sb(struct bch_fs *c, const struct bch_option *opt, u64 v) +void bch2_opt_set_sb(struct bch_fs *c, struct bch_dev *ca, + const struct bch_option *opt, u64 v) { - if (opt->set_sb == SET_BCH2_NO_SB_OPT) - return; - mutex_lock(&c->sb_lock); - __bch2_opt_set_sb(c->disk_sb.sb, opt, v); + __bch2_opt_set_sb(c->disk_sb.sb, ca ? ca->dev_idx : -1, opt, v); bch2_write_super(c); mutex_unlock(&c->sb_lock); } diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index cda1725702ea..cb2e244a2429 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -53,23 +53,25 @@ void SET_BCH2_NO_SB_OPT(struct bch_sb *, u64); /* When can be set: */ enum opt_flags { - OPT_FS = (1 << 0), /* Filesystem option */ - OPT_DEVICE = (1 << 1), /* Device option */ - OPT_INODE = (1 << 2), /* Inode option */ - OPT_FORMAT = (1 << 3), /* May be specified at format time */ - OPT_MOUNT = (1 << 4), /* May be specified at mount time */ - OPT_RUNTIME = (1 << 5), /* May be specified at runtime */ - OPT_HUMAN_READABLE = (1 << 6), - OPT_MUST_BE_POW_2 = (1 << 7), /* Must be power of 2 */ - OPT_SB_FIELD_SECTORS = (1 << 8),/* Superblock field is >> 9 of actual value */ - OPT_SB_FIELD_ILOG2 = (1 << 9), /* Superblock field is ilog2 of actual value */ - OPT_HIDDEN = (1 << 10), + OPT_FS = BIT(0), /* Filesystem option */ + OPT_DEVICE = BIT(1), /* Device option */ + OPT_INODE = BIT(2), /* Inode option */ + OPT_FORMAT = BIT(3), /* May be specified at format time */ + OPT_MOUNT = BIT(4), /* May be specified at mount time */ + OPT_RUNTIME = BIT(5), /* May be specified at runtime */ + OPT_HUMAN_READABLE = BIT(6), + OPT_MUST_BE_POW_2 = BIT(7), /* Must be power of 2 */ + OPT_SB_FIELD_SECTORS = BIT(8), /* Superblock field is >> 9 of actual value */ + OPT_SB_FIELD_ILOG2 = BIT(9), /* Superblock field is ilog2 of actual value */ + OPT_SB_FIELD_ONE_BIAS = BIT(10), /* 0 means default value */ + OPT_HIDDEN = BIT(11), }; enum opt_type { BCH_OPT_BOOL, BCH_OPT_UINT, BCH_OPT_STR, + BCH_OPT_BITFIELD, BCH_OPT_FN, }; @@ -263,6 +265,11 @@ enum fsck_err_opts { OPT_BOOL(), \ BCH2_NO_SB_OPT, true, \ NULL, "Enable inline data extents") \ + x(promote_whole_extents, u8, \ + OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ + OPT_BOOL(), \ + BCH_SB_PROMOTE_WHOLE_EXTENTS, true, \ + NULL, "Promote whole extents, instead of just part being read")\ x(acl, u8, \ OPT_FS|OPT_FORMAT|OPT_MOUNT, \ OPT_BOOL(), \ @@ -366,6 +373,16 @@ enum fsck_err_opts { OPT_BOOL(), \ BCH2_NO_SB_OPT, false, \ NULL, "Exit recovery immediately prior to journal replay")\ + x(recovery_passes, u64, \ + OPT_FS|OPT_MOUNT, \ + OPT_BITFIELD(bch2_recovery_passes), \ + BCH2_NO_SB_OPT, 0, \ + NULL, "Recovery passes to run explicitly") \ + x(recovery_passes_exclude, u64, \ + OPT_FS|OPT_MOUNT, \ + OPT_BITFIELD(bch2_recovery_passes), \ + BCH2_NO_SB_OPT, 0, \ + NULL, "Recovery passes to exclude") \ x(recovery_pass_last, u8, \ OPT_FS|OPT_MOUNT, \ OPT_STR_NOLIMIT(bch2_recovery_passes), \ @@ -472,11 +489,16 @@ enum fsck_err_opts { BCH2_NO_SB_OPT, 0, \ "size", "Size of filesystem on device") \ x(durability, u8, \ - OPT_DEVICE, \ + OPT_DEVICE|OPT_SB_FIELD_ONE_BIAS, \ OPT_UINT(0, BCH_REPLICAS_MAX), \ BCH2_NO_SB_OPT, 1, \ "n", "Data written to this device will be considered\n"\ "to have already been replicated n times") \ + x(data_allowed, u8, \ + OPT_DEVICE, \ + OPT_BITFIELD(__bch2_data_types), \ + BCH2_NO_SB_OPT, BIT(BCH_DATA_journal)|BIT(BCH_DATA_btree)|BIT(BCH_DATA_user),\ + "types", "Allowed data types for this device: journal, btree, and/or user")\ x(btree_node_prefetch, u8, \ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ OPT_BOOL(), \ @@ -484,6 +506,11 @@ enum fsck_err_opts { NULL, "BTREE_ITER_prefetch casuse btree nodes to be\n"\ " prefetched sequentially") +#define BCH_DEV_OPT_SETTERS() \ + x(discard, BCH_MEMBER_DISCARD) \ + x(durability, BCH_MEMBER_DURABILITY) \ + x(data_allowed, BCH_MEMBER_DATA_ALLOWED) + struct bch_opts { #define x(_name, _bits, ...) unsigned _name##_defined:1; BCH_OPTS() @@ -563,8 +590,10 @@ void bch2_opt_set_by_id(struct bch_opts *, enum bch_opt_id, u64); u64 bch2_opt_from_sb(struct bch_sb *, enum bch_opt_id); int bch2_opts_from_sb(struct bch_opts *, struct bch_sb *); -void __bch2_opt_set_sb(struct bch_sb *, const struct bch_option *, u64); -void bch2_opt_set_sb(struct bch_fs *, const struct bch_option *, u64); +void __bch2_opt_set_sb(struct bch_sb *, int, const struct bch_option *, u64); + +struct bch_dev; +void bch2_opt_set_sb(struct bch_fs *, struct bch_dev *, const struct bch_option *, u64); int bch2_opt_lookup(const char *); int bch2_opt_validate(const struct bch_option *, u64, struct printbuf *); @@ -576,6 +605,10 @@ int bch2_opt_parse(struct bch_fs *, const struct bch_option *, void bch2_opt_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, const struct bch_option *, u64, unsigned); +void bch2_opts_to_text(struct printbuf *, + struct bch_opts, + struct bch_fs *, struct bch_sb *, + unsigned, unsigned, unsigned); int bch2_opt_check_may_set(struct bch_fs *, int, u64); int bch2_opts_check_may_set(struct bch_fs *); diff --git a/fs/bcachefs/rcu_pending.c b/fs/bcachefs/rcu_pending.c new file mode 100644 index 000000000000..40a20192eee8 --- /dev/null +++ b/fs/bcachefs/rcu_pending.c @@ -0,0 +1,650 @@ +// SPDX-License-Identifier: GPL-2.0 +#define pr_fmt(fmt) "%s() " fmt "\n", __func__ + +#include +#include +#include +#include +#include +#include + +#include "rcu_pending.h" +#include "darray.h" +#include "util.h" + +#define static_array_for_each(_a, _i) \ + for (typeof(&(_a)[0]) _i = _a; \ + _i < (_a) + ARRAY_SIZE(_a); \ + _i++) + +enum rcu_pending_special { + RCU_PENDING_KVFREE = 1, + RCU_PENDING_CALL_RCU = 2, +}; + +#define RCU_PENDING_KVFREE_FN ((rcu_pending_process_fn) (ulong) RCU_PENDING_KVFREE) +#define RCU_PENDING_CALL_RCU_FN ((rcu_pending_process_fn) (ulong) RCU_PENDING_CALL_RCU) + +static inline unsigned long __get_state_synchronize_rcu(struct srcu_struct *ssp) +{ + return ssp + ? get_state_synchronize_srcu(ssp) + : get_state_synchronize_rcu(); +} + +static inline unsigned long __start_poll_synchronize_rcu(struct srcu_struct *ssp) +{ + return ssp + ? start_poll_synchronize_srcu(ssp) + : start_poll_synchronize_rcu(); +} + +static inline bool __poll_state_synchronize_rcu(struct srcu_struct *ssp, unsigned long cookie) +{ + return ssp + ? poll_state_synchronize_srcu(ssp, cookie) + : poll_state_synchronize_rcu(cookie); +} + +static inline void __rcu_barrier(struct srcu_struct *ssp) +{ + return ssp + ? srcu_barrier(ssp) + : rcu_barrier(); +} + +static inline void __call_rcu(struct srcu_struct *ssp, struct rcu_head *rhp, + rcu_callback_t func) +{ + if (ssp) + call_srcu(ssp, rhp, func); + else + call_rcu(rhp, func); +} + +struct rcu_pending_seq { + /* + * We're using a radix tree like a vector - we're just pushing elements + * onto the end; we're using a radix tree instead of an actual vector to + * avoid reallocation overhead + */ + GENRADIX(struct rcu_head *) objs; + size_t nr; + struct rcu_head **cursor; + unsigned long seq; +}; + +struct rcu_pending_list { + struct rcu_head *head; + struct rcu_head *tail; + unsigned long seq; +}; + +struct rcu_pending_pcpu { + struct rcu_pending *parent; + spinlock_t lock; + int cpu; + + /* + * We can't bound the number of unprocessed gp sequence numbers, and we + * can't efficiently merge radix trees for expired grace periods, so we + * need darray/vector: + */ + DARRAY_PREALLOCATED(struct rcu_pending_seq, 4) objs; + + /* Third entry is for expired objects: */ + struct rcu_pending_list lists[NUM_ACTIVE_RCU_POLL_OLDSTATE + 1]; + + struct rcu_head cb; + bool cb_armed; + struct work_struct work; +}; + +static bool __rcu_pending_has_pending(struct rcu_pending_pcpu *p) +{ + if (p->objs.nr) + return true; + + static_array_for_each(p->lists, i) + if (i->head) + return true; + + return false; +} + +static void rcu_pending_list_merge(struct rcu_pending_list *l1, + struct rcu_pending_list *l2) +{ +#ifdef __KERNEL__ + if (!l1->head) + l1->head = l2->head; + else + l1->tail->next = l2->head; +#else + if (!l1->head) + l1->head = l2->head; + else + l1->tail->next.next = (void *) l2->head; +#endif + + l1->tail = l2->tail; + l2->head = l2->tail = NULL; +} + +static void rcu_pending_list_add(struct rcu_pending_list *l, + struct rcu_head *n) +{ +#ifdef __KERNEL__ + if (!l->head) + l->head = n; + else + l->tail->next = n; + l->tail = n; + n->next = NULL; +#else + if (!l->head) + l->head = n; + else + l->tail->next.next = (void *) n; + l->tail = n; + n->next.next = NULL; +#endif +} + +static void merge_expired_lists(struct rcu_pending_pcpu *p) +{ + struct rcu_pending_list *expired = &p->lists[NUM_ACTIVE_RCU_POLL_OLDSTATE]; + + for (struct rcu_pending_list *i = p->lists; i < expired; i++) + if (i->head && __poll_state_synchronize_rcu(p->parent->srcu, i->seq)) + rcu_pending_list_merge(expired, i); +} + +#ifndef __KERNEL__ +static inline void kfree_bulk(size_t nr, void ** p) +{ + while (nr--) + kfree(*p); +} + +#define local_irq_save(flags) \ +do { \ + flags = 0; \ +} while (0) +#endif + +static noinline void __process_finished_items(struct rcu_pending *pending, + struct rcu_pending_pcpu *p, + unsigned long flags) +{ + struct rcu_pending_list *expired = &p->lists[NUM_ACTIVE_RCU_POLL_OLDSTATE]; + struct rcu_pending_seq objs = {}; + struct rcu_head *list = NULL; + + if (p->objs.nr && + __poll_state_synchronize_rcu(pending->srcu, p->objs.data[0].seq)) { + objs = p->objs.data[0]; + darray_remove_item(&p->objs, p->objs.data); + } + + merge_expired_lists(p); + + list = expired->head; + expired->head = expired->tail = NULL; + + spin_unlock_irqrestore(&p->lock, flags); + + switch ((ulong) pending->process) { + case RCU_PENDING_KVFREE: + for (size_t i = 0; i < objs.nr; ) { + size_t nr_this_node = min(GENRADIX_NODE_SIZE / sizeof(void *), objs.nr - i); + + kfree_bulk(nr_this_node, (void **) genradix_ptr(&objs.objs, i)); + i += nr_this_node; + } + genradix_free(&objs.objs); + + while (list) { + struct rcu_head *obj = list; +#ifdef __KERNEL__ + list = obj->next; +#else + list = (void *) obj->next.next; +#endif + + /* + * low bit of pointer indicates whether rcu_head needs + * to be freed - kvfree_rcu_mightsleep() + */ + BUILD_BUG_ON(ARCH_SLAB_MINALIGN == 0); + + void *ptr = (void *)(((unsigned long) obj->func) & ~1UL); + bool free_head = ((unsigned long) obj->func) & 1UL; + + kvfree(ptr); + if (free_head) + kfree(obj); + } + + break; + + case RCU_PENDING_CALL_RCU: + for (size_t i = 0; i < objs.nr; i++) { + struct rcu_head *obj = *genradix_ptr(&objs.objs, i); + obj->func(obj); + } + genradix_free(&objs.objs); + + while (list) { + struct rcu_head *obj = list; +#ifdef __KERNEL__ + list = obj->next; +#else + list = (void *) obj->next.next; +#endif + obj->func(obj); + } + break; + + default: + for (size_t i = 0; i < objs.nr; i++) + pending->process(pending, *genradix_ptr(&objs.objs, i)); + genradix_free(&objs.objs); + + while (list) { + struct rcu_head *obj = list; +#ifdef __KERNEL__ + list = obj->next; +#else + list = (void *) obj->next.next; +#endif + pending->process(pending, obj); + } + break; + } +} + +static bool process_finished_items(struct rcu_pending *pending, + struct rcu_pending_pcpu *p, + unsigned long flags) +{ + /* + * XXX: we should grab the gp seq once and avoid multiple function + * calls, this is called from __rcu_pending_enqueue() fastpath in + * may_sleep==true mode + */ + if ((p->objs.nr && __poll_state_synchronize_rcu(pending->srcu, p->objs.data[0].seq)) || + (p->lists[0].head && __poll_state_synchronize_rcu(pending->srcu, p->lists[0].seq)) || + (p->lists[1].head && __poll_state_synchronize_rcu(pending->srcu, p->lists[1].seq)) || + p->lists[2].head) { + __process_finished_items(pending, p, flags); + return true; + } + + return false; +} + +static void rcu_pending_work(struct work_struct *work) +{ + struct rcu_pending_pcpu *p = + container_of(work, struct rcu_pending_pcpu, work); + struct rcu_pending *pending = p->parent; + unsigned long flags; + + do { + spin_lock_irqsave(&p->lock, flags); + } while (process_finished_items(pending, p, flags)); + + spin_unlock_irqrestore(&p->lock, flags); +} + +static void rcu_pending_rcu_cb(struct rcu_head *rcu) +{ + struct rcu_pending_pcpu *p = container_of(rcu, struct rcu_pending_pcpu, cb); + + schedule_work_on(p->cpu, &p->work); + + unsigned long flags; + spin_lock_irqsave(&p->lock, flags); + if (__rcu_pending_has_pending(p)) { + spin_unlock_irqrestore(&p->lock, flags); + __call_rcu(p->parent->srcu, &p->cb, rcu_pending_rcu_cb); + } else { + p->cb_armed = false; + spin_unlock_irqrestore(&p->lock, flags); + } +} + +static __always_inline struct rcu_pending_seq * +get_object_radix(struct rcu_pending_pcpu *p, unsigned long seq) +{ + darray_for_each_reverse(p->objs, objs) + if (objs->seq == seq) + return objs; + + if (darray_push_gfp(&p->objs, ((struct rcu_pending_seq) { .seq = seq }), GFP_ATOMIC)) + return NULL; + + return &darray_last(p->objs); +} + +static noinline bool +rcu_pending_enqueue_list(struct rcu_pending_pcpu *p, unsigned long seq, + struct rcu_head *head, void *ptr, + unsigned long *flags) +{ + if (ptr) { + if (!head) { + /* + * kvfree_rcu_mightsleep(): we weren't passed an + * rcu_head, but we need one: use the low bit of the + * ponter to free to flag that the head needs to be + * freed as well: + */ + ptr = (void *)(((unsigned long) ptr)|1UL); + head = kmalloc(sizeof(*head), __GFP_NOWARN); + if (!head) { + spin_unlock_irqrestore(&p->lock, *flags); + head = kmalloc(sizeof(*head), GFP_KERNEL|__GFP_NOFAIL); + /* + * dropped lock, did GFP_KERNEL allocation, + * check for gp expiration + */ + if (unlikely(__poll_state_synchronize_rcu(p->parent->srcu, seq))) { + kvfree(--ptr); + kfree(head); + spin_lock_irqsave(&p->lock, *flags); + return false; + } + } + } + + head->func = ptr; + } +again: + for (struct rcu_pending_list *i = p->lists; + i < p->lists + NUM_ACTIVE_RCU_POLL_OLDSTATE; i++) { + if (i->seq == seq) { + rcu_pending_list_add(i, head); + return false; + } + } + + for (struct rcu_pending_list *i = p->lists; + i < p->lists + NUM_ACTIVE_RCU_POLL_OLDSTATE; i++) { + if (!i->head) { + i->seq = seq; + rcu_pending_list_add(i, head); + return true; + } + } + + merge_expired_lists(p); + goto again; +} + +/* + * __rcu_pending_enqueue: enqueue a pending RCU item, to be processed (via + * pending->pracess) once grace period elapses. + * + * Attempt to enqueue items onto a radix tree; if memory allocation fails, fall + * back to a linked list. + * + * - If @ptr is NULL, we're enqueuing an item for a generic @pending with a + * process callback + * + * - If @ptr and @head are both not NULL, we're kvfree_rcu() + * + * - If @ptr is not NULL and @head is, we're kvfree_rcu_mightsleep() + * + * - If @may_sleep is true, will do GFP_KERNEL memory allocations and process + * expired items. + */ +static __always_inline void +__rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *head, + void *ptr, bool may_sleep) +{ + + struct rcu_pending_pcpu *p; + struct rcu_pending_seq *objs; + struct genradix_node *new_node = NULL; + unsigned long seq, flags; + bool start_gp = false; + + BUG_ON((ptr != NULL) != (pending->process == RCU_PENDING_KVFREE_FN)); + + local_irq_save(flags); + p = this_cpu_ptr(pending->p); + spin_lock(&p->lock); + seq = __get_state_synchronize_rcu(pending->srcu); +restart: + if (may_sleep && + unlikely(process_finished_items(pending, p, flags))) + goto check_expired; + + /* + * In kvfree_rcu() mode, the radix tree is only for slab pointers so + * that we can do kfree_bulk() - vmalloc pointers always use the linked + * list: + */ + if (ptr && unlikely(is_vmalloc_addr(ptr))) + goto list_add; + + objs = get_object_radix(p, seq); + if (unlikely(!objs)) + goto list_add; + + if (unlikely(!objs->cursor)) { + /* + * New radix tree nodes must be added under @p->lock because the + * tree root is in a darray that can be resized (typically, + * genradix supports concurrent unlocked allocation of new + * nodes) - hence preallocation and the retry loop: + */ + objs->cursor = genradix_ptr_alloc_preallocated_inlined(&objs->objs, + objs->nr, &new_node, GFP_ATOMIC|__GFP_NOWARN); + if (unlikely(!objs->cursor)) { + if (may_sleep) { + spin_unlock_irqrestore(&p->lock, flags); + + gfp_t gfp = GFP_KERNEL; + if (!head) + gfp |= __GFP_NOFAIL; + + new_node = genradix_alloc_node(gfp); + if (!new_node) + may_sleep = false; + goto check_expired; + } +list_add: + start_gp = rcu_pending_enqueue_list(p, seq, head, ptr, &flags); + goto start_gp; + } + } + + *objs->cursor++ = ptr ?: head; + /* zero cursor if we hit the end of a radix tree node: */ + if (!(((ulong) objs->cursor) & (GENRADIX_NODE_SIZE - 1))) + objs->cursor = NULL; + start_gp = !objs->nr; + objs->nr++; +start_gp: + if (unlikely(start_gp)) { + /* + * We only have one callback (ideally, we would have one for + * every outstanding graceperiod) - so if our callback is + * already in flight, we may still have to start a grace period + * (since we used get_state() above, not start_poll()) + */ + if (!p->cb_armed) { + p->cb_armed = true; + __call_rcu(pending->srcu, &p->cb, rcu_pending_rcu_cb); + } else { + __start_poll_synchronize_rcu(pending->srcu); + } + } + spin_unlock_irqrestore(&p->lock, flags); +free_node: + if (new_node) + genradix_free_node(new_node); + return; +check_expired: + if (unlikely(__poll_state_synchronize_rcu(pending->srcu, seq))) { + switch ((ulong) pending->process) { + case RCU_PENDING_KVFREE: + kvfree(ptr); + break; + case RCU_PENDING_CALL_RCU: + head->func(head); + break; + default: + pending->process(pending, head); + break; + } + goto free_node; + } + + local_irq_save(flags); + p = this_cpu_ptr(pending->p); + spin_lock(&p->lock); + goto restart; +} + +void rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *obj) +{ + __rcu_pending_enqueue(pending, obj, NULL, true); +} + +static struct rcu_head *rcu_pending_pcpu_dequeue(struct rcu_pending_pcpu *p) +{ + struct rcu_head *ret = NULL; + + spin_lock_irq(&p->lock); + darray_for_each(p->objs, objs) + if (objs->nr) { + ret = *genradix_ptr(&objs->objs, --objs->nr); + objs->cursor = NULL; + if (!objs->nr) + genradix_free(&objs->objs); + goto out; + } + + static_array_for_each(p->lists, i) + if (i->head) { + ret = i->head; +#ifdef __KERNEL__ + i->head = ret->next; +#else + i->head = (void *) ret->next.next; +#endif + if (!i->head) + i->tail = NULL; + goto out; + } +out: + spin_unlock_irq(&p->lock); + + return ret; +} + +struct rcu_head *rcu_pending_dequeue(struct rcu_pending *pending) +{ + return rcu_pending_pcpu_dequeue(raw_cpu_ptr(pending->p)); +} + +struct rcu_head *rcu_pending_dequeue_from_all(struct rcu_pending *pending) +{ + struct rcu_head *ret = rcu_pending_dequeue(pending); + + if (ret) + return ret; + + int cpu; + for_each_possible_cpu(cpu) { + ret = rcu_pending_pcpu_dequeue(per_cpu_ptr(pending->p, cpu)); + if (ret) + break; + } + return ret; +} + +static bool rcu_pending_has_pending_or_armed(struct rcu_pending *pending) +{ + int cpu; + for_each_possible_cpu(cpu) { + struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu); + spin_lock_irq(&p->lock); + if (__rcu_pending_has_pending(p) || p->cb_armed) { + spin_unlock_irq(&p->lock); + return true; + } + spin_unlock_irq(&p->lock); + } + + return false; +} + +void rcu_pending_exit(struct rcu_pending *pending) +{ + int cpu; + + if (!pending->p) + return; + + while (rcu_pending_has_pending_or_armed(pending)) { + __rcu_barrier(pending->srcu); + + for_each_possible_cpu(cpu) { + struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu); + flush_work(&p->work); + } + } + + for_each_possible_cpu(cpu) { + struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu); + flush_work(&p->work); + } + + for_each_possible_cpu(cpu) { + struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu); + + static_array_for_each(p->lists, i) + WARN_ON(i->head); + WARN_ON(p->objs.nr); + darray_exit(&p->objs); + } + free_percpu(pending->p); +} + +/** + * rcu_pending_init: - initialize a rcu_pending + * + * @pending: Object to init + * @srcu: May optionally be used with an srcu_struct; if NULL, uses normal + * RCU flavor + * @process: Callback function invoked on objects once their RCU barriers + * have completed; if NULL, kvfree() is used. + */ +int rcu_pending_init(struct rcu_pending *pending, + struct srcu_struct *srcu, + rcu_pending_process_fn process) +{ + pending->p = alloc_percpu(struct rcu_pending_pcpu); + if (!pending->p) + return -ENOMEM; + + int cpu; + for_each_possible_cpu(cpu) { + struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu); + p->parent = pending; + p->cpu = cpu; + spin_lock_init(&p->lock); + darray_init(&p->objs); + INIT_WORK(&p->work, rcu_pending_work); + } + + pending->srcu = srcu; + pending->process = process; + + return 0; +} diff --git a/fs/bcachefs/rcu_pending.h b/fs/bcachefs/rcu_pending.h new file mode 100644 index 000000000000..71a2f4ddaade --- /dev/null +++ b/fs/bcachefs/rcu_pending.h @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_RCU_PENDING_H +#define _LINUX_RCU_PENDING_H + +#include + +struct rcu_pending; +typedef void (*rcu_pending_process_fn)(struct rcu_pending *, struct rcu_head *); + +struct rcu_pending_pcpu; + +struct rcu_pending { + struct rcu_pending_pcpu __percpu *p; + struct srcu_struct *srcu; + rcu_pending_process_fn process; +}; + +void rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *obj); +struct rcu_head *rcu_pending_dequeue(struct rcu_pending *pending); +struct rcu_head *rcu_pending_dequeue_from_all(struct rcu_pending *pending); + +void rcu_pending_exit(struct rcu_pending *pending); +int rcu_pending_init(struct rcu_pending *pending, + struct srcu_struct *srcu, + rcu_pending_process_fn process); + +#endif /* _LINUX_RCU_PENDING_H */ diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c index cf81e5128c3a..2d299a37cf07 100644 --- a/fs/bcachefs/rebalance.c +++ b/fs/bcachefs/rebalance.c @@ -13,6 +13,7 @@ #include "errcode.h" #include "error.h" #include "inode.h" +#include "io_write.h" #include "move.h" #include "rebalance.h" #include "subvolume.h" @@ -156,6 +157,7 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, k, r->target, r->compression); data_opts->target = r->target; + data_opts->write_flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS; if (!data_opts->rewrite_ptrs) { /* @@ -263,6 +265,7 @@ static bool rebalance_pred(struct bch_fs *c, void *arg, data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, k, target, compression); data_opts->target = target; + data_opts->write_flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS; return data_opts->rewrite_ptrs != 0; } diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 36de1c6fe8c3..be1e7ca4362f 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -97,7 +97,7 @@ static void bch2_reconstruct_alloc(struct bch_fs *c) bch2_write_super(c); mutex_unlock(&c->sb_lock); - c->recovery_passes_explicit |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); + c->opts.recovery_passes |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); bch2_shoot_down_journal_keys(c, BTREE_ID_alloc, @@ -525,17 +525,17 @@ static int read_btree_roots(struct bch_fs *c) "error reading btree root %s l=%u: %s", bch2_btree_id_str(i), r->level, bch2_err_str(ret))) { if (btree_id_is_alloc(i)) { - c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_allocations); - c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_alloc_info); - c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_lrus); - c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers); - c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_alloc_to_lru_refs); + c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_allocations); + c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_alloc_info); + c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_lrus); + c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers); + c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_alloc_to_lru_refs); c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); r->error = 0; - } else if (!(c->recovery_passes_explicit & BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes))) { + } else if (!(c->opts.recovery_passes & BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes))) { bch_info(c, "will run btree node scan"); - c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes); - c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_topology); + c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes); + c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_topology); } ret = 0; @@ -706,14 +706,14 @@ int bch2_fs_recovery(struct bch_fs *c) if (check_version_upgrade(c)) write_sb = true; - c->recovery_passes_explicit |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); + c->opts.recovery_passes |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); if (write_sb) bch2_write_super(c); mutex_unlock(&c->sb_lock); if (c->opts.fsck && IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) - c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_topology); + c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_topology); if (c->opts.fsck) set_bit(BCH_FS_fsck_running, &c->flags); diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c index 73339a0a3111..735b8adc8f9d 100644 --- a/fs/bcachefs/recovery_passes.c +++ b/fs/bcachefs/recovery_passes.c @@ -40,7 +40,7 @@ static int bch2_set_may_go_rw(struct bch_fs *c) set_bit(BCH_FS_may_go_rw, &c->flags); - if (keys->nr || c->opts.fsck || !c->sb.clean || c->recovery_passes_explicit) + if (keys->nr || c->opts.fsck || !c->sb.clean || c->opts.recovery_passes) return bch2_fs_read_write_early(c); return 0; } @@ -97,14 +97,14 @@ u64 bch2_recovery_passes_from_stable(u64 v) int bch2_run_explicit_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) { - if (c->recovery_passes_explicit & BIT_ULL(pass)) + if (c->opts.recovery_passes & BIT_ULL(pass)) return 0; bch_info(c, "running explicit recovery pass %s (%u), currently at %s (%u)", bch2_recovery_passes[pass], pass, bch2_recovery_passes[c->curr_recovery_pass], c->curr_recovery_pass); - c->recovery_passes_explicit |= BIT_ULL(pass); + c->opts.recovery_passes |= BIT_ULL(pass); if (c->curr_recovery_pass >= pass) { c->curr_recovery_pass = pass; @@ -161,7 +161,9 @@ static bool should_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pa { struct recovery_pass_fn *p = recovery_pass_fns + pass; - if (c->recovery_passes_explicit & BIT_ULL(pass)) + if (c->opts.recovery_passes_exclude & BIT_ULL(pass)) + return false; + if (c->opts.recovery_passes & BIT_ULL(pass)) return true; if ((p->when & PASS_FSCK) && c->opts.fsck) return true; diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c index 1f34c92a6d11..998c0bd06802 100644 --- a/fs/bcachefs/replicas.c +++ b/fs/bcachefs/replicas.c @@ -123,7 +123,7 @@ static void extent_to_replicas(struct bkey_s_c k, continue; if (!p.has_ec) - r->devs[r->nr_devs++] = p.ptr.dev; + replicas_entry_add_dev(r, p.ptr.dev); else r->nr_required = 0; } @@ -140,7 +140,7 @@ static void stripe_to_replicas(struct bkey_s_c k, for (ptr = s.v->ptrs; ptr < s.v->ptrs + s.v->nr_blocks; ptr++) - r->devs[r->nr_devs++] = ptr->dev; + replicas_entry_add_dev(r, ptr->dev); } void bch2_bkey_to_replicas(struct bch_replicas_entry_v1 *e, @@ -181,7 +181,7 @@ void bch2_devlist_to_replicas(struct bch_replicas_entry_v1 *e, e->nr_required = 1; darray_for_each(devs, i) - e->devs[e->nr_devs++] = *i; + replicas_entry_add_dev(e, *i); bch2_replicas_entry_sort(e); } @@ -795,12 +795,12 @@ bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs, for (unsigned i = 0; i < e->nr_devs; i++) { nr_online += test_bit(e->devs[i], devs.d); - struct bch_dev *ca = bch2_dev_rcu(c, e->devs[i]); + struct bch_dev *ca = bch2_dev_rcu_noerror(c, e->devs[i]); nr_failed += !ca || ca->mi.state == BCH_MEMBER_STATE_failed; } rcu_read_unlock(); - if (nr_failed == e->nr_devs) + if (nr_online + nr_failed == e->nr_devs) continue; if (nr_online < e->nr_required) diff --git a/fs/bcachefs/replicas_format.h b/fs/bcachefs/replicas_format.h index b97208195d06..b7eff904acdb 100644 --- a/fs/bcachefs/replicas_format.h +++ b/fs/bcachefs/replicas_format.h @@ -5,7 +5,7 @@ struct bch_replicas_entry_v0 { __u8 data_type; __u8 nr_devs; - __u8 devs[]; + __u8 devs[] __counted_by(nr_devs); } __packed; struct bch_sb_field_replicas_v0 { @@ -17,7 +17,7 @@ struct bch_replicas_entry_v1 { __u8 data_type; __u8 nr_devs; __u8 nr_required; - __u8 devs[]; + __u8 devs[] __counted_by(nr_devs); } __packed; struct bch_sb_field_replicas { @@ -28,4 +28,9 @@ struct bch_sb_field_replicas { #define replicas_entry_bytes(_i) \ (offsetof(typeof(*(_i)), devs) + (_i)->nr_devs) +#define replicas_entry_add_dev(e, d) ({ \ + (e)->nr_devs++; \ + (e)->devs[(e)->nr_devs - 1] = (d); \ +}) + #endif /* _BCACHEFS_REPLICAS_FORMAT_H */ diff --git a/fs/bcachefs/sb-clean.c b/fs/bcachefs/sb-clean.c index c57d42bb8d1b..025848a9c4c0 100644 --- a/fs/bcachefs/sb-clean.c +++ b/fs/bcachefs/sb-clean.c @@ -155,7 +155,7 @@ struct bch_sb_field_clean *bch2_read_superblock_clean(struct bch_fs *c) SET_BCH_SB_CLEAN(c->disk_sb.sb, false); c->sb.clean = false; mutex_unlock(&c->sb_lock); - return NULL; + return ERR_PTR(-BCH_ERR_invalid_sb_clean); } clean = kmemdup(sb_clean, vstruct_bytes(&sb_clean->field), diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c index 4b765422dd77..02bcde3c1b02 100644 --- a/fs/bcachefs/sb-members.c +++ b/fs/bcachefs/sb-members.c @@ -465,3 +465,60 @@ void bch2_dev_btree_bitmap_mark(struct bch_fs *c, struct bkey_s_c k) __bch2_dev_btree_bitmap_mark(mi, ptr->dev, ptr->offset, btree_sectors(c)); } } + +unsigned bch2_sb_nr_devices(const struct bch_sb *sb) +{ + unsigned nr = 0; + + for (unsigned i = 0; i < sb->nr_devices; i++) + nr += bch2_member_exists((struct bch_sb *) sb, i); + return nr; +} + +int bch2_sb_member_alloc(struct bch_fs *c) +{ + unsigned dev_idx = c->sb.nr_devices; + struct bch_sb_field_members_v2 *mi; + unsigned nr_devices; + unsigned u64s; + int best = -1; + u64 best_last_mount = 0; + + if (dev_idx < BCH_SB_MEMBERS_MAX) + goto have_slot; + + for (dev_idx = 0; dev_idx < BCH_SB_MEMBERS_MAX; dev_idx++) { + /* eventually BCH_SB_MEMBERS_MAX will be raised */ + if (dev_idx == BCH_SB_MEMBER_INVALID) + continue; + + struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, dev_idx); + if (bch2_member_alive(&m)) + continue; + + u64 last_mount = le64_to_cpu(m.last_mount); + if (best < 0 || last_mount < best_last_mount) { + best = dev_idx; + best_last_mount = last_mount; + } + } + if (best >= 0) { + dev_idx = best; + goto have_slot; + } + + return -BCH_ERR_ENOSPC_sb_members; +have_slot: + nr_devices = max_t(unsigned, dev_idx + 1, c->sb.nr_devices); + + mi = bch2_sb_field_get(c->disk_sb.sb, members_v2); + u64s = DIV_ROUND_UP(sizeof(struct bch_sb_field_members_v2) + + le16_to_cpu(mi->member_bytes) * nr_devices, sizeof(u64)); + + mi = bch2_sb_field_resize(&c->disk_sb, members_v2, u64s); + if (!mi) + return -BCH_ERR_ENOSPC_sb_members; + + c->disk_sb.sb->nr_devices = nr_devices; + return dev_idx; +} diff --git a/fs/bcachefs/sb-members.h b/fs/bcachefs/sb-members.h index dd93192ec065..762083b564ee 100644 --- a/fs/bcachefs/sb-members.h +++ b/fs/bcachefs/sb-members.h @@ -198,29 +198,37 @@ static inline struct bch_dev *bch2_dev_locked(struct bch_fs *c, unsigned dev) lockdep_is_held(&c->state_lock)); } -static inline struct bch_dev *bch2_dev_rcu(struct bch_fs *c, unsigned dev) +static inline struct bch_dev *bch2_dev_rcu_noerror(struct bch_fs *c, unsigned dev) { return c && dev < c->sb.nr_devices ? rcu_dereference(c->devs[dev]) : NULL; } +void bch2_dev_missing(struct bch_fs *, unsigned); + +static inline struct bch_dev *bch2_dev_rcu(struct bch_fs *c, unsigned dev) +{ + struct bch_dev *ca = bch2_dev_rcu_noerror(c, dev); + if (unlikely(!ca)) + bch2_dev_missing(c, dev); + return ca; +} + static inline struct bch_dev *bch2_dev_tryget_noerror(struct bch_fs *c, unsigned dev) { rcu_read_lock(); - struct bch_dev *ca = bch2_dev_rcu(c, dev); + struct bch_dev *ca = bch2_dev_rcu_noerror(c, dev); if (ca) bch2_dev_get(ca); rcu_read_unlock(); return ca; } -void bch2_dev_missing(struct bch_fs *, unsigned); - static inline struct bch_dev *bch2_dev_tryget(struct bch_fs *c, unsigned dev) { struct bch_dev *ca = bch2_dev_tryget_noerror(c, dev); - if (!ca) + if (unlikely(!ca)) bch2_dev_missing(c, dev); return ca; } @@ -307,6 +315,8 @@ static inline bool bch2_member_exists(struct bch_sb *sb, unsigned dev) return false; } +unsigned bch2_sb_nr_devices(const struct bch_sb *); + static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi) { return (struct bch_member_cpu) { @@ -352,4 +362,6 @@ static inline bool bch2_dev_btree_bitmap_marked_sectors(struct bch_dev *ca, u64 bool bch2_dev_btree_bitmap_marked(struct bch_fs *, struct bkey_s_c); void bch2_dev_btree_bitmap_mark(struct bch_fs *, struct bkey_s_c); +int bch2_sb_member_alloc(struct bch_fs *); + #endif /* _BCACHEFS_SB_MEMBERS_H */ diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h index c8c266cb5797..215eed4cce6d 100644 --- a/fs/bcachefs/str_hash.h +++ b/fs/bcachefs/str_hash.h @@ -270,7 +270,7 @@ int bch2_hash_set_in_snapshot(struct btree_trans *trans, desc.hash_bkey(info, bkey_i_to_s_c(insert)), snapshot), POS(insert->k.p.inode, U64_MAX), - BTREE_ITER_slots|BTREE_ITER_intent, k, ret) { + BTREE_ITER_slots|BTREE_ITER_intent|flags, k, ret) { if (is_visible_key(desc, inum, k)) { if (!desc.cmp_bkey(k, bkey_i_to_s_c(insert))) goto found; diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h index a8299ba2cab2..e62f876541fe 100644 --- a/fs/bcachefs/subvolume.h +++ b/fs/bcachefs/subvolume.h @@ -31,6 +31,51 @@ int bch2_subvolume_get_snapshot(struct btree_trans *, u32, u32 *); int bch2_subvol_is_ro_trans(struct btree_trans *, u32); int bch2_subvol_is_ro(struct bch_fs *, u32); +static inline struct bkey_s_c +bch2_btree_iter_peek_in_subvolume_upto_type(struct btree_iter *iter, struct bpos end, + u32 subvolid, unsigned flags) +{ + u32 snapshot; + int ret = bch2_subvolume_get_snapshot(iter->trans, subvolid, &snapshot); + if (ret) + return bkey_s_c_err(ret); + + bch2_btree_iter_set_snapshot(iter, snapshot); + return bch2_btree_iter_peek_upto_type(iter, end, flags); +} + +#define for_each_btree_key_in_subvolume_upto_continue(_trans, _iter, \ + _end, _subvolid, _flags, _k, _do) \ +({ \ + struct bkey_s_c _k; \ + int _ret3 = 0; \ + \ + do { \ + _ret3 = lockrestart_do(_trans, ({ \ + (_k) = bch2_btree_iter_peek_in_subvolume_upto_type(&(_iter), \ + _end, _subvolid, (_flags)); \ + if (!(_k).k) \ + break; \ + \ + bkey_err(_k) ?: (_do); \ + })); \ + } while (!_ret3 && bch2_btree_iter_advance(&(_iter))); \ + \ + bch2_trans_iter_exit((_trans), &(_iter)); \ + _ret3; \ +}) + +#define for_each_btree_key_in_subvolume_upto(_trans, _iter, _btree_id, \ + _start, _end, _subvolid, _flags, _k, _do) \ +({ \ + struct btree_iter _iter; \ + bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ + (_start), (_flags)); \ + \ + for_each_btree_key_in_subvolume_upto_continue(_trans, _iter, \ + _end, _subvolid, _flags, _k, _do); \ +}) + int bch2_delete_dead_snapshots(struct bch_fs *); void bch2_delete_dead_snapshots_async(struct bch_fs *); diff --git a/fs/bcachefs/subvolume_types.h b/fs/bcachefs/subvolume_types.h index 9b10c8947828..f2ec4277c2a5 100644 --- a/fs/bcachefs/subvolume_types.h +++ b/fs/bcachefs/subvolume_types.h @@ -30,7 +30,8 @@ struct snapshot_table { }; typedef struct { - u32 subvol; + /* we can't have padding in this struct: */ + u64 subvol; u64 inum; } subvol_inum; diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index c8c2ccbdfbb5..d86d5dae54c9 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -418,6 +418,9 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_disk_accounting_v2 && !BCH_SB_ALLOCATOR_STUCK_TIMEOUT(sb)) SET_BCH_SB_ALLOCATOR_STUCK_TIMEOUT(sb, 30); + + if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_disk_accounting_v2) + SET_BCH_SB_PROMOTE_WHOLE_EXTENTS(sb, true); } for (opt_id = 0; opt_id < bch2_opts_nr; opt_id++) { @@ -1292,15 +1295,9 @@ void bch2_sb_layout_to_text(struct printbuf *out, struct bch_sb_layout *l) void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb, bool print_layout, unsigned fields) { - u64 fields_have = 0; - unsigned nr_devices = 0; - if (!out->nr_tabstops) printbuf_tabstop_push(out, 44); - for (int i = 0; i < sb->nr_devices; i++) - nr_devices += bch2_member_exists(sb, i); - prt_printf(out, "External UUID:\t"); pr_uuid(out, sb->user_uuid.b); prt_newline(out); @@ -1356,9 +1353,10 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb, prt_newline(out); prt_printf(out, "Clean:\t%llu\n", BCH_SB_CLEAN(sb)); - prt_printf(out, "Devices:\t%u\n", nr_devices); + prt_printf(out, "Devices:\t%u\n", bch2_sb_nr_devices(sb)); prt_printf(out, "Sections:\t"); + u64 fields_have = 0; vstruct_for_each(sb, f) fields_have |= 1 << le32_to_cpu(f->type); prt_bitflags(out, bch2_sb_fields, fields_have); diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index e7fa2de35014..873e4be7e1dc 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -370,7 +370,7 @@ void bch2_fs_read_only(struct bch_fs *c) test_bit(BCH_FS_clean_shutdown, &c->flags) && c->recovery_pass_done >= BCH_RECOVERY_PASS_journal_replay) { BUG_ON(c->journal.last_empty_seq != journal_cur_seq(&c->journal)); - BUG_ON(atomic_read(&c->btree_cache.dirty)); + BUG_ON(atomic_long_read(&c->btree_cache.nr_dirty)); BUG_ON(atomic_long_read(&c->btree_key_cache.nr_dirty)); BUG_ON(c->btree_write_buffer.inc.keys.nr); BUG_ON(c->btree_write_buffer.flushing.keys.nr); @@ -543,6 +543,7 @@ static void __bch2_fs_free(struct bch_fs *c) bch2_fs_fs_io_direct_exit(c); bch2_fs_fs_io_buffered_exit(c); bch2_fs_fsio_exit(c); + bch2_fs_vfs_exit(c); bch2_fs_ec_exit(c); bch2_fs_encryption_exit(c); bch2_fs_nocow_locking_exit(c); @@ -810,7 +811,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) c->copy_gc_enabled = 1; c->rebalance.enabled = 1; - c->promote_whole_extents = true; c->journal.flush_write_time = &c->times[BCH_TIME_journal_flush_write]; c->journal.noflush_write_time = &c->times[BCH_TIME_journal_noflush_write]; @@ -926,6 +926,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) bch2_fs_encryption_init(c) ?: bch2_fs_compress_init(c) ?: bch2_fs_ec_init(c) ?: + bch2_fs_vfs_init(c) ?: bch2_fs_fsio_init(c) ?: bch2_fs_fs_io_buffered_init(c) ?: bch2_fs_fs_io_direct_init(c); @@ -1591,33 +1592,6 @@ int bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, /* Device add/removal: */ -static int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca) -{ - struct bpos start = POS(ca->dev_idx, 0); - struct bpos end = POS(ca->dev_idx, U64_MAX); - int ret; - - /* - * We clear the LRU and need_discard btrees first so that we don't race - * with bch2_do_invalidates() and bch2_do_discards() - */ - ret = bch2_btree_delete_range(c, BTREE_ID_lru, start, end, - BTREE_TRIGGER_norun, NULL) ?: - bch2_btree_delete_range(c, BTREE_ID_need_discard, start, end, - BTREE_TRIGGER_norun, NULL) ?: - bch2_btree_delete_range(c, BTREE_ID_freespace, start, end, - BTREE_TRIGGER_norun, NULL) ?: - bch2_btree_delete_range(c, BTREE_ID_backpointers, start, end, - BTREE_TRIGGER_norun, NULL) ?: - bch2_btree_delete_range(c, BTREE_ID_alloc, start, end, - BTREE_TRIGGER_norun, NULL) ?: - bch2_btree_delete_range(c, BTREE_ID_bucket_gens, start, end, - BTREE_TRIGGER_norun, NULL) ?: - bch2_dev_usage_remove(c, ca->dev_idx); - bch_err_msg(c, ret, "removing dev alloc info"); - return ret; -} - int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) { struct bch_member *m; @@ -1729,9 +1703,6 @@ int bch2_dev_add(struct bch_fs *c, const char *path) struct bch_opts opts = bch2_opts_empty(); struct bch_sb_handle sb; struct bch_dev *ca = NULL; - struct bch_sb_field_members_v2 *mi; - struct bch_member dev_mi; - unsigned dev_idx, nr_devices, u64s; struct printbuf errbuf = PRINTBUF; struct printbuf label = PRINTBUF; int ret; @@ -1741,7 +1712,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path) if (ret) goto err; - dev_mi = bch2_sb_member_get(sb.sb, sb.sb->dev_idx); + struct bch_member dev_mi = bch2_sb_member_get(sb.sb, sb.sb->dev_idx); if (BCH_MEMBER_GROUP(&dev_mi)) { bch2_disk_path_to_text_sb(&label, sb.sb, BCH_MEMBER_GROUP(&dev_mi) - 1); @@ -1779,55 +1750,19 @@ int bch2_dev_add(struct bch_fs *c, const char *path) goto err_unlock; if (dynamic_fault("bcachefs:add:no_slot")) - goto no_slot; + goto err_unlock; - if (c->sb.nr_devices < BCH_SB_MEMBERS_MAX) { - dev_idx = c->sb.nr_devices; - goto have_slot; - } - - int best = -1; - u64 best_last_mount = 0; - for (dev_idx = 0; dev_idx < BCH_SB_MEMBERS_MAX; dev_idx++) { - struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, dev_idx); - if (bch2_member_alive(&m)) - continue; - - u64 last_mount = le64_to_cpu(m.last_mount); - if (best < 0 || last_mount < best_last_mount) { - best = dev_idx; - best_last_mount = last_mount; - } - } - if (best >= 0) { - dev_idx = best; - goto have_slot; - } -no_slot: - ret = -BCH_ERR_ENOSPC_sb_members; - bch_err_msg(c, ret, "setting up new superblock"); - goto err_unlock; - -have_slot: - nr_devices = max_t(unsigned, dev_idx + 1, c->sb.nr_devices); - - mi = bch2_sb_field_get(c->disk_sb.sb, members_v2); - u64s = DIV_ROUND_UP(sizeof(struct bch_sb_field_members_v2) + - le16_to_cpu(mi->member_bytes) * nr_devices, sizeof(u64)); - - mi = bch2_sb_field_resize(&c->disk_sb, members_v2, u64s); - if (!mi) { - ret = -BCH_ERR_ENOSPC_sb_members; + ret = bch2_sb_member_alloc(c); + if (ret < 0) { bch_err_msg(c, ret, "setting up new superblock"); goto err_unlock; } - struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, dev_idx); + unsigned dev_idx = ret; /* success: */ - *m = dev_mi; - m->last_mount = cpu_to_le64(ktime_get_real_seconds()); - c->disk_sb.sb->nr_devices = nr_devices; + dev_mi.last_mount = cpu_to_le64(ktime_get_real_seconds()); + *bch2_members_v2_get_mut(c->disk_sb.sb, dev_idx) = dev_mi; ca->disk_sb.sb->dev_idx = dev_idx; bch2_dev_attach(c, ca, dev_idx); diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c index 33f2a64c14c9..03e59f86f360 100644 --- a/fs/bcachefs/sysfs.c +++ b/fs/bcachefs/sysfs.c @@ -219,7 +219,6 @@ read_attribute(copy_gc_wait); rw_attribute(rebalance_enabled); sysfs_pd_controller_attribute(rebalance); read_attribute(rebalance_status); -rw_attribute(promote_whole_extents); read_attribute(new_stripes); @@ -234,7 +233,7 @@ write_attribute(perf_test); #define x(_name) \ static struct attribute sysfs_time_stat_##_name = \ - { .name = #_name, .mode = 0444 }; + { .name = #_name, .mode = 0644 }; BCH_TIME_STATS() #undef x @@ -245,14 +244,18 @@ static struct attribute sysfs_state_rw = { static size_t bch2_btree_cache_size(struct bch_fs *c) { + struct btree_cache *bc = &c->btree_cache; size_t ret = 0; struct btree *b; - mutex_lock(&c->btree_cache.lock); - list_for_each_entry(b, &c->btree_cache.live, list) + mutex_lock(&bc->lock); + list_for_each_entry(b, &bc->live[0].list, list) ret += btree_buf_bytes(b); - - mutex_unlock(&c->btree_cache.lock); + list_for_each_entry(b, &bc->live[1].list, list) + ret += btree_buf_bytes(b); + list_for_each_entry(b, &bc->freeable, list) + ret += btree_buf_bytes(b); + mutex_unlock(&bc->lock); return ret; } @@ -288,7 +291,7 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c prt_tab_rjust(out); prt_human_readable_u64(out, nr_extents - ? div_u64(sectors_uncompressed << 9, nr_extents) + ? div64_u64(sectors_uncompressed << 9, nr_extents) : 0); prt_tab_rjust(out); prt_newline(out); @@ -347,8 +350,6 @@ SHOW(bch2_fs) if (attr == &sysfs_rebalance_status) bch2_rebalance_status_to_text(out, c); - sysfs_print(promote_whole_extents, c->promote_whole_extents); - /* Debugging: */ if (attr == &sysfs_journal_debug) @@ -436,8 +437,6 @@ STORE(bch2_fs) sysfs_pd_controller_store(rebalance, &c->rebalance.pd); - sysfs_strtoul(promote_whole_extents, c->promote_whole_extents); - /* Debugging: */ if (!test_bit(BCH_FS_started, &c->flags)) @@ -449,11 +448,12 @@ STORE(bch2_fs) return -EROFS; if (attr == &sysfs_trigger_btree_cache_shrink) { + struct btree_cache *bc = &c->btree_cache; struct shrink_control sc; sc.gfp_mask = GFP_KERNEL; sc.nr_to_scan = strtoul_or_return(buf); - c->btree_cache.shrink->scan_objects(c->btree_cache.shrink, &sc); + bc->live[0].shrink->scan_objects(bc->live[0].shrink, &sc); } if (attr == &sysfs_trigger_btree_key_cache_shrink) { @@ -514,7 +514,7 @@ struct attribute *bch2_fs_files[] = { &sysfs_btree_cache_size, &sysfs_btree_write_stats, - &sysfs_promote_whole_extents, + &sysfs_rebalance_status, &sysfs_compression_stats, @@ -614,7 +614,6 @@ struct attribute *bch2_fs_internal_files[] = { &sysfs_copy_gc_wait, &sysfs_rebalance_enabled, - &sysfs_rebalance_status, sysfs_pd_controller_files(rebalance), &sysfs_moving_ctxts, @@ -674,7 +673,7 @@ STORE(bch2_fs_opts_dir) if (ret < 0) goto err; - bch2_opt_set_sb(c, opt, v); + bch2_opt_set_sb(c, NULL, opt, v); bch2_opt_set_by_id(&c->opts, id, v); if (v && @@ -728,6 +727,13 @@ SHOW(bch2_fs_time_stats) STORE(bch2_fs_time_stats) { + struct bch_fs *c = container_of(kobj, struct bch_fs, time_stats); + +#define x(name) \ + if (attr == &sysfs_time_stat_##name) \ + bch2_time_stats_reset(&c->times[BCH_TIME_##name]); + BCH_TIME_STATS() +#undef x return size; } SYSFS_OPS(bch2_fs_time_stats); @@ -821,32 +827,17 @@ STORE(bch2_dev) { struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj); struct bch_fs *c = ca->fs; - struct bch_member *mi; if (attr == &sysfs_discard) { bool v = strtoul_or_return(buf); - mutex_lock(&c->sb_lock); - mi = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); - - if (v != BCH_MEMBER_DISCARD(mi)) { - SET_BCH_MEMBER_DISCARD(mi, v); - bch2_write_super(c); - } - mutex_unlock(&c->sb_lock); + bch2_opt_set_sb(c, ca, bch2_opt_table + Opt_discard, v); } if (attr == &sysfs_durability) { u64 v = strtoul_or_return(buf); - mutex_lock(&c->sb_lock); - mi = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); - - if (v + 1 != BCH_MEMBER_DURABILITY(mi)) { - SET_BCH_MEMBER_DURABILITY(mi, v + 1); - bch2_write_super(c); - } - mutex_unlock(&c->sb_lock); + bch2_opt_set_sb(c, ca, bch2_opt_table + Opt_durability, v); } if (attr == &sysfs_label) { diff --git a/fs/bcachefs/thread_with_file.c b/fs/bcachefs/thread_with_file.c index 0807ce9b171a..fb3442a7c67f 100644 --- a/fs/bcachefs/thread_with_file.c +++ b/fs/bcachefs/thread_with_file.c @@ -387,7 +387,7 @@ int bch2_stdio_redirect_readline_timeout(struct stdio_redirect *stdio, seen = buf->buf.nr; char *n = memchr(buf->buf.data, '\n', seen); - if (!n && timeout != MAX_SCHEDULE_TIMEOUT && jiffies >= until) { + if (!n && timeout != MAX_SCHEDULE_TIMEOUT && time_after_eq(jiffies, until)) { spin_unlock(&buf->lock); return -ETIME; } diff --git a/fs/bcachefs/time_stats.c b/fs/bcachefs/time_stats.c index 4508e9dcbee2..3fe82757f93a 100644 --- a/fs/bcachefs/time_stats.c +++ b/fs/bcachefs/time_stats.c @@ -151,6 +151,20 @@ void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end) } } +void bch2_time_stats_reset(struct bch2_time_stats *stats) +{ + spin_lock_irq(&stats->lock); + unsigned offset = offsetof(struct bch2_time_stats, min_duration); + memset((void *) stats + offset, 0, sizeof(*stats) - offset); + + if (stats->buffer) { + int cpu; + for_each_possible_cpu(cpu) + per_cpu_ptr(stats->buffer, cpu)->nr = 0; + } + spin_unlock_irq(&stats->lock); +} + void bch2_time_stats_exit(struct bch2_time_stats *stats) { free_percpu(stats->buffer); diff --git a/fs/bcachefs/time_stats.h b/fs/bcachefs/time_stats.h index 5df61403744b..dc6493f7bbab 100644 --- a/fs/bcachefs/time_stats.h +++ b/fs/bcachefs/time_stats.h @@ -70,6 +70,7 @@ struct time_stat_buffer { struct bch2_time_stats { spinlock_t lock; bool have_quantiles; + struct time_stat_buffer __percpu *buffer; /* all fields are in nanoseconds */ u64 min_duration; u64 max_duration; @@ -87,7 +88,6 @@ struct bch2_time_stats { struct mean_and_variance_weighted duration_stats_weighted; struct mean_and_variance_weighted freq_stats_weighted; - struct time_stat_buffer __percpu *buffer; }; struct bch2_time_stats_quantiles { @@ -142,6 +142,7 @@ static inline bool track_event_change(struct bch2_time_stats *stats, bool v) return false; } +void bch2_time_stats_reset(struct bch2_time_stats *); void bch2_time_stats_exit(struct bch2_time_stats *); void bch2_time_stats_init(struct bch2_time_stats *); diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h index c62f00322d1e..5597b9d6297f 100644 --- a/fs/bcachefs/trace.h +++ b/fs/bcachefs/trace.h @@ -3,7 +3,6 @@ #define TRACE_SYSTEM bcachefs #if !defined(_TRACE_BCACHEFS_H) || defined(TRACE_HEADER_MULTI_READ) -#define _TRACE_BCACHEFS_H #include @@ -558,6 +557,7 @@ TRACE_EVENT(btree_path_relock_fail, __field(unsigned long, caller_ip ) __field(u8, btree_id ) __field(u8, level ) + __field(u8, path_idx) TRACE_BPOS_entries(pos) __array(char, node, 24 ) __field(u8, self_read_count ) @@ -575,7 +575,8 @@ TRACE_EVENT(btree_path_relock_fail, strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); __entry->caller_ip = caller_ip; __entry->btree_id = path->btree_id; - __entry->level = path->level; + __entry->level = level; + __entry->path_idx = path - trans->paths; TRACE_BPOS_assign(pos, path->pos); c = bch2_btree_node_lock_counts(trans, NULL, &path->l[level].b->c, level); @@ -588,7 +589,7 @@ TRACE_EVENT(btree_path_relock_fail, c = six_lock_counts(&path->l[level].b->c.lock); __entry->read_count = c.n[SIX_LOCK_read]; __entry->intent_count = c.n[SIX_LOCK_intent]; - scnprintf(__entry->node, sizeof(__entry->node), "%px", b); + scnprintf(__entry->node, sizeof(__entry->node), "%px", &b->c); } __entry->iter_lock_seq = path->l[level].lock_seq; __entry->node_lock_seq = is_btree_node(path, level) @@ -596,9 +597,10 @@ TRACE_EVENT(btree_path_relock_fail, : 0; ), - TP_printk("%s %pS btree %s pos %llu:%llu:%u level %u node %s held %u:%u lock count %u:%u iter seq %u lock seq %u", + TP_printk("%s %pS\nidx %2u btree %s pos %llu:%llu:%u level %u node %s held %u:%u lock count %u:%u iter seq %u lock seq %u", __entry->trans_fn, (void *) __entry->caller_ip, + __entry->path_idx, bch2_btree_id_str(__entry->btree_id), __entry->pos_inode, __entry->pos_offset, @@ -625,6 +627,7 @@ TRACE_EVENT(btree_path_upgrade_fail, __field(unsigned long, caller_ip ) __field(u8, btree_id ) __field(u8, level ) + __field(u8, path_idx) TRACE_BPOS_entries(pos) __field(u8, locked ) __field(u8, self_read_count ) @@ -642,6 +645,7 @@ TRACE_EVENT(btree_path_upgrade_fail, __entry->caller_ip = caller_ip; __entry->btree_id = path->btree_id; __entry->level = level; + __entry->path_idx = path - trans->paths; TRACE_BPOS_assign(pos, path->pos); __entry->locked = btree_node_locked(path, level); @@ -657,9 +661,10 @@ TRACE_EVENT(btree_path_upgrade_fail, : 0; ), - TP_printk("%s %pS btree %s pos %llu:%llu:%u level %u locked %u held %u:%u lock count %u:%u iter seq %u lock seq %u", + TP_printk("%s %pS\nidx %2u btree %s pos %llu:%llu:%u level %u locked %u held %u:%u lock count %u:%u iter seq %u lock seq %u", __entry->trans_fn, (void *) __entry->caller_ip, + __entry->path_idx, bch2_btree_id_str(__entry->btree_id), __entry->pos_inode, __entry->pos_offset, @@ -1438,6 +1443,456 @@ TRACE_EVENT(error_downcast, TP_printk("%s -> %s %s", __entry->bch_err, __entry->std_err, __entry->ip) ); +#ifdef CONFIG_BCACHEFS_PATH_TRACEPOINTS + +TRACE_EVENT(update_by_path, + TP_PROTO(struct btree_trans *trans, struct btree_path *path, + struct btree_insert_entry *i, bool overwrite), + TP_ARGS(trans, path, i, overwrite), + + TP_STRUCT__entry( + __array(char, trans_fn, 32 ) + __field(btree_path_idx_t, path_idx ) + __field(u8, btree_id ) + TRACE_BPOS_entries(pos) + __field(u8, overwrite ) + __field(btree_path_idx_t, update_idx ) + __field(btree_path_idx_t, nr_updates ) + ), + + TP_fast_assign( + strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); + __entry->path_idx = path - trans->paths; + __entry->btree_id = path->btree_id; + TRACE_BPOS_assign(pos, path->pos); + __entry->overwrite = overwrite; + __entry->update_idx = i - trans->updates; + __entry->nr_updates = trans->nr_updates; + ), + + TP_printk("%s path %3u btree %s pos %llu:%llu:%u overwrite %u update %u/%u", + __entry->trans_fn, + __entry->path_idx, + bch2_btree_id_str(__entry->btree_id), + __entry->pos_inode, + __entry->pos_offset, + __entry->pos_snapshot, + __entry->overwrite, + __entry->update_idx, + __entry->nr_updates) +); + +TRACE_EVENT(btree_path_lock, + TP_PROTO(struct btree_trans *trans, + unsigned long caller_ip, + struct btree_bkey_cached_common *b), + TP_ARGS(trans, caller_ip, b), + + TP_STRUCT__entry( + __array(char, trans_fn, 32 ) + __field(unsigned long, caller_ip ) + __field(u8, btree_id ) + __field(u8, level ) + __array(char, node, 24 ) + __field(u32, lock_seq ) + ), + + TP_fast_assign( + strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); + __entry->caller_ip = caller_ip; + __entry->btree_id = b->btree_id; + __entry->level = b->level; + + scnprintf(__entry->node, sizeof(__entry->node), "%px", b); + __entry->lock_seq = six_lock_seq(&b->lock); + ), + + TP_printk("%s %pS\nbtree %s level %u node %s lock seq %u", + __entry->trans_fn, + (void *) __entry->caller_ip, + bch2_btree_id_str(__entry->btree_id), + __entry->level, + __entry->node, + __entry->lock_seq) +); + +DECLARE_EVENT_CLASS(btree_path_ev, + TP_PROTO(struct btree_trans *trans, struct btree_path *path), + TP_ARGS(trans, path), + + TP_STRUCT__entry( + __field(u16, idx ) + __field(u8, ref ) + __field(u8, btree_id ) + TRACE_BPOS_entries(pos) + ), + + TP_fast_assign( + __entry->idx = path - trans->paths; + __entry->ref = path->ref; + __entry->btree_id = path->btree_id; + TRACE_BPOS_assign(pos, path->pos); + ), + + TP_printk("path %3u ref %u btree %s pos %llu:%llu:%u", + __entry->idx, __entry->ref, + bch2_btree_id_str(__entry->btree_id), + __entry->pos_inode, + __entry->pos_offset, + __entry->pos_snapshot) +); + +DEFINE_EVENT(btree_path_ev, btree_path_get_ll, + TP_PROTO(struct btree_trans *trans, struct btree_path *path), + TP_ARGS(trans, path) +); + +DEFINE_EVENT(btree_path_ev, btree_path_put_ll, + TP_PROTO(struct btree_trans *trans, struct btree_path *path), + TP_ARGS(trans, path) +); + +DEFINE_EVENT(btree_path_ev, btree_path_should_be_locked, + TP_PROTO(struct btree_trans *trans, struct btree_path *path), + TP_ARGS(trans, path) +); + +TRACE_EVENT(btree_path_alloc, + TP_PROTO(struct btree_trans *trans, struct btree_path *path), + TP_ARGS(trans, path), + + TP_STRUCT__entry( + __field(btree_path_idx_t, idx ) + __field(u8, locks_want ) + __field(u8, btree_id ) + TRACE_BPOS_entries(pos) + ), + + TP_fast_assign( + __entry->idx = path - trans->paths; + __entry->locks_want = path->locks_want; + __entry->btree_id = path->btree_id; + TRACE_BPOS_assign(pos, path->pos); + ), + + TP_printk("path %3u btree %s locks_want %u pos %llu:%llu:%u", + __entry->idx, + bch2_btree_id_str(__entry->btree_id), + __entry->locks_want, + __entry->pos_inode, + __entry->pos_offset, + __entry->pos_snapshot) +); + +TRACE_EVENT(btree_path_get, + TP_PROTO(struct btree_trans *trans, struct btree_path *path, struct bpos *new_pos), + TP_ARGS(trans, path, new_pos), + + TP_STRUCT__entry( + __field(btree_path_idx_t, idx ) + __field(u8, ref ) + __field(u8, preserve ) + __field(u8, locks_want ) + __field(u8, btree_id ) + TRACE_BPOS_entries(old_pos) + TRACE_BPOS_entries(new_pos) + ), + + TP_fast_assign( + __entry->idx = path - trans->paths; + __entry->ref = path->ref; + __entry->preserve = path->preserve; + __entry->locks_want = path->locks_want; + __entry->btree_id = path->btree_id; + TRACE_BPOS_assign(old_pos, path->pos); + TRACE_BPOS_assign(new_pos, *new_pos); + ), + + TP_printk(" path %3u ref %u preserve %u btree %s locks_want %u pos %llu:%llu:%u -> %llu:%llu:%u", + __entry->idx, + __entry->ref, + __entry->preserve, + bch2_btree_id_str(__entry->btree_id), + __entry->locks_want, + __entry->old_pos_inode, + __entry->old_pos_offset, + __entry->old_pos_snapshot, + __entry->new_pos_inode, + __entry->new_pos_offset, + __entry->new_pos_snapshot) +); + +DECLARE_EVENT_CLASS(btree_path_clone, + TP_PROTO(struct btree_trans *trans, struct btree_path *path, struct btree_path *new), + TP_ARGS(trans, path, new), + + TP_STRUCT__entry( + __field(btree_path_idx_t, idx ) + __field(u8, new_idx ) + __field(u8, btree_id ) + __field(u8, ref ) + __field(u8, preserve ) + TRACE_BPOS_entries(pos) + ), + + TP_fast_assign( + __entry->idx = path - trans->paths; + __entry->new_idx = new - trans->paths; + __entry->btree_id = path->btree_id; + __entry->ref = path->ref; + __entry->preserve = path->preserve; + TRACE_BPOS_assign(pos, path->pos); + ), + + TP_printk(" path %3u ref %u preserve %u btree %s %llu:%llu:%u -> %u", + __entry->idx, + __entry->ref, + __entry->preserve, + bch2_btree_id_str(__entry->btree_id), + __entry->pos_inode, + __entry->pos_offset, + __entry->pos_snapshot, + __entry->new_idx) +); + +DEFINE_EVENT(btree_path_clone, btree_path_clone, + TP_PROTO(struct btree_trans *trans, struct btree_path *path, struct btree_path *new), + TP_ARGS(trans, path, new) +); + +DEFINE_EVENT(btree_path_clone, btree_path_save_pos, + TP_PROTO(struct btree_trans *trans, struct btree_path *path, struct btree_path *new), + TP_ARGS(trans, path, new) +); + +DECLARE_EVENT_CLASS(btree_path_traverse, + TP_PROTO(struct btree_trans *trans, + struct btree_path *path), + TP_ARGS(trans, path), + + TP_STRUCT__entry( + __array(char, trans_fn, 32 ) + __field(btree_path_idx_t, idx ) + __field(u8, ref ) + __field(u8, preserve ) + __field(u8, should_be_locked ) + __field(u8, btree_id ) + __field(u8, level ) + TRACE_BPOS_entries(pos) + __field(u8, locks_want ) + __field(u8, nodes_locked ) + __array(char, node0, 24 ) + __array(char, node1, 24 ) + __array(char, node2, 24 ) + __array(char, node3, 24 ) + ), + + TP_fast_assign( + strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); + + __entry->idx = path - trans->paths; + __entry->ref = path->ref; + __entry->preserve = path->preserve; + __entry->btree_id = path->btree_id; + __entry->level = path->level; + TRACE_BPOS_assign(pos, path->pos); + + __entry->locks_want = path->locks_want; + __entry->nodes_locked = path->nodes_locked; + struct btree *b = path->l[0].b; + if (IS_ERR(b)) + strscpy(__entry->node0, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0)); + else + scnprintf(__entry->node0, sizeof(__entry->node0), "%px", &b->c); + b = path->l[1].b; + if (IS_ERR(b)) + strscpy(__entry->node1, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0)); + else + scnprintf(__entry->node1, sizeof(__entry->node0), "%px", &b->c); + b = path->l[2].b; + if (IS_ERR(b)) + strscpy(__entry->node2, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0)); + else + scnprintf(__entry->node2, sizeof(__entry->node0), "%px", &b->c); + b = path->l[3].b; + if (IS_ERR(b)) + strscpy(__entry->node3, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0)); + else + scnprintf(__entry->node3, sizeof(__entry->node0), "%px", &b->c); + ), + + TP_printk("%s\npath %3u ref %u preserve %u btree %s %llu:%llu:%u level %u locks_want %u\n" + "locks %u %u %u %u node %s %s %s %s", + __entry->trans_fn, + __entry->idx, + __entry->ref, + __entry->preserve, + bch2_btree_id_str(__entry->btree_id), + __entry->pos_inode, + __entry->pos_offset, + __entry->pos_snapshot, + __entry->level, + __entry->locks_want, + (__entry->nodes_locked >> 6) & 3, + (__entry->nodes_locked >> 4) & 3, + (__entry->nodes_locked >> 2) & 3, + (__entry->nodes_locked >> 0) & 3, + __entry->node3, + __entry->node2, + __entry->node1, + __entry->node0) +); + +DEFINE_EVENT(btree_path_traverse, btree_path_traverse_start, + TP_PROTO(struct btree_trans *trans, + struct btree_path *path), + TP_ARGS(trans, path) +); + +DEFINE_EVENT(btree_path_traverse, btree_path_traverse_end, + TP_PROTO(struct btree_trans *trans, struct btree_path *path), + TP_ARGS(trans, path) +); + +TRACE_EVENT(btree_path_set_pos, + TP_PROTO(struct btree_trans *trans, + struct btree_path *path, + struct bpos *new_pos), + TP_ARGS(trans, path, new_pos), + + TP_STRUCT__entry( + __field(btree_path_idx_t, idx ) + __field(u8, ref ) + __field(u8, preserve ) + __field(u8, btree_id ) + TRACE_BPOS_entries(old_pos) + TRACE_BPOS_entries(new_pos) + __field(u8, locks_want ) + __field(u8, nodes_locked ) + __array(char, node0, 24 ) + __array(char, node1, 24 ) + __array(char, node2, 24 ) + __array(char, node3, 24 ) + ), + + TP_fast_assign( + __entry->idx = path - trans->paths; + __entry->ref = path->ref; + __entry->preserve = path->preserve; + __entry->btree_id = path->btree_id; + TRACE_BPOS_assign(old_pos, path->pos); + TRACE_BPOS_assign(new_pos, *new_pos); + + __entry->nodes_locked = path->nodes_locked; + struct btree *b = path->l[0].b; + if (IS_ERR(b)) + strscpy(__entry->node0, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0)); + else + scnprintf(__entry->node0, sizeof(__entry->node0), "%px", &b->c); + b = path->l[1].b; + if (IS_ERR(b)) + strscpy(__entry->node1, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0)); + else + scnprintf(__entry->node1, sizeof(__entry->node0), "%px", &b->c); + b = path->l[2].b; + if (IS_ERR(b)) + strscpy(__entry->node2, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0)); + else + scnprintf(__entry->node2, sizeof(__entry->node0), "%px", &b->c); + b = path->l[3].b; + if (IS_ERR(b)) + strscpy(__entry->node3, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node0)); + else + scnprintf(__entry->node3, sizeof(__entry->node0), "%px", &b->c); + ), + + TP_printk("\npath %3u ref %u preserve %u btree %s %llu:%llu:%u -> %llu:%llu:%u\n" + "locks %u %u %u %u node %s %s %s %s", + __entry->idx, + __entry->ref, + __entry->preserve, + bch2_btree_id_str(__entry->btree_id), + __entry->old_pos_inode, + __entry->old_pos_offset, + __entry->old_pos_snapshot, + __entry->new_pos_inode, + __entry->new_pos_offset, + __entry->new_pos_snapshot, + (__entry->nodes_locked >> 6) & 3, + (__entry->nodes_locked >> 4) & 3, + (__entry->nodes_locked >> 2) & 3, + (__entry->nodes_locked >> 0) & 3, + __entry->node3, + __entry->node2, + __entry->node1, + __entry->node0) +); + +TRACE_EVENT(btree_path_free, + TP_PROTO(struct btree_trans *trans, btree_path_idx_t path, struct btree_path *dup), + TP_ARGS(trans, path, dup), + + TP_STRUCT__entry( + __field(btree_path_idx_t, idx ) + __field(u8, preserve ) + __field(u8, should_be_locked) + __field(s8, dup ) + __field(u8, dup_locked ) + ), + + TP_fast_assign( + __entry->idx = path; + __entry->preserve = trans->paths[path].preserve; + __entry->should_be_locked = trans->paths[path].should_be_locked; + __entry->dup = dup ? dup - trans->paths : -1; + __entry->dup_locked = dup ? btree_node_locked(dup, dup->level) : 0; + ), + + TP_printk(" path %3u %c %c dup %2i locked %u", __entry->idx, + __entry->preserve ? 'P' : ' ', + __entry->should_be_locked ? 'S' : ' ', + __entry->dup, + __entry->dup_locked) +); + +TRACE_EVENT(btree_path_free_trans_begin, + TP_PROTO(btree_path_idx_t path), + TP_ARGS(path), + + TP_STRUCT__entry( + __field(btree_path_idx_t, idx ) + ), + + TP_fast_assign( + __entry->idx = path; + ), + + TP_printk(" path %3u", __entry->idx) +); + +#else /* CONFIG_BCACHEFS_PATH_TRACEPOINTS */ +#ifndef _TRACE_BCACHEFS_H + +static inline void trace_update_by_path(struct btree_trans *trans, struct btree_path *path, + struct btree_insert_entry *i, bool overwrite) {} +static inline void trace_btree_path_lock(struct btree_trans *trans, unsigned long caller_ip, struct btree_bkey_cached_common *b) {} +static inline void trace_btree_path_get_ll(struct btree_trans *trans, struct btree_path *path) {} +static inline void trace_btree_path_put_ll(struct btree_trans *trans, struct btree_path *path) {} +static inline void trace_btree_path_should_be_locked(struct btree_trans *trans, struct btree_path *path) {} +static inline void trace_btree_path_alloc(struct btree_trans *trans, struct btree_path *path) {} +static inline void trace_btree_path_get(struct btree_trans *trans, struct btree_path *path, struct bpos *new_pos) {} +static inline void trace_btree_path_clone(struct btree_trans *trans, struct btree_path *path, struct btree_path *new) {} +static inline void trace_btree_path_save_pos(struct btree_trans *trans, struct btree_path *path, struct btree_path *new) {} +static inline void trace_btree_path_traverse_start(struct btree_trans *trans, struct btree_path *path) {} +static inline void trace_btree_path_traverse_end(struct btree_trans *trans, struct btree_path *path) {} +static inline void trace_btree_path_set_pos(struct btree_trans *trans, struct btree_path *path, struct bpos *new_pos) {} +static inline void trace_btree_path_free(struct btree_trans *trans, btree_path_idx_t path, struct btree_path *dup) {} +static inline void trace_btree_path_free_trans_begin(btree_path_idx_t path) {} + +#endif +#endif /* CONFIG_BCACHEFS_PATH_TRACEPOINTS */ + +#define _TRACE_BCACHEFS_H #endif /* _TRACE_BCACHEFS_H */ /* This part must be outside protection */ diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c index 1b8554460af4..42f565c76181 100644 --- a/fs/bcachefs/util.c +++ b/fs/bcachefs/util.c @@ -64,7 +64,7 @@ static int bch2_pow(u64 n, u64 p, u64 *res) *res = 1; while (p--) { - if (*res > div_u64(U64_MAX, n)) + if (*res > div64_u64(U64_MAX, n)) return -ERANGE; *res *= n; } @@ -140,14 +140,14 @@ static int __bch2_strtou64_h(const char *cp, u64 *res) parse_or_ret(cp, parse_unit_suffix(cp, &b)); - if (v > div_u64(U64_MAX, b)) + if (v > div64_u64(U64_MAX, b)) return -ERANGE; v *= b; - if (f_n > div_u64(U64_MAX, b)) + if (f_n > div64_u64(U64_MAX, b)) return -ERANGE; - f_n = div_u64(f_n * b, f_d); + f_n = div64_u64(f_n * b, f_d); if (v + f_n < v) return -ERANGE; v += f_n; @@ -204,7 +204,7 @@ STRTO_H(strtoll, long long) STRTO_H(strtoull, unsigned long long) STRTO_H(strtou64, u64) -u64 bch2_read_flag_list(char *opt, const char * const list[]) +u64 bch2_read_flag_list(const char *opt, const char * const list[]) { u64 ret = 0; char *p, *s, *d = kstrdup(opt, GFP_KERNEL); @@ -214,7 +214,7 @@ u64 bch2_read_flag_list(char *opt, const char * const list[]) s = strim(d); - while ((p = strsep(&s, ","))) { + while ((p = strsep(&s, ",;"))) { int flag = match_string(list, -1, p); if (flag < 0) { @@ -360,7 +360,7 @@ void bch2_pr_time_units(struct printbuf *out, u64 ns) { const struct time_unit *u = bch2_pick_time_units(ns); - prt_printf(out, "%llu %s", div_u64(ns, u->nsecs), u->name); + prt_printf(out, "%llu %s", div64_u64(ns, u->nsecs), u->name); } static void bch2_pr_time_units_aligned(struct printbuf *out, u64 ns) @@ -477,7 +477,7 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1; u64 q = max(quantiles->entries[i].m, last_q); - prt_printf(out, "%llu ", div_u64(q, u->nsecs)); + prt_printf(out, "%llu ", div64_u64(q, u->nsecs)); if (is_last) prt_newline(out); last_q = q; diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h index 902b7f5406a2..fb02c1c36004 100644 --- a/fs/bcachefs/util.h +++ b/fs/bcachefs/util.h @@ -195,7 +195,7 @@ static inline int bch2_strtoul_h(const char *cp, long *res) bool bch2_is_zero(const void *, size_t); -u64 bch2_read_flag_list(char *, const char * const[]); +u64 bch2_read_flag_list(const char *, const char * const[]); void bch2_prt_u64_base2_nbits(struct printbuf *, u64, unsigned); void bch2_prt_u64_base2(struct printbuf *, u64); diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c index 331f944d73dc..56c8d3fe55a4 100644 --- a/fs/bcachefs/xattr.c +++ b/fs/bcachefs/xattr.c @@ -250,17 +250,27 @@ static int __bch2_xattr_emit(const char *prefix, return 0; } +static inline const char *bch2_xattr_prefix(unsigned type, struct dentry *dentry) +{ + const struct xattr_handler *handler = bch2_xattr_type_to_handler(type); + + if (!xattr_handler_can_list(handler, dentry)) + return NULL; + + return xattr_prefix(handler); +} + static int bch2_xattr_emit(struct dentry *dentry, const struct bch_xattr *xattr, struct xattr_buf *buf) { - const struct xattr_handler *handler = - bch2_xattr_type_to_handler(xattr->x_type); + const char *prefix; - return handler && (!handler->list || handler->list(dentry)) - ? __bch2_xattr_emit(handler->prefix ?: handler->name, - xattr->x_name, xattr->x_name_len, buf) - : 0; + prefix = bch2_xattr_prefix(xattr->x_type, dentry); + if (!prefix) + return 0; + + return __bch2_xattr_emit(prefix, xattr->x_name, xattr->x_name_len, buf); } static int bch2_xattr_list_bcachefs(struct bch_fs *c, @@ -295,54 +305,23 @@ ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) { struct bch_fs *c = dentry->d_sb->s_fs_info; struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); - struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter; - struct bkey_s_c k; struct xattr_buf buf = { .buf = buffer, .len = buffer_size }; u64 offset = 0, inum = inode->ei_inode.bi_inum; - u32 snapshot; - int ret; -retry: - bch2_trans_begin(trans); - iter = (struct btree_iter) { NULL }; - ret = bch2_subvolume_get_snapshot(trans, inode->ei_subvol, &snapshot); - if (ret) - goto err; + int ret = bch2_trans_run(c, + for_each_btree_key_in_subvolume_upto(trans, iter, BTREE_ID_xattrs, + POS(inum, offset), + POS(inum, U64_MAX), + inode->ei_inum.subvol, 0, k, ({ + if (k.k->type != KEY_TYPE_xattr) + continue; - for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_xattrs, - SPOS(inum, offset, snapshot), - POS(inum, U64_MAX), 0, k, ret) { - if (k.k->type != KEY_TYPE_xattr) - continue; + bch2_xattr_emit(dentry, bkey_s_c_to_xattr(k).v, &buf); + }))) ?: + bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, false) ?: + bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, true); - ret = bch2_xattr_emit(dentry, bkey_s_c_to_xattr(k).v, &buf); - if (ret) - break; - } - - offset = iter.pos.offset; - bch2_trans_iter_exit(trans, &iter); -err: - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto retry; - - bch2_trans_put(trans); - - if (ret) - goto out; - - ret = bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, false); - if (ret) - goto out; - - ret = bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, true); - if (ret) - goto out; - - return buf.used; -out: - return bch2_err_class(ret); + return ret ? bch2_err_class(ret) : buf.used; } static int bch2_xattr_get_handler(const struct xattr_handler *handler, @@ -632,10 +611,6 @@ static const struct xattr_handler bch_xattr_bcachefs_effective_handler = { const struct xattr_handler *bch2_xattr_handlers[] = { &bch_xattr_user_handler, -#ifdef CONFIG_BCACHEFS_POSIX_ACL - &nop_posix_acl_access, - &nop_posix_acl_default, -#endif &bch_xattr_trusted_handler, &bch_xattr_security_handler, #ifndef NO_BCACHEFS_FS diff --git a/fs/bcachefs/xattr_format.h b/fs/bcachefs/xattr_format.h index e9f810539552..c7916011ef34 100644 --- a/fs/bcachefs/xattr_format.h +++ b/fs/bcachefs/xattr_format.h @@ -13,7 +13,7 @@ struct bch_xattr { __u8 x_type; __u8 x_name_len; __le16 x_val_len; - __u8 x_name[]; + __u8 x_name[] __counted_by(x_name_len); } __packed __aligned(8); #endif /* _BCACHEFS_XATTR_FORMAT_H */ diff --git a/fs/inode.c b/fs/inode.c index af78f515403f..471ae4a31549 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -438,14 +438,6 @@ static void init_once(void *foo) inode_init_once(inode); } -/* - * inode->i_lock must be held - */ -void __iget(struct inode *inode) -{ - atomic_inc(&inode->i_count); -} - /* * get additional reference to inode; caller must already hold one. */ diff --git a/include/linux/fs.h b/include/linux/fs.h index 6b8df574729c..776298fbfcb4 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3148,7 +3148,14 @@ static inline bool is_zero_ino(ino_t ino) return (u32)ino == 0; } -extern void __iget(struct inode * inode); +/* + * inode->i_lock must be held + */ +static inline void __iget(struct inode *inode) +{ + atomic_inc(&inode->i_count); +} + extern void iget_failed(struct inode *); extern void clear_inode(struct inode *); extern void __destroy_inode(struct inode *); diff --git a/include/linux/generic-radix-tree.h b/include/linux/generic-radix-tree.h index f3512fddf3d7..5b51c3d582d6 100644 --- a/include/linux/generic-radix-tree.h +++ b/include/linux/generic-radix-tree.h @@ -41,6 +41,7 @@ #include #include #include +#include #include struct genradix_root; @@ -48,10 +49,63 @@ struct genradix_root; #define GENRADIX_NODE_SHIFT 9 #define GENRADIX_NODE_SIZE (1U << GENRADIX_NODE_SHIFT) +#define GENRADIX_ARY (GENRADIX_NODE_SIZE / sizeof(struct genradix_node *)) +#define GENRADIX_ARY_SHIFT ilog2(GENRADIX_ARY) + +/* depth that's needed for a genradix that can address up to ULONG_MAX: */ +#define GENRADIX_MAX_DEPTH \ + DIV_ROUND_UP(BITS_PER_LONG - GENRADIX_NODE_SHIFT, GENRADIX_ARY_SHIFT) + +#define GENRADIX_DEPTH_MASK \ + ((unsigned long) (roundup_pow_of_two(GENRADIX_MAX_DEPTH + 1) - 1)) + +static inline int genradix_depth_shift(unsigned depth) +{ + return GENRADIX_NODE_SHIFT + GENRADIX_ARY_SHIFT * depth; +} + +/* + * Returns size (of data, in bytes) that a tree of a given depth holds: + */ +static inline size_t genradix_depth_size(unsigned depth) +{ + return 1UL << genradix_depth_shift(depth); +} + +static inline unsigned genradix_root_to_depth(struct genradix_root *r) +{ + return (unsigned long) r & GENRADIX_DEPTH_MASK; +} + +static inline struct genradix_node *genradix_root_to_node(struct genradix_root *r) +{ + return (void *) ((unsigned long) r & ~GENRADIX_DEPTH_MASK); +} + struct __genradix { struct genradix_root *root; }; +struct genradix_node { + union { + /* Interior node: */ + struct genradix_node *children[GENRADIX_ARY]; + + /* Leaf: */ + u8 data[GENRADIX_NODE_SIZE]; + }; +}; + +static inline struct genradix_node *genradix_alloc_node(gfp_t gfp_mask) +{ + return kzalloc(GENRADIX_NODE_SIZE, gfp_mask); +} + +static inline void genradix_free_node(struct genradix_node *node) +{ + kfree(node); +} + /* * NOTE: currently, sizeof(_type) must not be larger than GENRADIX_NODE_SIZE: */ @@ -128,6 +182,30 @@ static inline size_t __idx_to_offset(size_t idx, size_t obj_size) #define __genradix_idx_to_offset(_radix, _idx) \ __idx_to_offset(_idx, __genradix_obj_size(_radix)) +static inline void *__genradix_ptr_inlined(struct __genradix *radix, size_t offset) +{ + struct genradix_root *r = READ_ONCE(radix->root); + struct genradix_node *n = genradix_root_to_node(r); + unsigned level = genradix_root_to_depth(r); + unsigned shift = genradix_depth_shift(level); + + if (unlikely(ilog2(offset) >= genradix_depth_shift(level))) + return NULL; + + while (n && shift > GENRADIX_NODE_SHIFT) { + shift -= GENRADIX_ARY_SHIFT; + n = n->children[offset >> shift]; + offset &= (1UL << shift) - 1; + } + + return n ? &n->data[offset] : NULL; +} + +#define genradix_ptr_inlined(_radix, _idx) \ + (__genradix_cast(_radix) \ + __genradix_ptr_inlined(&(_radix)->tree, \ + __genradix_idx_to_offset(_radix, _idx))) + void *__genradix_ptr(struct __genradix *, size_t); /** @@ -142,7 +220,24 @@ void *__genradix_ptr(struct __genradix *, size_t); __genradix_ptr(&(_radix)->tree, \ __genradix_idx_to_offset(_radix, _idx))) -void *__genradix_ptr_alloc(struct __genradix *, size_t, gfp_t); +void *__genradix_ptr_alloc(struct __genradix *, size_t, + struct genradix_node **, gfp_t); + +#define genradix_ptr_alloc_inlined(_radix, _idx, _gfp) \ + (__genradix_cast(_radix) \ + (__genradix_ptr_inlined(&(_radix)->tree, \ + __genradix_idx_to_offset(_radix, _idx)) ?: \ + __genradix_ptr_alloc(&(_radix)->tree, \ + __genradix_idx_to_offset(_radix, _idx), \ + NULL, _gfp))) + +#define genradix_ptr_alloc_preallocated_inlined(_radix, _idx, _new_node, _gfp)\ + (__genradix_cast(_radix) \ + (__genradix_ptr_inlined(&(_radix)->tree, \ + __genradix_idx_to_offset(_radix, _idx)) ?: \ + __genradix_ptr_alloc(&(_radix)->tree, \ + __genradix_idx_to_offset(_radix, _idx), \ + _new_node, _gfp))) /** * genradix_ptr_alloc - get a pointer to a genradix entry, allocating it @@ -157,7 +252,13 @@ void *__genradix_ptr_alloc(struct __genradix *, size_t, gfp_t); (__genradix_cast(_radix) \ __genradix_ptr_alloc(&(_radix)->tree, \ __genradix_idx_to_offset(_radix, _idx), \ - _gfp)) + NULL, _gfp)) + +#define genradix_ptr_alloc_preallocated(_radix, _idx, _new_node, _gfp)\ + (__genradix_cast(_radix) \ + __genradix_ptr_alloc(&(_radix)->tree, \ + __genradix_idx_to_offset(_radix, _idx), \ + _new_node, _gfp)) struct genradix_iter { size_t offset; diff --git a/lib/generic-radix-tree.c b/lib/generic-radix-tree.c index fa692c86f069..79e067b51488 100644 --- a/lib/generic-radix-tree.c +++ b/lib/generic-radix-tree.c @@ -5,99 +5,31 @@ #include #include -#define GENRADIX_ARY (GENRADIX_NODE_SIZE / sizeof(struct genradix_node *)) -#define GENRADIX_ARY_SHIFT ilog2(GENRADIX_ARY) - -struct genradix_node { - union { - /* Interior node: */ - struct genradix_node *children[GENRADIX_ARY]; - - /* Leaf: */ - u8 data[GENRADIX_NODE_SIZE]; - }; -}; - -static inline int genradix_depth_shift(unsigned depth) -{ - return GENRADIX_NODE_SHIFT + GENRADIX_ARY_SHIFT * depth; -} - -/* - * Returns size (of data, in bytes) that a tree of a given depth holds: - */ -static inline size_t genradix_depth_size(unsigned depth) -{ - return 1UL << genradix_depth_shift(depth); -} - -/* depth that's needed for a genradix that can address up to ULONG_MAX: */ -#define GENRADIX_MAX_DEPTH \ - DIV_ROUND_UP(BITS_PER_LONG - GENRADIX_NODE_SHIFT, GENRADIX_ARY_SHIFT) - -#define GENRADIX_DEPTH_MASK \ - ((unsigned long) (roundup_pow_of_two(GENRADIX_MAX_DEPTH + 1) - 1)) - -static inline unsigned genradix_root_to_depth(struct genradix_root *r) -{ - return (unsigned long) r & GENRADIX_DEPTH_MASK; -} - -static inline struct genradix_node *genradix_root_to_node(struct genradix_root *r) -{ - return (void *) ((unsigned long) r & ~GENRADIX_DEPTH_MASK); -} - /* * Returns pointer to the specified byte @offset within @radix, or NULL if not * allocated */ void *__genradix_ptr(struct __genradix *radix, size_t offset) { - struct genradix_root *r = READ_ONCE(radix->root); - struct genradix_node *n = genradix_root_to_node(r); - unsigned level = genradix_root_to_depth(r); - - if (ilog2(offset) >= genradix_depth_shift(level)) - return NULL; - - while (1) { - if (!n) - return NULL; - if (!level) - break; - - level--; - - n = n->children[offset >> genradix_depth_shift(level)]; - offset &= genradix_depth_size(level) - 1; - } - - return &n->data[offset]; + return __genradix_ptr_inlined(radix, offset); } EXPORT_SYMBOL(__genradix_ptr); -static inline struct genradix_node *genradix_alloc_node(gfp_t gfp_mask) -{ - return kzalloc(GENRADIX_NODE_SIZE, gfp_mask); -} - -static inline void genradix_free_node(struct genradix_node *node) -{ - kfree(node); -} - /* * Returns pointer to the specified byte @offset within @radix, allocating it if * necessary - newly allocated slots are always zeroed out: */ void *__genradix_ptr_alloc(struct __genradix *radix, size_t offset, + struct genradix_node **preallocated, gfp_t gfp_mask) { struct genradix_root *v = READ_ONCE(radix->root); struct genradix_node *n, *new_node = NULL; unsigned level; + if (preallocated) + swap(new_node, *preallocated); + /* Increase tree depth if necessary: */ while (1) { struct genradix_root *r = v, *new_root; @@ -281,7 +213,7 @@ int __genradix_prealloc(struct __genradix *radix, size_t size, size_t offset; for (offset = 0; offset < size; offset += GENRADIX_NODE_SIZE) - if (!__genradix_ptr_alloc(radix, offset, gfp_mask)) + if (!__genradix_ptr_alloc(radix, offset, NULL, gfp_mask)) return -ENOMEM; return 0;