From 180fb49dea90dfbac591b9b201a4dfb75159f5f0 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 21 Jan 2021 21:52:06 -0500 Subject: [PATCH] bcachefs: Journal updates to dev usage This eliminates the need to scan every bucket to regenerate dev_usage at mount time. Signed-off-by: Kent Overstreet Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 4 -- fs/bcachefs/bcachefs.h | 6 +- fs/bcachefs/bcachefs_format.h | 20 ++++++- fs/bcachefs/btree_gc.c | 40 ++++++++----- fs/bcachefs/buckets.c | 102 ++++++++++++++++++++------------- fs/bcachefs/buckets.h | 7 ++- fs/bcachefs/journal_io.c | 37 ++++++++++++ fs/bcachefs/recovery.c | 21 +++++++ fs/bcachefs/super-io.c | 22 ++++++- fs/bcachefs/super.c | 37 +++++++----- 10 files changed, 220 insertions(+), 76 deletions(-) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index bba83011b18b..aadd878b357d 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -350,10 +350,6 @@ int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys) return ret; } - percpu_down_write(&c->mark_lock); - bch2_dev_usage_from_buckets(c); - percpu_up_write(&c->mark_lock); - return 0; } diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 763cac0efa0c..0c24a5312e49 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -429,7 +429,9 @@ struct bch_dev { unsigned long *buckets_nouse; struct rw_semaphore bucket_lock; - struct bch_dev_usage __percpu *usage[2]; + struct bch_dev_usage *usage_base; + struct bch_dev_usage __percpu *usage[JOURNAL_BUF_NR]; + struct bch_dev_usage __percpu *usage_gc; /* Allocator: */ struct task_struct __rcu *alloc_thread; @@ -582,6 +584,8 @@ struct bch_fs { struct journal_entry_res replicas_journal_res; + struct journal_entry_res dev_usage_journal_res; + struct bch_disk_groups_cpu __rcu *disk_groups; struct bch_opts opts; diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index 5dab5bfd228a..9048441cfa55 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -1512,7 +1512,8 @@ static inline __u64 __bset_magic(struct bch_sb *sb) x(blacklist_v2, 4) \ x(usage, 5) \ x(data_usage, 6) \ - x(clock, 7) + x(clock, 7) \ + x(dev_usage, 8) enum { #define x(f, nr) BCH_JSET_ENTRY_##f = nr, @@ -1567,6 +1568,23 @@ struct jset_entry_clock { __le64 time; } __attribute__((packed)); +struct jset_entry_dev_usage_type { + __le64 buckets; + __le64 sectors; + __le64 fragmented; +} __attribute__((packed)); + +struct jset_entry_dev_usage { + struct jset_entry entry; + __le32 dev; + __u32 pad; + + __le64 buckets_ec; + __le64 buckets_unavailable; + + struct jset_entry_dev_usage_type d[]; +} __attribute__((packed)); + /* * On disk format for a journal entry: * seq is monotonically increasing; every journal entry has its own unique diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index 5ea9bae09d59..d44b9c079fde 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -706,8 +706,8 @@ static void bch2_gc_free(struct bch_fs *c) ca->mi.nbuckets * sizeof(struct bucket)); ca->buckets[1] = NULL; - free_percpu(ca->usage[1]); - ca->usage[1] = NULL; + free_percpu(ca->usage_gc); + ca->usage_gc = NULL; } free_percpu(c->usage_gc); @@ -720,7 +720,7 @@ static int bch2_gc_done(struct bch_fs *c, struct bch_dev *ca; bool verify = (!initial || (c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO))); - unsigned i; + unsigned i, dev; int ret = 0; #define copy_field(_f, _msg, ...) \ @@ -786,7 +786,10 @@ static int bch2_gc_done(struct bch_fs *c, } } - for_each_member_device(ca, c, i) { + for (i = 0; i < ARRAY_SIZE(c->usage); i++) + bch2_fs_usage_acc_to_base(c, i); + + for_each_member_device(ca, c, dev) { struct bucket_array *dst = __bucket_array(ca, 0); struct bucket_array *src = __bucket_array(ca, 1); size_t b; @@ -801,13 +804,24 @@ static int bch2_gc_done(struct bch_fs *c, dst->b[b].oldest_gen = src->b[b].oldest_gen; } + + { + struct bch_dev_usage *dst = ca->usage_base; + struct bch_dev_usage *src = (void *) + bch2_acc_percpu_u64s((void *) ca->usage_gc, + dev_usage_u64s()); + + copy_dev_field(buckets_ec, "buckets_ec"); + copy_dev_field(buckets_unavailable, "buckets_unavailable"); + + for (i = 0; i < BCH_DATA_NR; i++) { + copy_dev_field(d[i].buckets, "%s buckets", bch2_data_types[i]); + copy_dev_field(d[i].sectors, "%s sectors", bch2_data_types[i]); + copy_dev_field(d[i].fragmented, "%s fragmented", bch2_data_types[i]); + } + } }; - for (i = 0; i < ARRAY_SIZE(c->usage); i++) - bch2_fs_usage_acc_to_base(c, i); - - bch2_dev_usage_from_buckets(c); - { unsigned nr = fs_usage_u64s(c); struct bch_fs_usage *dst = c->usage_base; @@ -862,7 +876,7 @@ static int bch2_gc_start(struct bch_fs *c) for_each_member_device(ca, c, i) { BUG_ON(ca->buckets[1]); - BUG_ON(ca->usage[1]); + BUG_ON(ca->usage_gc); ca->buckets[1] = kvpmalloc(sizeof(struct bucket_array) + ca->mi.nbuckets * sizeof(struct bucket), @@ -873,9 +887,9 @@ static int bch2_gc_start(struct bch_fs *c) return -ENOMEM; } - ca->usage[1] = alloc_percpu(struct bch_dev_usage); - if (!ca->usage[1]) { - bch_err(c, "error allocating ca->usage[gc]"); + ca->usage_gc = alloc_percpu(struct bch_dev_usage); + if (!ca->usage_gc) { + bch_err(c, "error allocating ca->usage_gc"); percpu_ref_put(&ca->ref); return -ENOMEM; } diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index 7b60e988df83..65ae89c80590 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -137,6 +137,7 @@ void bch2_bucket_seq_cleanup(struct bch_fs *c) void bch2_fs_usage_initialize(struct bch_fs *c) { struct bch_fs_usage *usage; + struct bch_dev *ca; unsigned i; percpu_down_write(&c->mark_lock); @@ -155,6 +156,14 @@ void bch2_fs_usage_initialize(struct bch_fs *c) fs_usage_data_type_to_base(usage, e->data_type, usage->replicas[i]); } + for_each_member_device(ca, c, i) { + struct bch_dev_usage dev = bch2_dev_usage_read(ca); + + usage->hidden += (dev.d[BCH_DATA_sb].buckets + + dev.d[BCH_DATA_journal].buckets) * + ca->mi.bucket_size; + } + percpu_up_write(&c->mark_lock); } @@ -189,14 +198,27 @@ out_pool: return ret; } +static inline struct bch_dev_usage *dev_usage_ptr(struct bch_dev *ca, + unsigned journal_seq, + bool gc) +{ + return this_cpu_ptr(gc + ? ca->usage_gc + : ca->usage[journal_seq & JOURNAL_BUF_MASK]); +} + struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca) { + struct bch_fs *c = ca->fs; struct bch_dev_usage ret; + unsigned seq, i, u64s = dev_usage_u64s(); - memset(&ret, 0, sizeof(ret)); - acc_u64s_percpu((u64 *) &ret, - (u64 __percpu *) ca->usage[0], - sizeof(ret) / sizeof(u64)); + do { + seq = read_seqcount_begin(&c->usage_lock); + memcpy(&ret, ca->usage_base, u64s * sizeof(u64)); + for (i = 0; i < ARRAY_SIZE(ca->usage); i++) + acc_u64s_percpu((u64 *) &ret, (u64 __percpu *) ca->usage[i], u64s); + } while (read_seqcount_retry(&c->usage_lock, seq)); return ret; } @@ -264,7 +286,8 @@ retry: void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx) { - unsigned u64s = fs_usage_u64s(c); + struct bch_dev *ca; + unsigned i, u64s = fs_usage_u64s(c); BUG_ON(idx >= ARRAY_SIZE(c->usage)); @@ -275,6 +298,16 @@ void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx) (u64 __percpu *) c->usage[idx], u64s); percpu_memset(c->usage[idx], 0, u64s * sizeof(u64)); + rcu_read_lock(); + for_each_member_device_rcu(ca, c, i, NULL) { + u64s = dev_usage_u64s(); + + acc_u64s_percpu((u64 *) ca->usage_base, + (u64 __percpu *) ca->usage[idx], u64s); + percpu_memset(ca->usage[idx], 0, u64s * sizeof(u64)); + } + rcu_read_unlock(); + write_seqcount_end(&c->usage_lock); preempt_enable(); } @@ -459,14 +492,14 @@ static inline void account_bucket(struct bch_fs_usage *fs_usage, static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, struct bch_fs_usage *fs_usage, struct bucket_mark old, struct bucket_mark new, - bool gc) + u64 journal_seq, bool gc) { struct bch_dev_usage *u; percpu_rwsem_assert_held(&c->mark_lock); preempt_disable(); - u = this_cpu_ptr(ca->usage[gc]); + u = dev_usage_ptr(ca, journal_seq, gc); if (bucket_type(old)) account_bucket(fs_usage, u, bucket_type(old), @@ -493,31 +526,6 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, bch2_wake_allocator(ca); } -__flatten -void bch2_dev_usage_from_buckets(struct bch_fs *c) -{ - struct bch_dev *ca; - struct bucket_mark old = { .v.counter = 0 }; - struct bucket_array *buckets; - struct bucket *g; - unsigned i; - int cpu; - - c->usage_base->hidden = 0; - - for_each_member_device(ca, c, i) { - for_each_possible_cpu(cpu) - memset(per_cpu_ptr(ca->usage[0], cpu), 0, - sizeof(*ca->usage[0])); - - buckets = bucket_array(ca); - - for_each_bucket(g, buckets) - bch2_dev_usage_update(c, ca, c->usage_base, - old, g->mark, false); - } -} - static inline int update_replicas(struct bch_fs *c, struct bch_fs_usage *fs_usage, struct bch_replicas_entry *r, @@ -656,7 +664,12 @@ static int __bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, new.owned_by_allocator = owned_by_allocator; })); - bch2_dev_usage_update(c, ca, fs_usage, old, new, gc); + /* + * XXX: this is wrong, this means we'll be doing updates to the percpu + * buckets_alloc counter that don't have an open journal buffer and + * we'll race with the machinery that accumulates that to ca->usage_base + */ + bch2_dev_usage_update(c, ca, fs_usage, old, new, 0, gc); BUG_ON(!gc && !owned_by_allocator && !old.owned_by_allocator); @@ -720,7 +733,7 @@ static int bch2_mark_alloc(struct bch_fs *c, } })); - bch2_dev_usage_update(c, ca, fs_usage, old_m, m, gc); + bch2_dev_usage_update(c, ca, fs_usage, old_m, m, journal_seq, gc); g->io_time[READ] = u.read_time; g->io_time[WRITE] = u.write_time; @@ -785,7 +798,7 @@ static int __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, if (c) bch2_dev_usage_update(c, ca, fs_usage_ptr(c, 0, gc), - old, new, gc); + old, new, 0, gc); return 0; } @@ -966,7 +979,7 @@ static int mark_stripe_bucket(struct bch_fs *c, struct bkey_s_c k, g->stripe = k.k->p.offset; g->stripe_redundancy = s->nr_redundant; - bch2_dev_usage_update(c, ca, fs_usage, old, new, gc); + bch2_dev_usage_update(c, ca, fs_usage, old, new, journal_seq, gc); return 0; } @@ -1033,7 +1046,7 @@ static int bch2_mark_pointer(struct bch_fs *c, struct bkey_s_c k, old.v.counter, new.v.counter)) != old.v.counter); - bch2_dev_usage_update(c, ca, fs_usage, old, new, gc); + bch2_dev_usage_update(c, ca, fs_usage, old, new, journal_seq, gc); BUG_ON(!gc && bucket_became_unavailable(old, new)); @@ -2389,13 +2402,24 @@ void bch2_dev_buckets_free(struct bch_dev *ca) sizeof(struct bucket_array) + ca->mi.nbuckets * sizeof(struct bucket)); - free_percpu(ca->usage[0]); + for (i = 0; i < ARRAY_SIZE(ca->usage); i++) + free_percpu(ca->usage[i]); + kfree(ca->usage_base); } int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca) { - if (!(ca->usage[0] = alloc_percpu(struct bch_dev_usage))) + unsigned i; + + ca->usage_base = kzalloc(sizeof(struct bch_dev_usage), GFP_KERNEL); + if (!ca->usage_base) return -ENOMEM; + for (i = 0; i < ARRAY_SIZE(ca->usage); i++) { + ca->usage[i] = alloc_percpu(struct bch_dev_usage); + if (!ca->usage[i]) + return -ENOMEM; + } + return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets);; } diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h index 50989d286190..c965c4d48218 100644 --- a/fs/bcachefs/buckets.h +++ b/fs/bcachefs/buckets.h @@ -162,8 +162,6 @@ static inline bool bucket_needs_journal_commit(struct bucket_mark m, struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *); -void bch2_dev_usage_from_buckets(struct bch_fs *); - static inline u64 __dev_buckets_available(struct bch_dev *ca, struct bch_dev_usage stats) { @@ -207,6 +205,11 @@ static inline unsigned fs_usage_u64s(struct bch_fs *c) READ_ONCE(c->replicas.nr); } +static inline unsigned dev_usage_u64s(void) +{ + return sizeof(struct bch_dev_usage) / sizeof(u64); +} + void bch2_fs_usage_scratch_put(struct bch_fs *, struct bch_fs_usage_online *); struct bch_fs_usage_online *bch2_fs_usage_scratch_get(struct bch_fs *); diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index a82548983dbd..df5b375c367f 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -452,6 +452,43 @@ fsck_err: return ret; } +static int journal_entry_validate_dev_usage(struct bch_fs *c, + struct jset *jset, + struct jset_entry *entry, + int write) +{ + struct jset_entry_dev_usage *u = + container_of(entry, struct jset_entry_dev_usage, entry); + unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); + unsigned expected = sizeof(*u) + sizeof(u->d[0]) * 7; /* Current value of BCH_DATA_NR */ + unsigned dev; + int ret = 0; + + if (journal_entry_err_on(bytes < expected, + c, "invalid journal entry dev usage: bad size (%u < %u)", + bytes, expected)) { + journal_entry_null_range(entry, vstruct_next(entry)); + return ret; + } + + dev = le32_to_cpu(u->dev); + + if (journal_entry_err_on(!bch2_dev_exists2(c, dev), + c, "invalid journal entry dev usage: bad dev")) { + journal_entry_null_range(entry, vstruct_next(entry)); + return ret; + } + + if (journal_entry_err_on(u->pad, + c, "invalid journal entry dev usage: bad pad")) { + journal_entry_null_range(entry, vstruct_next(entry)); + return ret; + } + +fsck_err: + return ret; +} + struct jset_entry_ops { int (*validate)(struct bch_fs *, struct jset *, struct jset_entry *, int); diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 55f7771e11c8..7ba098adcab9 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -825,10 +825,31 @@ static int journal_replay_entry_early(struct bch_fs *c, case BCH_JSET_ENTRY_data_usage: { struct jset_entry_data_usage *u = container_of(entry, struct jset_entry_data_usage, entry); + ret = bch2_replicas_set_usage(c, &u->r, le64_to_cpu(u->v)); break; } + case BCH_JSET_ENTRY_dev_usage: { + struct jset_entry_dev_usage *u = + container_of(entry, struct jset_entry_dev_usage, entry); + struct bch_dev *ca = bch_dev_bkey_exists(c, u->dev); + unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); + unsigned nr_types = (bytes - sizeof(struct jset_entry_dev_usage)) / + sizeof(struct jset_entry_dev_usage_type); + unsigned i; + + ca->usage_base->buckets_ec = le64_to_cpu(u->buckets_ec); + ca->usage_base->buckets_unavailable = le64_to_cpu(u->buckets_unavailable); + + for (i = 0; i < nr_types; i++) { + ca->usage_base->d[i].buckets = le64_to_cpu(u->d[i].buckets); + ca->usage_base->d[i].sectors = le64_to_cpu(u->d[i].sectors); + ca->usage_base->d[i].fragmented = le64_to_cpu(u->d[i].fragmented); + } + + break; + } case BCH_JSET_ENTRY_blacklist: { struct jset_entry_blacklist *bl_entry = container_of(entry, struct jset_entry_blacklist, entry); diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index 3b082da934fb..0356541c00e2 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -986,7 +986,8 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c, struct jset_entry **end, u64 journal_seq) { - unsigned i; + struct bch_dev *ca; + unsigned i, dev; percpu_down_read(&c->mark_lock); @@ -1041,6 +1042,25 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c, "embedded variable length struct"); } + for_each_member_device(ca, c, dev) { + unsigned b = sizeof(struct jset_entry_dev_usage) + + sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR; + struct jset_entry_dev_usage *u = + container_of(jset_entry_init(end, b), + struct jset_entry_dev_usage, entry); + + u->entry.type = BCH_JSET_ENTRY_dev_usage; + u->dev = cpu_to_le32(dev); + u->buckets_ec = cpu_to_le64(ca->usage_base->buckets_ec); + u->buckets_unavailable = cpu_to_le64(ca->usage_base->buckets_unavailable); + + for (i = 0; i < BCH_DATA_NR; i++) { + u->d[i].buckets = cpu_to_le64(ca->usage_base->d[i].buckets); + u->d[i].sectors = cpu_to_le64(ca->usage_base->d[i].sectors); + u->d[i].fragmented = cpu_to_le64(ca->usage_base->d[i].fragmented); + } + } + percpu_up_read(&c->mark_lock); for (i = 0; i < 2; i++) { diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 5f5893ab9edf..eecabeb08c94 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -155,6 +155,22 @@ struct bch_fs *bch2_uuid_to_fs(__uuid_t uuid) return c; } +static void bch2_dev_usage_journal_reserve(struct bch_fs *c) +{ + struct bch_dev *ca; + unsigned i, nr = 0, u64s = + (sizeof(struct jset_entry_dev_usage) + + sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR); + + rcu_read_lock(); + for_each_member_device_rcu(ca, c, i, NULL) + nr++; + rcu_read_unlock(); + + bch2_journal_entry_res_resize(&c->journal, + &c->dev_usage_journal_res, u64s * nr); +} + /* Filesystem RO/RW: */ /* @@ -780,6 +796,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) bch2_fs_fsio_init(c)) goto err; + bch2_dev_usage_journal_reserve(c); + mi = bch2_sb_get_members(c->disk_sb.sb); for (i = 0; i < c->sb.nr_devices; i++) if (bch2_dev_exists(c->disk_sb.sb, mi, i) && @@ -1516,6 +1534,8 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) mutex_unlock(&c->sb_lock); up_write(&c->state_lock); + + bch2_dev_usage_journal_reserve(c); return 0; err: if (ca->mi.state == BCH_MEMBER_STATE_RW && @@ -1525,19 +1545,6 @@ err: return ret; } -static void dev_usage_clear(struct bch_dev *ca) -{ - struct bucket_array *buckets; - - percpu_memset(ca->usage[0], 0, sizeof(*ca->usage[0])); - - down_read(&ca->bucket_lock); - buckets = bucket_array(ca); - - memset(buckets->b, 0, sizeof(buckets->b[0]) * buckets->nbuckets); - up_read(&ca->bucket_lock); -} - /* Add new device to running filesystem: */ int bch2_dev_add(struct bch_fs *c, const char *path) { @@ -1595,8 +1602,6 @@ int bch2_dev_add(struct bch_fs *c, const char *path) if (ret) goto err; - dev_usage_clear(ca); - down_write(&c->state_lock); mutex_lock(&c->sb_lock); @@ -1650,6 +1655,8 @@ have_slot: bch2_write_super(c); mutex_unlock(&c->sb_lock); + bch2_dev_usage_journal_reserve(c); + err = "error marking superblock"; ret = bch2_trans_mark_dev_sb(c, NULL, ca); if (ret)