bcachefs: Journal updates to dev usage

This eliminates the need to scan every bucket to regenerate dev_usage at
mount time.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
This commit is contained in:
Kent Overstreet 2021-01-21 21:52:06 -05:00 committed by Kent Overstreet
parent 2abe542087
commit 180fb49dea
10 changed files with 220 additions and 76 deletions

View File

@ -350,10 +350,6 @@ int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys)
return ret;
}
percpu_down_write(&c->mark_lock);
bch2_dev_usage_from_buckets(c);
percpu_up_write(&c->mark_lock);
return 0;
}

View File

@ -429,7 +429,9 @@ struct bch_dev {
unsigned long *buckets_nouse;
struct rw_semaphore bucket_lock;
struct bch_dev_usage __percpu *usage[2];
struct bch_dev_usage *usage_base;
struct bch_dev_usage __percpu *usage[JOURNAL_BUF_NR];
struct bch_dev_usage __percpu *usage_gc;
/* Allocator: */
struct task_struct __rcu *alloc_thread;
@ -582,6 +584,8 @@ struct bch_fs {
struct journal_entry_res replicas_journal_res;
struct journal_entry_res dev_usage_journal_res;
struct bch_disk_groups_cpu __rcu *disk_groups;
struct bch_opts opts;

View File

@ -1512,7 +1512,8 @@ static inline __u64 __bset_magic(struct bch_sb *sb)
x(blacklist_v2, 4) \
x(usage, 5) \
x(data_usage, 6) \
x(clock, 7)
x(clock, 7) \
x(dev_usage, 8)
enum {
#define x(f, nr) BCH_JSET_ENTRY_##f = nr,
@ -1567,6 +1568,23 @@ struct jset_entry_clock {
__le64 time;
} __attribute__((packed));
struct jset_entry_dev_usage_type {
__le64 buckets;
__le64 sectors;
__le64 fragmented;
} __attribute__((packed));
struct jset_entry_dev_usage {
struct jset_entry entry;
__le32 dev;
__u32 pad;
__le64 buckets_ec;
__le64 buckets_unavailable;
struct jset_entry_dev_usage_type d[];
} __attribute__((packed));
/*
* On disk format for a journal entry:
* seq is monotonically increasing; every journal entry has its own unique

View File

@ -706,8 +706,8 @@ static void bch2_gc_free(struct bch_fs *c)
ca->mi.nbuckets * sizeof(struct bucket));
ca->buckets[1] = NULL;
free_percpu(ca->usage[1]);
ca->usage[1] = NULL;
free_percpu(ca->usage_gc);
ca->usage_gc = NULL;
}
free_percpu(c->usage_gc);
@ -720,7 +720,7 @@ static int bch2_gc_done(struct bch_fs *c,
struct bch_dev *ca;
bool verify = (!initial ||
(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO)));
unsigned i;
unsigned i, dev;
int ret = 0;
#define copy_field(_f, _msg, ...) \
@ -786,7 +786,10 @@ static int bch2_gc_done(struct bch_fs *c,
}
}
for_each_member_device(ca, c, i) {
for (i = 0; i < ARRAY_SIZE(c->usage); i++)
bch2_fs_usage_acc_to_base(c, i);
for_each_member_device(ca, c, dev) {
struct bucket_array *dst = __bucket_array(ca, 0);
struct bucket_array *src = __bucket_array(ca, 1);
size_t b;
@ -801,13 +804,24 @@ static int bch2_gc_done(struct bch_fs *c,
dst->b[b].oldest_gen = src->b[b].oldest_gen;
}
{
struct bch_dev_usage *dst = ca->usage_base;
struct bch_dev_usage *src = (void *)
bch2_acc_percpu_u64s((void *) ca->usage_gc,
dev_usage_u64s());
copy_dev_field(buckets_ec, "buckets_ec");
copy_dev_field(buckets_unavailable, "buckets_unavailable");
for (i = 0; i < BCH_DATA_NR; i++) {
copy_dev_field(d[i].buckets, "%s buckets", bch2_data_types[i]);
copy_dev_field(d[i].sectors, "%s sectors", bch2_data_types[i]);
copy_dev_field(d[i].fragmented, "%s fragmented", bch2_data_types[i]);
}
}
};
for (i = 0; i < ARRAY_SIZE(c->usage); i++)
bch2_fs_usage_acc_to_base(c, i);
bch2_dev_usage_from_buckets(c);
{
unsigned nr = fs_usage_u64s(c);
struct bch_fs_usage *dst = c->usage_base;
@ -862,7 +876,7 @@ static int bch2_gc_start(struct bch_fs *c)
for_each_member_device(ca, c, i) {
BUG_ON(ca->buckets[1]);
BUG_ON(ca->usage[1]);
BUG_ON(ca->usage_gc);
ca->buckets[1] = kvpmalloc(sizeof(struct bucket_array) +
ca->mi.nbuckets * sizeof(struct bucket),
@ -873,9 +887,9 @@ static int bch2_gc_start(struct bch_fs *c)
return -ENOMEM;
}
ca->usage[1] = alloc_percpu(struct bch_dev_usage);
if (!ca->usage[1]) {
bch_err(c, "error allocating ca->usage[gc]");
ca->usage_gc = alloc_percpu(struct bch_dev_usage);
if (!ca->usage_gc) {
bch_err(c, "error allocating ca->usage_gc");
percpu_ref_put(&ca->ref);
return -ENOMEM;
}

View File

@ -137,6 +137,7 @@ void bch2_bucket_seq_cleanup(struct bch_fs *c)
void bch2_fs_usage_initialize(struct bch_fs *c)
{
struct bch_fs_usage *usage;
struct bch_dev *ca;
unsigned i;
percpu_down_write(&c->mark_lock);
@ -155,6 +156,14 @@ void bch2_fs_usage_initialize(struct bch_fs *c)
fs_usage_data_type_to_base(usage, e->data_type, usage->replicas[i]);
}
for_each_member_device(ca, c, i) {
struct bch_dev_usage dev = bch2_dev_usage_read(ca);
usage->hidden += (dev.d[BCH_DATA_sb].buckets +
dev.d[BCH_DATA_journal].buckets) *
ca->mi.bucket_size;
}
percpu_up_write(&c->mark_lock);
}
@ -189,14 +198,27 @@ out_pool:
return ret;
}
static inline struct bch_dev_usage *dev_usage_ptr(struct bch_dev *ca,
unsigned journal_seq,
bool gc)
{
return this_cpu_ptr(gc
? ca->usage_gc
: ca->usage[journal_seq & JOURNAL_BUF_MASK]);
}
struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca)
{
struct bch_fs *c = ca->fs;
struct bch_dev_usage ret;
unsigned seq, i, u64s = dev_usage_u64s();
memset(&ret, 0, sizeof(ret));
acc_u64s_percpu((u64 *) &ret,
(u64 __percpu *) ca->usage[0],
sizeof(ret) / sizeof(u64));
do {
seq = read_seqcount_begin(&c->usage_lock);
memcpy(&ret, ca->usage_base, u64s * sizeof(u64));
for (i = 0; i < ARRAY_SIZE(ca->usage); i++)
acc_u64s_percpu((u64 *) &ret, (u64 __percpu *) ca->usage[i], u64s);
} while (read_seqcount_retry(&c->usage_lock, seq));
return ret;
}
@ -264,7 +286,8 @@ retry:
void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx)
{
unsigned u64s = fs_usage_u64s(c);
struct bch_dev *ca;
unsigned i, u64s = fs_usage_u64s(c);
BUG_ON(idx >= ARRAY_SIZE(c->usage));
@ -275,6 +298,16 @@ void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx)
(u64 __percpu *) c->usage[idx], u64s);
percpu_memset(c->usage[idx], 0, u64s * sizeof(u64));
rcu_read_lock();
for_each_member_device_rcu(ca, c, i, NULL) {
u64s = dev_usage_u64s();
acc_u64s_percpu((u64 *) ca->usage_base,
(u64 __percpu *) ca->usage[idx], u64s);
percpu_memset(ca->usage[idx], 0, u64s * sizeof(u64));
}
rcu_read_unlock();
write_seqcount_end(&c->usage_lock);
preempt_enable();
}
@ -459,14 +492,14 @@ static inline void account_bucket(struct bch_fs_usage *fs_usage,
static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
struct bch_fs_usage *fs_usage,
struct bucket_mark old, struct bucket_mark new,
bool gc)
u64 journal_seq, bool gc)
{
struct bch_dev_usage *u;
percpu_rwsem_assert_held(&c->mark_lock);
preempt_disable();
u = this_cpu_ptr(ca->usage[gc]);
u = dev_usage_ptr(ca, journal_seq, gc);
if (bucket_type(old))
account_bucket(fs_usage, u, bucket_type(old),
@ -493,31 +526,6 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
bch2_wake_allocator(ca);
}
__flatten
void bch2_dev_usage_from_buckets(struct bch_fs *c)
{
struct bch_dev *ca;
struct bucket_mark old = { .v.counter = 0 };
struct bucket_array *buckets;
struct bucket *g;
unsigned i;
int cpu;
c->usage_base->hidden = 0;
for_each_member_device(ca, c, i) {
for_each_possible_cpu(cpu)
memset(per_cpu_ptr(ca->usage[0], cpu), 0,
sizeof(*ca->usage[0]));
buckets = bucket_array(ca);
for_each_bucket(g, buckets)
bch2_dev_usage_update(c, ca, c->usage_base,
old, g->mark, false);
}
}
static inline int update_replicas(struct bch_fs *c,
struct bch_fs_usage *fs_usage,
struct bch_replicas_entry *r,
@ -656,7 +664,12 @@ static int __bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
new.owned_by_allocator = owned_by_allocator;
}));
bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
/*
* XXX: this is wrong, this means we'll be doing updates to the percpu
* buckets_alloc counter that don't have an open journal buffer and
* we'll race with the machinery that accumulates that to ca->usage_base
*/
bch2_dev_usage_update(c, ca, fs_usage, old, new, 0, gc);
BUG_ON(!gc &&
!owned_by_allocator && !old.owned_by_allocator);
@ -720,7 +733,7 @@ static int bch2_mark_alloc(struct bch_fs *c,
}
}));
bch2_dev_usage_update(c, ca, fs_usage, old_m, m, gc);
bch2_dev_usage_update(c, ca, fs_usage, old_m, m, journal_seq, gc);
g->io_time[READ] = u.read_time;
g->io_time[WRITE] = u.write_time;
@ -785,7 +798,7 @@ static int __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
if (c)
bch2_dev_usage_update(c, ca, fs_usage_ptr(c, 0, gc),
old, new, gc);
old, new, 0, gc);
return 0;
}
@ -966,7 +979,7 @@ static int mark_stripe_bucket(struct bch_fs *c, struct bkey_s_c k,
g->stripe = k.k->p.offset;
g->stripe_redundancy = s->nr_redundant;
bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
bch2_dev_usage_update(c, ca, fs_usage, old, new, journal_seq, gc);
return 0;
}
@ -1033,7 +1046,7 @@ static int bch2_mark_pointer(struct bch_fs *c, struct bkey_s_c k,
old.v.counter,
new.v.counter)) != old.v.counter);
bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
bch2_dev_usage_update(c, ca, fs_usage, old, new, journal_seq, gc);
BUG_ON(!gc && bucket_became_unavailable(old, new));
@ -2389,13 +2402,24 @@ void bch2_dev_buckets_free(struct bch_dev *ca)
sizeof(struct bucket_array) +
ca->mi.nbuckets * sizeof(struct bucket));
free_percpu(ca->usage[0]);
for (i = 0; i < ARRAY_SIZE(ca->usage); i++)
free_percpu(ca->usage[i]);
kfree(ca->usage_base);
}
int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca)
{
if (!(ca->usage[0] = alloc_percpu(struct bch_dev_usage)))
unsigned i;
ca->usage_base = kzalloc(sizeof(struct bch_dev_usage), GFP_KERNEL);
if (!ca->usage_base)
return -ENOMEM;
for (i = 0; i < ARRAY_SIZE(ca->usage); i++) {
ca->usage[i] = alloc_percpu(struct bch_dev_usage);
if (!ca->usage[i])
return -ENOMEM;
}
return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets);;
}

View File

@ -162,8 +162,6 @@ static inline bool bucket_needs_journal_commit(struct bucket_mark m,
struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *);
void bch2_dev_usage_from_buckets(struct bch_fs *);
static inline u64 __dev_buckets_available(struct bch_dev *ca,
struct bch_dev_usage stats)
{
@ -207,6 +205,11 @@ static inline unsigned fs_usage_u64s(struct bch_fs *c)
READ_ONCE(c->replicas.nr);
}
static inline unsigned dev_usage_u64s(void)
{
return sizeof(struct bch_dev_usage) / sizeof(u64);
}
void bch2_fs_usage_scratch_put(struct bch_fs *, struct bch_fs_usage_online *);
struct bch_fs_usage_online *bch2_fs_usage_scratch_get(struct bch_fs *);

View File

@ -452,6 +452,43 @@ fsck_err:
return ret;
}
static int journal_entry_validate_dev_usage(struct bch_fs *c,
struct jset *jset,
struct jset_entry *entry,
int write)
{
struct jset_entry_dev_usage *u =
container_of(entry, struct jset_entry_dev_usage, entry);
unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
unsigned expected = sizeof(*u) + sizeof(u->d[0]) * 7; /* Current value of BCH_DATA_NR */
unsigned dev;
int ret = 0;
if (journal_entry_err_on(bytes < expected,
c, "invalid journal entry dev usage: bad size (%u < %u)",
bytes, expected)) {
journal_entry_null_range(entry, vstruct_next(entry));
return ret;
}
dev = le32_to_cpu(u->dev);
if (journal_entry_err_on(!bch2_dev_exists2(c, dev),
c, "invalid journal entry dev usage: bad dev")) {
journal_entry_null_range(entry, vstruct_next(entry));
return ret;
}
if (journal_entry_err_on(u->pad,
c, "invalid journal entry dev usage: bad pad")) {
journal_entry_null_range(entry, vstruct_next(entry));
return ret;
}
fsck_err:
return ret;
}
struct jset_entry_ops {
int (*validate)(struct bch_fs *, struct jset *,
struct jset_entry *, int);

View File

@ -825,10 +825,31 @@ static int journal_replay_entry_early(struct bch_fs *c,
case BCH_JSET_ENTRY_data_usage: {
struct jset_entry_data_usage *u =
container_of(entry, struct jset_entry_data_usage, entry);
ret = bch2_replicas_set_usage(c, &u->r,
le64_to_cpu(u->v));
break;
}
case BCH_JSET_ENTRY_dev_usage: {
struct jset_entry_dev_usage *u =
container_of(entry, struct jset_entry_dev_usage, entry);
struct bch_dev *ca = bch_dev_bkey_exists(c, u->dev);
unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
unsigned nr_types = (bytes - sizeof(struct jset_entry_dev_usage)) /
sizeof(struct jset_entry_dev_usage_type);
unsigned i;
ca->usage_base->buckets_ec = le64_to_cpu(u->buckets_ec);
ca->usage_base->buckets_unavailable = le64_to_cpu(u->buckets_unavailable);
for (i = 0; i < nr_types; i++) {
ca->usage_base->d[i].buckets = le64_to_cpu(u->d[i].buckets);
ca->usage_base->d[i].sectors = le64_to_cpu(u->d[i].sectors);
ca->usage_base->d[i].fragmented = le64_to_cpu(u->d[i].fragmented);
}
break;
}
case BCH_JSET_ENTRY_blacklist: {
struct jset_entry_blacklist *bl_entry =
container_of(entry, struct jset_entry_blacklist, entry);

View File

@ -986,7 +986,8 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c,
struct jset_entry **end,
u64 journal_seq)
{
unsigned i;
struct bch_dev *ca;
unsigned i, dev;
percpu_down_read(&c->mark_lock);
@ -1041,6 +1042,25 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c,
"embedded variable length struct");
}
for_each_member_device(ca, c, dev) {
unsigned b = sizeof(struct jset_entry_dev_usage) +
sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR;
struct jset_entry_dev_usage *u =
container_of(jset_entry_init(end, b),
struct jset_entry_dev_usage, entry);
u->entry.type = BCH_JSET_ENTRY_dev_usage;
u->dev = cpu_to_le32(dev);
u->buckets_ec = cpu_to_le64(ca->usage_base->buckets_ec);
u->buckets_unavailable = cpu_to_le64(ca->usage_base->buckets_unavailable);
for (i = 0; i < BCH_DATA_NR; i++) {
u->d[i].buckets = cpu_to_le64(ca->usage_base->d[i].buckets);
u->d[i].sectors = cpu_to_le64(ca->usage_base->d[i].sectors);
u->d[i].fragmented = cpu_to_le64(ca->usage_base->d[i].fragmented);
}
}
percpu_up_read(&c->mark_lock);
for (i = 0; i < 2; i++) {

View File

@ -155,6 +155,22 @@ struct bch_fs *bch2_uuid_to_fs(__uuid_t uuid)
return c;
}
static void bch2_dev_usage_journal_reserve(struct bch_fs *c)
{
struct bch_dev *ca;
unsigned i, nr = 0, u64s =
(sizeof(struct jset_entry_dev_usage) +
sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR);
rcu_read_lock();
for_each_member_device_rcu(ca, c, i, NULL)
nr++;
rcu_read_unlock();
bch2_journal_entry_res_resize(&c->journal,
&c->dev_usage_journal_res, u64s * nr);
}
/* Filesystem RO/RW: */
/*
@ -780,6 +796,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
bch2_fs_fsio_init(c))
goto err;
bch2_dev_usage_journal_reserve(c);
mi = bch2_sb_get_members(c->disk_sb.sb);
for (i = 0; i < c->sb.nr_devices; i++)
if (bch2_dev_exists(c->disk_sb.sb, mi, i) &&
@ -1516,6 +1534,8 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
mutex_unlock(&c->sb_lock);
up_write(&c->state_lock);
bch2_dev_usage_journal_reserve(c);
return 0;
err:
if (ca->mi.state == BCH_MEMBER_STATE_RW &&
@ -1525,19 +1545,6 @@ err:
return ret;
}
static void dev_usage_clear(struct bch_dev *ca)
{
struct bucket_array *buckets;
percpu_memset(ca->usage[0], 0, sizeof(*ca->usage[0]));
down_read(&ca->bucket_lock);
buckets = bucket_array(ca);
memset(buckets->b, 0, sizeof(buckets->b[0]) * buckets->nbuckets);
up_read(&ca->bucket_lock);
}
/* Add new device to running filesystem: */
int bch2_dev_add(struct bch_fs *c, const char *path)
{
@ -1595,8 +1602,6 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
if (ret)
goto err;
dev_usage_clear(ca);
down_write(&c->state_lock);
mutex_lock(&c->sb_lock);
@ -1650,6 +1655,8 @@ have_slot:
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
bch2_dev_usage_journal_reserve(c);
err = "error marking superblock";
ret = bch2_trans_mark_dev_sb(c, NULL, ca);
if (ret)