bcachefs: Nocow support
This adds support for nocow mode, where we do writes in-place when
possible. Patch components:
- New boolean filesystem and inode option, nocow: note that when nocow
is enabled, data checksumming and compression are implicitly disabled
- To prevent in-place writes from racing with data moves
(data_update.c) or bucket reuse (i.e. a bucket being reused and
re-allocated while a nocow write is in flight, we have a new locking
mechanism.
Buckets can be locked for either data update or data move, using a
fixed size hash table of two_state_shared locks. We don't have any
chaining, meaning updates and moves to different buckets that hash to
the same lock will wait unnecessarily - we'll want to watch for this
becoming an issue.
- The allocator path also needs to check for in-place writes in flight
to a given bucket before giving it out: thus we add another counter
to bucket_alloc_state so we can track this.
- Fsync now may need to issue cache flushes to block devices instead of
flushing the journal. We add a device bitmask to bch_inode_info,
ei_devs_need_flush, which tracks devices that need to have flushes
issued - note that this will lead to unnecessary flushes when other
codepaths have already issued flushes, we may want to replace this with
a sequence number.
- New nocow write path: look up extents, and if they're writable write
to them - otherwise fall back to the normal COW write path.
XXX: switch to sequence numbers instead of bitmask for devs needing
journal flush
XXX: ei_quota_lock being a mutex means bch2_nocow_write_done() needs to
run in process context - see if we can improve this
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2022-11-02 17:12:00 -04:00
|
|
|
// SPDX-License-Identifier: GPL-2.0
|
|
|
|
|
|
|
|
#include "bcachefs.h"
|
|
|
|
#include "nocow_locking.h"
|
|
|
|
#include "util.h"
|
|
|
|
|
2022-12-14 20:52:11 -05:00
|
|
|
#include <linux/closure.h>
|
|
|
|
|
|
|
|
bool bch2_bucket_nocow_is_locked(struct bucket_nocow_lock_table *t, struct bpos bucket)
|
|
|
|
{
|
|
|
|
u64 dev_bucket = bucket_to_u64(bucket);
|
|
|
|
struct nocow_lock_bucket *l = bucket_nocow_lock(t, dev_bucket);
|
|
|
|
unsigned i;
|
|
|
|
|
|
|
|
for (i = 0; i < ARRAY_SIZE(l->b); i++)
|
|
|
|
if (l->b[i] == dev_bucket && atomic_read(&l->l[i]))
|
|
|
|
return true;
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2023-01-26 13:36:30 -05:00
|
|
|
#define sign(v) (v < 0 ? -1 : v > 0 ? 1 : 0)
|
|
|
|
|
2022-12-14 20:52:11 -05:00
|
|
|
void bch2_bucket_nocow_unlock(struct bucket_nocow_lock_table *t, struct bpos bucket, int flags)
|
|
|
|
{
|
|
|
|
u64 dev_bucket = bucket_to_u64(bucket);
|
|
|
|
struct nocow_lock_bucket *l = bucket_nocow_lock(t, dev_bucket);
|
|
|
|
int lock_val = flags ? 1 : -1;
|
|
|
|
unsigned i;
|
|
|
|
|
|
|
|
for (i = 0; i < ARRAY_SIZE(l->b); i++)
|
|
|
|
if (l->b[i] == dev_bucket) {
|
2023-01-26 13:36:30 -05:00
|
|
|
BUG_ON(sign(atomic_read(&l->l[i])) != lock_val);
|
|
|
|
|
2022-12-14 20:52:11 -05:00
|
|
|
if (!atomic_sub_return(lock_val, &l->l[i]))
|
|
|
|
closure_wake_up(&l->wait);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
BUG();
|
|
|
|
}
|
|
|
|
|
|
|
|
bool __bch2_bucket_nocow_trylock(struct nocow_lock_bucket *l,
|
|
|
|
u64 dev_bucket, int flags)
|
|
|
|
{
|
|
|
|
int v, lock_val = flags ? 1 : -1;
|
|
|
|
unsigned i;
|
|
|
|
|
|
|
|
spin_lock(&l->lock);
|
|
|
|
|
|
|
|
for (i = 0; i < ARRAY_SIZE(l->b); i++)
|
|
|
|
if (l->b[i] == dev_bucket)
|
|
|
|
goto got_entry;
|
|
|
|
|
|
|
|
for (i = 0; i < ARRAY_SIZE(l->b); i++)
|
|
|
|
if (!atomic_read(&l->l[i])) {
|
|
|
|
l->b[i] = dev_bucket;
|
|
|
|
goto take_lock;
|
|
|
|
}
|
|
|
|
fail:
|
|
|
|
spin_unlock(&l->lock);
|
|
|
|
return false;
|
|
|
|
got_entry:
|
|
|
|
v = atomic_read(&l->l[i]);
|
|
|
|
if (lock_val > 0 ? v < 0 : v > 0)
|
|
|
|
goto fail;
|
|
|
|
take_lock:
|
|
|
|
atomic_add(lock_val, &l->l[i]);
|
|
|
|
spin_unlock(&l->lock);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
bcachefs: Nocow support
This adds support for nocow mode, where we do writes in-place when
possible. Patch components:
- New boolean filesystem and inode option, nocow: note that when nocow
is enabled, data checksumming and compression are implicitly disabled
- To prevent in-place writes from racing with data moves
(data_update.c) or bucket reuse (i.e. a bucket being reused and
re-allocated while a nocow write is in flight, we have a new locking
mechanism.
Buckets can be locked for either data update or data move, using a
fixed size hash table of two_state_shared locks. We don't have any
chaining, meaning updates and moves to different buckets that hash to
the same lock will wait unnecessarily - we'll want to watch for this
becoming an issue.
- The allocator path also needs to check for in-place writes in flight
to a given bucket before giving it out: thus we add another counter
to bucket_alloc_state so we can track this.
- Fsync now may need to issue cache flushes to block devices instead of
flushing the journal. We add a device bitmask to bch_inode_info,
ei_devs_need_flush, which tracks devices that need to have flushes
issued - note that this will lead to unnecessary flushes when other
codepaths have already issued flushes, we may want to replace this with
a sequence number.
- New nocow write path: look up extents, and if they're writable write
to them - otherwise fall back to the normal COW write path.
XXX: switch to sequence numbers instead of bitmask for devs needing
journal flush
XXX: ei_quota_lock being a mutex means bch2_nocow_write_done() needs to
run in process context - see if we can improve this
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2022-11-02 17:12:00 -04:00
|
|
|
void __bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *t,
|
2022-12-14 20:52:11 -05:00
|
|
|
struct nocow_lock_bucket *l,
|
|
|
|
u64 dev_bucket, int flags)
|
|
|
|
{
|
|
|
|
if (!__bch2_bucket_nocow_trylock(l, dev_bucket, flags)) {
|
|
|
|
struct bch_fs *c = container_of(t, struct bch_fs, nocow_locks);
|
|
|
|
u64 start_time = local_clock();
|
|
|
|
|
|
|
|
__closure_wait_event(&l->wait, __bch2_bucket_nocow_trylock(l, dev_bucket, flags));
|
|
|
|
bch2_time_stats_update(&c->times[BCH_TIME_nocow_lock_contended], start_time);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void bch2_nocow_locks_to_text(struct printbuf *out, struct bucket_nocow_lock_table *t)
|
bcachefs: Nocow support
This adds support for nocow mode, where we do writes in-place when
possible. Patch components:
- New boolean filesystem and inode option, nocow: note that when nocow
is enabled, data checksumming and compression are implicitly disabled
- To prevent in-place writes from racing with data moves
(data_update.c) or bucket reuse (i.e. a bucket being reused and
re-allocated while a nocow write is in flight, we have a new locking
mechanism.
Buckets can be locked for either data update or data move, using a
fixed size hash table of two_state_shared locks. We don't have any
chaining, meaning updates and moves to different buckets that hash to
the same lock will wait unnecessarily - we'll want to watch for this
becoming an issue.
- The allocator path also needs to check for in-place writes in flight
to a given bucket before giving it out: thus we add another counter
to bucket_alloc_state so we can track this.
- Fsync now may need to issue cache flushes to block devices instead of
flushing the journal. We add a device bitmask to bch_inode_info,
ei_devs_need_flush, which tracks devices that need to have flushes
issued - note that this will lead to unnecessary flushes when other
codepaths have already issued flushes, we may want to replace this with
a sequence number.
- New nocow write path: look up extents, and if they're writable write
to them - otherwise fall back to the normal COW write path.
XXX: switch to sequence numbers instead of bitmask for devs needing
journal flush
XXX: ei_quota_lock being a mutex means bch2_nocow_write_done() needs to
run in process context - see if we can improve this
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2022-11-02 17:12:00 -04:00
|
|
|
{
|
2022-12-14 20:52:11 -05:00
|
|
|
unsigned i, nr_zero = 0;
|
|
|
|
struct nocow_lock_bucket *l;
|
|
|
|
|
|
|
|
for (l = t->l; l < t->l + ARRAY_SIZE(t->l); l++) {
|
|
|
|
unsigned v = 0;
|
|
|
|
|
|
|
|
for (i = 0; i < ARRAY_SIZE(l->l); i++)
|
|
|
|
v |= atomic_read(&l->l[i]);
|
|
|
|
|
|
|
|
if (!v) {
|
|
|
|
nr_zero++;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (nr_zero)
|
|
|
|
prt_printf(out, "(%u empty entries)\n", nr_zero);
|
|
|
|
nr_zero = 0;
|
|
|
|
|
|
|
|
for (i = 0; i < ARRAY_SIZE(l->l); i++)
|
|
|
|
if (atomic_read(&l->l[i]))
|
|
|
|
prt_printf(out, "%llu: %i ", l->b[i], atomic_read(&l->l[i]));
|
|
|
|
prt_newline(out);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (nr_zero)
|
|
|
|
prt_printf(out, "(%u empty entries)\n", nr_zero);
|
|
|
|
}
|
|
|
|
|
|
|
|
int bch2_fs_nocow_locking_init(struct bch_fs *c)
|
|
|
|
{
|
|
|
|
unsigned i;
|
|
|
|
|
|
|
|
for (i = 0; i < ARRAY_SIZE(c->nocow_locks.l); i++)
|
|
|
|
spin_lock_init(&c->nocow_locks.l[i].lock);
|
bcachefs: Nocow support
This adds support for nocow mode, where we do writes in-place when
possible. Patch components:
- New boolean filesystem and inode option, nocow: note that when nocow
is enabled, data checksumming and compression are implicitly disabled
- To prevent in-place writes from racing with data moves
(data_update.c) or bucket reuse (i.e. a bucket being reused and
re-allocated while a nocow write is in flight, we have a new locking
mechanism.
Buckets can be locked for either data update or data move, using a
fixed size hash table of two_state_shared locks. We don't have any
chaining, meaning updates and moves to different buckets that hash to
the same lock will wait unnecessarily - we'll want to watch for this
becoming an issue.
- The allocator path also needs to check for in-place writes in flight
to a given bucket before giving it out: thus we add another counter
to bucket_alloc_state so we can track this.
- Fsync now may need to issue cache flushes to block devices instead of
flushing the journal. We add a device bitmask to bch_inode_info,
ei_devs_need_flush, which tracks devices that need to have flushes
issued - note that this will lead to unnecessary flushes when other
codepaths have already issued flushes, we may want to replace this with
a sequence number.
- New nocow write path: look up extents, and if they're writable write
to them - otherwise fall back to the normal COW write path.
XXX: switch to sequence numbers instead of bitmask for devs needing
journal flush
XXX: ei_quota_lock being a mutex means bch2_nocow_write_done() needs to
run in process context - see if we can improve this
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2022-11-02 17:12:00 -04:00
|
|
|
|
2022-12-14 20:52:11 -05:00
|
|
|
return 0;
|
bcachefs: Nocow support
This adds support for nocow mode, where we do writes in-place when
possible. Patch components:
- New boolean filesystem and inode option, nocow: note that when nocow
is enabled, data checksumming and compression are implicitly disabled
- To prevent in-place writes from racing with data moves
(data_update.c) or bucket reuse (i.e. a bucket being reused and
re-allocated while a nocow write is in flight, we have a new locking
mechanism.
Buckets can be locked for either data update or data move, using a
fixed size hash table of two_state_shared locks. We don't have any
chaining, meaning updates and moves to different buckets that hash to
the same lock will wait unnecessarily - we'll want to watch for this
becoming an issue.
- The allocator path also needs to check for in-place writes in flight
to a given bucket before giving it out: thus we add another counter
to bucket_alloc_state so we can track this.
- Fsync now may need to issue cache flushes to block devices instead of
flushing the journal. We add a device bitmask to bch_inode_info,
ei_devs_need_flush, which tracks devices that need to have flushes
issued - note that this will lead to unnecessary flushes when other
codepaths have already issued flushes, we may want to replace this with
a sequence number.
- New nocow write path: look up extents, and if they're writable write
to them - otherwise fall back to the normal COW write path.
XXX: switch to sequence numbers instead of bitmask for devs needing
journal flush
XXX: ei_quota_lock being a mutex means bch2_nocow_write_done() needs to
run in process context - see if we can improve this
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2022-11-02 17:12:00 -04:00
|
|
|
}
|