mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git
synced 2025-01-04 04:06:26 +00:00
bcachefs: Run btree updates after write out of write_point
In the write path, after the write to the block device(s) complete we have to punt to process context to do the btree update. Instead of using the work item embedded in op->cl, this patch switches to a per write-point work item. This helps with two different issues: - lock contention: btree updates to the same writepoint will (usually) be updating the same alloc keys - context switch overhead: when we're bottlenecked on btree updates, having a thread (running out of a work item) checking the write point for completed ops is cheaper than queueing up a new work item and waking up a kworker. In an arbitrary benchmark, 4k random writes with fio running inside a VM, this patch resulted in a 10% improvement in total iops. Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
This commit is contained in:
parent
5f41739403
commit
b17d3cec14
@ -762,16 +762,17 @@ static struct write_point *writepoint_find(struct bch_fs *c,
|
|||||||
/*
|
/*
|
||||||
* Get us an open_bucket we can allocate from, return with it locked:
|
* Get us an open_bucket we can allocate from, return with it locked:
|
||||||
*/
|
*/
|
||||||
struct write_point *bch2_alloc_sectors_start(struct bch_fs *c,
|
int bch2_alloc_sectors_start(struct bch_fs *c,
|
||||||
unsigned target,
|
unsigned target,
|
||||||
unsigned erasure_code,
|
unsigned erasure_code,
|
||||||
struct write_point_specifier write_point,
|
struct write_point_specifier write_point,
|
||||||
struct bch_devs_list *devs_have,
|
struct bch_devs_list *devs_have,
|
||||||
unsigned nr_replicas,
|
unsigned nr_replicas,
|
||||||
unsigned nr_replicas_required,
|
unsigned nr_replicas_required,
|
||||||
enum alloc_reserve reserve,
|
enum alloc_reserve reserve,
|
||||||
unsigned flags,
|
unsigned flags,
|
||||||
struct closure *cl)
|
struct closure *cl,
|
||||||
|
struct write_point **wp_ret)
|
||||||
{
|
{
|
||||||
struct write_point *wp;
|
struct write_point *wp;
|
||||||
struct open_bucket *ob;
|
struct open_bucket *ob;
|
||||||
@ -792,7 +793,7 @@ struct write_point *bch2_alloc_sectors_start(struct bch_fs *c,
|
|||||||
write_points_nr = c->write_points_nr;
|
write_points_nr = c->write_points_nr;
|
||||||
have_cache = false;
|
have_cache = false;
|
||||||
|
|
||||||
wp = writepoint_find(c, write_point.v);
|
*wp_ret = wp = writepoint_find(c, write_point.v);
|
||||||
|
|
||||||
if (wp->data_type == BCH_DATA_user)
|
if (wp->data_type == BCH_DATA_user)
|
||||||
ob_flags |= BUCKET_MAY_ALLOC_PARTIAL;
|
ob_flags |= BUCKET_MAY_ALLOC_PARTIAL;
|
||||||
@ -848,7 +849,7 @@ struct write_point *bch2_alloc_sectors_start(struct bch_fs *c,
|
|||||||
|
|
||||||
BUG_ON(!wp->sectors_free || wp->sectors_free == UINT_MAX);
|
BUG_ON(!wp->sectors_free || wp->sectors_free == UINT_MAX);
|
||||||
|
|
||||||
return wp;
|
return 0;
|
||||||
err:
|
err:
|
||||||
open_bucket_for_each(c, &wp->ptrs, ob, i)
|
open_bucket_for_each(c, &wp->ptrs, ob, i)
|
||||||
if (ptrs.nr < ARRAY_SIZE(ptrs.v))
|
if (ptrs.nr < ARRAY_SIZE(ptrs.v))
|
||||||
@ -866,9 +867,9 @@ struct write_point *bch2_alloc_sectors_start(struct bch_fs *c,
|
|||||||
switch (ret) {
|
switch (ret) {
|
||||||
case -OPEN_BUCKETS_EMPTY:
|
case -OPEN_BUCKETS_EMPTY:
|
||||||
case -FREELIST_EMPTY:
|
case -FREELIST_EMPTY:
|
||||||
return cl ? ERR_PTR(-EAGAIN) : ERR_PTR(-ENOSPC);
|
return cl ? -EAGAIN : -ENOSPC;
|
||||||
case -INSUFFICIENT_DEVICES:
|
case -INSUFFICIENT_DEVICES:
|
||||||
return ERR_PTR(-EROFS);
|
return -EROFS;
|
||||||
default:
|
default:
|
||||||
BUG();
|
BUG();
|
||||||
}
|
}
|
||||||
@ -895,13 +896,13 @@ struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *c, struct open_bucket *ob)
|
|||||||
void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp,
|
void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp,
|
||||||
struct bkey_i *k, unsigned sectors,
|
struct bkey_i *k, unsigned sectors,
|
||||||
bool cached)
|
bool cached)
|
||||||
|
|
||||||
{
|
{
|
||||||
struct open_bucket *ob;
|
struct open_bucket *ob;
|
||||||
unsigned i;
|
unsigned i;
|
||||||
|
|
||||||
BUG_ON(sectors > wp->sectors_free);
|
BUG_ON(sectors > wp->sectors_free);
|
||||||
wp->sectors_free -= sectors;
|
wp->sectors_free -= sectors;
|
||||||
|
wp->sectors_allocated += sectors;
|
||||||
|
|
||||||
open_bucket_for_each(c, &wp->ptrs, ob, i) {
|
open_bucket_for_each(c, &wp->ptrs, ob, i) {
|
||||||
struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
|
struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
|
||||||
@ -942,6 +943,10 @@ static inline void writepoint_init(struct write_point *wp,
|
|||||||
{
|
{
|
||||||
mutex_init(&wp->lock);
|
mutex_init(&wp->lock);
|
||||||
wp->data_type = type;
|
wp->data_type = type;
|
||||||
|
|
||||||
|
INIT_WORK(&wp->index_update_work, bch2_write_point_do_index_updates);
|
||||||
|
INIT_LIST_HEAD(&wp->writes);
|
||||||
|
spin_lock_init(&wp->writes_lock);
|
||||||
}
|
}
|
||||||
|
|
||||||
void bch2_fs_allocator_foreground_init(struct bch_fs *c)
|
void bch2_fs_allocator_foreground_init(struct bch_fs *c)
|
||||||
@ -997,3 +1002,33 @@ void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c)
|
|||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static const char * const bch2_write_point_states[] = {
|
||||||
|
#define x(n) #n,
|
||||||
|
WRITE_POINT_STATES()
|
||||||
|
#undef x
|
||||||
|
NULL
|
||||||
|
};
|
||||||
|
|
||||||
|
void bch2_write_points_to_text(struct printbuf *out, struct bch_fs *c)
|
||||||
|
{
|
||||||
|
struct write_point *wp;
|
||||||
|
unsigned i;
|
||||||
|
|
||||||
|
for (wp = c->write_points;
|
||||||
|
wp < c->write_points + ARRAY_SIZE(c->write_points);
|
||||||
|
wp++) {
|
||||||
|
pr_buf(out, "%lu: ", wp->write_point);
|
||||||
|
bch2_hprint(out, wp->sectors_allocated);
|
||||||
|
|
||||||
|
pr_buf(out, " last wrote: ");
|
||||||
|
bch2_pr_time_units(out, sched_clock() - wp->last_used);
|
||||||
|
|
||||||
|
for (i = 0; i < WRITE_POINT_STATE_NR; i++) {
|
||||||
|
pr_buf(out, " %s: ", bch2_write_point_states[i]);
|
||||||
|
bch2_pr_time_units(out, wp->time[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
pr_newline(out);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
@ -122,14 +122,15 @@ int bch2_bucket_alloc_set(struct bch_fs *, struct open_buckets *,
|
|||||||
unsigned, unsigned *, bool *, enum alloc_reserve,
|
unsigned, unsigned *, bool *, enum alloc_reserve,
|
||||||
unsigned, struct closure *);
|
unsigned, struct closure *);
|
||||||
|
|
||||||
struct write_point *bch2_alloc_sectors_start(struct bch_fs *,
|
int bch2_alloc_sectors_start(struct bch_fs *,
|
||||||
unsigned, unsigned,
|
unsigned, unsigned,
|
||||||
struct write_point_specifier,
|
struct write_point_specifier,
|
||||||
struct bch_devs_list *,
|
struct bch_devs_list *,
|
||||||
unsigned, unsigned,
|
unsigned, unsigned,
|
||||||
enum alloc_reserve,
|
enum alloc_reserve,
|
||||||
unsigned,
|
unsigned,
|
||||||
struct closure *);
|
struct closure *,
|
||||||
|
struct write_point **);
|
||||||
|
|
||||||
struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *, struct open_bucket *);
|
struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *, struct open_bucket *);
|
||||||
void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct write_point *,
|
void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct write_point *,
|
||||||
@ -156,4 +157,6 @@ void bch2_fs_allocator_foreground_init(struct bch_fs *);
|
|||||||
|
|
||||||
void bch2_open_buckets_to_text(struct printbuf *, struct bch_fs *);
|
void bch2_open_buckets_to_text(struct printbuf *, struct bch_fs *);
|
||||||
|
|
||||||
|
void bch2_write_points_to_text(struct printbuf *, struct bch_fs *);
|
||||||
|
|
||||||
#endif /* _BCACHEFS_ALLOC_FOREGROUND_H */
|
#endif /* _BCACHEFS_ALLOC_FOREGROUND_H */
|
||||||
|
@ -81,18 +81,46 @@ struct dev_stripe_state {
|
|||||||
u64 next_alloc[BCH_SB_MEMBERS_MAX];
|
u64 next_alloc[BCH_SB_MEMBERS_MAX];
|
||||||
};
|
};
|
||||||
|
|
||||||
|
#define WRITE_POINT_STATES() \
|
||||||
|
x(stopped) \
|
||||||
|
x(waiting_io) \
|
||||||
|
x(waiting_work) \
|
||||||
|
x(running)
|
||||||
|
|
||||||
|
enum write_point_state {
|
||||||
|
#define x(n) WRITE_POINT_##n,
|
||||||
|
WRITE_POINT_STATES()
|
||||||
|
#undef x
|
||||||
|
WRITE_POINT_STATE_NR
|
||||||
|
};
|
||||||
|
|
||||||
struct write_point {
|
struct write_point {
|
||||||
struct hlist_node node;
|
struct {
|
||||||
struct mutex lock;
|
struct hlist_node node;
|
||||||
u64 last_used;
|
struct mutex lock;
|
||||||
unsigned long write_point;
|
u64 last_used;
|
||||||
enum bch_data_type data_type;
|
unsigned long write_point;
|
||||||
|
enum bch_data_type data_type;
|
||||||
|
|
||||||
/* calculated based on how many pointers we're actually going to use: */
|
/* calculated based on how many pointers we're actually going to use: */
|
||||||
unsigned sectors_free;
|
unsigned sectors_free;
|
||||||
|
|
||||||
struct open_buckets ptrs;
|
struct open_buckets ptrs;
|
||||||
struct dev_stripe_state stripe;
|
struct dev_stripe_state stripe;
|
||||||
|
|
||||||
|
u64 sectors_allocated;
|
||||||
|
} __attribute__((__aligned__(SMP_CACHE_BYTES)));
|
||||||
|
|
||||||
|
struct {
|
||||||
|
struct work_struct index_update_work;
|
||||||
|
|
||||||
|
struct list_head writes;
|
||||||
|
spinlock_t writes_lock;
|
||||||
|
|
||||||
|
enum write_point_state state;
|
||||||
|
u64 last_state_change;
|
||||||
|
u64 time[WRITE_POINT_STATE_NR];
|
||||||
|
} __attribute__((__aligned__(SMP_CACHE_BYTES)));
|
||||||
};
|
};
|
||||||
|
|
||||||
struct write_point_specifier {
|
struct write_point_specifier {
|
||||||
|
@ -191,6 +191,7 @@ static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
|
|||||||
struct bch_devs_list devs_have = (struct bch_devs_list) { 0 };
|
struct bch_devs_list devs_have = (struct bch_devs_list) { 0 };
|
||||||
unsigned nr_reserve;
|
unsigned nr_reserve;
|
||||||
enum alloc_reserve alloc_reserve;
|
enum alloc_reserve alloc_reserve;
|
||||||
|
int ret;
|
||||||
|
|
||||||
if (flags & BTREE_INSERT_USE_RESERVE) {
|
if (flags & BTREE_INSERT_USE_RESERVE) {
|
||||||
nr_reserve = 0;
|
nr_reserve = 0;
|
||||||
@ -213,7 +214,7 @@ static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
|
|||||||
mutex_unlock(&c->btree_reserve_cache_lock);
|
mutex_unlock(&c->btree_reserve_cache_lock);
|
||||||
|
|
||||||
retry:
|
retry:
|
||||||
wp = bch2_alloc_sectors_start(c,
|
ret = bch2_alloc_sectors_start(c,
|
||||||
c->opts.metadata_target ?:
|
c->opts.metadata_target ?:
|
||||||
c->opts.foreground_target,
|
c->opts.foreground_target,
|
||||||
0,
|
0,
|
||||||
@ -221,9 +222,9 @@ static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
|
|||||||
&devs_have,
|
&devs_have,
|
||||||
res->nr_replicas,
|
res->nr_replicas,
|
||||||
c->opts.metadata_replicas_required,
|
c->opts.metadata_replicas_required,
|
||||||
alloc_reserve, 0, cl);
|
alloc_reserve, 0, cl, &wp);
|
||||||
if (IS_ERR(wp))
|
if (unlikely(ret))
|
||||||
return ERR_CAST(wp);
|
return ERR_PTR(ret);
|
||||||
|
|
||||||
if (wp->sectors_free < btree_sectors(c)) {
|
if (wp->sectors_free < btree_sectors(c)) {
|
||||||
struct open_bucket *ob;
|
struct open_bucket *ob;
|
||||||
|
219
fs/bcachefs/io.c
219
fs/bcachefs/io.c
@ -589,7 +589,7 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void __bch2_write(struct closure *);
|
static void __bch2_write(struct bch_write_op *);
|
||||||
|
|
||||||
static void bch2_write_done(struct closure *cl)
|
static void bch2_write_done(struct closure *cl)
|
||||||
{
|
{
|
||||||
@ -686,22 +686,86 @@ static void __bch2_write_index(struct bch_write_op *op)
|
|||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline void __wp_update_state(struct write_point *wp, enum write_point_state state)
|
||||||
|
{
|
||||||
|
if (state != wp->state) {
|
||||||
|
u64 now = ktime_get_ns();
|
||||||
|
|
||||||
|
if (wp->last_state_change &&
|
||||||
|
time_after64(now, wp->last_state_change))
|
||||||
|
wp->time[wp->state] += now - wp->last_state_change;
|
||||||
|
wp->state = state;
|
||||||
|
wp->last_state_change = now;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void wp_update_state(struct write_point *wp, bool running)
|
||||||
|
{
|
||||||
|
enum write_point_state state;
|
||||||
|
|
||||||
|
state = running ? WRITE_POINT_running :
|
||||||
|
!list_empty(&wp->writes) ? WRITE_POINT_waiting_io
|
||||||
|
: WRITE_POINT_stopped;
|
||||||
|
|
||||||
|
__wp_update_state(wp, state);
|
||||||
|
}
|
||||||
|
|
||||||
static void bch2_write_index(struct closure *cl)
|
static void bch2_write_index(struct closure *cl)
|
||||||
{
|
{
|
||||||
struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
|
struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
|
||||||
struct bch_fs *c = op->c;
|
struct write_point *wp = op->wp;
|
||||||
|
struct workqueue_struct *wq = index_update_wq(op);
|
||||||
|
|
||||||
__bch2_write_index(op);
|
barrier();
|
||||||
|
|
||||||
if (!(op->flags & BCH_WRITE_DONE)) {
|
/*
|
||||||
continue_at(cl, __bch2_write, index_update_wq(op));
|
* We're not using wp->writes_lock here, so this is racey: that's ok,
|
||||||
} else if (!op->error && (op->flags & BCH_WRITE_FLUSH)) {
|
* because this is just for diagnostic purposes, and we're running out
|
||||||
bch2_journal_flush_seq_async(&c->journal,
|
* of interrupt context here so if we were to take the log we'd have to
|
||||||
*op_journal_seq(op),
|
* switch to spin_lock_irq()/irqsave(), which is not free:
|
||||||
cl);
|
*/
|
||||||
continue_at(cl, bch2_write_done, index_update_wq(op));
|
if (wp->state == WRITE_POINT_waiting_io)
|
||||||
} else {
|
__wp_update_state(wp, WRITE_POINT_waiting_work);
|
||||||
continue_at_nobarrier(cl, bch2_write_done, NULL);
|
|
||||||
|
op->btree_update_ready = true;
|
||||||
|
queue_work(wq, &wp->index_update_work);
|
||||||
|
}
|
||||||
|
|
||||||
|
void bch2_write_point_do_index_updates(struct work_struct *work)
|
||||||
|
{
|
||||||
|
struct write_point *wp =
|
||||||
|
container_of(work, struct write_point, index_update_work);
|
||||||
|
struct bch_write_op *op;
|
||||||
|
|
||||||
|
while (1) {
|
||||||
|
spin_lock(&wp->writes_lock);
|
||||||
|
list_for_each_entry(op, &wp->writes, wp_list)
|
||||||
|
if (op->btree_update_ready) {
|
||||||
|
list_del(&op->wp_list);
|
||||||
|
goto unlock;
|
||||||
|
}
|
||||||
|
op = NULL;
|
||||||
|
unlock:
|
||||||
|
wp_update_state(wp, op != NULL);
|
||||||
|
spin_unlock(&wp->writes_lock);
|
||||||
|
|
||||||
|
if (!op)
|
||||||
|
break;
|
||||||
|
|
||||||
|
op->flags |= BCH_WRITE_IN_WORKER;
|
||||||
|
|
||||||
|
__bch2_write_index(op);
|
||||||
|
|
||||||
|
if (!(op->flags & BCH_WRITE_DONE)) {
|
||||||
|
__bch2_write(op);
|
||||||
|
} else if (!op->error && (op->flags & BCH_WRITE_FLUSH)) {
|
||||||
|
bch2_journal_flush_seq_async(&op->c->journal,
|
||||||
|
*op_journal_seq(op),
|
||||||
|
&op->cl);
|
||||||
|
continue_at(&op->cl, bch2_write_done, index_update_wq(op));
|
||||||
|
} else {
|
||||||
|
bch2_write_done(&op->cl);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -734,10 +798,8 @@ static void bch2_write_endio(struct bio *bio)
|
|||||||
|
|
||||||
if (parent)
|
if (parent)
|
||||||
bio_endio(&parent->bio);
|
bio_endio(&parent->bio);
|
||||||
else if (!(op->flags & BCH_WRITE_SKIP_CLOSURE_PUT))
|
|
||||||
closure_put(cl);
|
|
||||||
else
|
else
|
||||||
continue_at_nobarrier(cl, bch2_write_index, index_update_wq(op));
|
closure_put(cl);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void init_append_extent(struct bch_write_op *op,
|
static void init_append_extent(struct bch_write_op *op,
|
||||||
@ -1136,19 +1198,18 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
|
|||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void __bch2_write(struct closure *cl)
|
static void __bch2_write(struct bch_write_op *op)
|
||||||
{
|
{
|
||||||
struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
|
|
||||||
struct bch_fs *c = op->c;
|
struct bch_fs *c = op->c;
|
||||||
struct write_point *wp;
|
struct write_point *wp = NULL;
|
||||||
struct bio *bio = NULL;
|
struct bio *bio = NULL;
|
||||||
bool skip_put = true;
|
|
||||||
unsigned nofs_flags;
|
unsigned nofs_flags;
|
||||||
int ret;
|
int ret;
|
||||||
|
|
||||||
nofs_flags = memalloc_nofs_save();
|
nofs_flags = memalloc_nofs_save();
|
||||||
again:
|
again:
|
||||||
memset(&op->failed, 0, sizeof(op->failed));
|
memset(&op->failed, 0, sizeof(op->failed));
|
||||||
|
op->btree_update_ready = false;
|
||||||
|
|
||||||
do {
|
do {
|
||||||
struct bkey_i *key_to_write;
|
struct bkey_i *key_to_write;
|
||||||
@ -1158,13 +1219,13 @@ static void __bch2_write(struct closure *cl)
|
|||||||
/* +1 for possible cache device: */
|
/* +1 for possible cache device: */
|
||||||
if (op->open_buckets.nr + op->nr_replicas + 1 >
|
if (op->open_buckets.nr + op->nr_replicas + 1 >
|
||||||
ARRAY_SIZE(op->open_buckets.v))
|
ARRAY_SIZE(op->open_buckets.v))
|
||||||
goto flush_io;
|
break;
|
||||||
|
|
||||||
if (bch2_keylist_realloc(&op->insert_keys,
|
if (bch2_keylist_realloc(&op->insert_keys,
|
||||||
op->inline_keys,
|
op->inline_keys,
|
||||||
ARRAY_SIZE(op->inline_keys),
|
ARRAY_SIZE(op->inline_keys),
|
||||||
BKEY_EXTENT_U64s_MAX))
|
BKEY_EXTENT_U64s_MAX))
|
||||||
goto flush_io;
|
break;
|
||||||
|
|
||||||
if ((op->flags & BCH_WRITE_FROM_INTERNAL) &&
|
if ((op->flags & BCH_WRITE_FROM_INTERNAL) &&
|
||||||
percpu_ref_is_dying(&c->writes)) {
|
percpu_ref_is_dying(&c->writes)) {
|
||||||
@ -1177,7 +1238,7 @@ static void __bch2_write(struct closure *cl)
|
|||||||
* freeing up space on specific disks, which means that
|
* freeing up space on specific disks, which means that
|
||||||
* allocations for specific disks may hang arbitrarily long:
|
* allocations for specific disks may hang arbitrarily long:
|
||||||
*/
|
*/
|
||||||
wp = bch2_alloc_sectors_start(c,
|
ret = bch2_alloc_sectors_start(c,
|
||||||
op->target,
|
op->target,
|
||||||
op->opts.erasure_code && !(op->flags & BCH_WRITE_CACHED),
|
op->opts.erasure_code && !(op->flags & BCH_WRITE_CACHED),
|
||||||
op->write_point,
|
op->write_point,
|
||||||
@ -1187,53 +1248,34 @@ static void __bch2_write(struct closure *cl)
|
|||||||
op->alloc_reserve,
|
op->alloc_reserve,
|
||||||
op->flags,
|
op->flags,
|
||||||
(op->flags & (BCH_WRITE_ALLOC_NOWAIT|
|
(op->flags & (BCH_WRITE_ALLOC_NOWAIT|
|
||||||
BCH_WRITE_ONLY_SPECIFIED_DEVS)) ? NULL : cl);
|
BCH_WRITE_ONLY_SPECIFIED_DEVS))
|
||||||
EBUG_ON(!wp);
|
? NULL : &op->cl,
|
||||||
|
&wp);
|
||||||
if (unlikely(IS_ERR(wp))) {
|
if (unlikely(ret)) {
|
||||||
if (unlikely(PTR_ERR(wp) != -EAGAIN)) {
|
if (unlikely(ret != -EAGAIN))
|
||||||
ret = PTR_ERR(wp);
|
|
||||||
goto err;
|
goto err;
|
||||||
}
|
|
||||||
|
|
||||||
goto flush_io;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
EBUG_ON(!wp);
|
||||||
* It's possible for the allocator to fail, put us on the
|
|
||||||
* freelist waitlist, and then succeed in one of various retry
|
|
||||||
* paths: if that happens, we need to disable the skip_put
|
|
||||||
* optimization because otherwise there won't necessarily be a
|
|
||||||
* barrier before we free the bch_write_op:
|
|
||||||
*/
|
|
||||||
if (atomic_read(&cl->remaining) & CLOSURE_WAITING)
|
|
||||||
skip_put = false;
|
|
||||||
|
|
||||||
bch2_open_bucket_get(c, wp, &op->open_buckets);
|
bch2_open_bucket_get(c, wp, &op->open_buckets);
|
||||||
ret = bch2_write_extent(op, wp, &bio);
|
ret = bch2_write_extent(op, wp, &bio);
|
||||||
|
|
||||||
bch2_alloc_sectors_done(c, wp);
|
bch2_alloc_sectors_done(c, wp);
|
||||||
|
|
||||||
if (ret < 0)
|
if (ret < 0)
|
||||||
goto err;
|
goto err;
|
||||||
|
|
||||||
if (ret) {
|
if (!ret)
|
||||||
skip_put = false;
|
|
||||||
} else {
|
|
||||||
/*
|
|
||||||
* for the skip_put optimization this has to be set
|
|
||||||
* before we submit the bio:
|
|
||||||
*/
|
|
||||||
op->flags |= BCH_WRITE_DONE;
|
op->flags |= BCH_WRITE_DONE;
|
||||||
}
|
|
||||||
|
|
||||||
bio->bi_end_io = bch2_write_endio;
|
bio->bi_end_io = bch2_write_endio;
|
||||||
bio->bi_private = &op->cl;
|
bio->bi_private = &op->cl;
|
||||||
bio->bi_opf |= REQ_OP_WRITE;
|
bio->bi_opf |= REQ_OP_WRITE;
|
||||||
|
|
||||||
if (!skip_put)
|
closure_get(bio->bi_private);
|
||||||
closure_get(bio->bi_private);
|
|
||||||
else
|
|
||||||
op->flags |= BCH_WRITE_SKIP_CLOSURE_PUT;
|
|
||||||
|
|
||||||
key_to_write = (void *) (op->insert_keys.keys_p +
|
key_to_write = (void *) (op->insert_keys.keys_p +
|
||||||
key_to_write_offset);
|
key_to_write_offset);
|
||||||
@ -1241,55 +1283,49 @@ static void __bch2_write(struct closure *cl)
|
|||||||
bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user,
|
bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user,
|
||||||
key_to_write);
|
key_to_write);
|
||||||
} while (ret);
|
} while (ret);
|
||||||
|
|
||||||
if (!skip_put)
|
|
||||||
continue_at(cl, bch2_write_index, index_update_wq(op));
|
|
||||||
out:
|
out:
|
||||||
|
/*
|
||||||
|
* If the write can't all be submitted at once, we generally want to
|
||||||
|
* block synchronously as that signals backpressure to the caller.
|
||||||
|
*/
|
||||||
|
if (!(op->flags & BCH_WRITE_DONE) &&
|
||||||
|
!(op->flags & BCH_WRITE_IN_WORKER)) {
|
||||||
|
closure_sync(&op->cl);
|
||||||
|
__bch2_write_index(op);
|
||||||
|
|
||||||
|
if (!(op->flags & BCH_WRITE_DONE))
|
||||||
|
goto again;
|
||||||
|
bch2_write_done(&op->cl);
|
||||||
|
} else {
|
||||||
|
spin_lock(&wp->writes_lock);
|
||||||
|
op->wp = wp;
|
||||||
|
list_add_tail(&op->wp_list, &wp->writes);
|
||||||
|
if (wp->state == WRITE_POINT_stopped)
|
||||||
|
__wp_update_state(wp, WRITE_POINT_waiting_io);
|
||||||
|
spin_unlock(&wp->writes_lock);
|
||||||
|
|
||||||
|
continue_at(&op->cl, bch2_write_index, NULL);
|
||||||
|
}
|
||||||
|
|
||||||
memalloc_nofs_restore(nofs_flags);
|
memalloc_nofs_restore(nofs_flags);
|
||||||
return;
|
return;
|
||||||
err:
|
err:
|
||||||
op->error = ret;
|
op->error = ret;
|
||||||
op->flags |= BCH_WRITE_DONE;
|
op->flags |= BCH_WRITE_DONE;
|
||||||
|
|
||||||
continue_at(cl, bch2_write_index, index_update_wq(op));
|
|
||||||
goto out;
|
goto out;
|
||||||
flush_io:
|
|
||||||
/*
|
|
||||||
* If the write can't all be submitted at once, we generally want to
|
|
||||||
* block synchronously as that signals backpressure to the caller.
|
|
||||||
*
|
|
||||||
* However, if we're running out of a workqueue, we can't block here
|
|
||||||
* because we'll be blocking other work items from completing:
|
|
||||||
*/
|
|
||||||
if (current->flags & PF_WQ_WORKER) {
|
|
||||||
continue_at(cl, bch2_write_index, index_update_wq(op));
|
|
||||||
goto out;
|
|
||||||
}
|
|
||||||
|
|
||||||
closure_sync(cl);
|
|
||||||
|
|
||||||
if (!bch2_keylist_empty(&op->insert_keys)) {
|
|
||||||
__bch2_write_index(op);
|
|
||||||
|
|
||||||
if (op->error) {
|
|
||||||
op->flags |= BCH_WRITE_DONE;
|
|
||||||
continue_at_nobarrier(cl, bch2_write_done, NULL);
|
|
||||||
goto out;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
goto again;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len)
|
static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len)
|
||||||
{
|
{
|
||||||
struct closure *cl = &op->cl;
|
|
||||||
struct bio *bio = &op->wbio.bio;
|
struct bio *bio = &op->wbio.bio;
|
||||||
struct bvec_iter iter;
|
struct bvec_iter iter;
|
||||||
struct bkey_i_inline_data *id;
|
struct bkey_i_inline_data *id;
|
||||||
unsigned sectors;
|
unsigned sectors;
|
||||||
int ret;
|
int ret;
|
||||||
|
|
||||||
|
op->flags |= BCH_WRITE_WROTE_DATA_INLINE;
|
||||||
|
op->flags |= BCH_WRITE_DONE;
|
||||||
|
|
||||||
bch2_check_set_feature(op->c, BCH_FEATURE_inline_data);
|
bch2_check_set_feature(op->c, BCH_FEATURE_inline_data);
|
||||||
|
|
||||||
ret = bch2_keylist_realloc(&op->insert_keys, op->inline_keys,
|
ret = bch2_keylist_realloc(&op->insert_keys, op->inline_keys,
|
||||||
@ -1317,11 +1353,7 @@ static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len)
|
|||||||
set_bkey_val_bytes(&id->k, data_len);
|
set_bkey_val_bytes(&id->k, data_len);
|
||||||
bch2_keylist_push(&op->insert_keys);
|
bch2_keylist_push(&op->insert_keys);
|
||||||
|
|
||||||
op->flags |= BCH_WRITE_WROTE_DATA_INLINE;
|
__bch2_write_index(op);
|
||||||
op->flags |= BCH_WRITE_DONE;
|
|
||||||
|
|
||||||
continue_at_nobarrier(cl, bch2_write_index, NULL);
|
|
||||||
return;
|
|
||||||
err:
|
err:
|
||||||
bch2_write_done(&op->cl);
|
bch2_write_done(&op->cl);
|
||||||
}
|
}
|
||||||
@ -1349,6 +1381,7 @@ void bch2_write(struct closure *cl)
|
|||||||
struct bch_fs *c = op->c;
|
struct bch_fs *c = op->c;
|
||||||
unsigned data_len;
|
unsigned data_len;
|
||||||
|
|
||||||
|
EBUG_ON(op->cl.parent);
|
||||||
BUG_ON(!op->nr_replicas);
|
BUG_ON(!op->nr_replicas);
|
||||||
BUG_ON(!op->write_point.v);
|
BUG_ON(!op->write_point.v);
|
||||||
BUG_ON(!bkey_cmp(op->pos, POS_MAX));
|
BUG_ON(!bkey_cmp(op->pos, POS_MAX));
|
||||||
@ -1381,18 +1414,14 @@ void bch2_write(struct closure *cl)
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
continue_at_nobarrier(cl, __bch2_write, NULL);
|
__bch2_write(op);
|
||||||
return;
|
return;
|
||||||
err:
|
err:
|
||||||
bch2_disk_reservation_put(c, &op->res);
|
bch2_disk_reservation_put(c, &op->res);
|
||||||
|
|
||||||
if (op->end_io) {
|
closure_debug_destroy(&op->cl);
|
||||||
EBUG_ON(cl->parent);
|
if (op->end_io)
|
||||||
closure_debug_destroy(cl);
|
|
||||||
op->end_io(op);
|
op->end_io(op);
|
||||||
} else {
|
|
||||||
closure_return(cl);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Cache promotion on read */
|
/* Cache promotion on read */
|
||||||
|
@ -41,7 +41,7 @@ enum bch_write_flags {
|
|||||||
__BCH_WRITE_CHECK_ENOSPC,
|
__BCH_WRITE_CHECK_ENOSPC,
|
||||||
__BCH_WRITE_MOVE,
|
__BCH_WRITE_MOVE,
|
||||||
__BCH_WRITE_JOURNAL_SEQ_PTR,
|
__BCH_WRITE_JOURNAL_SEQ_PTR,
|
||||||
__BCH_WRITE_SKIP_CLOSURE_PUT,
|
__BCH_WRITE_IN_WORKER,
|
||||||
__BCH_WRITE_DONE,
|
__BCH_WRITE_DONE,
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -59,7 +59,7 @@ enum bch_write_flags {
|
|||||||
|
|
||||||
/* Internal: */
|
/* Internal: */
|
||||||
#define BCH_WRITE_JOURNAL_SEQ_PTR (1U << __BCH_WRITE_JOURNAL_SEQ_PTR)
|
#define BCH_WRITE_JOURNAL_SEQ_PTR (1U << __BCH_WRITE_JOURNAL_SEQ_PTR)
|
||||||
#define BCH_WRITE_SKIP_CLOSURE_PUT (1U << __BCH_WRITE_SKIP_CLOSURE_PUT)
|
#define BCH_WRITE_IN_WORKER (1U << __BCH_WRITE_IN_WORKER)
|
||||||
#define BCH_WRITE_DONE (1U << __BCH_WRITE_DONE)
|
#define BCH_WRITE_DONE (1U << __BCH_WRITE_DONE)
|
||||||
|
|
||||||
static inline u64 *op_journal_seq(struct bch_write_op *op)
|
static inline u64 *op_journal_seq(struct bch_write_op *op)
|
||||||
@ -115,6 +115,8 @@ static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
|
|||||||
|
|
||||||
void bch2_write(struct closure *);
|
void bch2_write(struct closure *);
|
||||||
|
|
||||||
|
void bch2_write_point_do_index_updates(struct work_struct *);
|
||||||
|
|
||||||
static inline struct bch_write_bio *wbio_init(struct bio *bio)
|
static inline struct bch_write_bio *wbio_init(struct bio *bio)
|
||||||
{
|
{
|
||||||
struct bch_write_bio *wbio = to_wbio(bio);
|
struct bch_write_bio *wbio = to_wbio(bio);
|
||||||
|
@ -119,6 +119,7 @@ struct bch_write_op {
|
|||||||
unsigned nr_replicas_required:4;
|
unsigned nr_replicas_required:4;
|
||||||
unsigned alloc_reserve:3;
|
unsigned alloc_reserve:3;
|
||||||
unsigned incompressible:1;
|
unsigned incompressible:1;
|
||||||
|
unsigned btree_update_ready:1;
|
||||||
|
|
||||||
struct bch_devs_list devs_have;
|
struct bch_devs_list devs_have;
|
||||||
u16 target;
|
u16 target;
|
||||||
@ -134,6 +135,9 @@ struct bch_write_op {
|
|||||||
|
|
||||||
struct write_point_specifier write_point;
|
struct write_point_specifier write_point;
|
||||||
|
|
||||||
|
struct write_point *wp;
|
||||||
|
struct list_head wp_list;
|
||||||
|
|
||||||
struct disk_reservation res;
|
struct disk_reservation res;
|
||||||
|
|
||||||
struct open_buckets open_buckets;
|
struct open_buckets open_buckets;
|
||||||
|
@ -791,7 +791,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
|
|||||||
c->inode_shard_bits = ilog2(roundup_pow_of_two(num_possible_cpus()));
|
c->inode_shard_bits = ilog2(roundup_pow_of_two(num_possible_cpus()));
|
||||||
|
|
||||||
if (!(c->btree_update_wq = alloc_workqueue("bcachefs",
|
if (!(c->btree_update_wq = alloc_workqueue("bcachefs",
|
||||||
WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) ||
|
WQ_FREEZABLE|WQ_UNBOUND|WQ_MEM_RECLAIM, 512)) ||
|
||||||
!(c->btree_io_complete_wq = alloc_workqueue("bcachefs_btree_io",
|
!(c->btree_io_complete_wq = alloc_workqueue("bcachefs_btree_io",
|
||||||
WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) ||
|
WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) ||
|
||||||
!(c->copygc_wq = alloc_workqueue("bcachefs_copygc",
|
!(c->copygc_wq = alloc_workqueue("bcachefs_copygc",
|
||||||
|
@ -180,6 +180,7 @@ read_attribute(btree_key_cache);
|
|||||||
read_attribute(btree_transactions);
|
read_attribute(btree_transactions);
|
||||||
read_attribute(stripes_heap);
|
read_attribute(stripes_heap);
|
||||||
read_attribute(open_buckets);
|
read_attribute(open_buckets);
|
||||||
|
read_attribute(write_points);
|
||||||
|
|
||||||
read_attribute(internal_uuid);
|
read_attribute(internal_uuid);
|
||||||
|
|
||||||
@ -418,6 +419,9 @@ SHOW(bch2_fs)
|
|||||||
if (attr == &sysfs_open_buckets)
|
if (attr == &sysfs_open_buckets)
|
||||||
bch2_open_buckets_to_text(out, c);
|
bch2_open_buckets_to_text(out, c);
|
||||||
|
|
||||||
|
if (attr == &sysfs_write_points)
|
||||||
|
bch2_write_points_to_text(out, c);
|
||||||
|
|
||||||
if (attr == &sysfs_compression_stats)
|
if (attr == &sysfs_compression_stats)
|
||||||
bch2_compression_stats_to_text(out, c);
|
bch2_compression_stats_to_text(out, c);
|
||||||
|
|
||||||
@ -563,6 +567,7 @@ struct attribute *bch2_fs_internal_files[] = {
|
|||||||
&sysfs_new_stripes,
|
&sysfs_new_stripes,
|
||||||
&sysfs_stripes_heap,
|
&sysfs_stripes_heap,
|
||||||
&sysfs_open_buckets,
|
&sysfs_open_buckets,
|
||||||
|
&sysfs_write_points,
|
||||||
&sysfs_io_timers_read,
|
&sysfs_io_timers_read,
|
||||||
&sysfs_io_timers_write,
|
&sysfs_io_timers_write,
|
||||||
|
|
||||||
|
@ -390,7 +390,7 @@ static const struct time_unit *pick_time_units(u64 ns)
|
|||||||
return u;
|
return u;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void pr_time_units(struct printbuf *out, u64 ns)
|
void bch2_pr_time_units(struct printbuf *out, u64 ns)
|
||||||
{
|
{
|
||||||
const struct time_unit *u = pick_time_units(ns);
|
const struct time_unit *u = pick_time_units(ns);
|
||||||
|
|
||||||
@ -410,13 +410,13 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats
|
|||||||
freq ? div64_u64(NSEC_PER_SEC, freq) : 0);
|
freq ? div64_u64(NSEC_PER_SEC, freq) : 0);
|
||||||
|
|
||||||
pr_buf(out, "frequency:\t");
|
pr_buf(out, "frequency:\t");
|
||||||
pr_time_units(out, freq);
|
bch2_pr_time_units(out, freq);
|
||||||
|
|
||||||
pr_buf(out, "\navg duration:\t");
|
pr_buf(out, "\navg duration:\t");
|
||||||
pr_time_units(out, stats->average_duration);
|
bch2_pr_time_units(out, stats->average_duration);
|
||||||
|
|
||||||
pr_buf(out, "\nmax duration:\t");
|
pr_buf(out, "\nmax duration:\t");
|
||||||
pr_time_units(out, stats->max_duration);
|
bch2_pr_time_units(out, stats->max_duration);
|
||||||
|
|
||||||
i = eytzinger0_first(NR_QUANTILES);
|
i = eytzinger0_first(NR_QUANTILES);
|
||||||
u = pick_time_units(stats->quantiles.entries[i].m);
|
u = pick_time_units(stats->quantiles.entries[i].m);
|
||||||
|
@ -352,6 +352,8 @@ static inline void pr_sectors(struct printbuf *out, u64 v)
|
|||||||
bch2_pr_units(out, v, v << 9);
|
bch2_pr_units(out, v, v << 9);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void bch2_pr_time_units(struct printbuf *, u64);
|
||||||
|
|
||||||
#ifdef __KERNEL__
|
#ifdef __KERNEL__
|
||||||
static inline void pr_time(struct printbuf *out, u64 time)
|
static inline void pr_time(struct printbuf *out, u64 time)
|
||||||
{
|
{
|
||||||
|
Loading…
Reference in New Issue
Block a user