mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2025-01-01 18:53:30 +00:00
io-wq: make buffered file write hashed work map per-ctx
Before the io-wq thread change, we maintained a hash work map and lock per-node per-ring. That wasn't ideal, as we really wanted it to be per ring. But now that we have per-task workers, the hash map ends up being just per-task. That'll work just fine for the normal case of having one task use a ring, but if you share the ring between tasks, then it's considerably worse than it was before. Make the hash map per ctx instead, which provides full per-ctx buffered write serialization on hashed writes. Signed-off-by: Jens Axboe <axboe@kernel.dk>
This commit is contained in:
parent
eb2de9418d
commit
e941894eae
85
fs/io-wq.c
85
fs/io-wq.c
@ -87,7 +87,6 @@ struct io_wqe {
|
|||||||
struct {
|
struct {
|
||||||
raw_spinlock_t lock;
|
raw_spinlock_t lock;
|
||||||
struct io_wq_work_list work_list;
|
struct io_wq_work_list work_list;
|
||||||
unsigned long hash_map;
|
|
||||||
unsigned flags;
|
unsigned flags;
|
||||||
} ____cacheline_aligned_in_smp;
|
} ____cacheline_aligned_in_smp;
|
||||||
|
|
||||||
@ -97,6 +96,8 @@ struct io_wqe {
|
|||||||
struct hlist_nulls_head free_list;
|
struct hlist_nulls_head free_list;
|
||||||
struct list_head all_list;
|
struct list_head all_list;
|
||||||
|
|
||||||
|
struct wait_queue_entry wait;
|
||||||
|
|
||||||
struct io_wq *wq;
|
struct io_wq *wq;
|
||||||
struct io_wq_work *hash_tail[IO_WQ_NR_HASH_BUCKETS];
|
struct io_wq_work *hash_tail[IO_WQ_NR_HASH_BUCKETS];
|
||||||
};
|
};
|
||||||
@ -113,6 +114,9 @@ struct io_wq {
|
|||||||
|
|
||||||
struct task_struct *manager;
|
struct task_struct *manager;
|
||||||
struct user_struct *user;
|
struct user_struct *user;
|
||||||
|
|
||||||
|
struct io_wq_hash *hash;
|
||||||
|
|
||||||
refcount_t refs;
|
refcount_t refs;
|
||||||
struct completion done;
|
struct completion done;
|
||||||
|
|
||||||
@ -328,14 +332,31 @@ static inline unsigned int io_get_work_hash(struct io_wq_work *work)
|
|||||||
return work->flags >> IO_WQ_HASH_SHIFT;
|
return work->flags >> IO_WQ_HASH_SHIFT;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void io_wait_on_hash(struct io_wqe *wqe, unsigned int hash)
|
||||||
|
{
|
||||||
|
struct io_wq *wq = wqe->wq;
|
||||||
|
|
||||||
|
spin_lock(&wq->hash->wait.lock);
|
||||||
|
if (list_empty(&wqe->wait.entry)) {
|
||||||
|
__add_wait_queue(&wq->hash->wait, &wqe->wait);
|
||||||
|
if (!test_bit(hash, &wq->hash->map)) {
|
||||||
|
__set_current_state(TASK_RUNNING);
|
||||||
|
list_del_init(&wqe->wait.entry);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
spin_unlock(&wq->hash->wait.lock);
|
||||||
|
}
|
||||||
|
|
||||||
static struct io_wq_work *io_get_next_work(struct io_wqe *wqe)
|
static struct io_wq_work *io_get_next_work(struct io_wqe *wqe)
|
||||||
__must_hold(wqe->lock)
|
__must_hold(wqe->lock)
|
||||||
{
|
{
|
||||||
struct io_wq_work_node *node, *prev;
|
struct io_wq_work_node *node, *prev;
|
||||||
struct io_wq_work *work, *tail;
|
struct io_wq_work *work, *tail;
|
||||||
unsigned int hash;
|
unsigned int stall_hash = -1U;
|
||||||
|
|
||||||
wq_list_for_each(node, prev, &wqe->work_list) {
|
wq_list_for_each(node, prev, &wqe->work_list) {
|
||||||
|
unsigned int hash;
|
||||||
|
|
||||||
work = container_of(node, struct io_wq_work, list);
|
work = container_of(node, struct io_wq_work, list);
|
||||||
|
|
||||||
/* not hashed, can run anytime */
|
/* not hashed, can run anytime */
|
||||||
@ -344,16 +365,26 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe)
|
|||||||
return work;
|
return work;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* hashed, can run if not already running */
|
|
||||||
hash = io_get_work_hash(work);
|
hash = io_get_work_hash(work);
|
||||||
if (!(wqe->hash_map & BIT(hash))) {
|
/* all items with this hash lie in [work, tail] */
|
||||||
wqe->hash_map |= BIT(hash);
|
tail = wqe->hash_tail[hash];
|
||||||
/* all items with this hash lie in [work, tail] */
|
|
||||||
tail = wqe->hash_tail[hash];
|
/* hashed, can run if not already running */
|
||||||
|
if (!test_and_set_bit(hash, &wqe->wq->hash->map)) {
|
||||||
wqe->hash_tail[hash] = NULL;
|
wqe->hash_tail[hash] = NULL;
|
||||||
wq_list_cut(&wqe->work_list, &tail->list, prev);
|
wq_list_cut(&wqe->work_list, &tail->list, prev);
|
||||||
return work;
|
return work;
|
||||||
}
|
}
|
||||||
|
if (stall_hash == -1U)
|
||||||
|
stall_hash = hash;
|
||||||
|
/* fast forward to a next hash, for-each will fix up @prev */
|
||||||
|
node = &tail->list;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (stall_hash != -1U) {
|
||||||
|
raw_spin_unlock(&wqe->lock);
|
||||||
|
io_wait_on_hash(wqe, stall_hash);
|
||||||
|
raw_spin_lock(&wqe->lock);
|
||||||
}
|
}
|
||||||
|
|
||||||
return NULL;
|
return NULL;
|
||||||
@ -421,6 +452,7 @@ static void io_worker_handle_work(struct io_worker *worker)
|
|||||||
if (!work)
|
if (!work)
|
||||||
break;
|
break;
|
||||||
io_assign_current_work(worker, work);
|
io_assign_current_work(worker, work);
|
||||||
|
__set_current_state(TASK_RUNNING);
|
||||||
|
|
||||||
/* handle a whole dependent link */
|
/* handle a whole dependent link */
|
||||||
do {
|
do {
|
||||||
@ -444,8 +476,10 @@ static void io_worker_handle_work(struct io_worker *worker)
|
|||||||
io_wqe_enqueue(wqe, linked);
|
io_wqe_enqueue(wqe, linked);
|
||||||
|
|
||||||
if (hash != -1U && !next_hashed) {
|
if (hash != -1U && !next_hashed) {
|
||||||
|
clear_bit(hash, &wq->hash->map);
|
||||||
|
if (wq_has_sleeper(&wq->hash->wait))
|
||||||
|
wake_up(&wq->hash->wait);
|
||||||
raw_spin_lock_irq(&wqe->lock);
|
raw_spin_lock_irq(&wqe->lock);
|
||||||
wqe->hash_map &= ~BIT_ULL(hash);
|
|
||||||
wqe->flags &= ~IO_WQE_FLAG_STALLED;
|
wqe->flags &= ~IO_WQE_FLAG_STALLED;
|
||||||
/* skip unnecessary unlock-lock wqe->lock */
|
/* skip unnecessary unlock-lock wqe->lock */
|
||||||
if (!work)
|
if (!work)
|
||||||
@ -471,7 +505,6 @@ static int io_wqe_worker(void *data)
|
|||||||
loop:
|
loop:
|
||||||
raw_spin_lock_irq(&wqe->lock);
|
raw_spin_lock_irq(&wqe->lock);
|
||||||
if (io_wqe_run_queue(wqe)) {
|
if (io_wqe_run_queue(wqe)) {
|
||||||
__set_current_state(TASK_RUNNING);
|
|
||||||
io_worker_handle_work(worker);
|
io_worker_handle_work(worker);
|
||||||
goto loop;
|
goto loop;
|
||||||
}
|
}
|
||||||
@ -928,6 +961,24 @@ enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
|
|||||||
return IO_WQ_CANCEL_NOTFOUND;
|
return IO_WQ_CANCEL_NOTFOUND;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int io_wqe_hash_wake(struct wait_queue_entry *wait, unsigned mode,
|
||||||
|
int sync, void *key)
|
||||||
|
{
|
||||||
|
struct io_wqe *wqe = container_of(wait, struct io_wqe, wait);
|
||||||
|
int ret;
|
||||||
|
|
||||||
|
list_del_init(&wait->entry);
|
||||||
|
|
||||||
|
rcu_read_lock();
|
||||||
|
ret = io_wqe_activate_free_worker(wqe);
|
||||||
|
rcu_read_unlock();
|
||||||
|
|
||||||
|
if (!ret)
|
||||||
|
wake_up_process(wqe->wq->manager);
|
||||||
|
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
|
struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
|
||||||
{
|
{
|
||||||
int ret = -ENOMEM, node;
|
int ret = -ENOMEM, node;
|
||||||
@ -948,6 +999,8 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
|
|||||||
if (ret)
|
if (ret)
|
||||||
goto err_wqes;
|
goto err_wqes;
|
||||||
|
|
||||||
|
refcount_inc(&data->hash->refs);
|
||||||
|
wq->hash = data->hash;
|
||||||
wq->free_work = data->free_work;
|
wq->free_work = data->free_work;
|
||||||
wq->do_work = data->do_work;
|
wq->do_work = data->do_work;
|
||||||
|
|
||||||
@ -968,6 +1021,8 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
|
|||||||
wqe->acct[IO_WQ_ACCT_UNBOUND].max_workers =
|
wqe->acct[IO_WQ_ACCT_UNBOUND].max_workers =
|
||||||
task_rlimit(current, RLIMIT_NPROC);
|
task_rlimit(current, RLIMIT_NPROC);
|
||||||
atomic_set(&wqe->acct[IO_WQ_ACCT_UNBOUND].nr_running, 0);
|
atomic_set(&wqe->acct[IO_WQ_ACCT_UNBOUND].nr_running, 0);
|
||||||
|
wqe->wait.func = io_wqe_hash_wake;
|
||||||
|
INIT_LIST_HEAD(&wqe->wait.entry);
|
||||||
wqe->wq = wq;
|
wqe->wq = wq;
|
||||||
raw_spin_lock_init(&wqe->lock);
|
raw_spin_lock_init(&wqe->lock);
|
||||||
INIT_WQ_LIST(&wqe->work_list);
|
INIT_WQ_LIST(&wqe->work_list);
|
||||||
@ -989,6 +1044,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
|
|||||||
|
|
||||||
if (refcount_dec_and_test(&wq->refs))
|
if (refcount_dec_and_test(&wq->refs))
|
||||||
complete(&wq->done);
|
complete(&wq->done);
|
||||||
|
io_wq_put_hash(data->hash);
|
||||||
err:
|
err:
|
||||||
cpuhp_state_remove_instance_nocalls(io_wq_online, &wq->cpuhp_node);
|
cpuhp_state_remove_instance_nocalls(io_wq_online, &wq->cpuhp_node);
|
||||||
for_each_node(node)
|
for_each_node(node)
|
||||||
@ -1017,8 +1073,15 @@ void io_wq_destroy(struct io_wq *wq)
|
|||||||
|
|
||||||
wait_for_completion(&wq->done);
|
wait_for_completion(&wq->done);
|
||||||
|
|
||||||
for_each_node(node)
|
spin_lock_irq(&wq->hash->wait.lock);
|
||||||
kfree(wq->wqes[node]);
|
for_each_node(node) {
|
||||||
|
struct io_wqe *wqe = wq->wqes[node];
|
||||||
|
|
||||||
|
list_del_init(&wqe->wait.entry);
|
||||||
|
kfree(wqe);
|
||||||
|
}
|
||||||
|
spin_unlock_irq(&wq->hash->wait.lock);
|
||||||
|
io_wq_put_hash(wq->hash);
|
||||||
kfree(wq->wqes);
|
kfree(wq->wqes);
|
||||||
kfree(wq);
|
kfree(wq);
|
||||||
}
|
}
|
||||||
|
14
fs/io-wq.h
14
fs/io-wq.h
@ -1,6 +1,7 @@
|
|||||||
#ifndef INTERNAL_IO_WQ_H
|
#ifndef INTERNAL_IO_WQ_H
|
||||||
#define INTERNAL_IO_WQ_H
|
#define INTERNAL_IO_WQ_H
|
||||||
|
|
||||||
|
#include <linux/refcount.h>
|
||||||
#include <linux/io_uring.h>
|
#include <linux/io_uring.h>
|
||||||
|
|
||||||
struct io_wq;
|
struct io_wq;
|
||||||
@ -93,7 +94,20 @@ static inline struct io_wq_work *wq_next_work(struct io_wq_work *work)
|
|||||||
typedef struct io_wq_work *(free_work_fn)(struct io_wq_work *);
|
typedef struct io_wq_work *(free_work_fn)(struct io_wq_work *);
|
||||||
typedef void (io_wq_work_fn)(struct io_wq_work *);
|
typedef void (io_wq_work_fn)(struct io_wq_work *);
|
||||||
|
|
||||||
|
struct io_wq_hash {
|
||||||
|
refcount_t refs;
|
||||||
|
unsigned long map;
|
||||||
|
struct wait_queue_head wait;
|
||||||
|
};
|
||||||
|
|
||||||
|
static inline void io_wq_put_hash(struct io_wq_hash *hash)
|
||||||
|
{
|
||||||
|
if (refcount_dec_and_test(&hash->refs))
|
||||||
|
kfree(hash);
|
||||||
|
}
|
||||||
|
|
||||||
struct io_wq_data {
|
struct io_wq_data {
|
||||||
|
struct io_wq_hash *hash;
|
||||||
io_wq_work_fn *do_work;
|
io_wq_work_fn *do_work;
|
||||||
free_work_fn *free_work;
|
free_work_fn *free_work;
|
||||||
};
|
};
|
||||||
|
@ -360,6 +360,9 @@ struct io_ring_ctx {
|
|||||||
unsigned cached_cq_overflow;
|
unsigned cached_cq_overflow;
|
||||||
unsigned long sq_check_overflow;
|
unsigned long sq_check_overflow;
|
||||||
|
|
||||||
|
/* hashed buffered write serialization */
|
||||||
|
struct io_wq_hash *hash_map;
|
||||||
|
|
||||||
struct list_head defer_list;
|
struct list_head defer_list;
|
||||||
struct list_head timeout_list;
|
struct list_head timeout_list;
|
||||||
struct list_head cq_overflow_list;
|
struct list_head cq_overflow_list;
|
||||||
@ -454,6 +457,8 @@ struct io_ring_ctx {
|
|||||||
/* exit task_work */
|
/* exit task_work */
|
||||||
struct callback_head *exit_task_work;
|
struct callback_head *exit_task_work;
|
||||||
|
|
||||||
|
struct wait_queue_head hash_wait;
|
||||||
|
|
||||||
/* Keep this last, we don't need it for the fast path */
|
/* Keep this last, we don't need it for the fast path */
|
||||||
struct work_struct exit_work;
|
struct work_struct exit_work;
|
||||||
};
|
};
|
||||||
@ -7763,9 +7768,21 @@ static struct io_wq_work *io_free_work(struct io_wq_work *work)
|
|||||||
|
|
||||||
static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx)
|
static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx)
|
||||||
{
|
{
|
||||||
|
struct io_wq_hash *hash;
|
||||||
struct io_wq_data data;
|
struct io_wq_data data;
|
||||||
unsigned int concurrency;
|
unsigned int concurrency;
|
||||||
|
|
||||||
|
hash = ctx->hash_map;
|
||||||
|
if (!hash) {
|
||||||
|
hash = kzalloc(sizeof(*hash), GFP_KERNEL);
|
||||||
|
if (!hash)
|
||||||
|
return ERR_PTR(-ENOMEM);
|
||||||
|
refcount_set(&hash->refs, 1);
|
||||||
|
init_waitqueue_head(&hash->wait);
|
||||||
|
ctx->hash_map = hash;
|
||||||
|
}
|
||||||
|
|
||||||
|
data.hash = hash;
|
||||||
data.free_work = io_free_work;
|
data.free_work = io_free_work;
|
||||||
data.do_work = io_wq_submit_work;
|
data.do_work = io_wq_submit_work;
|
||||||
|
|
||||||
@ -8405,6 +8422,8 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
|
|||||||
percpu_ref_exit(&ctx->refs);
|
percpu_ref_exit(&ctx->refs);
|
||||||
free_uid(ctx->user);
|
free_uid(ctx->user);
|
||||||
io_req_caches_free(ctx, NULL);
|
io_req_caches_free(ctx, NULL);
|
||||||
|
if (ctx->hash_map)
|
||||||
|
io_wq_put_hash(ctx->hash_map);
|
||||||
kfree(ctx->cancel_hash);
|
kfree(ctx->cancel_hash);
|
||||||
kfree(ctx);
|
kfree(ctx);
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user