mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git
synced 2025-01-04 04:02:26 +00:00
7029acd8a9
Work in progress, but get rid of the per-ring serialization of resource nodes, like registered buffers and files. Main issue here is that one node can otherwise hold up a bunch of other nodes from getting freed, which is especially a problem for file resource nodes and networked workloads where some descriptors may not see activity in a long time. As an example, instantiate an io_uring ring fd and create a sparse registered file table. Even 2 will do. Then create a socket and register it as fixed file 0, F0. The number of open files in the app is now 5, with 0/1/2 being the usual stdin/out/err, 3 being the ring fd, and 4 being the socket. Register this socket (eg "the listener") in slot 0 of the registered file table. Now add an operation on the socket that uses slot 0. Finally, loop N times, where each loop creates a new socket, registers said socket as a file, then unregisters the socket, and finally closes the socket. This is roughly similar to what a basic accept loop would look like. At the end of this loop, it's not unreasonable to expect that there would still be 5 open files. Each socket created and registered in the loop is also unregistered and closed. But since the listener socket registered first still has references to its resource node due to still being active, each subsequent socket unregistration is stuck behind it for reclaim. Hence 5 + N files are still open at that point, where N is awaiting the final put held up by the listener socket. Rewrite the io_rsrc_node handling to NOT rely on serialization. Struct io_kiocb now gets explicit resource nodes assigned, with each holding a reference to the parent node. A parent node is either of type FILE or BUFFER, which are the two types of nodes that exist. A request can have two nodes assigned, if it's using both registered files and buffers. Since request issue and task_work completion is both under the ring private lock, no atomics are needed to handle these references. It's a simple unlocked inc/dec. As before, the registered buffer or file table each hold a reference as well to the registered nodes. Final put of the node will remove the node and free the underlying resource, eg unmap the buffer or put the file. Outside of removing the stall in resource reclaim described above, it has the following advantages: 1) It's a lot simpler than the previous scheme, and easier to follow. No need to specific quiesce handling anymore. 2) There are no resource node allocations in the fast path, all of that happens at resource registration time. 3) The structs related to resource handling can all get simplified quite a bit, like io_rsrc_node and io_rsrc_data. io_rsrc_put can go away completely. 4) Handling of resource tags is much simpler, and doesn't require persistent storage as it can simply get assigned up front at registration time. Just copy them in one-by-one at registration time and assign to the resource node. The only real downside is that a request is now explicitly limited to pinning 2 resources, one file and one buffer, where before just assigning a resource node to a request would pin all of them. The upside is that it's easier to follow now, as an individual resource is explicitly referenced and assigned to the request. With this in place, the above mentioned example will be using exactly 5 files at the end of the loop, not N. Signed-off-by: Jens Axboe <axboe@kernel.dk>
230 lines
7.2 KiB
C
230 lines
7.2 KiB
C
// SPDX-License-Identifier: GPL-2.0
|
|
#include <linux/kernel.h>
|
|
#include <linux/errno.h>
|
|
#include <linux/fs.h>
|
|
#include <linux/file.h>
|
|
#include <linux/proc_fs.h>
|
|
#include <linux/seq_file.h>
|
|
#include <linux/io_uring.h>
|
|
|
|
#include <uapi/linux/io_uring.h>
|
|
|
|
#include "io_uring.h"
|
|
#include "sqpoll.h"
|
|
#include "fdinfo.h"
|
|
#include "cancel.h"
|
|
#include "rsrc.h"
|
|
|
|
#ifdef CONFIG_PROC_FS
|
|
static __cold int io_uring_show_cred(struct seq_file *m, unsigned int id,
|
|
const struct cred *cred)
|
|
{
|
|
struct user_namespace *uns = seq_user_ns(m);
|
|
struct group_info *gi;
|
|
kernel_cap_t cap;
|
|
int g;
|
|
|
|
seq_printf(m, "%5d\n", id);
|
|
seq_put_decimal_ull(m, "\tUid:\t", from_kuid_munged(uns, cred->uid));
|
|
seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->euid));
|
|
seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->suid));
|
|
seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->fsuid));
|
|
seq_put_decimal_ull(m, "\n\tGid:\t", from_kgid_munged(uns, cred->gid));
|
|
seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->egid));
|
|
seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->sgid));
|
|
seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->fsgid));
|
|
seq_puts(m, "\n\tGroups:\t");
|
|
gi = cred->group_info;
|
|
for (g = 0; g < gi->ngroups; g++) {
|
|
seq_put_decimal_ull(m, g ? " " : "",
|
|
from_kgid_munged(uns, gi->gid[g]));
|
|
}
|
|
seq_puts(m, "\n\tCapEff:\t");
|
|
cap = cred->cap_effective;
|
|
seq_put_hex_ll(m, NULL, cap.val, 16);
|
|
seq_putc(m, '\n');
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* Caller holds a reference to the file already, we don't need to do
|
|
* anything else to get an extra reference.
|
|
*/
|
|
__cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file)
|
|
{
|
|
struct io_ring_ctx *ctx = file->private_data;
|
|
struct io_overflow_cqe *ocqe;
|
|
struct io_rings *r = ctx->rings;
|
|
struct rusage sq_usage;
|
|
unsigned int sq_mask = ctx->sq_entries - 1, cq_mask = ctx->cq_entries - 1;
|
|
unsigned int sq_head = READ_ONCE(r->sq.head);
|
|
unsigned int sq_tail = READ_ONCE(r->sq.tail);
|
|
unsigned int cq_head = READ_ONCE(r->cq.head);
|
|
unsigned int cq_tail = READ_ONCE(r->cq.tail);
|
|
unsigned int cq_shift = 0;
|
|
unsigned int sq_shift = 0;
|
|
unsigned int sq_entries, cq_entries;
|
|
int sq_pid = -1, sq_cpu = -1;
|
|
u64 sq_total_time = 0, sq_work_time = 0;
|
|
bool has_lock;
|
|
unsigned int i;
|
|
|
|
if (ctx->flags & IORING_SETUP_CQE32)
|
|
cq_shift = 1;
|
|
if (ctx->flags & IORING_SETUP_SQE128)
|
|
sq_shift = 1;
|
|
|
|
/*
|
|
* we may get imprecise sqe and cqe info if uring is actively running
|
|
* since we get cached_sq_head and cached_cq_tail without uring_lock
|
|
* and sq_tail and cq_head are changed by userspace. But it's ok since
|
|
* we usually use these info when it is stuck.
|
|
*/
|
|
seq_printf(m, "SqMask:\t0x%x\n", sq_mask);
|
|
seq_printf(m, "SqHead:\t%u\n", sq_head);
|
|
seq_printf(m, "SqTail:\t%u\n", sq_tail);
|
|
seq_printf(m, "CachedSqHead:\t%u\n", ctx->cached_sq_head);
|
|
seq_printf(m, "CqMask:\t0x%x\n", cq_mask);
|
|
seq_printf(m, "CqHead:\t%u\n", cq_head);
|
|
seq_printf(m, "CqTail:\t%u\n", cq_tail);
|
|
seq_printf(m, "CachedCqTail:\t%u\n", ctx->cached_cq_tail);
|
|
seq_printf(m, "SQEs:\t%u\n", sq_tail - sq_head);
|
|
sq_entries = min(sq_tail - sq_head, ctx->sq_entries);
|
|
for (i = 0; i < sq_entries; i++) {
|
|
unsigned int entry = i + sq_head;
|
|
struct io_uring_sqe *sqe;
|
|
unsigned int sq_idx;
|
|
|
|
if (ctx->flags & IORING_SETUP_NO_SQARRAY)
|
|
break;
|
|
sq_idx = READ_ONCE(ctx->sq_array[entry & sq_mask]);
|
|
if (sq_idx > sq_mask)
|
|
continue;
|
|
sqe = &ctx->sq_sqes[sq_idx << sq_shift];
|
|
seq_printf(m, "%5u: opcode:%s, fd:%d, flags:%x, off:%llu, "
|
|
"addr:0x%llx, rw_flags:0x%x, buf_index:%d "
|
|
"user_data:%llu",
|
|
sq_idx, io_uring_get_opcode(sqe->opcode), sqe->fd,
|
|
sqe->flags, (unsigned long long) sqe->off,
|
|
(unsigned long long) sqe->addr, sqe->rw_flags,
|
|
sqe->buf_index, sqe->user_data);
|
|
if (sq_shift) {
|
|
u64 *sqeb = (void *) (sqe + 1);
|
|
int size = sizeof(struct io_uring_sqe) / sizeof(u64);
|
|
int j;
|
|
|
|
for (j = 0; j < size; j++) {
|
|
seq_printf(m, ", e%d:0x%llx", j,
|
|
(unsigned long long) *sqeb);
|
|
sqeb++;
|
|
}
|
|
}
|
|
seq_printf(m, "\n");
|
|
}
|
|
seq_printf(m, "CQEs:\t%u\n", cq_tail - cq_head);
|
|
cq_entries = min(cq_tail - cq_head, ctx->cq_entries);
|
|
for (i = 0; i < cq_entries; i++) {
|
|
unsigned int entry = i + cq_head;
|
|
struct io_uring_cqe *cqe = &r->cqes[(entry & cq_mask) << cq_shift];
|
|
|
|
seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x",
|
|
entry & cq_mask, cqe->user_data, cqe->res,
|
|
cqe->flags);
|
|
if (cq_shift)
|
|
seq_printf(m, ", extra1:%llu, extra2:%llu\n",
|
|
cqe->big_cqe[0], cqe->big_cqe[1]);
|
|
seq_printf(m, "\n");
|
|
}
|
|
|
|
/*
|
|
* Avoid ABBA deadlock between the seq lock and the io_uring mutex,
|
|
* since fdinfo case grabs it in the opposite direction of normal use
|
|
* cases. If we fail to get the lock, we just don't iterate any
|
|
* structures that could be going away outside the io_uring mutex.
|
|
*/
|
|
has_lock = mutex_trylock(&ctx->uring_lock);
|
|
|
|
if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL)) {
|
|
struct io_sq_data *sq = ctx->sq_data;
|
|
|
|
/*
|
|
* sq->thread might be NULL if we raced with the sqpoll
|
|
* thread termination.
|
|
*/
|
|
if (sq->thread) {
|
|
sq_pid = sq->task_pid;
|
|
sq_cpu = sq->sq_cpu;
|
|
getrusage(sq->thread, RUSAGE_SELF, &sq_usage);
|
|
sq_total_time = (sq_usage.ru_stime.tv_sec * 1000000
|
|
+ sq_usage.ru_stime.tv_usec);
|
|
sq_work_time = sq->work_time;
|
|
}
|
|
}
|
|
|
|
seq_printf(m, "SqThread:\t%d\n", sq_pid);
|
|
seq_printf(m, "SqThreadCpu:\t%d\n", sq_cpu);
|
|
seq_printf(m, "SqTotalTime:\t%llu\n", sq_total_time);
|
|
seq_printf(m, "SqWorkTime:\t%llu\n", sq_work_time);
|
|
seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files);
|
|
for (i = 0; has_lock && i < ctx->nr_user_files; i++) {
|
|
struct file *f = io_file_from_index(&ctx->file_table, i);
|
|
|
|
if (f)
|
|
seq_printf(m, "%5u: %s\n", i, file_dentry(f)->d_iname);
|
|
else
|
|
seq_printf(m, "%5u: <none>\n", i);
|
|
}
|
|
seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs);
|
|
for (i = 0; has_lock && i < ctx->nr_user_bufs; i++) {
|
|
struct io_mapped_ubuf *buf = ctx->user_bufs[i]->buf;
|
|
|
|
seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf, buf->len);
|
|
}
|
|
if (has_lock && !xa_empty(&ctx->personalities)) {
|
|
unsigned long index;
|
|
const struct cred *cred;
|
|
|
|
seq_printf(m, "Personalities:\n");
|
|
xa_for_each(&ctx->personalities, index, cred)
|
|
io_uring_show_cred(m, index, cred);
|
|
}
|
|
|
|
seq_puts(m, "PollList:\n");
|
|
for (i = 0; has_lock && i < (1U << ctx->cancel_table.hash_bits); i++) {
|
|
struct io_hash_bucket *hb = &ctx->cancel_table.hbs[i];
|
|
struct io_kiocb *req;
|
|
|
|
hlist_for_each_entry(req, &hb->list, hash_node)
|
|
seq_printf(m, " op=%d, task_works=%d\n", req->opcode,
|
|
task_work_pending(req->task));
|
|
}
|
|
|
|
if (has_lock)
|
|
mutex_unlock(&ctx->uring_lock);
|
|
|
|
seq_puts(m, "CqOverflowList:\n");
|
|
spin_lock(&ctx->completion_lock);
|
|
list_for_each_entry(ocqe, &ctx->cq_overflow_list, list) {
|
|
struct io_uring_cqe *cqe = &ocqe->cqe;
|
|
|
|
seq_printf(m, " user_data=%llu, res=%d, flags=%x\n",
|
|
cqe->user_data, cqe->res, cqe->flags);
|
|
|
|
}
|
|
spin_unlock(&ctx->completion_lock);
|
|
|
|
#ifdef CONFIG_NET_RX_BUSY_POLL
|
|
if (ctx->napi_enabled) {
|
|
seq_puts(m, "NAPI:\tenabled\n");
|
|
seq_printf(m, "napi_busy_poll_dt:\t%llu\n", ctx->napi_busy_poll_dt);
|
|
if (ctx->napi_prefer_busy_poll)
|
|
seq_puts(m, "napi_prefer_busy_poll:\ttrue\n");
|
|
else
|
|
seq_puts(m, "napi_prefer_busy_poll:\tfalse\n");
|
|
} else {
|
|
seq_puts(m, "NAPI:\tdisabled\n");
|
|
}
|
|
#endif
|
|
}
|
|
#endif
|