Add io_uring IO interface

The submission queue (SQ) and completion queue (CQ) rings are shared
between the application and the kernel. This eliminates the need to
copy data back and forth to submit and complete IO.

IO submissions use the io_uring_sqe data structure, and completions
are generated in the form of io_uring_cqe data structures. The SQ
ring is an index into the io_uring_sqe array, which makes it possible
to submit a batch of IOs without them being contiguous in the ring.
The CQ ring is always contiguous, as completion events are inherently
unordered, and hence any io_uring_cqe entry can point back to an
arbitrary submission.

Two new system calls are added for this:

io_uring_setup(entries, params)
	Sets up an io_uring instance for doing async IO. On success,
	returns a file descriptor that the application can mmap to
	gain access to the SQ ring, CQ ring, and io_uring_sqes.

io_uring_enter(fd, to_submit, min_complete, flags, sigset, sigsetsize)
	Initiates IO against the rings mapped to this fd, or waits for
	them to complete, or both. The behavior is controlled by the
	parameters passed in. If 'to_submit' is non-zero, then we'll
	try and submit new IO. If IORING_ENTER_GETEVENTS is set, the
	kernel will wait for 'min_complete' events, if they aren't
	already available. It's valid to set IORING_ENTER_GETEVENTS
	and 'min_complete' == 0 at the same time, this allows the
	kernel to return already completed events without waiting
	for them. This is useful only for polling, as for IRQ
	driven IO, the application can just check the CQ ring
	without entering the kernel.

With this setup, it's possible to do async IO with a single system
call. Future developments will enable polled IO with this interface,
and polled submission as well. The latter will enable an application
to do IO without doing ANY system calls at all.

For IRQ driven IO, an application only needs to enter the kernel for
completions if it wants to wait for them to occur.

Each io_uring is backed by a workqueue, to support buffered async IO
as well. We will only punt to an async context if the command would
need to wait for IO on the device side. Any data that can be accessed
directly in the page cache is done inline. This avoids the slowness
issue of usual threadpools, since cached data is accessed as quickly
as a sync interface.

Sample application: http://git.kernel.dk/cgit/fio/plain/t/io_uring.c

Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
This commit is contained in:
Jens Axboe 2019-01-07 10:46:33 -07:00
parent 594b9a89af
commit 2b188cc1bb
12 changed files with 1390 additions and 2 deletions

View File

@ -398,3 +398,5 @@
384 i386 arch_prctl sys_arch_prctl __ia32_compat_sys_arch_prctl 384 i386 arch_prctl sys_arch_prctl __ia32_compat_sys_arch_prctl
385 i386 io_pgetevents sys_io_pgetevents __ia32_compat_sys_io_pgetevents 385 i386 io_pgetevents sys_io_pgetevents __ia32_compat_sys_io_pgetevents
386 i386 rseq sys_rseq __ia32_sys_rseq 386 i386 rseq sys_rseq __ia32_sys_rseq
425 i386 io_uring_setup sys_io_uring_setup __ia32_sys_io_uring_setup
426 i386 io_uring_enter sys_io_uring_enter __ia32_sys_io_uring_enter

View File

@ -343,6 +343,8 @@
332 common statx __x64_sys_statx 332 common statx __x64_sys_statx
333 common io_pgetevents __x64_sys_io_pgetevents 333 common io_pgetevents __x64_sys_io_pgetevents
334 common rseq __x64_sys_rseq 334 common rseq __x64_sys_rseq
425 common io_uring_setup __x64_sys_io_uring_setup
426 common io_uring_enter __x64_sys_io_uring_enter
# #
# x32-specific system call numbers start at 512 to avoid cache impact # x32-specific system call numbers start at 512 to avoid cache impact

View File

@ -30,6 +30,7 @@ obj-$(CONFIG_TIMERFD) += timerfd.o
obj-$(CONFIG_EVENTFD) += eventfd.o obj-$(CONFIG_EVENTFD) += eventfd.o
obj-$(CONFIG_USERFAULTFD) += userfaultfd.o obj-$(CONFIG_USERFAULTFD) += userfaultfd.o
obj-$(CONFIG_AIO) += aio.o obj-$(CONFIG_AIO) += aio.o
obj-$(CONFIG_IO_URING) += io_uring.o
obj-$(CONFIG_FS_DAX) += dax.o obj-$(CONFIG_FS_DAX) += dax.o
obj-$(CONFIG_FS_ENCRYPTION) += crypto/ obj-$(CONFIG_FS_ENCRYPTION) += crypto/
obj-$(CONFIG_FILE_LOCKING) += locks.o obj-$(CONFIG_FILE_LOCKING) += locks.o

1255
fs/io_uring.c Normal file

File diff suppressed because it is too large Load Diff

View File

@ -3517,4 +3517,13 @@ extern void inode_nohighmem(struct inode *inode);
extern int vfs_fadvise(struct file *file, loff_t offset, loff_t len, extern int vfs_fadvise(struct file *file, loff_t offset, loff_t len,
int advice); int advice);
#if defined(CONFIG_IO_URING)
extern struct sock *io_uring_get_socket(struct file *file);
#else
static inline struct sock *io_uring_get_socket(struct file *file)
{
return NULL;
}
#endif
#endif /* _LINUX_FS_H */ #endif /* _LINUX_FS_H */

View File

@ -40,7 +40,7 @@ struct user_struct {
kuid_t uid; kuid_t uid;
#if defined(CONFIG_PERF_EVENTS) || defined(CONFIG_BPF_SYSCALL) || \ #if defined(CONFIG_PERF_EVENTS) || defined(CONFIG_BPF_SYSCALL) || \
defined(CONFIG_NET) defined(CONFIG_NET) || defined(CONFIG_IO_URING)
atomic_long_t locked_vm; atomic_long_t locked_vm;
#endif #endif

View File

@ -69,6 +69,7 @@ struct file_handle;
struct sigaltstack; struct sigaltstack;
struct rseq; struct rseq;
union bpf_attr; union bpf_attr;
struct io_uring_params;
#include <linux/types.h> #include <linux/types.h>
#include <linux/aio_abi.h> #include <linux/aio_abi.h>
@ -309,6 +310,11 @@ asmlinkage long sys_io_pgetevents_time32(aio_context_t ctx_id,
struct io_event __user *events, struct io_event __user *events,
struct old_timespec32 __user *timeout, struct old_timespec32 __user *timeout,
const struct __aio_sigset *sig); const struct __aio_sigset *sig);
asmlinkage long sys_io_uring_setup(u32 entries,
struct io_uring_params __user *p);
asmlinkage long sys_io_uring_enter(unsigned int fd, u32 to_submit,
u32 min_complete, u32 flags,
const sigset_t __user *sig, size_t sigsz);
/* fs/xattr.c */ /* fs/xattr.c */
asmlinkage long sys_setxattr(const char __user *path, const char __user *name, asmlinkage long sys_setxattr(const char __user *path, const char __user *name,

View File

@ -740,9 +740,13 @@ __SC_COMP(__NR_io_pgetevents, sys_io_pgetevents, compat_sys_io_pgetevents)
__SYSCALL(__NR_rseq, sys_rseq) __SYSCALL(__NR_rseq, sys_rseq)
#define __NR_kexec_file_load 294 #define __NR_kexec_file_load 294
__SYSCALL(__NR_kexec_file_load, sys_kexec_file_load) __SYSCALL(__NR_kexec_file_load, sys_kexec_file_load)
#define __NR_io_uring_setup 425
__SYSCALL(__NR_io_uring_setup, sys_io_uring_setup)
#define __NR_io_uring_enter 426
__SYSCALL(__NR_io_uring_enter, sys_io_uring_enter)
#undef __NR_syscalls #undef __NR_syscalls
#define __NR_syscalls 295 #define __NR_syscalls 427
/* /*
* 32 bit systems traditionally used different * 32 bit systems traditionally used different

View File

@ -0,0 +1,95 @@
/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
/*
* Header file for the io_uring interface.
*
* Copyright (C) 2019 Jens Axboe
* Copyright (C) 2019 Christoph Hellwig
*/
#ifndef LINUX_IO_URING_H
#define LINUX_IO_URING_H
#include <linux/fs.h>
#include <linux/types.h>
/*
* IO submission data structure (Submission Queue Entry)
*/
struct io_uring_sqe {
__u8 opcode; /* type of operation for this sqe */
__u8 flags; /* as of now unused */
__u16 ioprio; /* ioprio for the request */
__s32 fd; /* file descriptor to do IO on */
__u64 off; /* offset into file */
__u64 addr; /* pointer to buffer or iovecs */
__u32 len; /* buffer size or number of iovecs */
union {
__kernel_rwf_t rw_flags;
__u32 __resv;
};
__u64 user_data; /* data to be passed back at completion time */
__u64 __pad2[3];
};
#define IORING_OP_NOP 0
#define IORING_OP_READV 1
#define IORING_OP_WRITEV 2
/*
* IO completion data structure (Completion Queue Entry)
*/
struct io_uring_cqe {
__u64 user_data; /* sqe->data submission passed back */
__s32 res; /* result code for this event */
__u32 flags;
};
/*
* Magic offsets for the application to mmap the data it needs
*/
#define IORING_OFF_SQ_RING 0ULL
#define IORING_OFF_CQ_RING 0x8000000ULL
#define IORING_OFF_SQES 0x10000000ULL
/*
* Filled with the offset for mmap(2)
*/
struct io_sqring_offsets {
__u32 head;
__u32 tail;
__u32 ring_mask;
__u32 ring_entries;
__u32 flags;
__u32 dropped;
__u32 array;
__u32 resv1;
__u64 resv2;
};
struct io_cqring_offsets {
__u32 head;
__u32 tail;
__u32 ring_mask;
__u32 ring_entries;
__u32 overflow;
__u32 cqes;
__u64 resv[2];
};
/*
* io_uring_enter(2) flags
*/
#define IORING_ENTER_GETEVENTS (1U << 0)
/*
* Passed in for io_uring_setup(2). Copied back with updated info on success
*/
struct io_uring_params {
__u32 sq_entries;
__u32 cq_entries;
__u32 flags;
__u32 resv[7];
struct io_sqring_offsets sq_off;
struct io_cqring_offsets cq_off;
};
#endif

View File

@ -1414,6 +1414,15 @@ config AIO
by some high performance threaded applications. Disabling by some high performance threaded applications. Disabling
this option saves about 7k. this option saves about 7k.
config IO_URING
bool "Enable IO uring support" if EXPERT
select ANON_INODES
default y
help
This option enables support for the io_uring interface, enabling
applications to submit and complete IO through submission and
completion rings that are shared between the kernel and application.
config ADVISE_SYSCALLS config ADVISE_SYSCALLS
bool "Enable madvise/fadvise syscalls" if EXPERT bool "Enable madvise/fadvise syscalls" if EXPERT
default y default y

View File

@ -46,6 +46,8 @@ COND_SYSCALL(io_getevents);
COND_SYSCALL(io_pgetevents); COND_SYSCALL(io_pgetevents);
COND_SYSCALL_COMPAT(io_getevents); COND_SYSCALL_COMPAT(io_getevents);
COND_SYSCALL_COMPAT(io_pgetevents); COND_SYSCALL_COMPAT(io_pgetevents);
COND_SYSCALL(io_uring_setup);
COND_SYSCALL(io_uring_enter);
/* fs/xattr.c */ /* fs/xattr.c */

View File

@ -108,6 +108,9 @@ struct sock *unix_get_socket(struct file *filp)
/* PF_UNIX ? */ /* PF_UNIX ? */
if (s && sock->ops && sock->ops->family == PF_UNIX) if (s && sock->ops && sock->ops->family == PF_UNIX)
u_sock = s; u_sock = s;
} else {
/* Could be an io_uring instance */
u_sock = io_uring_get_socket(filp);
} }
return u_sock; return u_sock;
} }