Merge branch 'for-6.14/io_uring' into for-next

* for-6.14/io_uring: (51 commits)
  io_uring: expose read/write attribute capability
  io_uring/rw: don't gate retry on completion context
  io_uring/rw: handle -EAGAIN retry at IO completion time
  io_uring/rw: use io_rw_recycle() from cleanup path
  io_uring/rsrc: simplify the bvec iter count calculation
  io_uring: ensure io_queue_deferred() is out-of-line
  io_uring/rw: always clear ->bytes_done on io_async_rw setup
  io_uring/rw: use NULL for rw->free_iovec assigment
  io_uring/rw: don't mask in f_iocb_flags
  io_uring/msg_ring: Drop custom destructor
  io_uring: Move old async data allocation helper to header
  io_uring/rw: Allocate async data through helper
  io_uring/net: Allocate msghdr async data through helper
  io_uring/uring_cmd: Allocate async data through generic helper
  io_uring/poll: Allocate apoll with generic alloc_cache helper
  io_uring/futex: Allocate ifd with generic alloc_cache helper
  io_uring: Add generic helper to allocate async data
  io_uring: Fold allocation into alloc_cache helper
  io_uring: prevent reg-wait speculations
  io_uring: don't vmap single page regions
  ...

Signed-off-by: Jens Axboe <axboe@kernel.dk>
This commit is contained in:
Jens Axboe 2025-01-13 07:26:00 -07:00
commit 51f62a45c5
31 changed files with 798 additions and 744 deletions

View File

@ -118,17 +118,18 @@ static void bio_integrity_unpin_bvec(struct bio_vec *bv, int nr_vecs,
static void bio_integrity_uncopy_user(struct bio_integrity_payload *bip)
{
unsigned short nr_vecs = bip->bip_max_vcnt - 1;
struct bio_vec *copy = &bip->bip_vec[1];
size_t bytes = bip->bip_iter.bi_size;
struct iov_iter iter;
unsigned short orig_nr_vecs = bip->bip_max_vcnt - 1;
struct bio_vec *orig_bvecs = &bip->bip_vec[1];
struct bio_vec *bounce_bvec = &bip->bip_vec[0];
size_t bytes = bounce_bvec->bv_len;
struct iov_iter orig_iter;
int ret;
iov_iter_bvec(&iter, ITER_DEST, copy, nr_vecs, bytes);
ret = copy_to_iter(bvec_virt(bip->bip_vec), bytes, &iter);
iov_iter_bvec(&orig_iter, ITER_DEST, orig_bvecs, orig_nr_vecs, bytes);
ret = copy_to_iter(bvec_virt(bounce_bvec), bytes, &orig_iter);
WARN_ON_ONCE(ret != bytes);
bio_integrity_unpin_bvec(copy, nr_vecs, true);
bio_integrity_unpin_bvec(orig_bvecs, orig_nr_vecs, true);
}
/**
@ -301,16 +302,15 @@ static unsigned int bvec_from_pages(struct bio_vec *bvec, struct page **pages,
return nr_bvecs;
}
int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t bytes)
int bio_integrity_map_user(struct bio *bio, struct iov_iter *iter)
{
struct request_queue *q = bdev_get_queue(bio->bi_bdev);
unsigned int align = blk_lim_dma_alignment_and_pad(&q->limits);
struct page *stack_pages[UIO_FASTIOV], **pages = stack_pages;
struct bio_vec stack_vec[UIO_FASTIOV], *bvec = stack_vec;
size_t offset, bytes = iter->count;
unsigned int direction, nr_bvecs;
struct iov_iter iter;
int ret, nr_vecs;
size_t offset;
bool copy;
if (bio_integrity(bio))
@ -323,8 +323,7 @@ int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t bytes)
else
direction = ITER_SOURCE;
iov_iter_ubuf(&iter, direction, ubuf, bytes);
nr_vecs = iov_iter_npages(&iter, BIO_MAX_VECS + 1);
nr_vecs = iov_iter_npages(iter, BIO_MAX_VECS + 1);
if (nr_vecs > BIO_MAX_VECS)
return -E2BIG;
if (nr_vecs > UIO_FASTIOV) {
@ -334,8 +333,8 @@ int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t bytes)
pages = NULL;
}
copy = !iov_iter_is_aligned(&iter, align, align);
ret = iov_iter_extract_pages(&iter, &pages, bytes, nr_vecs, 0, &offset);
copy = !iov_iter_is_aligned(iter, align, align);
ret = iov_iter_extract_pages(iter, &pages, bytes, nr_vecs, 0, &offset);
if (unlikely(ret < 0))
goto free_bvec;
@ -365,6 +364,55 @@ free_bvec:
return ret;
}
static void bio_uio_meta_to_bip(struct bio *bio, struct uio_meta *meta)
{
struct bio_integrity_payload *bip = bio_integrity(bio);
if (meta->flags & IO_INTEGRITY_CHK_GUARD)
bip->bip_flags |= BIP_CHECK_GUARD;
if (meta->flags & IO_INTEGRITY_CHK_APPTAG)
bip->bip_flags |= BIP_CHECK_APPTAG;
if (meta->flags & IO_INTEGRITY_CHK_REFTAG)
bip->bip_flags |= BIP_CHECK_REFTAG;
bip->app_tag = meta->app_tag;
}
int bio_integrity_map_iter(struct bio *bio, struct uio_meta *meta)
{
struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk);
unsigned int integrity_bytes;
int ret;
struct iov_iter it;
if (!bi)
return -EINVAL;
/*
* original meta iterator can be bigger.
* process integrity info corresponding to current data buffer only.
*/
it = meta->iter;
integrity_bytes = bio_integrity_bytes(bi, bio_sectors(bio));
if (it.count < integrity_bytes)
return -EINVAL;
/* should fit into two bytes */
BUILD_BUG_ON(IO_INTEGRITY_VALID_FLAGS >= (1 << 16));
if (meta->flags && (meta->flags & ~IO_INTEGRITY_VALID_FLAGS))
return -EINVAL;
it.count = integrity_bytes;
ret = bio_integrity_map_user(bio, &it);
if (!ret) {
bio_uio_meta_to_bip(bio, meta);
bip_set_seed(bio_integrity(bio), meta->seed);
iov_iter_advance(&meta->iter, integrity_bytes);
meta->seed += bio_integrity_intervals(bi, bio_sectors(bio));
}
return ret;
}
/**
* bio_integrity_prep - Prepare bio for integrity I/O
* @bio: bio to prepare
@ -435,6 +483,11 @@ bool bio_integrity_prep(struct bio *bio)
if (bi->csum_type == BLK_INTEGRITY_CSUM_IP)
bip->bip_flags |= BIP_IP_CHECKSUM;
/* describe what tags to check in payload */
if (bi->csum_type)
bip->bip_flags |= BIP_CHECK_GUARD;
if (bi->flags & BLK_INTEGRITY_REF_TAG)
bip->bip_flags |= BIP_CHECK_REFTAG;
if (bio_integrity_add_page(bio, virt_to_page(buf), len,
offset_in_page(buf)) < len) {
printk(KERN_ERR "could not attach integrity payload\n");
@ -559,7 +612,8 @@ int bio_integrity_clone(struct bio *bio, struct bio *bio_src,
bip->bip_vec = bip_src->bip_vec;
bip->bip_iter = bip_src->bip_iter;
bip->bip_flags = bip_src->bip_flags & ~BIP_BLOCK_INTEGRITY;
bip->bip_flags = bip_src->bip_flags & BIP_CLONE_FLAGS;
bip->app_tag = bip_src->app_tag;
return 0;
}

View File

@ -115,8 +115,16 @@ EXPORT_SYMBOL(blk_rq_map_integrity_sg);
int blk_rq_integrity_map_user(struct request *rq, void __user *ubuf,
ssize_t bytes)
{
int ret = bio_integrity_map_user(rq->bio, ubuf, bytes);
int ret;
struct iov_iter iter;
unsigned int direction;
if (op_is_write(req_op(rq)))
direction = ITER_DEST;
else
direction = ITER_SOURCE;
iov_iter_ubuf(&iter, direction, ubuf, bytes);
ret = bio_integrity_map_user(rq->bio, &iter);
if (ret)
return ret;

View File

@ -54,6 +54,7 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
struct bio bio;
ssize_t ret;
WARN_ON_ONCE(iocb->ki_flags & IOCB_HAS_METADATA);
if (nr_pages <= DIO_INLINE_BIO_VECS)
vecs = inline_vecs;
else {
@ -124,12 +125,16 @@ static void blkdev_bio_end_io(struct bio *bio)
{
struct blkdev_dio *dio = bio->bi_private;
bool should_dirty = dio->flags & DIO_SHOULD_DIRTY;
bool is_sync = dio->flags & DIO_IS_SYNC;
if (bio->bi_status && !dio->bio.bi_status)
dio->bio.bi_status = bio->bi_status;
if (!is_sync && (dio->iocb->ki_flags & IOCB_HAS_METADATA))
bio_integrity_unmap_user(bio);
if (atomic_dec_and_test(&dio->ref)) {
if (!(dio->flags & DIO_IS_SYNC)) {
if (!is_sync) {
struct kiocb *iocb = dio->iocb;
ssize_t ret;
@ -221,14 +226,16 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
* a retry of this from blocking context.
*/
if (unlikely(iov_iter_count(iter))) {
bio_release_pages(bio, false);
bio_clear_flag(bio, BIO_REFFED);
bio_put(bio);
blk_finish_plug(&plug);
return -EAGAIN;
ret = -EAGAIN;
goto fail;
}
bio->bi_opf |= REQ_NOWAIT;
}
if (!is_sync && (iocb->ki_flags & IOCB_HAS_METADATA)) {
ret = bio_integrity_map_iter(bio, iocb->private);
if (unlikely(ret))
goto fail;
}
if (is_read) {
if (dio->flags & DIO_SHOULD_DIRTY)
@ -269,6 +276,12 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
bio_put(&dio->bio);
return ret;
fail:
bio_release_pages(bio, false);
bio_clear_flag(bio, BIO_REFFED);
bio_put(bio);
blk_finish_plug(&plug);
return ret;
}
static void blkdev_bio_end_io_async(struct bio *bio)
@ -286,6 +299,9 @@ static void blkdev_bio_end_io_async(struct bio *bio)
ret = blk_status_to_errno(bio->bi_status);
}
if (iocb->ki_flags & IOCB_HAS_METADATA)
bio_integrity_unmap_user(bio);
iocb->ki_complete(iocb, ret);
if (dio->flags & DIO_SHOULD_DIRTY) {
@ -330,10 +346,8 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
bio_iov_bvec_set(bio, iter);
} else {
ret = bio_iov_iter_get_pages(bio, iter);
if (unlikely(ret)) {
bio_put(bio);
return ret;
}
if (unlikely(ret))
goto out_bio_put;
}
dio->size = bio->bi_iter.bi_size;
@ -346,6 +360,13 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
task_io_account_write(bio->bi_iter.bi_size);
}
if (iocb->ki_flags & IOCB_HAS_METADATA) {
ret = bio_integrity_map_iter(bio, iocb->private);
WRITE_ONCE(iocb->private, NULL);
if (unlikely(ret))
goto out_bio_put;
}
if (iocb->ki_flags & IOCB_ATOMIC)
bio->bi_opf |= REQ_ATOMIC;
@ -360,6 +381,10 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
submit_bio(bio);
}
return -EIOCBQUEUED;
out_bio_put:
bio_put(bio);
return ret;
}
static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)

View File

@ -885,6 +885,12 @@ static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req,
return BLK_STS_OK;
}
static void nvme_set_app_tag(struct request *req, struct nvme_command *cmnd)
{
cmnd->rw.lbat = cpu_to_le16(bio_integrity(req->bio)->app_tag);
cmnd->rw.lbatm = cpu_to_le16(0xffff);
}
static void nvme_set_ref_tag(struct nvme_ns *ns, struct nvme_command *cmnd,
struct request *req)
{
@ -1017,18 +1023,17 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
control |= NVME_RW_PRINFO_PRACT;
}
switch (ns->head->pi_type) {
case NVME_NS_DPS_PI_TYPE3:
if (bio_integrity_flagged(req->bio, BIP_CHECK_GUARD))
control |= NVME_RW_PRINFO_PRCHK_GUARD;
break;
case NVME_NS_DPS_PI_TYPE1:
case NVME_NS_DPS_PI_TYPE2:
control |= NVME_RW_PRINFO_PRCHK_GUARD |
NVME_RW_PRINFO_PRCHK_REF;
if (bio_integrity_flagged(req->bio, BIP_CHECK_REFTAG)) {
control |= NVME_RW_PRINFO_PRCHK_REF;
if (op == nvme_cmd_zone_append)
control |= NVME_RW_APPEND_PIREMAP;
nvme_set_ref_tag(ns, cmnd, req);
break;
}
if (bio_integrity_flagged(req->bio, BIP_CHECK_APPTAG)) {
control |= NVME_RW_PRINFO_PRCHK_APP;
nvme_set_app_tag(req, cmnd);
}
}

View File

@ -814,14 +814,14 @@ static unsigned char sd_setup_protect_cmnd(struct scsi_cmnd *scmd,
if (bio_integrity_flagged(bio, BIP_IP_CHECKSUM))
scmd->prot_flags |= SCSI_PROT_IP_CHECKSUM;
if (bio_integrity_flagged(bio, BIP_CTRL_NOCHECK) == false)
if (bio_integrity_flagged(bio, BIP_CHECK_GUARD))
scmd->prot_flags |= SCSI_PROT_GUARD_CHECK;
}
if (dif != T10_PI_TYPE3_PROTECTION) { /* DIX/DIF Type 0, 1, 2 */
scmd->prot_flags |= SCSI_PROT_REF_INCREMENT;
if (bio_integrity_flagged(bio, BIP_CTRL_NOCHECK) == false)
if (bio_integrity_flagged(bio, BIP_CHECK_REFTAG))
scmd->prot_flags |= SCSI_PROT_REF_CHECK;
}

View File

@ -7,10 +7,12 @@
enum bip_flags {
BIP_BLOCK_INTEGRITY = 1 << 0, /* block layer owns integrity data */
BIP_MAPPED_INTEGRITY = 1 << 1, /* ref tag has been remapped */
BIP_CTRL_NOCHECK = 1 << 2, /* disable HBA integrity checking */
BIP_DISK_NOCHECK = 1 << 3, /* disable disk integrity checking */
BIP_IP_CHECKSUM = 1 << 4, /* IP checksum */
BIP_COPY_USER = 1 << 5, /* Kernel bounce buffer in use */
BIP_DISK_NOCHECK = 1 << 2, /* disable disk integrity checking */
BIP_IP_CHECKSUM = 1 << 3, /* IP checksum */
BIP_COPY_USER = 1 << 4, /* Kernel bounce buffer in use */
BIP_CHECK_GUARD = 1 << 5, /* guard check */
BIP_CHECK_REFTAG = 1 << 6, /* reftag check */
BIP_CHECK_APPTAG = 1 << 7, /* apptag check */
};
struct bio_integrity_payload {
@ -21,6 +23,7 @@ struct bio_integrity_payload {
unsigned short bip_vcnt; /* # of integrity bio_vecs */
unsigned short bip_max_vcnt; /* integrity bio_vec slots */
unsigned short bip_flags; /* control flags */
u16 app_tag; /* application tag value */
struct bvec_iter bio_iter; /* for rewinding parent bio */
@ -30,6 +33,9 @@ struct bio_integrity_payload {
struct bio_vec bip_inline_vecs[];/* embedded bvec array */
};
#define BIP_CLONE_FLAGS (BIP_MAPPED_INTEGRITY | BIP_IP_CHECKSUM | \
BIP_CHECK_GUARD | BIP_CHECK_REFTAG | BIP_CHECK_APPTAG)
#ifdef CONFIG_BLK_DEV_INTEGRITY
#define bip_for_each_vec(bvl, bip, iter) \
@ -72,7 +78,8 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, gfp_t gfp,
unsigned int nr);
int bio_integrity_add_page(struct bio *bio, struct page *page, unsigned int len,
unsigned int offset);
int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t len);
int bio_integrity_map_user(struct bio *bio, struct iov_iter *iter);
int bio_integrity_map_iter(struct bio *bio, struct uio_meta *meta);
void bio_integrity_unmap_user(struct bio *bio);
bool bio_integrity_prep(struct bio *bio);
void bio_integrity_advance(struct bio *bio, unsigned int bytes_done);
@ -98,8 +105,12 @@ static inline void bioset_integrity_free(struct bio_set *bs)
{
}
static inline int bio_integrity_map_user(struct bio *bio, void __user *ubuf,
ssize_t len)
static inline int bio_integrity_map_user(struct bio *bio, struct iov_iter *iter)
{
return -EINVAL;
}
static inline int bio_integrity_map_iter(struct bio *bio, struct uio_meta *meta)
{
return -EINVAL;
}

View File

@ -348,6 +348,7 @@ struct readahead_control;
#define IOCB_DIO_CALLER_COMP (1 << 22)
/* kiocb is a read or write operation submitted by fs/aio.c. */
#define IOCB_AIO_RW (1 << 23)
#define IOCB_HAS_METADATA (1 << 24)
/* for use in trace events */
#define TRACE_IOCB_STRINGS \

View File

@ -78,8 +78,9 @@ struct io_hash_table {
struct io_mapped_region {
struct page **pages;
void *vmap_ptr;
size_t nr_pages;
void *ptr;
unsigned nr_pages;
unsigned flags;
};
/*
@ -293,6 +294,11 @@ struct io_ring_ctx {
struct io_submit_state submit_state;
/*
* Modifications are protected by ->uring_lock and ->mmap_lock.
* The flags, buf_pages and buf_nr_pages fields should be stable
* once published.
*/
struct xarray io_bl_xa;
struct io_hash_table cancel_table;
@ -424,17 +430,10 @@ struct io_ring_ctx {
* side will need to grab this lock, to prevent either side from
* being run concurrently with the other.
*/
struct mutex resize_lock;
/*
* If IORING_SETUP_NO_MMAP is used, then the below holds
* the gup'ed pages for the two rings, and the sqes.
*/
unsigned short n_ring_pages;
unsigned short n_sqe_pages;
struct page **ring_pages;
struct page **sqe_pages;
struct mutex mmap_lock;
struct io_mapped_region sq_region;
struct io_mapped_region ring_region;
/* used for optimised request parameter and wait argument passing */
struct io_mapped_region param_region;
};
@ -481,6 +480,7 @@ enum {
REQ_F_BL_NO_RECYCLE_BIT,
REQ_F_BUFFERS_COMMIT_BIT,
REQ_F_BUF_NODE_BIT,
REQ_F_HAS_METADATA_BIT,
/* not a real bit, just to check we're not overflowing the space */
__REQ_F_LAST_BIT,
@ -561,6 +561,8 @@ enum {
REQ_F_BUFFERS_COMMIT = IO_REQ_FLAG(REQ_F_BUFFERS_COMMIT_BIT),
/* buf node is valid */
REQ_F_BUF_NODE = IO_REQ_FLAG(REQ_F_BUF_NODE_BIT),
/* request has read/write metadata assigned */
REQ_F_HAS_METADATA = IO_REQ_FLAG(REQ_F_HAS_METADATA_BIT),
};
typedef void (*io_req_tw_func_t)(struct io_kiocb *req, struct io_tw_state *ts);

View File

@ -82,6 +82,15 @@ struct iov_iter {
};
};
typedef __u16 uio_meta_flags_t;
struct uio_meta {
uio_meta_flags_t flags;
u16 app_tag;
u64 seed;
struct iov_iter iter;
};
static inline const struct iovec *iter_iov(const struct iov_iter *iter)
{
if (iter->iter_type == ITER_UBUF)

View File

@ -40,6 +40,15 @@
#define BLOCK_SIZE_BITS 10
#define BLOCK_SIZE (1<<BLOCK_SIZE_BITS)
/* flags for integrity meta */
#define IO_INTEGRITY_CHK_GUARD (1U << 0) /* enforce guard check */
#define IO_INTEGRITY_CHK_REFTAG (1U << 1) /* enforce ref check */
#define IO_INTEGRITY_CHK_APPTAG (1U << 2) /* enforce app check */
#define IO_INTEGRITY_VALID_FLAGS (IO_INTEGRITY_CHK_GUARD | \
IO_INTEGRITY_CHK_REFTAG | \
IO_INTEGRITY_CHK_APPTAG)
#define SEEK_SET 0 /* seek relative to beginning of file */
#define SEEK_CUR 1 /* seek relative to current file position */
#define SEEK_END 2 /* seek relative to end of file */

View File

@ -98,6 +98,10 @@ struct io_uring_sqe {
__u64 addr3;
__u64 __pad2[1];
};
struct {
__u64 attr_ptr; /* pointer to attribute information */
__u64 attr_type_mask; /* bit mask of attributes */
};
__u64 optval;
/*
* If the ring is initialized with IORING_SETUP_SQE128, then
@ -107,6 +111,18 @@ struct io_uring_sqe {
};
};
/* sqe->attr_type_mask flags */
#define IORING_RW_ATTR_FLAG_PI (1U << 0)
/* PI attribute information */
struct io_uring_attr_pi {
__u16 flags;
__u16 app_tag;
__u32 len;
__u64 addr;
__u64 seed;
__u64 rsvd;
};
/*
* If sqe->file_index is set to this for opcodes that instantiate a new
* direct descriptor (like openat/openat2/accept), then io_uring will allocate
@ -561,6 +577,7 @@ struct io_uring_params {
#define IORING_FEAT_REG_REG_RING (1U << 13)
#define IORING_FEAT_RECVSEND_BUNDLE (1U << 14)
#define IORING_FEAT_MIN_TIMEOUT (1U << 15)
#define IORING_FEAT_RW_ATTR (1U << 16)
/*
* io_uring_register(2) opcodes and arguments

View File

@ -30,6 +30,19 @@ static inline void *io_alloc_cache_get(struct io_alloc_cache *cache)
return NULL;
}
static inline void *io_cache_alloc(struct io_alloc_cache *cache, gfp_t gfp,
void (*init_once)(void *obj))
{
if (unlikely(!cache->nr_cached)) {
void *obj = kmalloc(cache->elem_size, gfp);
if (obj && init_once)
init_once(obj);
return obj;
}
return io_alloc_cache_get(cache);
}
/* returns false if the cache was initialized properly */
static inline bool io_alloc_cache_init(struct io_alloc_cache *cache,
unsigned max_nr, size_t size)

View File

@ -251,17 +251,6 @@ static void io_futex_wake_fn(struct wake_q_head *wake_q, struct futex_q *q)
io_req_task_work_add(req);
}
static struct io_futex_data *io_alloc_ifd(struct io_ring_ctx *ctx)
{
struct io_futex_data *ifd;
ifd = io_alloc_cache_get(&ctx->futex_cache);
if (ifd)
return ifd;
return kmalloc(sizeof(struct io_futex_data), GFP_NOWAIT);
}
int io_futexv_wait(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex);
@ -331,7 +320,7 @@ int io_futex_wait(struct io_kiocb *req, unsigned int issue_flags)
}
io_ring_submit_lock(ctx, issue_flags);
ifd = io_alloc_ifd(ctx);
ifd = io_cache_alloc(&ctx->futex_cache, GFP_NOWAIT, NULL);
if (!ifd) {
ret = -ENOMEM;
goto done_unlock;

View File

@ -115,7 +115,7 @@
REQ_F_ASYNC_DATA)
#define IO_REQ_CLEAN_SLOW_FLAGS (REQ_F_REFCOUNT | REQ_F_LINK | REQ_F_HARDLINK |\
IO_REQ_CLEAN_FLAGS)
REQ_F_REISSUE | IO_REQ_CLEAN_FLAGS)
#define IO_TCTX_REFS_CACHE_NR (1U << 10)
@ -350,7 +350,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
INIT_WQ_LIST(&ctx->submit_state.compl_reqs);
INIT_HLIST_HEAD(&ctx->cancelable_uring_cmd);
io_napi_init(ctx);
mutex_init(&ctx->resize_lock);
mutex_init(&ctx->mmap_lock);
return ctx;
@ -361,7 +361,7 @@ err:
io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free);
io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free);
io_alloc_cache_free(&ctx->uring_cache, kfree);
io_alloc_cache_free(&ctx->msg_cache, io_msg_cache_free);
io_alloc_cache_free(&ctx->msg_cache, kfree);
io_futex_cache_free(ctx);
kvfree(ctx->cancel_table.hbs);
xa_destroy(&ctx->io_bl_xa);
@ -550,8 +550,9 @@ void io_req_queue_iowq(struct io_kiocb *req)
io_req_task_work_add(req);
}
static __cold void io_queue_deferred(struct io_ring_ctx *ctx)
static __cold noinline void io_queue_deferred(struct io_ring_ctx *ctx)
{
spin_lock(&ctx->completion_lock);
while (!list_empty(&ctx->defer_list)) {
struct io_defer_entry *de = list_first_entry(&ctx->defer_list,
struct io_defer_entry, list);
@ -562,6 +563,7 @@ static __cold void io_queue_deferred(struct io_ring_ctx *ctx)
io_req_task_queue(de->req);
kfree(de);
}
spin_unlock(&ctx->completion_lock);
}
void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
@ -570,11 +572,8 @@ void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
io_poll_wq_wake(ctx);
if (ctx->off_timeout_used)
io_flush_timeouts(ctx);
if (ctx->drain_active) {
spin_lock(&ctx->completion_lock);
if (ctx->drain_active)
io_queue_deferred(ctx);
spin_unlock(&ctx->completion_lock);
}
if (ctx->has_evfd)
io_eventfd_flush_signal(ctx);
}
@ -1401,6 +1400,12 @@ static void io_free_batch_list(struct io_ring_ctx *ctx,
comp_list);
if (unlikely(req->flags & IO_REQ_CLEAN_SLOW_FLAGS)) {
if (req->flags & REQ_F_REISSUE) {
node = req->comp_list.next;
req->flags &= ~REQ_F_REISSUE;
io_queue_iowq(req);
continue;
}
if (req->flags & REQ_F_REFCOUNT) {
node = req->comp_list.next;
if (!req_ref_put_and_test(req))
@ -1440,7 +1445,12 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx)
struct io_kiocb *req = container_of(node, struct io_kiocb,
comp_list);
if (!(req->flags & REQ_F_CQE_SKIP) &&
/*
* Requests marked with REQUEUE should not post a CQE, they
* will go through the io-wq retry machinery and post one
* later.
*/
if (!(req->flags & (REQ_F_CQE_SKIP | REQ_F_REISSUE)) &&
unlikely(!io_fill_cqe_req(ctx, req))) {
if (ctx->lockless_cq) {
spin_lock(&ctx->completion_lock);
@ -1640,19 +1650,6 @@ io_req_flags_t io_file_get_flags(struct file *file)
return res;
}
bool io_alloc_async_data(struct io_kiocb *req)
{
const struct io_issue_def *def = &io_issue_defs[req->opcode];
WARN_ON_ONCE(!def->async_size);
req->async_data = kmalloc(def->async_size, GFP_KERNEL);
if (req->async_data) {
req->flags |= REQ_F_ASYNC_DATA;
return false;
}
return true;
}
static u32 io_get_sequence(struct io_kiocb *req)
{
u32 seq = req->ctx->cached_sq_head;
@ -2631,36 +2628,10 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,
return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
}
static void *io_rings_map(struct io_ring_ctx *ctx, unsigned long uaddr,
size_t size)
{
return __io_uaddr_map(&ctx->ring_pages, &ctx->n_ring_pages, uaddr,
size);
}
static void *io_sqes_map(struct io_ring_ctx *ctx, unsigned long uaddr,
size_t size)
{
return __io_uaddr_map(&ctx->sqe_pages, &ctx->n_sqe_pages, uaddr,
size);
}
static void io_rings_free(struct io_ring_ctx *ctx)
{
if (!(ctx->flags & IORING_SETUP_NO_MMAP)) {
io_pages_unmap(ctx->rings, &ctx->ring_pages, &ctx->n_ring_pages,
true);
io_pages_unmap(ctx->sq_sqes, &ctx->sqe_pages, &ctx->n_sqe_pages,
true);
} else {
io_pages_free(&ctx->ring_pages, ctx->n_ring_pages);
ctx->n_ring_pages = 0;
io_pages_free(&ctx->sqe_pages, ctx->n_sqe_pages);
ctx->n_sqe_pages = 0;
vunmap(ctx->rings);
vunmap(ctx->sq_sqes);
}
io_free_region(ctx, &ctx->sq_region);
io_free_region(ctx, &ctx->ring_region);
ctx->rings = NULL;
ctx->sq_sqes = NULL;
}
@ -2732,7 +2703,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free);
io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free);
io_alloc_cache_free(&ctx->uring_cache, kfree);
io_alloc_cache_free(&ctx->msg_cache, io_msg_cache_free);
io_alloc_cache_free(&ctx->msg_cache, kfree);
io_futex_cache_free(ctx);
io_destroy_buffers(ctx);
io_free_region(ctx, &ctx->param_region);
@ -3233,6 +3204,7 @@ static struct io_uring_reg_wait *io_get_ext_arg_reg(struct io_ring_ctx *ctx,
end > ctx->cq_wait_size))
return ERR_PTR(-EFAULT);
offset = array_index_nospec(offset, ctx->cq_wait_size - size);
return ctx->cq_wait_arg + offset;
}
@ -3477,9 +3449,10 @@ bool io_is_uring_fops(struct file *file)
static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
struct io_uring_params *p)
{
struct io_uring_region_desc rd;
struct io_rings *rings;
size_t size, sq_array_offset;
void *ptr;
int ret;
/* make sure these are sane, as we already accounted them */
ctx->sq_entries = p->sq_entries;
@ -3490,15 +3463,17 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
if (size == SIZE_MAX)
return -EOVERFLOW;
if (!(ctx->flags & IORING_SETUP_NO_MMAP))
rings = io_pages_map(&ctx->ring_pages, &ctx->n_ring_pages, size);
else
rings = io_rings_map(ctx, p->cq_off.user_addr, size);
memset(&rd, 0, sizeof(rd));
rd.size = PAGE_ALIGN(size);
if (ctx->flags & IORING_SETUP_NO_MMAP) {
rd.user_addr = p->cq_off.user_addr;
rd.flags |= IORING_MEM_REGION_TYPE_USER;
}
ret = io_create_region(ctx, &ctx->ring_region, &rd, IORING_OFF_CQ_RING);
if (ret)
return ret;
ctx->rings = rings = io_region_get_ptr(&ctx->ring_region);
if (IS_ERR(rings))
return PTR_ERR(rings);
ctx->rings = rings;
if (!(ctx->flags & IORING_SETUP_NO_SQARRAY))
ctx->sq_array = (u32 *)((char *)rings + sq_array_offset);
rings->sq_ring_mask = p->sq_entries - 1;
@ -3515,17 +3490,18 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
return -EOVERFLOW;
}
if (!(ctx->flags & IORING_SETUP_NO_MMAP))
ptr = io_pages_map(&ctx->sqe_pages, &ctx->n_sqe_pages, size);
else
ptr = io_sqes_map(ctx, p->sq_off.user_addr, size);
if (IS_ERR(ptr)) {
io_rings_free(ctx);
return PTR_ERR(ptr);
memset(&rd, 0, sizeof(rd));
rd.size = PAGE_ALIGN(size);
if (ctx->flags & IORING_SETUP_NO_MMAP) {
rd.user_addr = p->sq_off.user_addr;
rd.flags |= IORING_MEM_REGION_TYPE_USER;
}
ctx->sq_sqes = ptr;
ret = io_create_region(ctx, &ctx->sq_region, &rd, IORING_OFF_SQES);
if (ret) {
io_rings_free(ctx);
return ret;
}
ctx->sq_sqes = io_region_get_ptr(&ctx->sq_region);
return 0;
}
@ -3733,7 +3709,8 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS |
IORING_FEAT_RSRC_TAGS | IORING_FEAT_CQE_SKIP |
IORING_FEAT_LINKED_FILE | IORING_FEAT_REG_REG_RING |
IORING_FEAT_RECVSEND_BUNDLE | IORING_FEAT_MIN_TIMEOUT;
IORING_FEAT_RECVSEND_BUNDLE | IORING_FEAT_MIN_TIMEOUT |
IORING_FEAT_RW_ATTR;
if (copy_to_user(params, p, sizeof(*p))) {
ret = -EFAULT;
@ -3894,6 +3871,8 @@ static int __init io_uring_init(void)
BUILD_BUG_SQE_ELEM(46, __u16, __pad3[0]);
BUILD_BUG_SQE_ELEM(48, __u64, addr3);
BUILD_BUG_SQE_ELEM_SIZE(48, 0, cmd);
BUILD_BUG_SQE_ELEM(48, __u64, attr_ptr);
BUILD_BUG_SQE_ELEM(56, __u64, attr_type_mask);
BUILD_BUG_SQE_ELEM(56, __u64, __pad2);
BUILD_BUG_ON(sizeof(struct io_uring_files_update) !=

View File

@ -8,9 +8,11 @@
#include <linux/poll.h>
#include <linux/io_uring_types.h>
#include <uapi/linux/eventpoll.h>
#include "alloc_cache.h"
#include "io-wq.h"
#include "slist.h"
#include "filetable.h"
#include "opdef.h"
#ifndef CREATE_TRACE_POINTS
#include <trace/events/io_uring.h>
@ -223,6 +225,27 @@ static inline void io_req_set_res(struct io_kiocb *req, s32 res, u32 cflags)
req->cqe.flags = cflags;
}
static inline void *io_uring_alloc_async_data(struct io_alloc_cache *cache,
struct io_kiocb *req,
void (*init_once)(void *obj))
{
req->async_data = io_cache_alloc(cache, GFP_KERNEL, init_once);
if (req->async_data)
req->flags |= REQ_F_ASYNC_DATA;
return req->async_data;
}
static inline void *io_uring_alloc_async_data_nocache(struct io_kiocb *req)
{
const struct io_issue_def *def = &io_issue_defs[req->opcode];
WARN_ON_ONCE(!def->async_size);
req->async_data = kmalloc(def->async_size, GFP_KERNEL);
if (req->async_data)
req->flags |= REQ_F_ASYNC_DATA;
return req->async_data;
}
static inline bool req_has_async_data(struct io_kiocb *req)
{
return req->flags & REQ_F_ASYNC_DATA;

View File

@ -45,10 +45,10 @@ static int io_buffer_add_list(struct io_ring_ctx *ctx,
/*
* Store buffer group ID and finally mark the list as visible.
* The normal lookup doesn't care about the visibility as we're
* always under the ->uring_lock, but the RCU lookup from mmap does.
* always under the ->uring_lock, but lookups from mmap do.
*/
bl->bgid = bgid;
atomic_set(&bl->refs, 1);
guard(mutex)(&ctx->mmap_lock);
return xa_err(xa_store(&ctx->io_bl_xa, bgid, bl, GFP_KERNEL));
}
@ -353,17 +353,7 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx,
if (bl->flags & IOBL_BUF_RING) {
i = bl->buf_ring->tail - bl->head;
if (bl->buf_nr_pages) {
int j;
if (!(bl->flags & IOBL_MMAP)) {
for (j = 0; j < bl->buf_nr_pages; j++)
unpin_user_page(bl->buf_pages[j]);
}
io_pages_unmap(bl->buf_ring, &bl->buf_pages,
&bl->buf_nr_pages, bl->flags & IOBL_MMAP);
bl->flags &= ~IOBL_MMAP;
}
io_free_region(ctx, &bl->region);
/* make sure it's seen as empty */
INIT_LIST_HEAD(&bl->buf_list);
bl->flags &= ~IOBL_BUF_RING;
@ -386,12 +376,10 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx,
return i;
}
void io_put_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl)
static void io_put_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl)
{
if (atomic_dec_and_test(&bl->refs)) {
__io_remove_buffers(ctx, bl, -1U);
kfree_rcu(bl, rcu);
}
__io_remove_buffers(ctx, bl, -1U);
kfree(bl);
}
void io_destroy_buffers(struct io_ring_ctx *ctx)
@ -399,10 +387,17 @@ void io_destroy_buffers(struct io_ring_ctx *ctx)
struct io_buffer_list *bl;
struct list_head *item, *tmp;
struct io_buffer *buf;
unsigned long index;
xa_for_each(&ctx->io_bl_xa, index, bl) {
xa_erase(&ctx->io_bl_xa, bl->bgid);
while (1) {
unsigned long index = 0;
scoped_guard(mutex, &ctx->mmap_lock) {
bl = xa_find(&ctx->io_bl_xa, &index, ULONG_MAX, XA_PRESENT);
if (bl)
xa_erase(&ctx->io_bl_xa, bl->bgid);
}
if (!bl)
break;
io_put_bl(ctx, bl);
}
@ -591,11 +586,7 @@ int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
INIT_LIST_HEAD(&bl->buf_list);
ret = io_buffer_add_list(ctx, bl, p->bgid);
if (ret) {
/*
* Doesn't need rcu free as it was never visible, but
* let's keep it consistent throughout.
*/
kfree_rcu(bl, rcu);
kfree(bl);
goto err;
}
}
@ -615,75 +606,14 @@ err:
return IOU_OK;
}
static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg,
struct io_buffer_list *bl)
{
struct io_uring_buf_ring *br = NULL;
struct page **pages;
int nr_pages, ret;
pages = io_pin_pages(reg->ring_addr,
flex_array_size(br, bufs, reg->ring_entries),
&nr_pages);
if (IS_ERR(pages))
return PTR_ERR(pages);
br = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL);
if (!br) {
ret = -ENOMEM;
goto error_unpin;
}
#ifdef SHM_COLOUR
/*
* On platforms that have specific aliasing requirements, SHM_COLOUR
* is set and we must guarantee that the kernel and user side align
* nicely. We cannot do that if IOU_PBUF_RING_MMAP isn't set and
* the application mmap's the provided ring buffer. Fail the request
* if we, by chance, don't end up with aligned addresses. The app
* should use IOU_PBUF_RING_MMAP instead, and liburing will handle
* this transparently.
*/
if ((reg->ring_addr | (unsigned long) br) & (SHM_COLOUR - 1)) {
ret = -EINVAL;
goto error_unpin;
}
#endif
bl->buf_pages = pages;
bl->buf_nr_pages = nr_pages;
bl->buf_ring = br;
bl->flags |= IOBL_BUF_RING;
bl->flags &= ~IOBL_MMAP;
return 0;
error_unpin:
unpin_user_pages(pages, nr_pages);
kvfree(pages);
vunmap(br);
return ret;
}
static int io_alloc_pbuf_ring(struct io_ring_ctx *ctx,
struct io_uring_buf_reg *reg,
struct io_buffer_list *bl)
{
size_t ring_size;
ring_size = reg->ring_entries * sizeof(struct io_uring_buf_ring);
bl->buf_ring = io_pages_map(&bl->buf_pages, &bl->buf_nr_pages, ring_size);
if (IS_ERR(bl->buf_ring)) {
bl->buf_ring = NULL;
return -ENOMEM;
}
bl->flags |= (IOBL_BUF_RING | IOBL_MMAP);
return 0;
}
int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
{
struct io_uring_buf_reg reg;
struct io_buffer_list *bl, *free_bl = NULL;
struct io_uring_region_desc rd;
struct io_uring_buf_ring *br;
unsigned long mmap_offset;
unsigned long ring_size;
int ret;
lockdep_assert_held(&ctx->uring_lock);
@ -695,19 +625,8 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
return -EINVAL;
if (reg.flags & ~(IOU_PBUF_RING_MMAP | IOU_PBUF_RING_INC))
return -EINVAL;
if (!(reg.flags & IOU_PBUF_RING_MMAP)) {
if (!reg.ring_addr)
return -EFAULT;
if (reg.ring_addr & ~PAGE_MASK)
return -EINVAL;
} else {
if (reg.ring_addr)
return -EINVAL;
}
if (!is_power_of_2(reg.ring_entries))
return -EINVAL;
/* cannot disambiguate full vs empty due to head/tail size */
if (reg.ring_entries >= 65536)
return -EINVAL;
@ -723,22 +642,48 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
return -ENOMEM;
}
if (!(reg.flags & IOU_PBUF_RING_MMAP))
ret = io_pin_pbuf_ring(&reg, bl);
else
ret = io_alloc_pbuf_ring(ctx, &reg, bl);
mmap_offset = (unsigned long)reg.bgid << IORING_OFF_PBUF_SHIFT;
ring_size = flex_array_size(br, bufs, reg.ring_entries);
if (!ret) {
bl->nr_entries = reg.ring_entries;
bl->mask = reg.ring_entries - 1;
if (reg.flags & IOU_PBUF_RING_INC)
bl->flags |= IOBL_INC;
io_buffer_add_list(ctx, bl, reg.bgid);
return 0;
memset(&rd, 0, sizeof(rd));
rd.size = PAGE_ALIGN(ring_size);
if (!(reg.flags & IOU_PBUF_RING_MMAP)) {
rd.user_addr = reg.ring_addr;
rd.flags |= IORING_MEM_REGION_TYPE_USER;
}
ret = io_create_region_mmap_safe(ctx, &bl->region, &rd, mmap_offset);
if (ret)
goto fail;
br = io_region_get_ptr(&bl->region);
kfree_rcu(free_bl, rcu);
#ifdef SHM_COLOUR
/*
* On platforms that have specific aliasing requirements, SHM_COLOUR
* is set and we must guarantee that the kernel and user side align
* nicely. We cannot do that if IOU_PBUF_RING_MMAP isn't set and
* the application mmap's the provided ring buffer. Fail the request
* if we, by chance, don't end up with aligned addresses. The app
* should use IOU_PBUF_RING_MMAP instead, and liburing will handle
* this transparently.
*/
if (!(reg.flags & IOU_PBUF_RING_MMAP) &&
((reg.ring_addr | (unsigned long)br) & (SHM_COLOUR - 1))) {
ret = -EINVAL;
goto fail;
}
#endif
bl->nr_entries = reg.ring_entries;
bl->mask = reg.ring_entries - 1;
bl->flags |= IOBL_BUF_RING;
bl->buf_ring = br;
if (reg.flags & IOU_PBUF_RING_INC)
bl->flags |= IOBL_INC;
io_buffer_add_list(ctx, bl, reg.bgid);
return 0;
fail:
io_free_region(ctx, &bl->region);
kfree(free_bl);
return ret;
}
@ -762,7 +707,9 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
if (!(bl->flags & IOBL_BUF_RING))
return -EINVAL;
xa_erase(&ctx->io_bl_xa, bl->bgid);
scoped_guard(mutex, &ctx->mmap_lock)
xa_erase(&ctx->io_bl_xa, bl->bgid);
io_put_bl(ctx, bl);
return 0;
}
@ -793,50 +740,15 @@ int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg)
return 0;
}
struct io_buffer_list *io_pbuf_get_bl(struct io_ring_ctx *ctx,
unsigned long bgid)
struct io_mapped_region *io_pbuf_get_region(struct io_ring_ctx *ctx,
unsigned int bgid)
{
struct io_buffer_list *bl;
bool ret;
/*
* We have to be a bit careful here - we're inside mmap and cannot grab
* the uring_lock. This means the buffer_list could be simultaneously
* going away, if someone is trying to be sneaky. Look it up under rcu
* so we know it's not going away, and attempt to grab a reference to
* it. If the ref is already zero, then fail the mapping. If successful,
* the caller will call io_put_bl() to drop the the reference at at the
* end. This may then safely free the buffer_list (and drop the pages)
* at that point, vm_insert_pages() would've already grabbed the
* necessary vma references.
*/
rcu_read_lock();
lockdep_assert_held(&ctx->mmap_lock);
bl = xa_load(&ctx->io_bl_xa, bgid);
/* must be a mmap'able buffer ring and have pages */
ret = false;
if (bl && bl->flags & IOBL_MMAP)
ret = atomic_inc_not_zero(&bl->refs);
rcu_read_unlock();
if (ret)
return bl;
return ERR_PTR(-EINVAL);
}
int io_pbuf_mmap(struct file *file, struct vm_area_struct *vma)
{
struct io_ring_ctx *ctx = file->private_data;
loff_t pgoff = vma->vm_pgoff << PAGE_SHIFT;
struct io_buffer_list *bl;
int bgid, ret;
bgid = (pgoff & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT;
bl = io_pbuf_get_bl(ctx, bgid);
if (IS_ERR(bl))
return PTR_ERR(bl);
ret = io_uring_mmap_pages(ctx, vma, bl->buf_pages, bl->buf_nr_pages);
io_put_bl(ctx, bl);
return ret;
if (!bl || !(bl->flags & IOBL_BUF_RING))
return NULL;
return &bl->region;
}

View File

@ -3,15 +3,13 @@
#define IOU_KBUF_H
#include <uapi/linux/io_uring.h>
#include <linux/io_uring_types.h>
enum {
/* ring mapped provided buffers */
IOBL_BUF_RING = 1,
/* ring mapped provided buffers, but mmap'ed by application */
IOBL_MMAP = 2,
/* buffers are consumed incrementally rather than always fully */
IOBL_INC = 4,
IOBL_INC = 2,
};
struct io_buffer_list {
@ -21,11 +19,7 @@ struct io_buffer_list {
*/
union {
struct list_head buf_list;
struct {
struct page **buf_pages;
struct io_uring_buf_ring *buf_ring;
};
struct rcu_head rcu;
struct io_uring_buf_ring *buf_ring;
};
__u16 bgid;
@ -37,7 +31,7 @@ struct io_buffer_list {
__u16 flags;
atomic_t refs;
struct io_mapped_region region;
};
struct io_buffer {
@ -84,10 +78,8 @@ void __io_put_kbuf(struct io_kiocb *req, int len, unsigned issue_flags);
bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags);
void io_put_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl);
struct io_buffer_list *io_pbuf_get_bl(struct io_ring_ctx *ctx,
unsigned long bgid);
int io_pbuf_mmap(struct file *file, struct vm_area_struct *vma);
struct io_mapped_region *io_pbuf_get_region(struct io_ring_ctx *ctx,
unsigned int bgid);
static inline bool io_kbuf_recycle_ring(struct io_kiocb *req)
{

View File

@ -36,102 +36,6 @@ static void *io_mem_alloc_compound(struct page **pages, int nr_pages,
return page_address(page);
}
static void *io_mem_alloc_single(struct page **pages, int nr_pages, size_t size,
gfp_t gfp)
{
void *ret;
int i;
for (i = 0; i < nr_pages; i++) {
pages[i] = alloc_page(gfp);
if (!pages[i])
goto err;
}
ret = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL);
if (ret)
return ret;
err:
while (i--)
put_page(pages[i]);
return ERR_PTR(-ENOMEM);
}
void *io_pages_map(struct page ***out_pages, unsigned short *npages,
size_t size)
{
gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN;
struct page **pages;
int nr_pages;
void *ret;
nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
pages = kvmalloc_array(nr_pages, sizeof(struct page *), gfp);
if (!pages)
return ERR_PTR(-ENOMEM);
ret = io_mem_alloc_compound(pages, nr_pages, size, gfp);
if (!IS_ERR(ret))
goto done;
if (nr_pages == 1)
goto fail;
ret = io_mem_alloc_single(pages, nr_pages, size, gfp);
if (!IS_ERR(ret)) {
done:
*out_pages = pages;
*npages = nr_pages;
return ret;
}
fail:
kvfree(pages);
*out_pages = NULL;
*npages = 0;
return ret;
}
void io_pages_unmap(void *ptr, struct page ***pages, unsigned short *npages,
bool put_pages)
{
bool do_vunmap = false;
if (!ptr)
return;
if (put_pages && *npages) {
struct page **to_free = *pages;
int i;
/*
* Only did vmap for the non-compound multiple page case.
* For the compound page, we just need to put the head.
*/
if (PageCompound(to_free[0]))
*npages = 1;
else if (*npages > 1)
do_vunmap = true;
for (i = 0; i < *npages; i++)
put_page(to_free[i]);
}
if (do_vunmap)
vunmap(ptr);
kvfree(*pages);
*pages = NULL;
*npages = 0;
}
void io_pages_free(struct page ***pages, int npages)
{
struct page **page_array = *pages;
if (!page_array)
return;
unpin_user_pages(page_array, npages);
kvfree(page_array);
*pages = NULL;
}
struct page **io_pin_pages(unsigned long uaddr, unsigned long len, int *npages)
{
unsigned long start, end, nr_pages;
@ -174,64 +78,127 @@ struct page **io_pin_pages(unsigned long uaddr, unsigned long len, int *npages)
return ERR_PTR(ret);
}
void *__io_uaddr_map(struct page ***pages, unsigned short *npages,
unsigned long uaddr, size_t size)
{
struct page **page_array;
unsigned int nr_pages;
void *page_addr;
*npages = 0;
if (uaddr & (PAGE_SIZE - 1) || !size)
return ERR_PTR(-EINVAL);
nr_pages = 0;
page_array = io_pin_pages(uaddr, size, &nr_pages);
if (IS_ERR(page_array))
return page_array;
page_addr = vmap(page_array, nr_pages, VM_MAP, PAGE_KERNEL);
if (page_addr) {
*pages = page_array;
*npages = nr_pages;
return page_addr;
}
io_pages_free(&page_array, nr_pages);
return ERR_PTR(-ENOMEM);
}
enum {
/* memory was vmap'ed for the kernel, freeing the region vunmap's it */
IO_REGION_F_VMAP = 1,
/* memory is provided by user and pinned by the kernel */
IO_REGION_F_USER_PROVIDED = 2,
/* only the first page in the array is ref'ed */
IO_REGION_F_SINGLE_REF = 4,
};
void io_free_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr)
{
if (mr->pages) {
unpin_user_pages(mr->pages, mr->nr_pages);
long nr_refs = mr->nr_pages;
if (mr->flags & IO_REGION_F_SINGLE_REF)
nr_refs = 1;
if (mr->flags & IO_REGION_F_USER_PROVIDED)
unpin_user_pages(mr->pages, nr_refs);
else
release_pages(mr->pages, nr_refs);
kvfree(mr->pages);
}
if (mr->vmap_ptr)
vunmap(mr->vmap_ptr);
if ((mr->flags & IO_REGION_F_VMAP) && mr->ptr)
vunmap(mr->ptr);
if (mr->nr_pages && ctx->user)
__io_unaccount_mem(ctx->user, mr->nr_pages);
memset(mr, 0, sizeof(*mr));
}
int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr,
struct io_uring_region_desc *reg)
static int io_region_init_ptr(struct io_mapped_region *mr)
{
int pages_accounted = 0;
struct io_imu_folio_data ifd;
void *ptr;
if (io_check_coalesce_buffer(mr->pages, mr->nr_pages, &ifd)) {
if (ifd.nr_folios == 1) {
mr->ptr = page_address(mr->pages[0]);
return 0;
}
}
ptr = vmap(mr->pages, mr->nr_pages, VM_MAP, PAGE_KERNEL);
if (!ptr)
return -ENOMEM;
mr->ptr = ptr;
mr->flags |= IO_REGION_F_VMAP;
return 0;
}
static int io_region_pin_pages(struct io_ring_ctx *ctx,
struct io_mapped_region *mr,
struct io_uring_region_desc *reg)
{
unsigned long size = mr->nr_pages << PAGE_SHIFT;
struct page **pages;
int nr_pages;
pages = io_pin_pages(reg->user_addr, size, &nr_pages);
if (IS_ERR(pages))
return PTR_ERR(pages);
if (WARN_ON_ONCE(nr_pages != mr->nr_pages))
return -EFAULT;
mr->pages = pages;
mr->flags |= IO_REGION_F_USER_PROVIDED;
return 0;
}
static int io_region_allocate_pages(struct io_ring_ctx *ctx,
struct io_mapped_region *mr,
struct io_uring_region_desc *reg,
unsigned long mmap_offset)
{
gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN;
unsigned long size = mr->nr_pages << PAGE_SHIFT;
unsigned long nr_allocated;
struct page **pages;
void *p;
pages = kvmalloc_array(mr->nr_pages, sizeof(*pages), gfp);
if (!pages)
return -ENOMEM;
p = io_mem_alloc_compound(pages, mr->nr_pages, size, gfp);
if (!IS_ERR(p)) {
mr->flags |= IO_REGION_F_SINGLE_REF;
goto done;
}
nr_allocated = alloc_pages_bulk_array_node(gfp, NUMA_NO_NODE,
mr->nr_pages, pages);
if (nr_allocated != mr->nr_pages) {
if (nr_allocated)
release_pages(pages, nr_allocated);
kvfree(pages);
return -ENOMEM;
}
done:
reg->mmap_offset = mmap_offset;
mr->pages = pages;
return 0;
}
int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr,
struct io_uring_region_desc *reg,
unsigned long mmap_offset)
{
int nr_pages, ret;
void *vptr;
u64 end;
if (WARN_ON_ONCE(mr->pages || mr->vmap_ptr || mr->nr_pages))
if (WARN_ON_ONCE(mr->pages || mr->ptr || mr->nr_pages))
return -EFAULT;
if (memchr_inv(&reg->__resv, 0, sizeof(reg->__resv)))
return -EINVAL;
if (reg->flags != IORING_MEM_REGION_TYPE_USER)
if (reg->flags & ~IORING_MEM_REGION_TYPE_USER)
return -EINVAL;
if (!reg->user_addr)
/* user_addr should be set IFF it's a user memory backed region */
if ((reg->flags & IORING_MEM_REGION_TYPE_USER) != !!reg->user_addr)
return -EFAULT;
if (!reg->size || reg->mmap_offset || reg->id)
return -EINVAL;
@ -242,94 +209,120 @@ int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr,
if (check_add_overflow(reg->user_addr, reg->size, &end))
return -EOVERFLOW;
pages = io_pin_pages(reg->user_addr, reg->size, &nr_pages);
if (IS_ERR(pages))
return PTR_ERR(pages);
nr_pages = reg->size >> PAGE_SHIFT;
if (ctx->user) {
ret = __io_account_mem(ctx->user, nr_pages);
if (ret)
goto out_free;
pages_accounted = nr_pages;
return ret;
}
vptr = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL);
if (!vptr) {
ret = -ENOMEM;
goto out_free;
}
mr->pages = pages;
mr->vmap_ptr = vptr;
mr->nr_pages = nr_pages;
if (reg->flags & IORING_MEM_REGION_TYPE_USER)
ret = io_region_pin_pages(ctx, mr, reg);
else
ret = io_region_allocate_pages(ctx, mr, reg, mmap_offset);
if (ret)
goto out_free;
ret = io_region_init_ptr(mr);
if (ret)
goto out_free;
return 0;
out_free:
if (pages_accounted)
__io_unaccount_mem(ctx->user, pages_accounted);
io_pages_free(&pages, nr_pages);
io_free_region(ctx, mr);
return ret;
}
int io_create_region_mmap_safe(struct io_ring_ctx *ctx, struct io_mapped_region *mr,
struct io_uring_region_desc *reg,
unsigned long mmap_offset)
{
struct io_mapped_region tmp_mr;
int ret;
memcpy(&tmp_mr, mr, sizeof(tmp_mr));
ret = io_create_region(ctx, &tmp_mr, reg, mmap_offset);
if (ret)
return ret;
/*
* Once published mmap can find it without holding only the ->mmap_lock
* and not ->uring_lock.
*/
guard(mutex)(&ctx->mmap_lock);
memcpy(mr, &tmp_mr, sizeof(tmp_mr));
return 0;
}
static struct io_mapped_region *io_mmap_get_region(struct io_ring_ctx *ctx,
loff_t pgoff)
{
loff_t offset = pgoff << PAGE_SHIFT;
unsigned int bgid;
switch (offset & IORING_OFF_MMAP_MASK) {
case IORING_OFF_SQ_RING:
case IORING_OFF_CQ_RING:
return &ctx->ring_region;
case IORING_OFF_SQES:
return &ctx->sq_region;
case IORING_OFF_PBUF_RING:
bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT;
return io_pbuf_get_region(ctx, bgid);
case IORING_MAP_OFF_PARAM_REGION:
return &ctx->param_region;
}
return NULL;
}
static void *io_region_validate_mmap(struct io_ring_ctx *ctx,
struct io_mapped_region *mr)
{
lockdep_assert_held(&ctx->mmap_lock);
if (!io_region_is_set(mr))
return ERR_PTR(-EINVAL);
if (mr->flags & IO_REGION_F_USER_PROVIDED)
return ERR_PTR(-EINVAL);
return io_region_get_ptr(mr);
}
static void *io_uring_validate_mmap_request(struct file *file, loff_t pgoff,
size_t sz)
{
struct io_ring_ctx *ctx = file->private_data;
loff_t offset = pgoff << PAGE_SHIFT;
struct io_mapped_region *region;
switch ((pgoff << PAGE_SHIFT) & IORING_OFF_MMAP_MASK) {
case IORING_OFF_SQ_RING:
case IORING_OFF_CQ_RING:
/* Don't allow mmap if the ring was setup without it */
if (ctx->flags & IORING_SETUP_NO_MMAP)
return ERR_PTR(-EINVAL);
if (!ctx->rings)
return ERR_PTR(-EFAULT);
return ctx->rings;
case IORING_OFF_SQES:
/* Don't allow mmap if the ring was setup without it */
if (ctx->flags & IORING_SETUP_NO_MMAP)
return ERR_PTR(-EINVAL);
if (!ctx->sq_sqes)
return ERR_PTR(-EFAULT);
return ctx->sq_sqes;
case IORING_OFF_PBUF_RING: {
struct io_buffer_list *bl;
unsigned int bgid;
void *ptr;
bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT;
bl = io_pbuf_get_bl(ctx, bgid);
if (IS_ERR(bl))
return bl;
ptr = bl->buf_ring;
io_put_bl(ctx, bl);
return ptr;
}
}
return ERR_PTR(-EINVAL);
}
int io_uring_mmap_pages(struct io_ring_ctx *ctx, struct vm_area_struct *vma,
struct page **pages, int npages)
{
unsigned long nr_pages = npages;
vm_flags_set(vma, VM_DONTEXPAND);
return vm_insert_pages(vma, vma->vm_start, pages, &nr_pages);
region = io_mmap_get_region(ctx, pgoff);
if (!region)
return ERR_PTR(-EINVAL);
return io_region_validate_mmap(ctx, region);
}
#ifdef CONFIG_MMU
static int io_region_mmap(struct io_ring_ctx *ctx,
struct io_mapped_region *mr,
struct vm_area_struct *vma,
unsigned max_pages)
{
unsigned long nr_pages = min(mr->nr_pages, max_pages);
vm_flags_set(vma, VM_DONTEXPAND);
return vm_insert_pages(vma, vma->vm_start, mr->pages, &nr_pages);
}
__cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
{
struct io_ring_ctx *ctx = file->private_data;
size_t sz = vma->vm_end - vma->vm_start;
long offset = vma->vm_pgoff << PAGE_SHIFT;
unsigned int npages;
unsigned int page_limit = UINT_MAX;
struct io_mapped_region *region;
void *ptr;
guard(mutex)(&ctx->resize_lock);
guard(mutex)(&ctx->mmap_lock);
ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz);
if (IS_ERR(ptr))
@ -338,16 +331,12 @@ __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
switch (offset & IORING_OFF_MMAP_MASK) {
case IORING_OFF_SQ_RING:
case IORING_OFF_CQ_RING:
npages = min(ctx->n_ring_pages, (sz + PAGE_SIZE - 1) >> PAGE_SHIFT);
return io_uring_mmap_pages(ctx, vma, ctx->ring_pages, npages);
case IORING_OFF_SQES:
return io_uring_mmap_pages(ctx, vma, ctx->sqe_pages,
ctx->n_sqe_pages);
case IORING_OFF_PBUF_RING:
return io_pbuf_mmap(file, vma);
page_limit = (sz + PAGE_SIZE - 1) >> PAGE_SHIFT;
break;
}
return -EINVAL;
region = io_mmap_get_region(ctx, vma->vm_pgoff);
return io_region_mmap(ctx, region, vma, page_limit);
}
unsigned long io_uring_get_unmapped_area(struct file *filp, unsigned long addr,
@ -365,7 +354,7 @@ unsigned long io_uring_get_unmapped_area(struct file *filp, unsigned long addr,
if (addr)
return -EINVAL;
guard(mutex)(&ctx->resize_lock);
guard(mutex)(&ctx->mmap_lock);
ptr = io_uring_validate_mmap_request(filp, pgoff, len);
if (IS_ERR(ptr))
@ -415,7 +404,7 @@ unsigned long io_uring_get_unmapped_area(struct file *file, unsigned long addr,
struct io_ring_ctx *ctx = file->private_data;
void *ptr;
guard(mutex)(&ctx->resize_lock);
guard(mutex)(&ctx->mmap_lock);
ptr = io_uring_validate_mmap_request(file, pgoff, len);
if (IS_ERR(ptr))

View File

@ -1,18 +1,9 @@
#ifndef IO_URING_MEMMAP_H
#define IO_URING_MEMMAP_H
#define IORING_MAP_OFF_PARAM_REGION 0x20000000ULL
struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages);
void io_pages_free(struct page ***pages, int npages);
int io_uring_mmap_pages(struct io_ring_ctx *ctx, struct vm_area_struct *vma,
struct page **pages, int npages);
void *io_pages_map(struct page ***out_pages, unsigned short *npages,
size_t size);
void io_pages_unmap(void *ptr, struct page ***pages, unsigned short *npages,
bool put_pages);
void *__io_uaddr_map(struct page ***pages, unsigned short *npages,
unsigned long uaddr, size_t size);
#ifndef CONFIG_MMU
unsigned int io_uring_nommu_mmap_capabilities(struct file *file);
@ -24,11 +15,17 @@ int io_uring_mmap(struct file *file, struct vm_area_struct *vma);
void io_free_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr);
int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr,
struct io_uring_region_desc *reg);
struct io_uring_region_desc *reg,
unsigned long mmap_offset);
int io_create_region_mmap_safe(struct io_ring_ctx *ctx,
struct io_mapped_region *mr,
struct io_uring_region_desc *reg,
unsigned long mmap_offset);
static inline void *io_region_get_ptr(struct io_mapped_region *mr)
{
return mr->vmap_ptr;
return mr->ptr;
}
static inline bool io_region_is_set(struct io_mapped_region *mr)

View File

@ -354,10 +354,3 @@ int io_uring_sync_msg_ring(struct io_uring_sqe *sqe)
return __io_msg_ring_data(fd_file(f)->private_data,
&io_msg, IO_URING_F_UNLOCKED);
}
void io_msg_cache_free(const void *entry)
{
struct io_kiocb *req = (struct io_kiocb *) entry;
kmem_cache_free(req_cachep, req);
}

View File

@ -4,4 +4,3 @@ int io_uring_sync_msg_ring(struct io_uring_sqe *sqe);
int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags);
void io_msg_ring_cleanup(struct io_kiocb *req);
void io_msg_cache_free(const void *entry);

View File

@ -155,30 +155,31 @@ static void io_netmsg_recycle(struct io_kiocb *req, unsigned int issue_flags)
}
}
static void io_msg_async_data_init(void *obj)
{
struct io_async_msghdr *hdr = (struct io_async_msghdr *)obj;
hdr->free_iov = NULL;
hdr->free_iov_nr = 0;
}
static struct io_async_msghdr *io_msg_alloc_async(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
struct io_async_msghdr *hdr;
hdr = io_alloc_cache_get(&ctx->netmsg_cache);
if (hdr) {
if (hdr->free_iov) {
kasan_mempool_unpoison_object(hdr->free_iov,
hdr->free_iov_nr * sizeof(struct iovec));
req->flags |= REQ_F_NEED_CLEANUP;
}
req->flags |= REQ_F_ASYNC_DATA;
req->async_data = hdr;
return hdr;
}
hdr = io_uring_alloc_async_data(&ctx->netmsg_cache, req,
io_msg_async_data_init);
if (!hdr)
return NULL;
if (!io_alloc_async_data(req)) {
hdr = req->async_data;
hdr->free_iov_nr = 0;
hdr->free_iov = NULL;
return hdr;
/* If the async data was cached, we might have an iov cached inside. */
if (hdr->free_iov) {
kasan_mempool_unpoison_object(hdr->free_iov,
hdr->free_iov_nr * sizeof(struct iovec));
req->flags |= REQ_F_NEED_CLEANUP;
}
return NULL;
return hdr;
}
/* assign new iovec to kmsg, if we need to */

View File

@ -648,15 +648,12 @@ static struct async_poll *io_req_alloc_apoll(struct io_kiocb *req,
if (req->flags & REQ_F_POLLED) {
apoll = req->apoll;
kfree(apoll->double_poll);
} else if (!(issue_flags & IO_URING_F_UNLOCKED)) {
apoll = io_alloc_cache_get(&ctx->apoll_cache);
if (!apoll)
goto alloc_apoll;
apoll->poll.retries = APOLL_MAX_RETRY;
} else {
alloc_apoll:
apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
if (unlikely(!apoll))
if (!(issue_flags & IO_URING_F_UNLOCKED))
apoll = io_cache_alloc(&ctx->apoll_cache, GFP_ATOMIC, NULL);
else
apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
if (!apoll)
return NULL;
apoll->poll.retries = APOLL_MAX_RETRY;
}

View File

@ -367,28 +367,19 @@ static int io_register_clock(struct io_ring_ctx *ctx,
* either mapping or freeing.
*/
struct io_ring_ctx_rings {
unsigned short n_ring_pages;
unsigned short n_sqe_pages;
struct page **ring_pages;
struct page **sqe_pages;
struct io_uring_sqe *sq_sqes;
struct io_rings *rings;
struct io_uring_sqe *sq_sqes;
struct io_mapped_region sq_region;
struct io_mapped_region ring_region;
};
static void io_register_free_rings(struct io_uring_params *p,
static void io_register_free_rings(struct io_ring_ctx *ctx,
struct io_uring_params *p,
struct io_ring_ctx_rings *r)
{
if (!(p->flags & IORING_SETUP_NO_MMAP)) {
io_pages_unmap(r->rings, &r->ring_pages, &r->n_ring_pages,
true);
io_pages_unmap(r->sq_sqes, &r->sqe_pages, &r->n_sqe_pages,
true);
} else {
io_pages_free(&r->ring_pages, r->n_ring_pages);
io_pages_free(&r->sqe_pages, r->n_sqe_pages);
vunmap(r->rings);
vunmap(r->sq_sqes);
}
io_free_region(ctx, &r->sq_region);
io_free_region(ctx, &r->ring_region);
}
#define swap_old(ctx, o, n, field) \
@ -403,11 +394,11 @@ static void io_register_free_rings(struct io_uring_params *p,
static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
{
struct io_uring_region_desc rd;
struct io_ring_ctx_rings o = { }, n = { }, *to_free = NULL;
size_t size, sq_array_offset;
struct io_uring_params p;
unsigned i, tail;
void *ptr;
int ret;
/* for single issuer, must be owner resizing */
@ -441,13 +432,18 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
if (size == SIZE_MAX)
return -EOVERFLOW;
if (!(p.flags & IORING_SETUP_NO_MMAP))
n.rings = io_pages_map(&n.ring_pages, &n.n_ring_pages, size);
else
n.rings = __io_uaddr_map(&n.ring_pages, &n.n_ring_pages,
p.cq_off.user_addr, size);
if (IS_ERR(n.rings))
return PTR_ERR(n.rings);
memset(&rd, 0, sizeof(rd));
rd.size = PAGE_ALIGN(size);
if (p.flags & IORING_SETUP_NO_MMAP) {
rd.user_addr = p.cq_off.user_addr;
rd.flags |= IORING_MEM_REGION_TYPE_USER;
}
ret = io_create_region_mmap_safe(ctx, &n.ring_region, &rd, IORING_OFF_CQ_RING);
if (ret) {
io_register_free_rings(ctx, &p, &n);
return ret;
}
n.rings = io_region_get_ptr(&n.ring_region);
n.rings->sq_ring_mask = p.sq_entries - 1;
n.rings->cq_ring_mask = p.cq_entries - 1;
@ -455,7 +451,7 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
n.rings->cq_ring_entries = p.cq_entries;
if (copy_to_user(arg, &p, sizeof(p))) {
io_register_free_rings(&p, &n);
io_register_free_rings(ctx, &p, &n);
return -EFAULT;
}
@ -464,20 +460,22 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
else
size = array_size(sizeof(struct io_uring_sqe), p.sq_entries);
if (size == SIZE_MAX) {
io_register_free_rings(&p, &n);
io_register_free_rings(ctx, &p, &n);
return -EOVERFLOW;
}
if (!(p.flags & IORING_SETUP_NO_MMAP))
ptr = io_pages_map(&n.sqe_pages, &n.n_sqe_pages, size);
else
ptr = __io_uaddr_map(&n.sqe_pages, &n.n_sqe_pages,
p.sq_off.user_addr,
size);
if (IS_ERR(ptr)) {
io_register_free_rings(&p, &n);
return PTR_ERR(ptr);
memset(&rd, 0, sizeof(rd));
rd.size = PAGE_ALIGN(size);
if (p.flags & IORING_SETUP_NO_MMAP) {
rd.user_addr = p.sq_off.user_addr;
rd.flags |= IORING_MEM_REGION_TYPE_USER;
}
ret = io_create_region_mmap_safe(ctx, &n.sq_region, &rd, IORING_OFF_SQES);
if (ret) {
io_register_free_rings(ctx, &p, &n);
return ret;
}
n.sq_sqes = io_region_get_ptr(&n.sq_region);
/*
* If using SQPOLL, park the thread
@ -489,15 +487,15 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
}
/*
* We'll do the swap. Grab the ctx->resize_lock, which will exclude
* We'll do the swap. Grab the ctx->mmap_lock, which will exclude
* any new mmap's on the ring fd. Clear out existing mappings to prevent
* mmap from seeing them, as we'll unmap them. Any attempt to mmap
* existing rings beyond this point will fail. Not that it could proceed
* at this point anyway, as the io_uring mmap side needs go grab the
* ctx->resize_lock as well. Likewise, hold the completion lock over the
* ctx->mmap_lock as well. Likewise, hold the completion lock over the
* duration of the actual swap.
*/
mutex_lock(&ctx->resize_lock);
mutex_lock(&ctx->mmap_lock);
spin_lock(&ctx->completion_lock);
o.rings = ctx->rings;
ctx->rings = NULL;
@ -508,7 +506,6 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
* Now copy SQ and CQ entries, if any. If either of the destination
* rings can't hold what is already there, then fail the operation.
*/
n.sq_sqes = ptr;
tail = o.rings->sq.tail;
if (tail - o.rings->sq.head > p.sq_entries)
goto overflow;
@ -556,16 +553,14 @@ overflow:
ctx->rings = n.rings;
ctx->sq_sqes = n.sq_sqes;
swap_old(ctx, o, n, n_ring_pages);
swap_old(ctx, o, n, n_sqe_pages);
swap_old(ctx, o, n, ring_pages);
swap_old(ctx, o, n, sqe_pages);
swap_old(ctx, o, n, ring_region);
swap_old(ctx, o, n, sq_region);
to_free = &o;
ret = 0;
out:
spin_unlock(&ctx->completion_lock);
mutex_unlock(&ctx->resize_lock);
io_register_free_rings(&p, to_free);
mutex_unlock(&ctx->mmap_lock);
io_register_free_rings(ctx, &p, to_free);
if (ctx->sq_data)
io_sq_thread_unpark(ctx->sq_data);
@ -588,7 +583,6 @@ static int io_register_mem_region(struct io_ring_ctx *ctx, void __user *uarg)
rd_uptr = u64_to_user_ptr(reg.region_uptr);
if (copy_from_user(&rd, rd_uptr, sizeof(rd)))
return -EFAULT;
if (memchr_inv(&reg.__resv, 0, sizeof(reg.__resv)))
return -EINVAL;
if (reg.flags & ~IORING_MEM_REGION_REG_WAIT_ARG)
@ -603,7 +597,8 @@ static int io_register_mem_region(struct io_ring_ctx *ctx, void __user *uarg)
!(ctx->flags & IORING_SETUP_R_DISABLED))
return -EINVAL;
ret = io_create_region(ctx, &ctx->param_region, &rd);
ret = io_create_region_mmap_safe(ctx, &ctx->param_region, &rd,
IORING_MAP_OFF_PARAM_REGION);
if (ret)
return ret;
if (copy_to_user(rd_uptr, &rd, sizeof(rd))) {

View File

@ -626,11 +626,12 @@ static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
return ret;
}
static bool io_do_coalesce_buffer(struct page ***pages, int *nr_pages,
struct io_imu_folio_data *data, int nr_folios)
static bool io_coalesce_buffer(struct page ***pages, int *nr_pages,
struct io_imu_folio_data *data)
{
struct page **page_array = *pages, **new_array = NULL;
int nr_pages_left = *nr_pages, i, j;
int nr_folios = data->nr_folios;
/* Store head pages only*/
new_array = kvmalloc_array(nr_folios, sizeof(struct page *),
@ -667,27 +668,21 @@ static bool io_do_coalesce_buffer(struct page ***pages, int *nr_pages,
return true;
}
static bool io_try_coalesce_buffer(struct page ***pages, int *nr_pages,
struct io_imu_folio_data *data)
bool io_check_coalesce_buffer(struct page **page_array, int nr_pages,
struct io_imu_folio_data *data)
{
struct page **page_array = *pages;
struct folio *folio = page_folio(page_array[0]);
unsigned int count = 1, nr_folios = 1;
int i;
if (*nr_pages <= 1)
return false;
data->nr_pages_mid = folio_nr_pages(folio);
if (data->nr_pages_mid == 1)
return false;
data->folio_shift = folio_shift(folio);
/*
* Check if pages are contiguous inside a folio, and all folios have
* the same page count except for the head and tail.
*/
for (i = 1; i < *nr_pages; i++) {
for (i = 1; i < nr_pages; i++) {
if (page_folio(page_array[i]) == folio &&
page_array[i] == page_array[i-1] + 1) {
count++;
@ -715,7 +710,8 @@ static bool io_try_coalesce_buffer(struct page ***pages, int *nr_pages,
if (nr_folios == 1)
data->nr_pages_head = count;
return io_do_coalesce_buffer(pages, nr_pages, data, nr_folios);
data->nr_folios = nr_folios;
return true;
}
static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
@ -729,7 +725,7 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
size_t size;
int ret, nr_pages, i;
struct io_imu_folio_data data;
bool coalesced;
bool coalesced = false;
if (!iov->iov_base)
return NULL;
@ -749,7 +745,10 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
}
/* If it's huge page(s), try to coalesce them into fewer bvec entries */
coalesced = io_try_coalesce_buffer(&pages, &nr_pages, &data);
if (nr_pages > 1 && io_check_coalesce_buffer(pages, nr_pages, &data)) {
if (data.nr_pages_mid != 1)
coalesced = io_coalesce_buffer(&pages, &nr_pages, &data);
}
imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL);
if (!imu)
@ -883,7 +882,7 @@ int io_import_fixed(int ddir, struct iov_iter *iter,
* and advance us to the beginning.
*/
offset = buf_addr - imu->ubuf;
iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, offset + len);
iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, len);
if (offset) {
/*
@ -905,7 +904,6 @@ int io_import_fixed(int ddir, struct iov_iter *iter,
const struct bio_vec *bvec = imu->bvec;
if (offset < bvec->bv_len) {
iter->count -= offset;
iter->iov_offset = offset;
} else {
unsigned long seg_skip;
@ -916,7 +914,6 @@ int io_import_fixed(int ddir, struct iov_iter *iter,
iter->bvec += seg_skip;
iter->nr_segs -= seg_skip;
iter->count -= bvec->bv_len + offset;
iter->iov_offset = offset & ((1UL << imu->folio_shift) - 1);
}
}

View File

@ -40,6 +40,7 @@ struct io_imu_folio_data {
/* For non-head/tail folios, has to be fully included */
unsigned int nr_pages_mid;
unsigned int folio_shift;
unsigned int nr_folios;
};
struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type);
@ -66,6 +67,9 @@ int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,
int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
unsigned int size, unsigned int type);
bool io_check_coalesce_buffer(struct page **page_array, int nr_pages,
struct io_imu_folio_data *data);
static inline struct io_rsrc_node *io_rsrc_node_lookup(struct io_rsrc_data *data,
int index)
{

View File

@ -202,45 +202,40 @@ static void io_req_rw_cleanup(struct io_kiocb *req, unsigned int issue_flags)
* mean that the underlying data can be gone at any time. But that
* should be fixed seperately, and then this check could be killed.
*/
if (!(req->flags & REQ_F_REFCOUNT)) {
if (!(req->flags & (REQ_F_REISSUE | REQ_F_REFCOUNT))) {
req->flags &= ~REQ_F_NEED_CLEANUP;
io_rw_recycle(req, issue_flags);
}
}
static void io_rw_async_data_init(void *obj)
{
struct io_async_rw *rw = (struct io_async_rw *)obj;
rw->free_iovec = NULL;
rw->bytes_done = 0;
}
static int io_rw_alloc_async(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
struct io_async_rw *rw;
rw = io_alloc_cache_get(&ctx->rw_cache);
if (rw) {
if (rw->free_iovec) {
kasan_mempool_unpoison_object(rw->free_iovec,
rw->free_iov_nr * sizeof(struct iovec));
req->flags |= REQ_F_NEED_CLEANUP;
}
req->flags |= REQ_F_ASYNC_DATA;
req->async_data = rw;
goto done;
rw = io_uring_alloc_async_data(&ctx->rw_cache, req, io_rw_async_data_init);
if (!rw)
return -ENOMEM;
if (rw->free_iovec) {
kasan_mempool_unpoison_object(rw->free_iovec,
rw->free_iov_nr * sizeof(struct iovec));
req->flags |= REQ_F_NEED_CLEANUP;
}
if (!io_alloc_async_data(req)) {
rw = req->async_data;
rw->free_iovec = NULL;
rw->free_iov_nr = 0;
done:
rw->bytes_done = 0;
return 0;
}
return -ENOMEM;
rw->bytes_done = 0;
return 0;
}
static int io_prep_rw_setup(struct io_kiocb *req, int ddir, bool do_import)
{
struct io_async_rw *rw;
int ret;
if (io_rw_alloc_async(req))
return -ENOMEM;
@ -249,12 +244,48 @@ static int io_prep_rw_setup(struct io_kiocb *req, int ddir, bool do_import)
return 0;
rw = req->async_data;
ret = io_import_iovec(ddir, req, rw, 0);
return io_import_iovec(ddir, req, rw, 0);
}
static inline void io_meta_save_state(struct io_async_rw *io)
{
io->meta_state.seed = io->meta.seed;
iov_iter_save_state(&io->meta.iter, &io->meta_state.iter_meta);
}
static inline void io_meta_restore(struct io_async_rw *io, struct kiocb *kiocb)
{
if (kiocb->ki_flags & IOCB_HAS_METADATA) {
io->meta.seed = io->meta_state.seed;
iov_iter_restore(&io->meta.iter, &io->meta_state.iter_meta);
}
}
static int io_prep_rw_pi(struct io_kiocb *req, struct io_rw *rw, int ddir,
u64 attr_ptr, u64 attr_type_mask)
{
struct io_uring_attr_pi pi_attr;
struct io_async_rw *io;
int ret;
if (copy_from_user(&pi_attr, u64_to_user_ptr(attr_ptr),
sizeof(pi_attr)))
return -EFAULT;
if (pi_attr.rsvd)
return -EINVAL;
io = req->async_data;
io->meta.flags = pi_attr.flags;
io->meta.app_tag = pi_attr.app_tag;
io->meta.seed = pi_attr.seed;
ret = import_ubuf(ddir, u64_to_user_ptr(pi_attr.addr),
pi_attr.len, &io->meta.iter);
if (unlikely(ret < 0))
return ret;
iov_iter_save_state(&rw->iter, &rw->iter_state);
return 0;
req->flags |= REQ_F_HAS_METADATA;
io_meta_save_state(io);
return ret;
}
static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
@ -262,6 +293,7 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
{
struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
unsigned ioprio;
u64 attr_type_mask;
int ret;
rw->kiocb.ki_pos = READ_ONCE(sqe->off);
@ -279,11 +311,28 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
rw->kiocb.ki_ioprio = get_current_ioprio();
}
rw->kiocb.dio_complete = NULL;
rw->kiocb.ki_flags = 0;
rw->addr = READ_ONCE(sqe->addr);
rw->len = READ_ONCE(sqe->len);
rw->flags = READ_ONCE(sqe->rw_flags);
return io_prep_rw_setup(req, ddir, do_import);
ret = io_prep_rw_setup(req, ddir, do_import);
if (unlikely(ret))
return ret;
attr_type_mask = READ_ONCE(sqe->attr_type_mask);
if (attr_type_mask) {
u64 attr_ptr;
/* only PI attribute is supported currently */
if (attr_type_mask != IORING_RW_ATTR_FLAG_PI)
return -EINVAL;
attr_ptr = READ_ONCE(sqe->attr_ptr);
ret = io_prep_rw_pi(req, rw, ddir, attr_ptr, attr_type_mask);
}
return ret;
}
int io_prep_read(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@ -385,7 +434,8 @@ int io_read_mshot_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
void io_readv_writev_cleanup(struct io_kiocb *req)
{
io_rw_iovec_free(req->async_data);
lockdep_assert_held(&req->ctx->uring_lock);
io_rw_recycle(req, 0);
}
static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req)
@ -405,17 +455,12 @@ static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req)
return NULL;
}
#ifdef CONFIG_BLOCK
static void io_resubmit_prep(struct io_kiocb *req)
{
struct io_async_rw *io = req->async_data;
iov_iter_restore(&io->iter, &io->iter_state);
}
static bool io_rw_should_reissue(struct io_kiocb *req)
{
#ifdef CONFIG_BLOCK
struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
umode_t mode = file_inode(req->file)->i_mode;
struct io_async_rw *io = req->async_data;
struct io_ring_ctx *ctx = req->ctx;
if (!S_ISBLK(mode) && !S_ISREG(mode))
@ -430,23 +475,14 @@ static bool io_rw_should_reissue(struct io_kiocb *req)
*/
if (percpu_ref_is_dying(&ctx->refs))
return false;
/*
* Play it safe and assume not safe to re-import and reissue if we're
* not in the original thread group (or in task context).
*/
if (!same_thread_group(req->tctx->task, current) || !in_task())
return false;
io_meta_restore(io, &rw->kiocb);
iov_iter_restore(&io->iter, &io->iter_state);
return true;
}
#else
static void io_resubmit_prep(struct io_kiocb *req)
{
}
static bool io_rw_should_reissue(struct io_kiocb *req)
{
return false;
}
#endif
}
static void io_req_end_write(struct io_kiocb *req)
{
@ -473,22 +509,16 @@ static void io_req_io_end(struct io_kiocb *req)
}
}
static bool __io_complete_rw_common(struct io_kiocb *req, long res)
static void __io_complete_rw_common(struct io_kiocb *req, long res)
{
if (unlikely(res != req->cqe.res)) {
if (res == -EAGAIN && io_rw_should_reissue(req)) {
/*
* Reissue will start accounting again, finish the
* current cycle.
*/
io_req_io_end(req);
req->flags |= REQ_F_REISSUE | REQ_F_BL_NO_RECYCLE;
return true;
}
if (res == req->cqe.res)
return;
if (res == -EAGAIN && io_rw_should_reissue(req)) {
req->flags |= REQ_F_REISSUE | REQ_F_BL_NO_RECYCLE;
} else {
req_set_fail(req);
req->cqe.res = res;
}
return false;
}
static inline int io_fixup_rw_res(struct io_kiocb *req, long res)
@ -531,8 +561,7 @@ static void io_complete_rw(struct kiocb *kiocb, long res)
struct io_kiocb *req = cmd_to_io_kiocb(rw);
if (!kiocb->dio_complete || !(kiocb->ki_flags & IOCB_DIO_CALLER_COMP)) {
if (__io_complete_rw_common(req, res))
return;
__io_complete_rw_common(req, res);
io_req_set_res(req, io_fixup_rw_res(req, res), 0);
}
req->io_task_work.func = io_req_rw_complete;
@ -594,26 +623,19 @@ static int kiocb_done(struct io_kiocb *req, ssize_t ret,
if (ret >= 0 && req->flags & REQ_F_CUR_POS)
req->file->f_pos = rw->kiocb.ki_pos;
if (ret >= 0 && (rw->kiocb.ki_complete == io_complete_rw)) {
if (!__io_complete_rw_common(req, ret)) {
/*
* Safe to call io_end from here as we're inline
* from the submission path.
*/
io_req_io_end(req);
io_req_set_res(req, final_ret,
io_put_kbuf(req, ret, issue_flags));
io_req_rw_cleanup(req, issue_flags);
return IOU_OK;
}
__io_complete_rw_common(req, ret);
/*
* Safe to call io_end from here as we're inline
* from the submission path.
*/
io_req_io_end(req);
io_req_set_res(req, final_ret, io_put_kbuf(req, ret, issue_flags));
io_req_rw_cleanup(req, issue_flags);
return IOU_OK;
} else {
io_rw_done(&rw->kiocb, ret);
}
if (req->flags & REQ_F_REISSUE) {
req->flags &= ~REQ_F_REISSUE;
io_resubmit_prep(req);
return -EAGAIN;
}
return IOU_ISSUE_SKIP_COMPLETE;
}
@ -736,8 +758,11 @@ static bool io_rw_should_retry(struct io_kiocb *req)
struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
struct kiocb *kiocb = &rw->kiocb;
/* never retry for NOWAIT, we just complete with -EAGAIN */
if (req->flags & REQ_F_NOWAIT)
/*
* Never retry for NOWAIT or a request with metadata, we just complete
* with -EAGAIN.
*/
if (req->flags & (REQ_F_NOWAIT | REQ_F_HAS_METADATA))
return false;
/* Only for buffered IO */
@ -828,6 +853,19 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type)
kiocb->ki_complete = io_complete_rw;
}
if (req->flags & REQ_F_HAS_METADATA) {
struct io_async_rw *io = req->async_data;
/*
* We have a union of meta fields with wpq used for buffered-io
* in io_async_rw, so fail it here.
*/
if (!(req->file->f_flags & O_DIRECT))
return -EOPNOTSUPP;
kiocb->ki_flags |= IOCB_HAS_METADATA;
kiocb->private = &io->meta;
}
return 0;
}
@ -876,8 +914,7 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags)
if (ret == -EOPNOTSUPP && force_nonblock)
ret = -EAGAIN;
if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) {
req->flags &= ~REQ_F_REISSUE;
if (ret == -EAGAIN) {
/* If we can poll, just do that. */
if (io_file_can_poll(req))
return -EAGAIN;
@ -902,6 +939,7 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags)
* manually if we need to.
*/
iov_iter_restore(&io->iter, &io->iter_state);
io_meta_restore(io, kiocb);
do {
/*
@ -1087,11 +1125,6 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags)
else
ret2 = -EINVAL;
if (req->flags & REQ_F_REISSUE) {
req->flags &= ~REQ_F_REISSUE;
ret2 = -EAGAIN;
}
/*
* Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just
* retry them without IOCB_NOWAIT.
@ -1127,6 +1160,7 @@ done:
} else {
ret_eagain:
iov_iter_restore(&io->iter, &io->iter_state);
io_meta_restore(io, kiocb);
if (kiocb->ki_flags & IOCB_WRITE)
io_req_end_write(req);
return -EAGAIN;

View File

@ -2,6 +2,11 @@
#include <linux/pagemap.h>
struct io_meta_state {
u32 seed;
struct iov_iter_state iter_meta;
};
struct io_async_rw {
size_t bytes_done;
struct iov_iter iter;
@ -9,7 +14,14 @@ struct io_async_rw {
struct iovec fast_iov;
struct iovec *free_iovec;
int free_iov_nr;
struct wait_page_queue wpq;
/* wpq is for buffered io, while meta fields are used with direct io */
union {
struct wait_page_queue wpq;
struct {
struct uio_meta meta;
struct io_meta_state meta_state;
};
};
};
int io_prep_read_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe);

View File

@ -544,10 +544,9 @@ static int __io_timeout_prep(struct io_kiocb *req,
if (WARN_ON_ONCE(req_has_async_data(req)))
return -EFAULT;
if (io_alloc_async_data(req))
data = io_uring_alloc_async_data_nocache(req);
if (!data)
return -ENOMEM;
data = req->async_data;
data->req = req;
data->flags = flags;

View File

@ -16,26 +16,6 @@
#include "rsrc.h"
#include "uring_cmd.h"
static struct io_uring_cmd_data *io_uring_async_get(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
struct io_uring_cmd_data *cache;
cache = io_alloc_cache_get(&ctx->uring_cache);
if (cache) {
cache->op_data = NULL;
req->flags |= REQ_F_ASYNC_DATA;
req->async_data = cache;
return cache;
}
if (!io_alloc_async_data(req)) {
cache = req->async_data;
cache->op_data = NULL;
return cache;
}
return NULL;
}
static void io_req_uring_cleanup(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
@ -188,14 +168,22 @@ void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, u64 res2,
}
EXPORT_SYMBOL_GPL(io_uring_cmd_done);
static void io_uring_cmd_init_once(void *obj)
{
struct io_uring_cmd_data *data = obj;
data->op_data = NULL;
}
static int io_uring_cmd_prep_setup(struct io_kiocb *req,
const struct io_uring_sqe *sqe)
{
struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
struct io_uring_cmd_data *cache;
cache = io_uring_async_get(req);
if (unlikely(!cache))
cache = io_uring_alloc_async_data(&req->ctx->uring_cache, req,
io_uring_cmd_init_once);
if (!cache)
return -ENOMEM;
if (!(req->flags & REQ_F_FORCE_ASYNC)) {

View File

@ -303,10 +303,10 @@ int io_waitid(struct io_kiocb *req, unsigned int issue_flags)
struct io_waitid_async *iwa;
int ret;
if (io_alloc_async_data(req))
iwa = io_uring_alloc_async_data_nocache(req);
if (!iwa)
return -ENOMEM;
iwa = req->async_data;
iwa->req = req;
ret = kernel_waitid_prepare(&iwa->wo, iw->which, iw->upid, &iw->info,