2019-05-27 08:55:01 +02:00
|
|
|
/* SPDX-License-Identifier: GPL-2.0-or-later */
|
2005-04-16 15:20:36 -07:00
|
|
|
/*
|
|
|
|
* Berkeley style UIO structures - Alan Cox 1994.
|
|
|
|
*/
|
2012-10-13 10:46:48 +01:00
|
|
|
#ifndef __LINUX_UIO_H
|
|
|
|
#define __LINUX_UIO_H
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2013-11-27 16:29:46 -08:00
|
|
|
#include <linux/kernel.h>
|
2017-06-29 21:45:10 -04:00
|
|
|
#include <linux/thread_info.h>
|
2021-10-18 10:39:06 -04:00
|
|
|
#include <linux/mm_types.h>
|
2012-10-13 10:46:48 +01:00
|
|
|
#include <uapi/linux/uio.h>
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2013-11-27 16:29:46 -08:00
|
|
|
struct page;
|
mm: Define struct folio_queue and ITER_FOLIOQ to handle a sequence of folios
Define a data structure, struct folio_queue, to represent a sequence of
folios and a kernel-internal I/O iterator type, ITER_FOLIOQ, to allow a
list of folio_queue structures to be used to provide a buffer to
iov_iter-taking functions, such as sendmsg and recvmsg.
The folio_queue structure looks like:
struct folio_queue {
struct folio_batch vec;
u8 orders[PAGEVEC_SIZE];
struct folio_queue *next;
struct folio_queue *prev;
unsigned long marks;
unsigned long marks2;
};
It does not use a list_head so that next and/or prev can be set to NULL at
the ends of the list, allowing iov_iter-handling routines to determine that
they *are* the ends without needing to store a head pointer in the iov_iter
struct.
A folio_batch struct is used to hold the folio pointers which allows the
batch to be passed to batch handling functions. Two mark bits are
available per slot. The intention is to use at least one of them to mark
folios that need putting, but that might not be ultimately necessary.
Accessor functions are used to access the slots to do the masking and an
additional accessor function is used to indicate the size of the array.
The order of each folio is also stored in the structure to avoid the need
for iov_iter_advance() and iov_iter_revert() to have to query each folio to
find its size.
With careful barriering, this can be used as an extending buffer with new
folios inserted and new folio_queue structs added without the need for a
lock. Further, provided we always keep at least one struct in the buffer,
we can also remove consumed folios and consumed structs from the head end
as we without the need for locks.
[Questions/thoughts]
(1) To manage this, I need a head pointer, a tail pointer, a tail slot
number (assuming insertion happens at the tail end and the next
pointers point from head to tail). Should I put these into a struct
of their own, say "folio_queue_head" or "rolling_buffer"?
I will end up with two of these in netfs_io_request eventually, one
keeping track of the pagecache I'm dealing with for buffered I/O and
the other to hold a bounce buffer when we need one.
(2) Should I make the slots {folio,off,len} or bio_vec?
(3) This is intended to replace ITER_XARRAY eventually. Using an xarray
in I/O iteration requires the taking of the RCU read lock, doing
copying under the RCU read lock, walking the xarray (which may change
under us), handling retries and dealing with special values.
The advantage of ITER_XARRAY is that when we're dealing with the
pagecache directly, we don't need any allocation - but if we're doing
encrypted comms, there's a good chance we'd be using a bounce buffer
anyway.
This will require afs, erofs, cifs, orangefs and fscache to be
converted to not use this. afs still uses it for dirs and symlinks;
some of erofs usages should be easy to change, but there's one which
won't be so easy; ceph's use via fscache can be fixed by porting ceph
to netfslib; cifs is using xarray as a bounce buffer - that can be
moved to use sheaves instead; and orangefs has a similar problem to
erofs - maybe orangefs could use netfslib?
Signed-off-by: David Howells <dhowells@redhat.com>
cc: Matthew Wilcox <willy@infradead.org>
cc: Jeff Layton <jlayton@kernel.org>
cc: Steve French <sfrench@samba.org>
cc: Ilya Dryomov <idryomov@gmail.com>
cc: Gao Xiang <xiang@kernel.org>
cc: Mike Marshall <hubcap@omnibond.com>
cc: netfs@lists.linux.dev
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
cc: linux-afs@lists.infradead.org
cc: linux-cifs@vger.kernel.org
cc: ceph-devel@vger.kernel.org
cc: linux-erofs@lists.ozlabs.org
cc: devel@lists.orangefs.org
Link: https://lore.kernel.org/r/20240814203850.2240469-13-dhowells@redhat.com/ # v2
Signed-off-by: Christian Brauner <brauner@kernel.org>
2024-06-19 00:20:42 +01:00
|
|
|
struct folio_queue;
|
2009-07-29 15:04:19 -07:00
|
|
|
|
2023-01-19 12:47:23 +00:00
|
|
|
typedef unsigned int __bitwise iov_iter_extraction_t;
|
|
|
|
|
2009-07-29 15:04:19 -07:00
|
|
|
struct kvec {
|
|
|
|
void *iov_base; /* and that should *never* hold a userland pointer */
|
|
|
|
size_t iov_len;
|
|
|
|
};
|
|
|
|
|
2018-10-22 13:07:28 +01:00
|
|
|
enum iter_type {
|
2019-02-27 13:05:25 -07:00
|
|
|
/* iter types */
|
2023-09-25 13:03:02 +01:00
|
|
|
ITER_UBUF,
|
2021-04-22 14:50:39 -04:00
|
|
|
ITER_IOVEC,
|
|
|
|
ITER_BVEC,
|
2023-09-25 13:03:02 +01:00
|
|
|
ITER_KVEC,
|
mm: Define struct folio_queue and ITER_FOLIOQ to handle a sequence of folios
Define a data structure, struct folio_queue, to represent a sequence of
folios and a kernel-internal I/O iterator type, ITER_FOLIOQ, to allow a
list of folio_queue structures to be used to provide a buffer to
iov_iter-taking functions, such as sendmsg and recvmsg.
The folio_queue structure looks like:
struct folio_queue {
struct folio_batch vec;
u8 orders[PAGEVEC_SIZE];
struct folio_queue *next;
struct folio_queue *prev;
unsigned long marks;
unsigned long marks2;
};
It does not use a list_head so that next and/or prev can be set to NULL at
the ends of the list, allowing iov_iter-handling routines to determine that
they *are* the ends without needing to store a head pointer in the iov_iter
struct.
A folio_batch struct is used to hold the folio pointers which allows the
batch to be passed to batch handling functions. Two mark bits are
available per slot. The intention is to use at least one of them to mark
folios that need putting, but that might not be ultimately necessary.
Accessor functions are used to access the slots to do the masking and an
additional accessor function is used to indicate the size of the array.
The order of each folio is also stored in the structure to avoid the need
for iov_iter_advance() and iov_iter_revert() to have to query each folio to
find its size.
With careful barriering, this can be used as an extending buffer with new
folios inserted and new folio_queue structs added without the need for a
lock. Further, provided we always keep at least one struct in the buffer,
we can also remove consumed folios and consumed structs from the head end
as we without the need for locks.
[Questions/thoughts]
(1) To manage this, I need a head pointer, a tail pointer, a tail slot
number (assuming insertion happens at the tail end and the next
pointers point from head to tail). Should I put these into a struct
of their own, say "folio_queue_head" or "rolling_buffer"?
I will end up with two of these in netfs_io_request eventually, one
keeping track of the pagecache I'm dealing with for buffered I/O and
the other to hold a bounce buffer when we need one.
(2) Should I make the slots {folio,off,len} or bio_vec?
(3) This is intended to replace ITER_XARRAY eventually. Using an xarray
in I/O iteration requires the taking of the RCU read lock, doing
copying under the RCU read lock, walking the xarray (which may change
under us), handling retries and dealing with special values.
The advantage of ITER_XARRAY is that when we're dealing with the
pagecache directly, we don't need any allocation - but if we're doing
encrypted comms, there's a good chance we'd be using a bounce buffer
anyway.
This will require afs, erofs, cifs, orangefs and fscache to be
converted to not use this. afs still uses it for dirs and symlinks;
some of erofs usages should be easy to change, but there's one which
won't be so easy; ceph's use via fscache can be fixed by porting ceph
to netfslib; cifs is using xarray as a bounce buffer - that can be
moved to use sheaves instead; and orangefs has a similar problem to
erofs - maybe orangefs could use netfslib?
Signed-off-by: David Howells <dhowells@redhat.com>
cc: Matthew Wilcox <willy@infradead.org>
cc: Jeff Layton <jlayton@kernel.org>
cc: Steve French <sfrench@samba.org>
cc: Ilya Dryomov <idryomov@gmail.com>
cc: Gao Xiang <xiang@kernel.org>
cc: Mike Marshall <hubcap@omnibond.com>
cc: netfs@lists.linux.dev
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
cc: linux-afs@lists.infradead.org
cc: linux-cifs@vger.kernel.org
cc: ceph-devel@vger.kernel.org
cc: linux-erofs@lists.ozlabs.org
cc: devel@lists.orangefs.org
Link: https://lore.kernel.org/r/20240814203850.2240469-13-dhowells@redhat.com/ # v2
Signed-off-by: Christian Brauner <brauner@kernel.org>
2024-06-19 00:20:42 +01:00
|
|
|
ITER_FOLIOQ,
|
2021-04-22 14:50:39 -04:00
|
|
|
ITER_XARRAY,
|
|
|
|
ITER_DISCARD,
|
2014-04-04 23:12:29 -04:00
|
|
|
};
|
|
|
|
|
2022-09-15 20:25:47 -04:00
|
|
|
#define ITER_SOURCE 1 // == WRITE
|
|
|
|
#define ITER_DEST 0 // == READ
|
|
|
|
|
2021-09-10 11:18:36 -06:00
|
|
|
struct iov_iter_state {
|
|
|
|
size_t iov_offset;
|
|
|
|
size_t count;
|
|
|
|
unsigned long nr_segs;
|
|
|
|
};
|
|
|
|
|
2013-11-27 16:29:46 -08:00
|
|
|
struct iov_iter {
|
2021-04-22 14:50:39 -04:00
|
|
|
u8 iter_type;
|
2021-07-12 12:06:14 +02:00
|
|
|
bool nofault;
|
2021-04-22 14:50:39 -04:00
|
|
|
bool data_source;
|
2023-09-25 13:02:58 +01:00
|
|
|
size_t iov_offset;
|
2023-03-28 14:21:06 -06:00
|
|
|
/*
|
|
|
|
* Hack alert: overlay ubuf_iovec with iovec + count, so
|
|
|
|
* that the members resolve correctly regardless of the type
|
|
|
|
* of iterator used. This means that you can use:
|
|
|
|
*
|
|
|
|
* &iter->__ubuf_iovec or iter->__iov
|
|
|
|
*
|
|
|
|
* interchangably for the user_backed cases, hence simplifying
|
|
|
|
* some of the cases that need to deal with both.
|
|
|
|
*/
|
2014-04-04 23:12:29 -04:00
|
|
|
union {
|
2023-03-28 14:21:06 -06:00
|
|
|
/*
|
|
|
|
* This really should be a const, but we cannot do that without
|
|
|
|
* also modifying any of the zero-filling iter init functions.
|
|
|
|
* Leave it non-const for now, but it should be treated as such.
|
|
|
|
*/
|
|
|
|
struct iovec __ubuf_iovec;
|
|
|
|
struct {
|
|
|
|
union {
|
|
|
|
/* use iter_iov() to get the current vec */
|
|
|
|
const struct iovec *__iov;
|
|
|
|
const struct kvec *kvec;
|
|
|
|
const struct bio_vec *bvec;
|
mm: Define struct folio_queue and ITER_FOLIOQ to handle a sequence of folios
Define a data structure, struct folio_queue, to represent a sequence of
folios and a kernel-internal I/O iterator type, ITER_FOLIOQ, to allow a
list of folio_queue structures to be used to provide a buffer to
iov_iter-taking functions, such as sendmsg and recvmsg.
The folio_queue structure looks like:
struct folio_queue {
struct folio_batch vec;
u8 orders[PAGEVEC_SIZE];
struct folio_queue *next;
struct folio_queue *prev;
unsigned long marks;
unsigned long marks2;
};
It does not use a list_head so that next and/or prev can be set to NULL at
the ends of the list, allowing iov_iter-handling routines to determine that
they *are* the ends without needing to store a head pointer in the iov_iter
struct.
A folio_batch struct is used to hold the folio pointers which allows the
batch to be passed to batch handling functions. Two mark bits are
available per slot. The intention is to use at least one of them to mark
folios that need putting, but that might not be ultimately necessary.
Accessor functions are used to access the slots to do the masking and an
additional accessor function is used to indicate the size of the array.
The order of each folio is also stored in the structure to avoid the need
for iov_iter_advance() and iov_iter_revert() to have to query each folio to
find its size.
With careful barriering, this can be used as an extending buffer with new
folios inserted and new folio_queue structs added without the need for a
lock. Further, provided we always keep at least one struct in the buffer,
we can also remove consumed folios and consumed structs from the head end
as we without the need for locks.
[Questions/thoughts]
(1) To manage this, I need a head pointer, a tail pointer, a tail slot
number (assuming insertion happens at the tail end and the next
pointers point from head to tail). Should I put these into a struct
of their own, say "folio_queue_head" or "rolling_buffer"?
I will end up with two of these in netfs_io_request eventually, one
keeping track of the pagecache I'm dealing with for buffered I/O and
the other to hold a bounce buffer when we need one.
(2) Should I make the slots {folio,off,len} or bio_vec?
(3) This is intended to replace ITER_XARRAY eventually. Using an xarray
in I/O iteration requires the taking of the RCU read lock, doing
copying under the RCU read lock, walking the xarray (which may change
under us), handling retries and dealing with special values.
The advantage of ITER_XARRAY is that when we're dealing with the
pagecache directly, we don't need any allocation - but if we're doing
encrypted comms, there's a good chance we'd be using a bounce buffer
anyway.
This will require afs, erofs, cifs, orangefs and fscache to be
converted to not use this. afs still uses it for dirs and symlinks;
some of erofs usages should be easy to change, but there's one which
won't be so easy; ceph's use via fscache can be fixed by porting ceph
to netfslib; cifs is using xarray as a bounce buffer - that can be
moved to use sheaves instead; and orangefs has a similar problem to
erofs - maybe orangefs could use netfslib?
Signed-off-by: David Howells <dhowells@redhat.com>
cc: Matthew Wilcox <willy@infradead.org>
cc: Jeff Layton <jlayton@kernel.org>
cc: Steve French <sfrench@samba.org>
cc: Ilya Dryomov <idryomov@gmail.com>
cc: Gao Xiang <xiang@kernel.org>
cc: Mike Marshall <hubcap@omnibond.com>
cc: netfs@lists.linux.dev
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
cc: linux-afs@lists.infradead.org
cc: linux-cifs@vger.kernel.org
cc: ceph-devel@vger.kernel.org
cc: linux-erofs@lists.ozlabs.org
cc: devel@lists.orangefs.org
Link: https://lore.kernel.org/r/20240814203850.2240469-13-dhowells@redhat.com/ # v2
Signed-off-by: Christian Brauner <brauner@kernel.org>
2024-06-19 00:20:42 +01:00
|
|
|
const struct folio_queue *folioq;
|
2023-03-28 14:21:06 -06:00
|
|
|
struct xarray *xarray;
|
|
|
|
void __user *ubuf;
|
|
|
|
};
|
|
|
|
size_t count;
|
|
|
|
};
|
2016-09-22 16:33:12 -04:00
|
|
|
};
|
|
|
|
union {
|
|
|
|
unsigned long nr_segs;
|
mm: Define struct folio_queue and ITER_FOLIOQ to handle a sequence of folios
Define a data structure, struct folio_queue, to represent a sequence of
folios and a kernel-internal I/O iterator type, ITER_FOLIOQ, to allow a
list of folio_queue structures to be used to provide a buffer to
iov_iter-taking functions, such as sendmsg and recvmsg.
The folio_queue structure looks like:
struct folio_queue {
struct folio_batch vec;
u8 orders[PAGEVEC_SIZE];
struct folio_queue *next;
struct folio_queue *prev;
unsigned long marks;
unsigned long marks2;
};
It does not use a list_head so that next and/or prev can be set to NULL at
the ends of the list, allowing iov_iter-handling routines to determine that
they *are* the ends without needing to store a head pointer in the iov_iter
struct.
A folio_batch struct is used to hold the folio pointers which allows the
batch to be passed to batch handling functions. Two mark bits are
available per slot. The intention is to use at least one of them to mark
folios that need putting, but that might not be ultimately necessary.
Accessor functions are used to access the slots to do the masking and an
additional accessor function is used to indicate the size of the array.
The order of each folio is also stored in the structure to avoid the need
for iov_iter_advance() and iov_iter_revert() to have to query each folio to
find its size.
With careful barriering, this can be used as an extending buffer with new
folios inserted and new folio_queue structs added without the need for a
lock. Further, provided we always keep at least one struct in the buffer,
we can also remove consumed folios and consumed structs from the head end
as we without the need for locks.
[Questions/thoughts]
(1) To manage this, I need a head pointer, a tail pointer, a tail slot
number (assuming insertion happens at the tail end and the next
pointers point from head to tail). Should I put these into a struct
of their own, say "folio_queue_head" or "rolling_buffer"?
I will end up with two of these in netfs_io_request eventually, one
keeping track of the pagecache I'm dealing with for buffered I/O and
the other to hold a bounce buffer when we need one.
(2) Should I make the slots {folio,off,len} or bio_vec?
(3) This is intended to replace ITER_XARRAY eventually. Using an xarray
in I/O iteration requires the taking of the RCU read lock, doing
copying under the RCU read lock, walking the xarray (which may change
under us), handling retries and dealing with special values.
The advantage of ITER_XARRAY is that when we're dealing with the
pagecache directly, we don't need any allocation - but if we're doing
encrypted comms, there's a good chance we'd be using a bounce buffer
anyway.
This will require afs, erofs, cifs, orangefs and fscache to be
converted to not use this. afs still uses it for dirs and symlinks;
some of erofs usages should be easy to change, but there's one which
won't be so easy; ceph's use via fscache can be fixed by porting ceph
to netfslib; cifs is using xarray as a bounce buffer - that can be
moved to use sheaves instead; and orangefs has a similar problem to
erofs - maybe orangefs could use netfslib?
Signed-off-by: David Howells <dhowells@redhat.com>
cc: Matthew Wilcox <willy@infradead.org>
cc: Jeff Layton <jlayton@kernel.org>
cc: Steve French <sfrench@samba.org>
cc: Ilya Dryomov <idryomov@gmail.com>
cc: Gao Xiang <xiang@kernel.org>
cc: Mike Marshall <hubcap@omnibond.com>
cc: netfs@lists.linux.dev
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
cc: linux-afs@lists.infradead.org
cc: linux-cifs@vger.kernel.org
cc: ceph-devel@vger.kernel.org
cc: linux-erofs@lists.ozlabs.org
cc: devel@lists.orangefs.org
Link: https://lore.kernel.org/r/20240814203850.2240469-13-dhowells@redhat.com/ # v2
Signed-off-by: Christian Brauner <brauner@kernel.org>
2024-06-19 00:20:42 +01:00
|
|
|
u8 folioq_slot;
|
2020-02-10 10:00:21 +00:00
|
|
|
loff_t xarray_start;
|
2014-04-04 23:12:29 -04:00
|
|
|
};
|
2013-11-27 16:29:46 -08:00
|
|
|
};
|
|
|
|
|
2023-03-28 14:21:06 -06:00
|
|
|
static inline const struct iovec *iter_iov(const struct iov_iter *iter)
|
|
|
|
{
|
|
|
|
if (iter->iter_type == ITER_UBUF)
|
|
|
|
return (const struct iovec *) &iter->__ubuf_iovec;
|
|
|
|
return iter->__iov;
|
|
|
|
}
|
|
|
|
|
2023-03-29 09:16:45 -06:00
|
|
|
#define iter_iov_addr(iter) (iter_iov(iter)->iov_base + (iter)->iov_offset)
|
|
|
|
#define iter_iov_len(iter) (iter_iov(iter)->iov_len - (iter)->iov_offset)
|
2023-03-29 08:52:15 -06:00
|
|
|
|
2018-10-22 13:07:28 +01:00
|
|
|
static inline enum iter_type iov_iter_type(const struct iov_iter *i)
|
|
|
|
{
|
2021-04-22 14:50:39 -04:00
|
|
|
return i->iter_type;
|
2018-10-22 13:07:28 +01:00
|
|
|
}
|
|
|
|
|
2021-09-10 11:18:36 -06:00
|
|
|
static inline void iov_iter_save_state(struct iov_iter *iter,
|
|
|
|
struct iov_iter_state *state)
|
|
|
|
{
|
|
|
|
state->iov_offset = iter->iov_offset;
|
|
|
|
state->count = iter->count;
|
|
|
|
state->nr_segs = iter->nr_segs;
|
|
|
|
}
|
|
|
|
|
2022-05-22 14:59:25 -04:00
|
|
|
static inline bool iter_is_ubuf(const struct iov_iter *i)
|
|
|
|
{
|
|
|
|
return iov_iter_type(i) == ITER_UBUF;
|
|
|
|
}
|
|
|
|
|
2018-10-22 13:07:28 +01:00
|
|
|
static inline bool iter_is_iovec(const struct iov_iter *i)
|
|
|
|
{
|
|
|
|
return iov_iter_type(i) == ITER_IOVEC;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline bool iov_iter_is_kvec(const struct iov_iter *i)
|
|
|
|
{
|
|
|
|
return iov_iter_type(i) == ITER_KVEC;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline bool iov_iter_is_bvec(const struct iov_iter *i)
|
|
|
|
{
|
|
|
|
return iov_iter_type(i) == ITER_BVEC;
|
|
|
|
}
|
|
|
|
|
2018-10-20 00:57:56 +01:00
|
|
|
static inline bool iov_iter_is_discard(const struct iov_iter *i)
|
|
|
|
{
|
|
|
|
return iov_iter_type(i) == ITER_DISCARD;
|
|
|
|
}
|
|
|
|
|
mm: Define struct folio_queue and ITER_FOLIOQ to handle a sequence of folios
Define a data structure, struct folio_queue, to represent a sequence of
folios and a kernel-internal I/O iterator type, ITER_FOLIOQ, to allow a
list of folio_queue structures to be used to provide a buffer to
iov_iter-taking functions, such as sendmsg and recvmsg.
The folio_queue structure looks like:
struct folio_queue {
struct folio_batch vec;
u8 orders[PAGEVEC_SIZE];
struct folio_queue *next;
struct folio_queue *prev;
unsigned long marks;
unsigned long marks2;
};
It does not use a list_head so that next and/or prev can be set to NULL at
the ends of the list, allowing iov_iter-handling routines to determine that
they *are* the ends without needing to store a head pointer in the iov_iter
struct.
A folio_batch struct is used to hold the folio pointers which allows the
batch to be passed to batch handling functions. Two mark bits are
available per slot. The intention is to use at least one of them to mark
folios that need putting, but that might not be ultimately necessary.
Accessor functions are used to access the slots to do the masking and an
additional accessor function is used to indicate the size of the array.
The order of each folio is also stored in the structure to avoid the need
for iov_iter_advance() and iov_iter_revert() to have to query each folio to
find its size.
With careful barriering, this can be used as an extending buffer with new
folios inserted and new folio_queue structs added without the need for a
lock. Further, provided we always keep at least one struct in the buffer,
we can also remove consumed folios and consumed structs from the head end
as we without the need for locks.
[Questions/thoughts]
(1) To manage this, I need a head pointer, a tail pointer, a tail slot
number (assuming insertion happens at the tail end and the next
pointers point from head to tail). Should I put these into a struct
of their own, say "folio_queue_head" or "rolling_buffer"?
I will end up with two of these in netfs_io_request eventually, one
keeping track of the pagecache I'm dealing with for buffered I/O and
the other to hold a bounce buffer when we need one.
(2) Should I make the slots {folio,off,len} or bio_vec?
(3) This is intended to replace ITER_XARRAY eventually. Using an xarray
in I/O iteration requires the taking of the RCU read lock, doing
copying under the RCU read lock, walking the xarray (which may change
under us), handling retries and dealing with special values.
The advantage of ITER_XARRAY is that when we're dealing with the
pagecache directly, we don't need any allocation - but if we're doing
encrypted comms, there's a good chance we'd be using a bounce buffer
anyway.
This will require afs, erofs, cifs, orangefs and fscache to be
converted to not use this. afs still uses it for dirs and symlinks;
some of erofs usages should be easy to change, but there's one which
won't be so easy; ceph's use via fscache can be fixed by porting ceph
to netfslib; cifs is using xarray as a bounce buffer - that can be
moved to use sheaves instead; and orangefs has a similar problem to
erofs - maybe orangefs could use netfslib?
Signed-off-by: David Howells <dhowells@redhat.com>
cc: Matthew Wilcox <willy@infradead.org>
cc: Jeff Layton <jlayton@kernel.org>
cc: Steve French <sfrench@samba.org>
cc: Ilya Dryomov <idryomov@gmail.com>
cc: Gao Xiang <xiang@kernel.org>
cc: Mike Marshall <hubcap@omnibond.com>
cc: netfs@lists.linux.dev
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
cc: linux-afs@lists.infradead.org
cc: linux-cifs@vger.kernel.org
cc: ceph-devel@vger.kernel.org
cc: linux-erofs@lists.ozlabs.org
cc: devel@lists.orangefs.org
Link: https://lore.kernel.org/r/20240814203850.2240469-13-dhowells@redhat.com/ # v2
Signed-off-by: Christian Brauner <brauner@kernel.org>
2024-06-19 00:20:42 +01:00
|
|
|
static inline bool iov_iter_is_folioq(const struct iov_iter *i)
|
|
|
|
{
|
|
|
|
return iov_iter_type(i) == ITER_FOLIOQ;
|
|
|
|
}
|
|
|
|
|
2020-02-10 10:00:21 +00:00
|
|
|
static inline bool iov_iter_is_xarray(const struct iov_iter *i)
|
|
|
|
{
|
|
|
|
return iov_iter_type(i) == ITER_XARRAY;
|
|
|
|
}
|
|
|
|
|
2018-10-22 13:07:28 +01:00
|
|
|
static inline unsigned char iov_iter_rw(const struct iov_iter *i)
|
|
|
|
{
|
2021-04-22 14:50:39 -04:00
|
|
|
return i->data_source ? WRITE : READ;
|
2018-10-22 13:07:28 +01:00
|
|
|
}
|
|
|
|
|
2022-05-22 14:59:25 -04:00
|
|
|
static inline bool user_backed_iter(const struct iov_iter *i)
|
|
|
|
{
|
2023-09-25 13:03:03 +01:00
|
|
|
return iter_is_ubuf(i) || iter_is_iovec(i);
|
2022-05-22 14:59:25 -04:00
|
|
|
}
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
/*
|
|
|
|
* Total number of bytes covered by an iovec.
|
|
|
|
*
|
|
|
|
* NOTE that it is not safe to use this function until all the iovec's
|
|
|
|
* segment lengths have been validated. Because the individual lengths can
|
|
|
|
* overflow a size_t when added together.
|
|
|
|
*/
|
|
|
|
static inline size_t iov_length(const struct iovec *iov, unsigned long nr_segs)
|
|
|
|
{
|
|
|
|
unsigned long seg;
|
|
|
|
size_t ret = 0;
|
|
|
|
|
|
|
|
for (seg = 0; seg < nr_segs; seg++)
|
|
|
|
ret += iov[seg].iov_len;
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2023-07-09 18:17:33 -04:00
|
|
|
size_t copy_page_from_iter_atomic(struct page *page, size_t offset,
|
2021-04-30 10:26:41 -04:00
|
|
|
size_t bytes, struct iov_iter *i);
|
2013-11-27 16:29:46 -08:00
|
|
|
void iov_iter_advance(struct iov_iter *i, size_t bytes);
|
2017-02-17 18:42:24 -05:00
|
|
|
void iov_iter_revert(struct iov_iter *i, size_t bytes);
|
2021-08-02 14:54:16 +02:00
|
|
|
size_t fault_in_iov_iter_readable(const struct iov_iter *i, size_t bytes);
|
2021-07-05 17:26:28 +02:00
|
|
|
size_t fault_in_iov_iter_writeable(const struct iov_iter *i, size_t bytes);
|
2013-11-27 16:29:46 -08:00
|
|
|
size_t iov_iter_single_seg_count(const struct iov_iter *i);
|
2014-02-03 17:07:03 -05:00
|
|
|
size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
|
|
|
|
struct iov_iter *i);
|
2014-04-03 15:05:18 -04:00
|
|
|
size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes,
|
|
|
|
struct iov_iter *i);
|
2017-06-29 21:45:10 -04:00
|
|
|
|
|
|
|
size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i);
|
|
|
|
size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i);
|
|
|
|
size_t _copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i);
|
|
|
|
|
2021-10-18 10:39:06 -04:00
|
|
|
static inline size_t copy_folio_to_iter(struct folio *folio, size_t offset,
|
|
|
|
size_t bytes, struct iov_iter *i)
|
|
|
|
{
|
|
|
|
return copy_page_to_iter(&folio->page, offset, bytes, i);
|
|
|
|
}
|
2023-07-09 18:17:33 -04:00
|
|
|
|
2024-08-14 16:14:21 +01:00
|
|
|
static inline size_t copy_folio_from_iter(struct folio *folio, size_t offset,
|
|
|
|
size_t bytes, struct iov_iter *i)
|
|
|
|
{
|
|
|
|
return copy_page_from_iter(&folio->page, offset, bytes, i);
|
|
|
|
}
|
|
|
|
|
2023-07-09 18:17:33 -04:00
|
|
|
static inline size_t copy_folio_from_iter_atomic(struct folio *folio,
|
|
|
|
size_t offset, size_t bytes, struct iov_iter *i)
|
|
|
|
{
|
|
|
|
return copy_page_from_iter_atomic(&folio->page, offset, bytes, i);
|
|
|
|
}
|
|
|
|
|
2023-03-22 18:57:03 +00:00
|
|
|
size_t copy_page_to_iter_nofault(struct page *page, unsigned offset,
|
|
|
|
size_t bytes, struct iov_iter *i);
|
2021-10-18 10:39:06 -04:00
|
|
|
|
2017-06-29 21:45:10 -04:00
|
|
|
static __always_inline __must_check
|
|
|
|
size_t copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
|
|
|
|
{
|
2022-06-06 18:42:59 -04:00
|
|
|
if (check_copy_size(addr, bytes, true))
|
2017-06-29 21:45:10 -04:00
|
|
|
return _copy_to_iter(addr, bytes, i);
|
2022-06-06 18:42:59 -04:00
|
|
|
return 0;
|
2017-06-29 21:45:10 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
static __always_inline __must_check
|
|
|
|
size_t copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
|
|
|
|
{
|
2022-06-06 18:42:59 -04:00
|
|
|
if (check_copy_size(addr, bytes, false))
|
2017-06-29 21:45:10 -04:00
|
|
|
return _copy_from_iter(addr, bytes, i);
|
2022-06-06 18:42:59 -04:00
|
|
|
return 0;
|
2017-06-29 21:45:10 -04:00
|
|
|
}
|
|
|
|
|
2024-04-07 02:42:36 -04:00
|
|
|
static __always_inline __must_check
|
|
|
|
bool copy_to_iter_full(const void *addr, size_t bytes, struct iov_iter *i)
|
|
|
|
{
|
|
|
|
size_t copied = copy_to_iter(addr, bytes, i);
|
|
|
|
if (likely(copied == bytes))
|
|
|
|
return true;
|
|
|
|
iov_iter_revert(i, copied);
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2017-06-29 21:45:10 -04:00
|
|
|
static __always_inline __must_check
|
|
|
|
bool copy_from_iter_full(void *addr, size_t bytes, struct iov_iter *i)
|
|
|
|
{
|
2021-04-29 21:16:56 -04:00
|
|
|
size_t copied = copy_from_iter(addr, bytes, i);
|
|
|
|
if (likely(copied == bytes))
|
|
|
|
return true;
|
|
|
|
iov_iter_revert(i, copied);
|
|
|
|
return false;
|
2017-06-29 21:45:10 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
static __always_inline __must_check
|
|
|
|
size_t copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i)
|
|
|
|
{
|
2022-06-06 18:42:59 -04:00
|
|
|
if (check_copy_size(addr, bytes, false))
|
2017-06-29 21:45:10 -04:00
|
|
|
return _copy_from_iter_nocache(addr, bytes, i);
|
2022-06-06 18:42:59 -04:00
|
|
|
return 0;
|
2017-06-29 21:45:10 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
static __always_inline __must_check
|
|
|
|
bool copy_from_iter_full_nocache(void *addr, size_t bytes, struct iov_iter *i)
|
|
|
|
{
|
2021-04-29 21:16:56 -04:00
|
|
|
size_t copied = copy_from_iter_nocache(addr, bytes, i);
|
|
|
|
if (likely(copied == bytes))
|
|
|
|
return true;
|
|
|
|
iov_iter_revert(i, copied);
|
|
|
|
return false;
|
2017-06-29 21:45:10 -04:00
|
|
|
}
|
|
|
|
|
2017-05-29 12:22:50 -07:00
|
|
|
#ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE
|
|
|
|
/*
|
|
|
|
* Note, users like pmem that depend on the stricter semantics of
|
2021-12-15 09:45:05 +01:00
|
|
|
* _copy_from_iter_flushcache() than _copy_from_iter_nocache() must check for
|
2017-05-29 12:22:50 -07:00
|
|
|
* IS_ENABLED(CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE) before assuming that the
|
|
|
|
* destination is flushed from the cache on return.
|
|
|
|
*/
|
Merge branch 'uaccess-work.iov_iter' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs
Pull iov_iter hardening from Al Viro:
"This is the iov_iter/uaccess/hardening pile.
For one thing, it trims the inline part of copy_to_user/copy_from_user
to the minimum that *does* need to be inlined - object size checks,
basically. For another, it sanitizes the checks for iov_iter
primitives. There are 4 groups of checks: access_ok(), might_fault(),
object size and KASAN.
- access_ok() had been verified by whoever had set the iov_iter up.
However, that has happened in a function far away, so proving that
there's no path to actual copying bypassing those checks is hard
and proving that iov_iter has not been buggered in the meanwhile is
also not pleasant. So we want those redone in actual
copyin/copyout.
- might_fault() is better off consolidated - we know whether it needs
to be checked as soon as we enter iov_iter primitive and observe
the iov_iter flavour. No need to wait until the copyin/copyout. The
call chains are short enough to make sure we won't miss anything -
in fact, it's more robust that way, since there are cases where we
do e.g. forced fault-in before getting to copyin/copyout. It's not
quite what we need to check (in particular, combination of
iovec-backed and set_fs(KERNEL_DS) is almost certainly a bug, not a
cause to skip checks), but that's for later series. For now let's
keep might_fault().
- KASAN checks belong in copyin/copyout - at the same level where
other iov_iter flavours would've hit them in memcpy().
- object size checks should apply to *all* iov_iter flavours, not
just iovec-backed ones.
There are two groups of primitives - one gets the kernel object
described as pointer + size (copy_to_iter(), etc.) while another gets
it as page + offset + size (copy_page_to_iter(), etc.)
For the first group the checks are best done where we actually have a
chance to find the object size. In other words, those belong in inline
wrappers in uio.h, before calling into iov_iter.c. Same kind as we
have for inlined part of copy_to_user().
For the second group there is no object to look at - offset in page is
just a number, it bears no type information. So we do them in the
common helper called by iov_iter.c primitives of that kind. All it
currently does is checking that we are not trying to access outside of
the compound page; eventually we might want to add some sanity checks
on the page involved.
So the things we need in copyin/copyout part of iov_iter.c do not
quite match anything in uaccess.h (we want no zeroing, we *do* want
access_ok() and KASAN and we want no might_fault() or object size
checks done on that level). OTOH, these needs are simple enough to
provide a couple of helpers (static in iov_iter.c) doing just what we
need..."
* 'uaccess-work.iov_iter' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs:
iov_iter: saner checks on copyin/copyout
iov_iter: sanity checks for copy to/from page primitives
iov_iter/hardening: move object size checks to inlined part
copy_{to,from}_user(): consolidate object size checks
copy_{from,to}_user(): move kasan checks and might_fault() out-of-line
2017-07-07 20:39:20 -07:00
|
|
|
size_t _copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i);
|
2017-05-29 12:22:50 -07:00
|
|
|
#else
|
Merge branch 'uaccess-work.iov_iter' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs
Pull iov_iter hardening from Al Viro:
"This is the iov_iter/uaccess/hardening pile.
For one thing, it trims the inline part of copy_to_user/copy_from_user
to the minimum that *does* need to be inlined - object size checks,
basically. For another, it sanitizes the checks for iov_iter
primitives. There are 4 groups of checks: access_ok(), might_fault(),
object size and KASAN.
- access_ok() had been verified by whoever had set the iov_iter up.
However, that has happened in a function far away, so proving that
there's no path to actual copying bypassing those checks is hard
and proving that iov_iter has not been buggered in the meanwhile is
also not pleasant. So we want those redone in actual
copyin/copyout.
- might_fault() is better off consolidated - we know whether it needs
to be checked as soon as we enter iov_iter primitive and observe
the iov_iter flavour. No need to wait until the copyin/copyout. The
call chains are short enough to make sure we won't miss anything -
in fact, it's more robust that way, since there are cases where we
do e.g. forced fault-in before getting to copyin/copyout. It's not
quite what we need to check (in particular, combination of
iovec-backed and set_fs(KERNEL_DS) is almost certainly a bug, not a
cause to skip checks), but that's for later series. For now let's
keep might_fault().
- KASAN checks belong in copyin/copyout - at the same level where
other iov_iter flavours would've hit them in memcpy().
- object size checks should apply to *all* iov_iter flavours, not
just iovec-backed ones.
There are two groups of primitives - one gets the kernel object
described as pointer + size (copy_to_iter(), etc.) while another gets
it as page + offset + size (copy_page_to_iter(), etc.)
For the first group the checks are best done where we actually have a
chance to find the object size. In other words, those belong in inline
wrappers in uio.h, before calling into iov_iter.c. Same kind as we
have for inlined part of copy_to_user().
For the second group there is no object to look at - offset in page is
just a number, it bears no type information. So we do them in the
common helper called by iov_iter.c primitives of that kind. All it
currently does is checking that we are not trying to access outside of
the compound page; eventually we might want to add some sanity checks
on the page involved.
So the things we need in copyin/copyout part of iov_iter.c do not
quite match anything in uaccess.h (we want no zeroing, we *do* want
access_ok() and KASAN and we want no might_fault() or object size
checks done on that level). OTOH, these needs are simple enough to
provide a couple of helpers (static in iov_iter.c) doing just what we
need..."
* 'uaccess-work.iov_iter' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs:
iov_iter: saner checks on copyin/copyout
iov_iter: sanity checks for copy to/from page primitives
iov_iter/hardening: move object size checks to inlined part
copy_{to,from}_user(): consolidate object size checks
copy_{from,to}_user(): move kasan checks and might_fault() out-of-line
2017-07-07 20:39:20 -07:00
|
|
|
#define _copy_from_iter_flushcache _copy_from_iter_nocache
|
|
|
|
#endif
|
|
|
|
|
x86, powerpc: Rename memcpy_mcsafe() to copy_mc_to_{user, kernel}()
In reaction to a proposal to introduce a memcpy_mcsafe_fast()
implementation Linus points out that memcpy_mcsafe() is poorly named
relative to communicating the scope of the interface. Specifically what
addresses are valid to pass as source, destination, and what faults /
exceptions are handled.
Of particular concern is that even though x86 might be able to handle
the semantics of copy_mc_to_user() with its common copy_user_generic()
implementation other archs likely need / want an explicit path for this
case:
On Fri, May 1, 2020 at 11:28 AM Linus Torvalds <torvalds@linux-foundation.org> wrote:
>
> On Thu, Apr 30, 2020 at 6:21 PM Dan Williams <dan.j.williams@intel.com> wrote:
> >
> > However now I see that copy_user_generic() works for the wrong reason.
> > It works because the exception on the source address due to poison
> > looks no different than a write fault on the user address to the
> > caller, it's still just a short copy. So it makes copy_to_user() work
> > for the wrong reason relative to the name.
>
> Right.
>
> And it won't work that way on other architectures. On x86, we have a
> generic function that can take faults on either side, and we use it
> for both cases (and for the "in_user" case too), but that's an
> artifact of the architecture oddity.
>
> In fact, it's probably wrong even on x86 - because it can hide bugs -
> but writing those things is painful enough that everybody prefers
> having just one function.
Replace a single top-level memcpy_mcsafe() with either
copy_mc_to_user(), or copy_mc_to_kernel().
Introduce an x86 copy_mc_fragile() name as the rename for the
low-level x86 implementation formerly named memcpy_mcsafe(). It is used
as the slow / careful backend that is supplanted by a fast
copy_mc_generic() in a follow-on patch.
One side-effect of this reorganization is that separating copy_mc_64.S
to its own file means that perf no longer needs to track dependencies
for its memcpy_64.S benchmarks.
[ bp: Massage a bit. ]
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Acked-by: Michael Ellerman <mpe@ellerman.id.au>
Cc: <stable@vger.kernel.org>
Link: http://lore.kernel.org/r/CAHk-=wjSqtXAqfUJxFtWNwmguFASTgB0dz1dT3V-78Quiezqbg@mail.gmail.com
Link: https://lkml.kernel.org/r/160195561680.2163339.11574962055305783722.stgit@dwillia2-desk3.amr.corp.intel.com
2020-10-05 20:40:16 -07:00
|
|
|
#ifdef CONFIG_ARCH_HAS_COPY_MC
|
|
|
|
size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i);
|
2018-05-03 17:06:31 -07:00
|
|
|
#else
|
x86, powerpc: Rename memcpy_mcsafe() to copy_mc_to_{user, kernel}()
In reaction to a proposal to introduce a memcpy_mcsafe_fast()
implementation Linus points out that memcpy_mcsafe() is poorly named
relative to communicating the scope of the interface. Specifically what
addresses are valid to pass as source, destination, and what faults /
exceptions are handled.
Of particular concern is that even though x86 might be able to handle
the semantics of copy_mc_to_user() with its common copy_user_generic()
implementation other archs likely need / want an explicit path for this
case:
On Fri, May 1, 2020 at 11:28 AM Linus Torvalds <torvalds@linux-foundation.org> wrote:
>
> On Thu, Apr 30, 2020 at 6:21 PM Dan Williams <dan.j.williams@intel.com> wrote:
> >
> > However now I see that copy_user_generic() works for the wrong reason.
> > It works because the exception on the source address due to poison
> > looks no different than a write fault on the user address to the
> > caller, it's still just a short copy. So it makes copy_to_user() work
> > for the wrong reason relative to the name.
>
> Right.
>
> And it won't work that way on other architectures. On x86, we have a
> generic function that can take faults on either side, and we use it
> for both cases (and for the "in_user" case too), but that's an
> artifact of the architecture oddity.
>
> In fact, it's probably wrong even on x86 - because it can hide bugs -
> but writing those things is painful enough that everybody prefers
> having just one function.
Replace a single top-level memcpy_mcsafe() with either
copy_mc_to_user(), or copy_mc_to_kernel().
Introduce an x86 copy_mc_fragile() name as the rename for the
low-level x86 implementation formerly named memcpy_mcsafe(). It is used
as the slow / careful backend that is supplanted by a fast
copy_mc_generic() in a follow-on patch.
One side-effect of this reorganization is that separating copy_mc_64.S
to its own file means that perf no longer needs to track dependencies
for its memcpy_64.S benchmarks.
[ bp: Massage a bit. ]
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Acked-by: Michael Ellerman <mpe@ellerman.id.au>
Cc: <stable@vger.kernel.org>
Link: http://lore.kernel.org/r/CAHk-=wjSqtXAqfUJxFtWNwmguFASTgB0dz1dT3V-78Quiezqbg@mail.gmail.com
Link: https://lkml.kernel.org/r/160195561680.2163339.11574962055305783722.stgit@dwillia2-desk3.amr.corp.intel.com
2020-10-05 20:40:16 -07:00
|
|
|
#define _copy_mc_to_iter _copy_to_iter
|
2018-05-03 17:06:31 -07:00
|
|
|
#endif
|
|
|
|
|
2014-08-01 09:27:22 -04:00
|
|
|
size_t iov_iter_zero(size_t bytes, struct iov_iter *);
|
2022-06-10 12:58:27 -07:00
|
|
|
bool iov_iter_is_aligned(const struct iov_iter *i, unsigned addr_mask,
|
|
|
|
unsigned len_mask);
|
2014-03-05 13:50:45 -05:00
|
|
|
unsigned long iov_iter_alignment(const struct iov_iter *i);
|
2016-04-08 19:05:19 -04:00
|
|
|
unsigned long iov_iter_gap_alignment(const struct iov_iter *i);
|
2018-10-20 00:57:56 +01:00
|
|
|
void iov_iter_init(struct iov_iter *i, unsigned int direction, const struct iovec *iov,
|
2014-03-05 19:28:09 -05:00
|
|
|
unsigned long nr_segs, size_t count);
|
2018-10-20 00:57:56 +01:00
|
|
|
void iov_iter_kvec(struct iov_iter *i, unsigned int direction, const struct kvec *kvec,
|
2015-01-23 01:08:07 -05:00
|
|
|
unsigned long nr_segs, size_t count);
|
2018-10-20 00:57:56 +01:00
|
|
|
void iov_iter_bvec(struct iov_iter *i, unsigned int direction, const struct bio_vec *bvec,
|
2014-11-24 14:46:11 -05:00
|
|
|
unsigned long nr_segs, size_t count);
|
2018-10-20 00:57:56 +01:00
|
|
|
void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count);
|
mm: Define struct folio_queue and ITER_FOLIOQ to handle a sequence of folios
Define a data structure, struct folio_queue, to represent a sequence of
folios and a kernel-internal I/O iterator type, ITER_FOLIOQ, to allow a
list of folio_queue structures to be used to provide a buffer to
iov_iter-taking functions, such as sendmsg and recvmsg.
The folio_queue structure looks like:
struct folio_queue {
struct folio_batch vec;
u8 orders[PAGEVEC_SIZE];
struct folio_queue *next;
struct folio_queue *prev;
unsigned long marks;
unsigned long marks2;
};
It does not use a list_head so that next and/or prev can be set to NULL at
the ends of the list, allowing iov_iter-handling routines to determine that
they *are* the ends without needing to store a head pointer in the iov_iter
struct.
A folio_batch struct is used to hold the folio pointers which allows the
batch to be passed to batch handling functions. Two mark bits are
available per slot. The intention is to use at least one of them to mark
folios that need putting, but that might not be ultimately necessary.
Accessor functions are used to access the slots to do the masking and an
additional accessor function is used to indicate the size of the array.
The order of each folio is also stored in the structure to avoid the need
for iov_iter_advance() and iov_iter_revert() to have to query each folio to
find its size.
With careful barriering, this can be used as an extending buffer with new
folios inserted and new folio_queue structs added without the need for a
lock. Further, provided we always keep at least one struct in the buffer,
we can also remove consumed folios and consumed structs from the head end
as we without the need for locks.
[Questions/thoughts]
(1) To manage this, I need a head pointer, a tail pointer, a tail slot
number (assuming insertion happens at the tail end and the next
pointers point from head to tail). Should I put these into a struct
of their own, say "folio_queue_head" or "rolling_buffer"?
I will end up with two of these in netfs_io_request eventually, one
keeping track of the pagecache I'm dealing with for buffered I/O and
the other to hold a bounce buffer when we need one.
(2) Should I make the slots {folio,off,len} or bio_vec?
(3) This is intended to replace ITER_XARRAY eventually. Using an xarray
in I/O iteration requires the taking of the RCU read lock, doing
copying under the RCU read lock, walking the xarray (which may change
under us), handling retries and dealing with special values.
The advantage of ITER_XARRAY is that when we're dealing with the
pagecache directly, we don't need any allocation - but if we're doing
encrypted comms, there's a good chance we'd be using a bounce buffer
anyway.
This will require afs, erofs, cifs, orangefs and fscache to be
converted to not use this. afs still uses it for dirs and symlinks;
some of erofs usages should be easy to change, but there's one which
won't be so easy; ceph's use via fscache can be fixed by porting ceph
to netfslib; cifs is using xarray as a bounce buffer - that can be
moved to use sheaves instead; and orangefs has a similar problem to
erofs - maybe orangefs could use netfslib?
Signed-off-by: David Howells <dhowells@redhat.com>
cc: Matthew Wilcox <willy@infradead.org>
cc: Jeff Layton <jlayton@kernel.org>
cc: Steve French <sfrench@samba.org>
cc: Ilya Dryomov <idryomov@gmail.com>
cc: Gao Xiang <xiang@kernel.org>
cc: Mike Marshall <hubcap@omnibond.com>
cc: netfs@lists.linux.dev
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
cc: linux-afs@lists.infradead.org
cc: linux-cifs@vger.kernel.org
cc: ceph-devel@vger.kernel.org
cc: linux-erofs@lists.ozlabs.org
cc: devel@lists.orangefs.org
Link: https://lore.kernel.org/r/20240814203850.2240469-13-dhowells@redhat.com/ # v2
Signed-off-by: Christian Brauner <brauner@kernel.org>
2024-06-19 00:20:42 +01:00
|
|
|
void iov_iter_folio_queue(struct iov_iter *i, unsigned int direction,
|
|
|
|
const struct folio_queue *folioq,
|
|
|
|
unsigned int first_slot, unsigned int offset, size_t count);
|
2020-02-10 10:00:21 +00:00
|
|
|
void iov_iter_xarray(struct iov_iter *i, unsigned int direction, struct xarray *xarray,
|
|
|
|
loff_t start, size_t count);
|
2022-06-10 13:05:12 -04:00
|
|
|
ssize_t iov_iter_get_pages2(struct iov_iter *i, struct page **pages,
|
2014-09-24 17:09:11 +02:00
|
|
|
size_t maxsize, unsigned maxpages, size_t *start);
|
2022-06-10 13:05:12 -04:00
|
|
|
ssize_t iov_iter_get_pages_alloc2(struct iov_iter *i, struct page ***pages,
|
2014-03-21 04:58:33 -04:00
|
|
|
size_t maxsize, size_t *start);
|
2014-03-19 01:16:16 -04:00
|
|
|
int iov_iter_npages(const struct iov_iter *i, int maxpages);
|
2021-09-10 11:18:36 -06:00
|
|
|
void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state);
|
2013-11-27 16:29:46 -08:00
|
|
|
|
2015-01-31 20:08:47 -05:00
|
|
|
const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags);
|
|
|
|
|
2016-10-10 13:57:37 -04:00
|
|
|
static inline size_t iov_iter_count(const struct iov_iter *i)
|
2013-11-27 16:29:46 -08:00
|
|
|
{
|
|
|
|
return i->count;
|
|
|
|
}
|
|
|
|
|
2014-06-23 08:44:40 +01:00
|
|
|
/*
|
|
|
|
* Cap the iov_iter by given limit; note that the second argument is
|
|
|
|
* *not* the new size - it's upper limit for such. Passing it a value
|
|
|
|
* greater than the amount of data in iov_iter is fine - it'll just do
|
|
|
|
* nothing in that case.
|
|
|
|
*/
|
|
|
|
static inline void iov_iter_truncate(struct iov_iter *i, u64 count)
|
2014-03-22 06:51:37 -04:00
|
|
|
{
|
2014-06-23 08:44:40 +01:00
|
|
|
/*
|
|
|
|
* count doesn't have to fit in size_t - comparison extends both
|
|
|
|
* operands to u64 here and any value that would be truncated by
|
|
|
|
* conversion in assignement is by definition greater than all
|
|
|
|
* values of size_t, including old i->count.
|
|
|
|
*/
|
2021-09-10 11:19:58 -06:00
|
|
|
if (i->count > count)
|
2014-03-22 06:51:37 -04:00
|
|
|
i->count = count;
|
|
|
|
}
|
|
|
|
|
2014-04-04 12:15:19 -04:00
|
|
|
/*
|
|
|
|
* reexpand a previously truncated iterator; count must be no more than how much
|
|
|
|
* we had shrunk it.
|
|
|
|
*/
|
|
|
|
static inline void iov_iter_reexpand(struct iov_iter *i, size_t count)
|
|
|
|
{
|
|
|
|
i->count = count;
|
|
|
|
}
|
2021-02-03 14:29:52 -05:00
|
|
|
|
2022-02-02 14:20:31 -08:00
|
|
|
static inline int
|
|
|
|
iov_iter_npages_cap(struct iov_iter *i, int maxpages, size_t max_bytes)
|
|
|
|
{
|
|
|
|
size_t shorted = 0;
|
|
|
|
int npages;
|
|
|
|
|
|
|
|
if (iov_iter_count(i) > max_bytes) {
|
|
|
|
shorted = iov_iter_count(i) - max_bytes;
|
|
|
|
iov_iter_truncate(i, max_bytes);
|
|
|
|
}
|
2022-09-08 15:20:23 +03:00
|
|
|
npages = iov_iter_npages(i, maxpages);
|
2022-02-02 14:20:31 -08:00
|
|
|
if (shorted)
|
|
|
|
iov_iter_reexpand(i, iov_iter_count(i) + shorted);
|
|
|
|
|
|
|
|
return npages;
|
|
|
|
}
|
|
|
|
|
2020-09-25 06:51:40 +02:00
|
|
|
struct iovec *iovec_from_user(const struct iovec __user *uvector,
|
|
|
|
unsigned long nr_segs, unsigned long fast_segs,
|
|
|
|
struct iovec *fast_iov, bool compat);
|
|
|
|
ssize_t import_iovec(int type, const struct iovec __user *uvec,
|
|
|
|
unsigned nr_segs, unsigned fast_segs, struct iovec **iovp,
|
|
|
|
struct iov_iter *i);
|
|
|
|
ssize_t __import_iovec(int type, const struct iovec __user *uvec,
|
|
|
|
unsigned nr_segs, unsigned fast_segs, struct iovec **iovp,
|
|
|
|
struct iov_iter *i, bool compat);
|
2023-01-05 11:07:30 -08:00
|
|
|
int import_ubuf(int type, void __user *buf, size_t len, struct iov_iter *i);
|
saner iov_iter initialization primitives
iovec-backed iov_iter instances are assumed to satisfy several properties:
* no more than UIO_MAXIOV elements in iovec array
* total size of all ranges is no more than MAX_RW_COUNT
* all ranges pass access_ok().
The problem is, invariants of data structures should be established in the
primitives creating those data structures, not in the code using those
primitives. And iov_iter_init() violates that principle. For a while we
managed to get away with that, but once the use of iov_iter started to
spread, it didn't take long for shit to hit the fan - missed check in
sys_sendto() had introduced a roothole.
We _do_ have primitives for importing and validating iovecs (both native and
compat ones) and those primitives are almost always followed by shoving the
resulting iovec into iov_iter. Life would be considerably simpler (and safer)
if we combined those primitives with initializing iov_iter.
That gives us two new primitives - import_iovec() and compat_import_iovec().
Calling conventions:
iovec = iov_array;
err = import_iovec(direction, uvec, nr_segs,
ARRAY_SIZE(iov_array), &iovec,
&iter);
imports user vector into kernel space (into iov_array if it fits, allocated
if it doesn't fit or if iovec was NULL), validates it and sets iter up to
refer to it. On success 0 is returned and allocated kernel copy (or NULL
if the array had fit into caller-supplied one) is returned via iovec.
On failure all allocations are undone and -E... is returned. If the total
size of ranges exceeds MAX_RW_COUNT, the excess is silently truncated.
compat_import_iovec() expects uvec to be a pointer to user array of compat_iovec;
otherwise it's identical to import_iovec().
Finally, import_single_range() sets iov_iter backed by single-element iovec
covering a user-supplied range -
err = import_single_range(direction, address, size, iovec, &iter);
does validation and sets iter up. Again, size in excess of MAX_RW_COUNT gets
silently truncated.
Next commits will be switching the things up to use of those and reducing
the amount of iov_iter_init() instances.
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2015-03-21 17:45:43 -04:00
|
|
|
|
2022-05-22 14:59:25 -04:00
|
|
|
static inline void iov_iter_ubuf(struct iov_iter *i, unsigned int direction,
|
|
|
|
void __user *buf, size_t count)
|
|
|
|
{
|
|
|
|
WARN_ON(direction & ~(READ | WRITE));
|
|
|
|
*i = (struct iov_iter) {
|
|
|
|
.iter_type = ITER_UBUF,
|
|
|
|
.data_source = direction,
|
|
|
|
.ubuf = buf,
|
2023-03-28 14:29:03 -06:00
|
|
|
.count = count,
|
|
|
|
.nr_segs = 1
|
2022-05-22 14:59:25 -04:00
|
|
|
};
|
|
|
|
}
|
2023-01-19 12:47:23 +00:00
|
|
|
/* Flags for iov_iter_get/extract_pages*() */
|
|
|
|
/* Allow P2PDMA on the extracted pages */
|
|
|
|
#define ITER_ALLOW_P2PDMA ((__force iov_iter_extraction_t)0x01)
|
|
|
|
|
2022-10-28 21:50:30 +01:00
|
|
|
ssize_t iov_iter_extract_pages(struct iov_iter *i, struct page ***pages,
|
|
|
|
size_t maxsize, unsigned int maxpages,
|
|
|
|
iov_iter_extraction_t extraction_flags,
|
|
|
|
size_t *offset0);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* iov_iter_extract_will_pin - Indicate how pages from the iterator will be retained
|
|
|
|
* @iter: The iterator
|
|
|
|
*
|
|
|
|
* Examine the iterator and indicate by returning true or false as to how, if
|
|
|
|
* at all, pages extracted from the iterator will be retained by the extraction
|
|
|
|
* function.
|
|
|
|
*
|
|
|
|
* %true indicates that the pages will have a pin placed in them that the
|
|
|
|
* caller must unpin. This is must be done for DMA/async DIO to force fork()
|
|
|
|
* to forcibly copy a page for the child (the parent must retain the original
|
|
|
|
* page).
|
|
|
|
*
|
|
|
|
* %false indicates that no measures are taken and that it's up to the caller
|
|
|
|
* to retain the pages.
|
|
|
|
*/
|
|
|
|
static inline bool iov_iter_extract_will_pin(const struct iov_iter *iter)
|
|
|
|
{
|
|
|
|
return user_backed_iter(iter);
|
|
|
|
}
|
2022-05-22 14:59:25 -04:00
|
|
|
|
2023-06-06 14:08:50 +01:00
|
|
|
struct sg_table;
|
|
|
|
ssize_t extract_iter_to_sg(struct iov_iter *iter, size_t len,
|
|
|
|
struct sg_table *sgtable, unsigned int sg_max,
|
|
|
|
iov_iter_extraction_t extraction_flags);
|
|
|
|
|
2009-07-29 15:04:19 -07:00
|
|
|
#endif
|