mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git
synced 2025-01-17 02:36:21 +00:00
ceph: add new mount option to enable sparse reads
Add a new mount option that has the client issue sparse reads instead of normal ones. The callers now preallocate an sparse extent buffer that the libceph receive code can populate and hand back after the operation completes. After a successful sparse read, we can't use the req->r_result value to determine the amount of data "read", so instead we set the received length to be from the end of the last extent in the buffer. Any interstitial holes will have been filled by the receive code. [ xiubli: fix a double free on req reported by Ilya ] Signed-off-by: Jeff Layton <jlayton@kernel.org> Reviewed-by: Xiubo Li <xiubli@redhat.com> Reviewed-and-tested-by: Luís Henriques <lhenriques@suse.de> Reviewed-by: Milind Changire <mchangir@redhat.com> Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
This commit is contained in:
parent
f628d79997
commit
03bc06c7b0
@ -245,8 +245,10 @@ static void finish_netfs_read(struct ceph_osd_request *req)
|
|||||||
struct ceph_fs_client *fsc = ceph_inode_to_client(req->r_inode);
|
struct ceph_fs_client *fsc = ceph_inode_to_client(req->r_inode);
|
||||||
struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0);
|
struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0);
|
||||||
struct netfs_io_subrequest *subreq = req->r_priv;
|
struct netfs_io_subrequest *subreq = req->r_priv;
|
||||||
|
struct ceph_osd_req_op *op = &req->r_ops[0];
|
||||||
int num_pages;
|
int num_pages;
|
||||||
int err = req->r_result;
|
int err = req->r_result;
|
||||||
|
bool sparse = (op->op == CEPH_OSD_OP_SPARSE_READ);
|
||||||
|
|
||||||
ceph_update_read_metrics(&fsc->mdsc->metric, req->r_start_latency,
|
ceph_update_read_metrics(&fsc->mdsc->metric, req->r_start_latency,
|
||||||
req->r_end_latency, osd_data->length, err);
|
req->r_end_latency, osd_data->length, err);
|
||||||
@ -255,7 +257,9 @@ static void finish_netfs_read(struct ceph_osd_request *req)
|
|||||||
subreq->len, i_size_read(req->r_inode));
|
subreq->len, i_size_read(req->r_inode));
|
||||||
|
|
||||||
/* no object means success but no data */
|
/* no object means success but no data */
|
||||||
if (err == -ENOENT)
|
if (sparse && err >= 0)
|
||||||
|
err = ceph_sparse_ext_map_end(op);
|
||||||
|
else if (err == -ENOENT)
|
||||||
err = 0;
|
err = 0;
|
||||||
else if (err == -EBLOCKLISTED)
|
else if (err == -EBLOCKLISTED)
|
||||||
fsc->blocklisted = true;
|
fsc->blocklisted = true;
|
||||||
@ -338,6 +342,7 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
|
|||||||
size_t page_off;
|
size_t page_off;
|
||||||
int err = 0;
|
int err = 0;
|
||||||
u64 len = subreq->len;
|
u64 len = subreq->len;
|
||||||
|
bool sparse = ceph_test_mount_opt(fsc, SPARSEREAD);
|
||||||
|
|
||||||
if (ceph_inode_is_shutdown(inode)) {
|
if (ceph_inode_is_shutdown(inode)) {
|
||||||
err = -EIO;
|
err = -EIO;
|
||||||
@ -348,7 +353,7 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
|
|||||||
return;
|
return;
|
||||||
|
|
||||||
req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino, subreq->start, &len,
|
req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino, subreq->start, &len,
|
||||||
0, 1, CEPH_OSD_OP_READ,
|
0, 1, sparse ? CEPH_OSD_OP_SPARSE_READ : CEPH_OSD_OP_READ,
|
||||||
CEPH_OSD_FLAG_READ | fsc->client->osdc.client->options->read_from_replica,
|
CEPH_OSD_FLAG_READ | fsc->client->osdc.client->options->read_from_replica,
|
||||||
NULL, ci->i_truncate_seq, ci->i_truncate_size, false);
|
NULL, ci->i_truncate_seq, ci->i_truncate_size, false);
|
||||||
if (IS_ERR(req)) {
|
if (IS_ERR(req)) {
|
||||||
@ -357,6 +362,12 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
|
|||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (sparse) {
|
||||||
|
err = ceph_alloc_sparse_ext_map(&req->r_ops[0]);
|
||||||
|
if (err)
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
|
||||||
dout("%s: pos=%llu orig_len=%zu len=%llu\n", __func__, subreq->start, subreq->len, len);
|
dout("%s: pos=%llu orig_len=%zu len=%llu\n", __func__, subreq->start, subreq->len, len);
|
||||||
iov_iter_xarray(&iter, ITER_DEST, &rreq->mapping->i_pages, subreq->start, len);
|
iov_iter_xarray(&iter, ITER_DEST, &rreq->mapping->i_pages, subreq->start, len);
|
||||||
err = iov_iter_get_pages_alloc2(&iter, &pages, len, &page_off);
|
err = iov_iter_get_pages_alloc2(&iter, &pages, len, &page_off);
|
||||||
|
@ -936,6 +936,7 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
|
|||||||
u64 off = iocb->ki_pos;
|
u64 off = iocb->ki_pos;
|
||||||
u64 len = iov_iter_count(to);
|
u64 len = iov_iter_count(to);
|
||||||
u64 i_size = i_size_read(inode);
|
u64 i_size = i_size_read(inode);
|
||||||
|
bool sparse = ceph_test_mount_opt(fsc, SPARSEREAD);
|
||||||
|
|
||||||
dout("sync_read on file %p %llu~%u %s\n", file, off, (unsigned)len,
|
dout("sync_read on file %p %llu~%u %s\n", file, off, (unsigned)len,
|
||||||
(file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
|
(file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
|
||||||
@ -962,10 +963,12 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
|
|||||||
bool more;
|
bool more;
|
||||||
int idx;
|
int idx;
|
||||||
size_t left;
|
size_t left;
|
||||||
|
struct ceph_osd_req_op *op;
|
||||||
|
|
||||||
req = ceph_osdc_new_request(osdc, &ci->i_layout,
|
req = ceph_osdc_new_request(osdc, &ci->i_layout,
|
||||||
ci->i_vino, off, &len, 0, 1,
|
ci->i_vino, off, &len, 0, 1,
|
||||||
CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
|
sparse ? CEPH_OSD_OP_SPARSE_READ : CEPH_OSD_OP_READ,
|
||||||
|
CEPH_OSD_FLAG_READ,
|
||||||
NULL, ci->i_truncate_seq,
|
NULL, ci->i_truncate_seq,
|
||||||
ci->i_truncate_size, false);
|
ci->i_truncate_size, false);
|
||||||
if (IS_ERR(req)) {
|
if (IS_ERR(req)) {
|
||||||
@ -986,6 +989,16 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
|
|||||||
|
|
||||||
osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_off,
|
osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_off,
|
||||||
false, false);
|
false, false);
|
||||||
|
|
||||||
|
op = &req->r_ops[0];
|
||||||
|
if (sparse) {
|
||||||
|
ret = ceph_alloc_sparse_ext_map(op);
|
||||||
|
if (ret) {
|
||||||
|
ceph_osdc_put_request(req);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
ceph_osdc_start_request(osdc, req);
|
ceph_osdc_start_request(osdc, req);
|
||||||
ret = ceph_osdc_wait_request(osdc, req);
|
ret = ceph_osdc_wait_request(osdc, req);
|
||||||
|
|
||||||
@ -994,19 +1007,24 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
|
|||||||
req->r_end_latency,
|
req->r_end_latency,
|
||||||
len, ret);
|
len, ret);
|
||||||
|
|
||||||
ceph_osdc_put_request(req);
|
|
||||||
|
|
||||||
i_size = i_size_read(inode);
|
i_size = i_size_read(inode);
|
||||||
dout("sync_read %llu~%llu got %zd i_size %llu%s\n",
|
dout("sync_read %llu~%llu got %zd i_size %llu%s\n",
|
||||||
off, len, ret, i_size, (more ? " MORE" : ""));
|
off, len, ret, i_size, (more ? " MORE" : ""));
|
||||||
|
|
||||||
if (ret == -ENOENT)
|
/* Fix it to go to end of extent map */
|
||||||
|
if (sparse && ret >= 0)
|
||||||
|
ret = ceph_sparse_ext_map_end(op);
|
||||||
|
else if (ret == -ENOENT)
|
||||||
ret = 0;
|
ret = 0;
|
||||||
|
|
||||||
|
ceph_osdc_put_request(req);
|
||||||
|
|
||||||
if (ret >= 0 && ret < len && (off + ret < i_size)) {
|
if (ret >= 0 && ret < len && (off + ret < i_size)) {
|
||||||
int zlen = min(len - ret, i_size - off - ret);
|
int zlen = min(len - ret, i_size - off - ret);
|
||||||
int zoff = page_off + ret;
|
int zoff = page_off + ret;
|
||||||
|
|
||||||
dout("sync_read zero gap %llu~%llu\n",
|
dout("sync_read zero gap %llu~%llu\n",
|
||||||
off + ret, off + ret + zlen);
|
off + ret, off + ret + zlen);
|
||||||
ceph_zero_page_vector_range(zoff, zlen, pages);
|
ceph_zero_page_vector_range(zoff, zlen, pages);
|
||||||
ret += zlen;
|
ret += zlen;
|
||||||
}
|
}
|
||||||
@ -1125,8 +1143,10 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req)
|
|||||||
struct inode *inode = req->r_inode;
|
struct inode *inode = req->r_inode;
|
||||||
struct ceph_aio_request *aio_req = req->r_priv;
|
struct ceph_aio_request *aio_req = req->r_priv;
|
||||||
struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0);
|
struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0);
|
||||||
|
struct ceph_osd_req_op *op = &req->r_ops[0];
|
||||||
struct ceph_client_metric *metric = &ceph_sb_to_mdsc(inode->i_sb)->metric;
|
struct ceph_client_metric *metric = &ceph_sb_to_mdsc(inode->i_sb)->metric;
|
||||||
unsigned int len = osd_data->bvec_pos.iter.bi_size;
|
unsigned int len = osd_data->bvec_pos.iter.bi_size;
|
||||||
|
bool sparse = (op->op == CEPH_OSD_OP_SPARSE_READ);
|
||||||
|
|
||||||
BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_BVECS);
|
BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_BVECS);
|
||||||
BUG_ON(!osd_data->num_bvecs);
|
BUG_ON(!osd_data->num_bvecs);
|
||||||
@ -1147,6 +1167,8 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req)
|
|||||||
}
|
}
|
||||||
rc = -ENOMEM;
|
rc = -ENOMEM;
|
||||||
} else if (!aio_req->write) {
|
} else if (!aio_req->write) {
|
||||||
|
if (sparse && rc >= 0)
|
||||||
|
rc = ceph_sparse_ext_map_end(op);
|
||||||
if (rc == -ENOENT)
|
if (rc == -ENOENT)
|
||||||
rc = 0;
|
rc = 0;
|
||||||
if (rc >= 0 && len > rc) {
|
if (rc >= 0 && len > rc) {
|
||||||
@ -1283,6 +1305,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
|
|||||||
loff_t pos = iocb->ki_pos;
|
loff_t pos = iocb->ki_pos;
|
||||||
bool write = iov_iter_rw(iter) == WRITE;
|
bool write = iov_iter_rw(iter) == WRITE;
|
||||||
bool should_dirty = !write && user_backed_iter(iter);
|
bool should_dirty = !write && user_backed_iter(iter);
|
||||||
|
bool sparse = ceph_test_mount_opt(fsc, SPARSEREAD);
|
||||||
|
|
||||||
if (write && ceph_snap(file_inode(file)) != CEPH_NOSNAP)
|
if (write && ceph_snap(file_inode(file)) != CEPH_NOSNAP)
|
||||||
return -EROFS;
|
return -EROFS;
|
||||||
@ -1310,6 +1333,8 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
|
|||||||
while (iov_iter_count(iter) > 0) {
|
while (iov_iter_count(iter) > 0) {
|
||||||
u64 size = iov_iter_count(iter);
|
u64 size = iov_iter_count(iter);
|
||||||
ssize_t len;
|
ssize_t len;
|
||||||
|
struct ceph_osd_req_op *op;
|
||||||
|
int readop = sparse ? CEPH_OSD_OP_SPARSE_READ : CEPH_OSD_OP_READ;
|
||||||
|
|
||||||
if (write)
|
if (write)
|
||||||
size = min_t(u64, size, fsc->mount_options->wsize);
|
size = min_t(u64, size, fsc->mount_options->wsize);
|
||||||
@ -1320,8 +1345,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
|
|||||||
req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
|
req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
|
||||||
vino, pos, &size, 0,
|
vino, pos, &size, 0,
|
||||||
1,
|
1,
|
||||||
write ? CEPH_OSD_OP_WRITE :
|
write ? CEPH_OSD_OP_WRITE : readop,
|
||||||
CEPH_OSD_OP_READ,
|
|
||||||
flags, snapc,
|
flags, snapc,
|
||||||
ci->i_truncate_seq,
|
ci->i_truncate_seq,
|
||||||
ci->i_truncate_size,
|
ci->i_truncate_size,
|
||||||
@ -1372,6 +1396,14 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
|
|||||||
}
|
}
|
||||||
|
|
||||||
osd_req_op_extent_osd_data_bvecs(req, 0, bvecs, num_pages, len);
|
osd_req_op_extent_osd_data_bvecs(req, 0, bvecs, num_pages, len);
|
||||||
|
op = &req->r_ops[0];
|
||||||
|
if (sparse) {
|
||||||
|
ret = ceph_alloc_sparse_ext_map(op);
|
||||||
|
if (ret) {
|
||||||
|
ceph_osdc_put_request(req);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (aio_req) {
|
if (aio_req) {
|
||||||
aio_req->total_len += len;
|
aio_req->total_len += len;
|
||||||
@ -1399,8 +1431,11 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
|
|||||||
|
|
||||||
size = i_size_read(inode);
|
size = i_size_read(inode);
|
||||||
if (!write) {
|
if (!write) {
|
||||||
if (ret == -ENOENT)
|
if (sparse && ret >= 0)
|
||||||
|
ret = ceph_sparse_ext_map_end(op);
|
||||||
|
else if (ret == -ENOENT)
|
||||||
ret = 0;
|
ret = 0;
|
||||||
|
|
||||||
if (ret >= 0 && ret < len && pos + ret < size) {
|
if (ret >= 0 && ret < len && pos + ret < size) {
|
||||||
struct iov_iter i;
|
struct iov_iter i;
|
||||||
int zlen = min_t(size_t, len - ret,
|
int zlen = min_t(size_t, len - ret,
|
||||||
|
@ -165,6 +165,7 @@ enum {
|
|||||||
Opt_copyfrom,
|
Opt_copyfrom,
|
||||||
Opt_wsync,
|
Opt_wsync,
|
||||||
Opt_pagecache,
|
Opt_pagecache,
|
||||||
|
Opt_sparseread,
|
||||||
};
|
};
|
||||||
|
|
||||||
enum ceph_recover_session_mode {
|
enum ceph_recover_session_mode {
|
||||||
@ -207,6 +208,7 @@ static const struct fs_parameter_spec ceph_mount_parameters[] = {
|
|||||||
fsparam_u32 ("wsize", Opt_wsize),
|
fsparam_u32 ("wsize", Opt_wsize),
|
||||||
fsparam_flag_no ("wsync", Opt_wsync),
|
fsparam_flag_no ("wsync", Opt_wsync),
|
||||||
fsparam_flag_no ("pagecache", Opt_pagecache),
|
fsparam_flag_no ("pagecache", Opt_pagecache),
|
||||||
|
fsparam_flag_no ("sparseread", Opt_sparseread),
|
||||||
{}
|
{}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -576,6 +578,12 @@ static int ceph_parse_mount_param(struct fs_context *fc,
|
|||||||
else
|
else
|
||||||
fsopt->flags &= ~CEPH_MOUNT_OPT_NOPAGECACHE;
|
fsopt->flags &= ~CEPH_MOUNT_OPT_NOPAGECACHE;
|
||||||
break;
|
break;
|
||||||
|
case Opt_sparseread:
|
||||||
|
if (result.negated)
|
||||||
|
fsopt->flags &= ~CEPH_MOUNT_OPT_SPARSEREAD;
|
||||||
|
else
|
||||||
|
fsopt->flags |= CEPH_MOUNT_OPT_SPARSEREAD;
|
||||||
|
break;
|
||||||
default:
|
default:
|
||||||
BUG();
|
BUG();
|
||||||
}
|
}
|
||||||
@ -710,9 +718,10 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
|
|||||||
|
|
||||||
if (!(fsopt->flags & CEPH_MOUNT_OPT_ASYNC_DIROPS))
|
if (!(fsopt->flags & CEPH_MOUNT_OPT_ASYNC_DIROPS))
|
||||||
seq_puts(m, ",wsync");
|
seq_puts(m, ",wsync");
|
||||||
|
|
||||||
if (fsopt->flags & CEPH_MOUNT_OPT_NOPAGECACHE)
|
if (fsopt->flags & CEPH_MOUNT_OPT_NOPAGECACHE)
|
||||||
seq_puts(m, ",nopagecache");
|
seq_puts(m, ",nopagecache");
|
||||||
|
if (fsopt->flags & CEPH_MOUNT_OPT_SPARSEREAD)
|
||||||
|
seq_puts(m, ",sparseread");
|
||||||
|
|
||||||
if (fsopt->wsize != CEPH_MAX_WRITE_SIZE)
|
if (fsopt->wsize != CEPH_MAX_WRITE_SIZE)
|
||||||
seq_printf(m, ",wsize=%u", fsopt->wsize);
|
seq_printf(m, ",wsize=%u", fsopt->wsize);
|
||||||
@ -1296,6 +1305,11 @@ static int ceph_reconfigure_fc(struct fs_context *fc)
|
|||||||
else
|
else
|
||||||
ceph_clear_mount_opt(fsc, ASYNC_DIROPS);
|
ceph_clear_mount_opt(fsc, ASYNC_DIROPS);
|
||||||
|
|
||||||
|
if (fsopt->flags & CEPH_MOUNT_OPT_SPARSEREAD)
|
||||||
|
ceph_set_mount_opt(fsc, SPARSEREAD);
|
||||||
|
else
|
||||||
|
ceph_clear_mount_opt(fsc, SPARSEREAD);
|
||||||
|
|
||||||
if (strcmp_null(fsc->mount_options->mon_addr, fsopt->mon_addr)) {
|
if (strcmp_null(fsc->mount_options->mon_addr, fsopt->mon_addr)) {
|
||||||
kfree(fsc->mount_options->mon_addr);
|
kfree(fsc->mount_options->mon_addr);
|
||||||
fsc->mount_options->mon_addr = fsopt->mon_addr;
|
fsc->mount_options->mon_addr = fsopt->mon_addr;
|
||||||
|
@ -42,6 +42,7 @@
|
|||||||
#define CEPH_MOUNT_OPT_NOCOPYFROM (1<<14) /* don't use RADOS 'copy-from' op */
|
#define CEPH_MOUNT_OPT_NOCOPYFROM (1<<14) /* don't use RADOS 'copy-from' op */
|
||||||
#define CEPH_MOUNT_OPT_ASYNC_DIROPS (1<<15) /* allow async directory ops */
|
#define CEPH_MOUNT_OPT_ASYNC_DIROPS (1<<15) /* allow async directory ops */
|
||||||
#define CEPH_MOUNT_OPT_NOPAGECACHE (1<<16) /* bypass pagecache altogether */
|
#define CEPH_MOUNT_OPT_NOPAGECACHE (1<<16) /* bypass pagecache altogether */
|
||||||
|
#define CEPH_MOUNT_OPT_SPARSEREAD (1<<17) /* always do sparse reads */
|
||||||
|
|
||||||
#define CEPH_MOUNT_OPT_DEFAULT \
|
#define CEPH_MOUNT_OPT_DEFAULT \
|
||||||
(CEPH_MOUNT_OPT_DCACHE | \
|
(CEPH_MOUNT_OPT_DCACHE | \
|
||||||
|
Loading…
x
Reference in New Issue
Block a user