diff --git a/Documentation/filesystems/ceph.rst b/Documentation/filesystems/ceph.rst index 76ce938e7024..085f309ece60 100644 --- a/Documentation/filesystems/ceph.rst +++ b/Documentation/filesystems/ceph.rst @@ -57,6 +57,16 @@ a snapshot on any subdirectory (and its nested contents) in the system. Snapshot creation and deletion are as simple as 'mkdir .snap/foo' and 'rmdir .snap/foo'. +Snapshot names have two limitations: + +* They can not start with an underscore ('_'), as these names are reserved + for internal usage by the MDS. +* They can not exceed 240 characters in size. This is because the MDS makes + use of long snapshot names internally, which follow the format: + `__`. Since filenames in general can't have + more than 255 characters, and `` takes 13 characters, the long + snapshot names can take as much as 255 - 1 - 1 - 13 = 240. + Ceph also provides some recursive accounting on directories for nested files and bytes. That is, a 'getfattr -d foo' on any directory in the system will reveal the total number of nested regular files and diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 2328cc05be36..3de11f077144 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -7199,7 +7199,6 @@ static void rbd_dev_remove_parent(struct rbd_device *rbd_dev) static ssize_t do_rbd_remove(const char *buf, size_t count) { struct rbd_device *rbd_dev = NULL; - struct list_head *tmp; int dev_id; char opt_buf[6]; bool force = false; @@ -7226,8 +7225,7 @@ static ssize_t do_rbd_remove(const char *buf, size_t count) ret = -ENOENT; spin_lock(&rbd_dev_list_lock); - list_for_each(tmp, &rbd_dev_list) { - rbd_dev = list_entry(tmp, struct rbd_device, node); + list_for_each_entry(rbd_dev, &rbd_dev_list, node) { if (rbd_dev->dev_id == dev_id) { ret = 0; break; diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile index 50c635dc7f71..1f77ca04c426 100644 --- a/fs/ceph/Makefile +++ b/fs/ceph/Makefile @@ -12,3 +12,4 @@ ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \ ceph-$(CONFIG_CEPH_FSCACHE) += cache.o ceph-$(CONFIG_CEPH_FS_POSIX_ACL) += acl.o +ceph-$(CONFIG_FS_ENCRYPTION) += crypto.o diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c index c91b293267d7..c53a1d220622 100644 --- a/fs/ceph/acl.c +++ b/fs/ceph/acl.c @@ -140,7 +140,7 @@ int ceph_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, newattrs.ia_ctime = current_time(inode); newattrs.ia_mode = new_mode; newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; - ret = __ceph_setattr(inode, &newattrs); + ret = __ceph_setattr(inode, &newattrs, NULL); if (ret) goto out_free; } @@ -151,7 +151,7 @@ int ceph_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, newattrs.ia_ctime = old_ctime; newattrs.ia_mode = old_mode; newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; - __ceph_setattr(inode, &newattrs); + __ceph_setattr(inode, &newattrs, NULL); } goto out_free; } diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 59cbfb80edbd..f4863078f7fe 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -18,6 +18,7 @@ #include "mds_client.h" #include "cache.h" #include "metric.h" +#include "crypto.h" #include #include @@ -242,11 +243,13 @@ static bool ceph_netfs_clamp_length(struct netfs_io_subrequest *subreq) static void finish_netfs_read(struct ceph_osd_request *req) { - struct ceph_fs_client *fsc = ceph_inode_to_client(req->r_inode); + struct inode *inode = req->r_inode; + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0); struct netfs_io_subrequest *subreq = req->r_priv; - int num_pages; + struct ceph_osd_req_op *op = &req->r_ops[0]; int err = req->r_result; + bool sparse = (op->op == CEPH_OSD_OP_SPARSE_READ); ceph_update_read_metrics(&fsc->mdsc->metric, req->r_start_latency, req->r_end_latency, osd_data->length, err); @@ -260,14 +263,29 @@ static void finish_netfs_read(struct ceph_osd_request *req) else if (err == -EBLOCKLISTED) fsc->blocklisted = true; - if (err >= 0 && err < subreq->len) - __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); + if (err >= 0) { + if (sparse && err > 0) + err = ceph_sparse_ext_map_end(op); + if (err < subreq->len) + __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); + if (IS_ENCRYPTED(inode) && err > 0) { + err = ceph_fscrypt_decrypt_extents(inode, + osd_data->pages, subreq->start, + op->extent.sparse_ext, + op->extent.sparse_ext_cnt); + if (err > subreq->len) + err = subreq->len; + } + } + if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES) { + ceph_put_page_vector(osd_data->pages, + calc_pages_for(osd_data->alignment, + osd_data->length), false); + } netfs_subreq_terminated(subreq, err, false); - - num_pages = calc_pages_for(osd_data->alignment, osd_data->length); - ceph_put_page_vector(osd_data->pages, num_pages, false); iput(req->r_inode); + ceph_dec_osd_stopping_blocker(fsc->mdsc); } static bool ceph_netfs_issue_op_inline(struct netfs_io_subrequest *subreq) @@ -334,10 +352,10 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq) struct ceph_osd_request *req = NULL; struct ceph_vino vino = ceph_vino(inode); struct iov_iter iter; - struct page **pages; - size_t page_off; int err = 0; u64 len = subreq->len; + bool sparse = IS_ENCRYPTED(inode) || ceph_test_mount_opt(fsc, SPARSEREAD); + u64 off = subreq->start; if (ceph_inode_is_shutdown(inode)) { err = -EIO; @@ -347,8 +365,10 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq) if (ceph_has_inline_data(ci) && ceph_netfs_issue_op_inline(subreq)) return; - req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino, subreq->start, &len, - 0, 1, CEPH_OSD_OP_READ, + ceph_fscrypt_adjust_off_and_len(inode, &off, &len); + + req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino, + off, &len, 0, 1, sparse ? CEPH_OSD_OP_SPARSE_READ : CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ | fsc->client->osdc.client->options->read_from_replica, NULL, ci->i_truncate_seq, ci->i_truncate_size, false); if (IS_ERR(req)) { @@ -357,20 +377,48 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq) goto out; } - dout("%s: pos=%llu orig_len=%zu len=%llu\n", __func__, subreq->start, subreq->len, len); - iov_iter_xarray(&iter, ITER_DEST, &rreq->mapping->i_pages, subreq->start, len); - err = iov_iter_get_pages_alloc2(&iter, &pages, len, &page_off); - if (err < 0) { - dout("%s: iov_ter_get_pages_alloc returned %d\n", __func__, err); - goto out; + if (sparse) { + err = ceph_alloc_sparse_ext_map(&req->r_ops[0]); + if (err) + goto out; } - /* should always give us a page-aligned read */ - WARN_ON_ONCE(page_off); - len = err; - err = 0; + dout("%s: pos=%llu orig_len=%zu len=%llu\n", __func__, subreq->start, subreq->len, len); - osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, false, false); + iov_iter_xarray(&iter, ITER_DEST, &rreq->mapping->i_pages, subreq->start, len); + + /* + * FIXME: For now, use CEPH_OSD_DATA_TYPE_PAGES instead of _ITER for + * encrypted inodes. We'd need infrastructure that handles an iov_iter + * instead of page arrays, and we don't have that as of yet. Once the + * dust settles on the write helpers and encrypt/decrypt routines for + * netfs, we should be able to rework this. + */ + if (IS_ENCRYPTED(inode)) { + struct page **pages; + size_t page_off; + + err = iov_iter_get_pages_alloc2(&iter, &pages, len, &page_off); + if (err < 0) { + dout("%s: iov_ter_get_pages_alloc returned %d\n", + __func__, err); + goto out; + } + + /* should always give us a page-aligned read */ + WARN_ON_ONCE(page_off); + len = err; + err = 0; + + osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, false, + false); + } else { + osd_req_op_extent_osd_iter(req, 0, &iter); + } + if (!ceph_inc_osd_stopping_blocker(fsc->mdsc)) { + err = -EIO; + goto out; + } req->r_callback = finish_netfs_read; req->r_priv = subreq; req->r_inode = inode; @@ -571,10 +619,12 @@ static u64 get_writepages_data_length(struct inode *inode, struct page *page, u64 start) { struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_snap_context *snapc = page_snap_context(page); + struct ceph_snap_context *snapc; struct ceph_cap_snap *capsnap = NULL; u64 end = i_size_read(inode); + u64 ret; + snapc = page_snap_context(ceph_fscrypt_pagecache_page(page)); if (snapc != ci->i_head_snapc) { bool found = false; spin_lock(&ci->i_ceph_lock); @@ -589,9 +639,12 @@ static u64 get_writepages_data_length(struct inode *inode, spin_unlock(&ci->i_ceph_lock); WARN_ON(!found); } - if (end > page_offset(page) + thp_size(page)) - end = page_offset(page) + thp_size(page); - return end > start ? end - start : 0; + if (end > ceph_fscrypt_page_offset(page) + thp_size(page)) + end = ceph_fscrypt_page_offset(page) + thp_size(page); + ret = end > start ? end - start : 0; + if (ret && fscrypt_is_bounce_page(page)) + ret = round_up(ret, CEPH_FSCRYPT_BLOCK_SIZE); + return ret; } /* @@ -610,10 +663,12 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) loff_t page_off = page_offset(page); int err; loff_t len = thp_size(page); + loff_t wlen; struct ceph_writeback_ctl ceph_wbc; struct ceph_osd_client *osdc = &fsc->client->osdc; struct ceph_osd_request *req; bool caching = ceph_is_cache_enabled(inode); + struct page *bounce_page = NULL; dout("writepage %p idx %lu\n", page, page->index); @@ -649,31 +704,51 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) if (ceph_wbc.i_size < page_off + len) len = ceph_wbc.i_size - page_off; + wlen = IS_ENCRYPTED(inode) ? round_up(len, CEPH_FSCRYPT_BLOCK_SIZE) : len; dout("writepage %p page %p index %lu on %llu~%llu snapc %p seq %lld\n", - inode, page, page->index, page_off, len, snapc, snapc->seq); + inode, page, page->index, page_off, wlen, snapc, snapc->seq); if (atomic_long_inc_return(&fsc->writeback_count) > CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb)) fsc->write_congested = true; - req = ceph_osdc_new_request(osdc, &ci->i_layout, ceph_vino(inode), page_off, &len, 0, 1, - CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE, snapc, - ceph_wbc.truncate_seq, ceph_wbc.truncate_size, - true); + req = ceph_osdc_new_request(osdc, &ci->i_layout, ceph_vino(inode), + page_off, &wlen, 0, 1, CEPH_OSD_OP_WRITE, + CEPH_OSD_FLAG_WRITE, snapc, + ceph_wbc.truncate_seq, + ceph_wbc.truncate_size, true); if (IS_ERR(req)) { redirty_page_for_writepage(wbc, page); return PTR_ERR(req); } + if (wlen < len) + len = wlen; + set_page_writeback(page); if (caching) ceph_set_page_fscache(page); ceph_fscache_write_to_cache(inode, page_off, len, caching); + if (IS_ENCRYPTED(inode)) { + bounce_page = fscrypt_encrypt_pagecache_blocks(page, + CEPH_FSCRYPT_BLOCK_SIZE, 0, + GFP_NOFS); + if (IS_ERR(bounce_page)) { + redirty_page_for_writepage(wbc, page); + end_page_writeback(page); + ceph_osdc_put_request(req); + return PTR_ERR(bounce_page); + } + } + /* it may be a short write due to an object boundary */ WARN_ON_ONCE(len > thp_size(page)); - osd_req_op_extent_osd_data_pages(req, 0, &page, len, 0, false, false); - dout("writepage %llu~%llu (%llu bytes)\n", page_off, len, len); + osd_req_op_extent_osd_data_pages(req, 0, + bounce_page ? &bounce_page : &page, wlen, 0, + false, false); + dout("writepage %llu~%llu (%llu bytes, %sencrypted)\n", + page_off, len, wlen, IS_ENCRYPTED(inode) ? "" : "not "); req->r_mtime = inode->i_mtime; ceph_osdc_start_request(osdc, req); @@ -681,7 +756,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency, req->r_end_latency, len, err); - + fscrypt_free_bounce_page(bounce_page); ceph_osdc_put_request(req); if (err == 0) err = len; @@ -800,6 +875,11 @@ static void writepages_finish(struct ceph_osd_request *req) total_pages += num_pages; for (j = 0; j < num_pages; j++) { page = osd_data->pages[j]; + if (fscrypt_is_bounce_page(page)) { + page = fscrypt_pagecache_page(page); + fscrypt_free_bounce_page(osd_data->pages[j]); + osd_data->pages[j] = page; + } BUG_ON(!page); WARN_ON(!PageUptodate(page)); @@ -835,6 +915,7 @@ static void writepages_finish(struct ceph_osd_request *req) else kfree(osd_data->pages); ceph_osdc_put_request(req); + ceph_dec_osd_stopping_blocker(fsc->mdsc); } /* @@ -1070,9 +1151,28 @@ static int ceph_writepages_start(struct address_space *mapping, fsc->mount_options->congestion_kb)) fsc->write_congested = true; - pages[locked_pages++] = page; - fbatch.folios[i] = NULL; + if (IS_ENCRYPTED(inode)) { + pages[locked_pages] = + fscrypt_encrypt_pagecache_blocks(page, + PAGE_SIZE, 0, + locked_pages ? GFP_NOWAIT : GFP_NOFS); + if (IS_ERR(pages[locked_pages])) { + if (PTR_ERR(pages[locked_pages]) == -EINVAL) + pr_err("%s: inode->i_blkbits=%hhu\n", + __func__, inode->i_blkbits); + /* better not fail on first page! */ + BUG_ON(locked_pages == 0); + pages[locked_pages] = NULL; + redirty_page_for_writepage(wbc, page); + unlock_page(page); + break; + } + ++locked_pages; + } else { + pages[locked_pages++] = page; + } + fbatch.folios[i] = NULL; len += thp_size(page); } @@ -1100,7 +1200,7 @@ static int ceph_writepages_start(struct address_space *mapping, } new_request: - offset = page_offset(pages[0]); + offset = ceph_fscrypt_page_offset(pages[0]); len = wsize; req = ceph_osdc_new_request(&fsc->client->osdc, @@ -1121,9 +1221,13 @@ static int ceph_writepages_start(struct address_space *mapping, ceph_wbc.truncate_size, true); BUG_ON(IS_ERR(req)); } - BUG_ON(len < page_offset(pages[locked_pages - 1]) + - thp_size(page) - offset); + BUG_ON(len < ceph_fscrypt_page_offset(pages[locked_pages - 1]) + + thp_size(pages[locked_pages - 1]) - offset); + if (!ceph_inc_osd_stopping_blocker(fsc->mdsc)) { + rc = -EIO; + goto release_folios; + } req->r_callback = writepages_finish; req->r_inode = inode; @@ -1132,7 +1236,9 @@ static int ceph_writepages_start(struct address_space *mapping, data_pages = pages; op_idx = 0; for (i = 0; i < locked_pages; i++) { - u64 cur_offset = page_offset(pages[i]); + struct page *page = ceph_fscrypt_pagecache_page(pages[i]); + + u64 cur_offset = page_offset(page); /* * Discontinuity in page range? Ceph can handle that by just passing * multiple extents in the write op. @@ -1161,9 +1267,9 @@ static int ceph_writepages_start(struct address_space *mapping, op_idx++; } - set_page_writeback(pages[i]); + set_page_writeback(page); if (caching) - ceph_set_page_fscache(pages[i]); + ceph_set_page_fscache(page); len += thp_size(page); } ceph_fscache_write_to_cache(inode, offset, len, caching); @@ -1179,8 +1285,16 @@ static int ceph_writepages_start(struct address_space *mapping, offset); len = max(len, min_len); } + if (IS_ENCRYPTED(inode)) + len = round_up(len, CEPH_FSCRYPT_BLOCK_SIZE); + dout("writepages got pages at %llu~%llu\n", offset, len); + if (IS_ENCRYPTED(inode) && + ((offset | len) & ~CEPH_FSCRYPT_BLOCK_MASK)) + pr_warn("%s: bad encrypted write offset=%lld len=%llu\n", + __func__, offset, len); + osd_req_op_extent_osd_data_pages(req, op_idx, data_pages, len, 0, from_pool, false); osd_req_op_extent_update(req, op_idx, len); diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 09cd6d334604..14215ec646f7 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -14,6 +14,7 @@ #include "super.h" #include "mds_client.h" #include "cache.h" +#include "crypto.h" #include #include @@ -1216,15 +1217,11 @@ struct cap_msg_args { umode_t mode; bool inline_data; bool wake; + bool encrypted; + u32 fscrypt_auth_len; + u8 fscrypt_auth[sizeof(struct ceph_fscrypt_auth)]; // for context }; -/* - * cap struct size + flock buffer size + inline version + inline data size + - * osd_epoch_barrier + oldest_flush_tid - */ -#define CAP_MSG_SIZE (sizeof(struct ceph_mds_caps) + \ - 4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 4) - /* Marshal up the cap msg to the MDS */ static void encode_cap_msg(struct ceph_msg *msg, struct cap_msg_args *arg) { @@ -1240,7 +1237,7 @@ static void encode_cap_msg(struct ceph_msg *msg, struct cap_msg_args *arg) arg->size, arg->max_size, arg->xattr_version, arg->xattr_buf ? (int)arg->xattr_buf->vec.iov_len : 0); - msg->hdr.version = cpu_to_le16(10); + msg->hdr.version = cpu_to_le16(12); msg->hdr.tid = cpu_to_le64(arg->flush_tid); fc = msg->front.iov_base; @@ -1257,7 +1254,13 @@ static void encode_cap_msg(struct ceph_msg *msg, struct cap_msg_args *arg) fc->ino = cpu_to_le64(arg->ino); fc->snap_follows = cpu_to_le64(arg->follows); - fc->size = cpu_to_le64(arg->size); +#if IS_ENABLED(CONFIG_FS_ENCRYPTION) + if (arg->encrypted) + fc->size = cpu_to_le64(round_up(arg->size, + CEPH_FSCRYPT_BLOCK_SIZE)); + else +#endif + fc->size = cpu_to_le64(arg->size); fc->max_size = cpu_to_le64(arg->max_size); ceph_encode_timespec64(&fc->mtime, &arg->mtime); ceph_encode_timespec64(&fc->atime, &arg->atime); @@ -1311,6 +1314,27 @@ static void encode_cap_msg(struct ceph_msg *msg, struct cap_msg_args *arg) /* Advisory flags (version 10) */ ceph_encode_32(&p, arg->flags); + + /* dirstats (version 11) - these are r/o on the client */ + ceph_encode_64(&p, 0); + ceph_encode_64(&p, 0); + +#if IS_ENABLED(CONFIG_FS_ENCRYPTION) + /* + * fscrypt_auth and fscrypt_file (version 12) + * + * fscrypt_auth holds the crypto context (if any). fscrypt_file + * tracks the real i_size as an __le64 field (and we use a rounded-up + * i_size in the traditional size field). + */ + ceph_encode_32(&p, arg->fscrypt_auth_len); + ceph_encode_copy(&p, arg->fscrypt_auth, arg->fscrypt_auth_len); + ceph_encode_32(&p, sizeof(__le64)); + ceph_encode_64(&p, arg->size); +#else /* CONFIG_FS_ENCRYPTION */ + ceph_encode_32(&p, 0); + ceph_encode_32(&p, 0); +#endif /* CONFIG_FS_ENCRYPTION */ } /* @@ -1378,7 +1402,6 @@ static void __prep_cap(struct cap_msg_args *arg, struct ceph_cap *cap, arg->follows = flushing ? ci->i_head_snapc->seq : 0; arg->flush_tid = flush_tid; arg->oldest_flush_tid = oldest_flush_tid; - arg->size = i_size_read(inode); ci->i_reported_size = arg->size; arg->max_size = ci->i_wanted_max_size; @@ -1432,8 +1455,39 @@ static void __prep_cap(struct cap_msg_args *arg, struct ceph_cap *cap, } } arg->flags = flags; + arg->encrypted = IS_ENCRYPTED(inode); +#if IS_ENABLED(CONFIG_FS_ENCRYPTION) + if (ci->fscrypt_auth_len && + WARN_ON_ONCE(ci->fscrypt_auth_len > sizeof(struct ceph_fscrypt_auth))) { + /* Don't set this if it's too big */ + arg->fscrypt_auth_len = 0; + } else { + arg->fscrypt_auth_len = ci->fscrypt_auth_len; + memcpy(arg->fscrypt_auth, ci->fscrypt_auth, + min_t(size_t, ci->fscrypt_auth_len, + sizeof(arg->fscrypt_auth))); + } +#endif /* CONFIG_FS_ENCRYPTION */ } +#if IS_ENABLED(CONFIG_FS_ENCRYPTION) +#define CAP_MSG_FIXED_FIELDS (sizeof(struct ceph_mds_caps) + \ + 4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 4 + 8 + 8 + 4 + 4 + 8) + +static inline int cap_msg_size(struct cap_msg_args *arg) +{ + return CAP_MSG_FIXED_FIELDS + arg->fscrypt_auth_len; +} +#else +#define CAP_MSG_FIXED_FIELDS (sizeof(struct ceph_mds_caps) + \ + 4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 4 + 8 + 8 + 4 + 4) + +static inline int cap_msg_size(struct cap_msg_args *arg) +{ + return CAP_MSG_FIXED_FIELDS; +} +#endif /* CONFIG_FS_ENCRYPTION */ + /* * Send a cap msg on the given inode. * @@ -1444,7 +1498,8 @@ static void __send_cap(struct cap_msg_args *arg, struct ceph_inode_info *ci) struct ceph_msg *msg; struct inode *inode = &ci->netfs.inode; - msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, CAP_MSG_SIZE, GFP_NOFS, false); + msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, cap_msg_size(arg), GFP_NOFS, + false); if (!msg) { pr_err("error allocating cap msg: ino (%llx.%llx) flushing %s tid %llu, requeuing cap.\n", ceph_vinop(inode), ceph_cap_string(arg->dirty), @@ -1470,10 +1525,6 @@ static inline int __send_flush_snap(struct inode *inode, struct cap_msg_args arg; struct ceph_msg *msg; - msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, CAP_MSG_SIZE, GFP_NOFS, false); - if (!msg) - return -ENOMEM; - arg.session = session; arg.ino = ceph_vino(inode).ino; arg.cid = 0; @@ -1510,6 +1561,15 @@ static inline int __send_flush_snap(struct inode *inode, arg.inline_data = capsnap->inline_data; arg.flags = 0; arg.wake = false; + arg.encrypted = IS_ENCRYPTED(inode); + + /* No fscrypt_auth changes from a capsnap.*/ + arg.fscrypt_auth_len = 0; + + msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, cap_msg_size(&arg), + GFP_NOFS, false); + if (!msg) + return -ENOMEM; encode_cap_msg(msg, &arg); ceph_con_send(&arg.session->s_con, msg); @@ -2900,10 +2960,9 @@ int ceph_try_get_caps(struct inode *inode, int need, int want, * due to a small max_size, make sure we check_max_size (and possibly * ask the mds) so we don't get hung up indefinitely. */ -int ceph_get_caps(struct file *filp, int need, int want, loff_t endoff, int *got) +int __ceph_get_caps(struct inode *inode, struct ceph_file_info *fi, int need, + int want, loff_t endoff, int *got) { - struct ceph_file_info *fi = filp->private_data; - struct inode *inode = file_inode(filp); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_client(inode); int ret, _got, flags; @@ -2912,7 +2971,7 @@ int ceph_get_caps(struct file *filp, int need, int want, loff_t endoff, int *got if (ret < 0) return ret; - if ((fi->fmode & CEPH_FILE_MODE_WR) && + if (fi && (fi->fmode & CEPH_FILE_MODE_WR) && fi->filp_gen != READ_ONCE(fsc->filp_gen)) return -EBADF; @@ -2965,7 +3024,7 @@ int ceph_get_caps(struct file *filp, int need, int want, loff_t endoff, int *got continue; } - if ((fi->fmode & CEPH_FILE_MODE_WR) && + if (fi && (fi->fmode & CEPH_FILE_MODE_WR) && fi->filp_gen != READ_ONCE(fsc->filp_gen)) { if (ret >= 0 && _got) ceph_put_cap_refs(ci, _got); @@ -3028,6 +3087,15 @@ int ceph_get_caps(struct file *filp, int need, int want, loff_t endoff, int *got return 0; } +int ceph_get_caps(struct file *filp, int need, int want, loff_t endoff, + int *got) +{ + struct ceph_file_info *fi = filp->private_data; + struct inode *inode = file_inode(filp); + + return __ceph_get_caps(inode, fi, need, want, endoff, got); +} + /* * Take cap refs. Caller must already know we hold at least one ref * on the caps in question or we don't know this is safe. @@ -3323,6 +3391,9 @@ struct cap_extra_info { /* currently issued */ int issued; struct timespec64 btime; + u8 *fscrypt_auth; + u32 fscrypt_auth_len; + u64 fscrypt_file_size; }; /* @@ -3355,6 +3426,14 @@ static void handle_cap_grant(struct inode *inode, bool deleted_inode = false; bool fill_inline = false; + /* + * If there is at least one crypto block then we'll trust + * fscrypt_file_size. If the real length of the file is 0, then + * ignore it (it has probably been truncated down to 0 by the MDS). + */ + if (IS_ENCRYPTED(inode) && size) + size = extra_info->fscrypt_file_size; + dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n", inode, cap, session->s_mds, seq, ceph_cap_string(newcaps)); dout(" size %llu max_size %llu, i_size %llu\n", size, max_size, @@ -3421,6 +3500,14 @@ static void handle_cap_grant(struct inode *inode, dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode, from_kuid(&init_user_ns, inode->i_uid), from_kgid(&init_user_ns, inode->i_gid)); +#if IS_ENABLED(CONFIG_FS_ENCRYPTION) + if (ci->fscrypt_auth_len != extra_info->fscrypt_auth_len || + memcmp(ci->fscrypt_auth, extra_info->fscrypt_auth, + ci->fscrypt_auth_len)) + pr_warn_ratelimited("%s: cap grant attempt to change fscrypt_auth on non-I_NEW inode (old len %d new len %d)\n", + __func__, ci->fscrypt_auth_len, + extra_info->fscrypt_auth_len); +#endif } if ((newcaps & CEPH_CAP_LINK_SHARED) && @@ -3837,7 +3924,8 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid, */ static bool handle_cap_trunc(struct inode *inode, struct ceph_mds_caps *trunc, - struct ceph_mds_session *session) + struct ceph_mds_session *session, + struct cap_extra_info *extra_info) { struct ceph_inode_info *ci = ceph_inode(inode); int mds = session->s_mds; @@ -3854,8 +3942,16 @@ static bool handle_cap_trunc(struct inode *inode, issued |= implemented | dirty; - dout("handle_cap_trunc inode %p mds%d seq %d to %lld seq %d\n", - inode, mds, seq, truncate_size, truncate_seq); + /* + * If there is at least one crypto block then we'll trust + * fscrypt_file_size. If the real length of the file is 0, then + * ignore it (it has probably been truncated down to 0 by the MDS). + */ + if (IS_ENCRYPTED(inode) && size) + size = extra_info->fscrypt_file_size; + + dout("%s inode %p mds%d seq %d to %lld truncate seq %d\n", + __func__, inode, mds, seq, truncate_size, truncate_seq); queue_trunc = ceph_fill_file_size(inode, issued, truncate_seq, truncate_size, size); return queue_trunc; @@ -4075,6 +4171,52 @@ static void handle_cap_import(struct ceph_mds_client *mdsc, *target_cap = cap; } +#ifdef CONFIG_FS_ENCRYPTION +static int parse_fscrypt_fields(void **p, void *end, + struct cap_extra_info *extra) +{ + u32 len; + + ceph_decode_32_safe(p, end, extra->fscrypt_auth_len, bad); + if (extra->fscrypt_auth_len) { + ceph_decode_need(p, end, extra->fscrypt_auth_len, bad); + extra->fscrypt_auth = kmalloc(extra->fscrypt_auth_len, + GFP_KERNEL); + if (!extra->fscrypt_auth) + return -ENOMEM; + ceph_decode_copy_safe(p, end, extra->fscrypt_auth, + extra->fscrypt_auth_len, bad); + } + + ceph_decode_32_safe(p, end, len, bad); + if (len >= sizeof(u64)) { + ceph_decode_64_safe(p, end, extra->fscrypt_file_size, bad); + len -= sizeof(u64); + } + ceph_decode_skip_n(p, end, len, bad); + return 0; +bad: + return -EIO; +} +#else +static int parse_fscrypt_fields(void **p, void *end, + struct cap_extra_info *extra) +{ + u32 len; + + /* Don't care about these fields unless we're encryption-capable */ + ceph_decode_32_safe(p, end, len, bad); + if (len) + ceph_decode_skip_n(p, end, len, bad); + ceph_decode_32_safe(p, end, len, bad); + if (len) + ceph_decode_skip_n(p, end, len, bad); + return 0; +bad: + return -EIO; +} +#endif + /* * Handle a caps message from the MDS. * @@ -4105,6 +4247,9 @@ void ceph_handle_caps(struct ceph_mds_session *session, dout("handle_caps from mds%d\n", session->s_mds); + if (!ceph_inc_mds_stopping_blocker(mdsc, session)) + return; + /* decode */ end = msg->front.iov_base + msg->front.iov_len; if (msg->front.iov_len < sizeof(*h)) @@ -4195,13 +4340,17 @@ void ceph_handle_caps(struct ceph_mds_session *session, ceph_decode_64_safe(&p, end, extra_info.nsubdirs, bad); } + if (msg_version >= 12) { + if (parse_fscrypt_fields(&p, end, &extra_info)) + goto bad; + } + /* lookup ino */ inode = ceph_find_inode(mdsc->fsc->sb, vino); dout(" op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op), vino.ino, vino.snap, inode); mutex_lock(&session->s_mutex); - inc_session_sequence(session); dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq, (unsigned)seq); @@ -4292,7 +4441,8 @@ void ceph_handle_caps(struct ceph_mds_session *session, break; case CEPH_CAP_OP_TRUNC: - queue_trunc = handle_cap_trunc(inode, h, session); + queue_trunc = handle_cap_trunc(inode, h, session, + &extra_info); spin_unlock(&ci->i_ceph_lock); if (queue_trunc) ceph_queue_vmtruncate(inode); @@ -4309,12 +4459,15 @@ void ceph_handle_caps(struct ceph_mds_session *session, done_unlocked: iput(inode); out: + ceph_dec_mds_stopping_blocker(mdsc); + ceph_put_string(extra_info.pool_ns); /* Defer closing the sessions after s_mutex lock being released */ if (close_sessions) ceph_mdsc_close_sessions(mdsc); + kfree(extra_info.fscrypt_auth); return; flush_cap_releases: @@ -4611,6 +4764,18 @@ int ceph_encode_inode_release(void **p, struct inode *inode, return ret; } +/** + * ceph_encode_dentry_release - encode a dentry release into an outgoing request + * @p: outgoing request buffer + * @dentry: dentry to release + * @dir: dir to release it from + * @mds: mds that we're speaking to + * @drop: caps being dropped + * @unless: unless we have these caps + * + * Encode a dentry release into an outgoing request buffer. Returns 1 if the + * thing was released, or a negative error code otherwise. + */ int ceph_encode_dentry_release(void **p, struct dentry *dentry, struct inode *dir, int mds, int drop, int unless) @@ -4643,13 +4808,25 @@ int ceph_encode_dentry_release(void **p, struct dentry *dentry, if (ret && di->lease_session && di->lease_session->s_mds == mds) { dout("encode_dentry_release %p mds%d seq %d\n", dentry, mds, (int)di->lease_seq); - rel->dname_len = cpu_to_le32(dentry->d_name.len); - memcpy(*p, dentry->d_name.name, dentry->d_name.len); - *p += dentry->d_name.len; rel->dname_seq = cpu_to_le32(di->lease_seq); __ceph_mdsc_drop_dentry_lease(dentry); + spin_unlock(&dentry->d_lock); + if (IS_ENCRYPTED(dir) && fscrypt_has_encryption_key(dir)) { + int ret2 = ceph_encode_encrypted_fname(dir, dentry, *p); + + if (ret2 < 0) + return ret2; + + rel->dname_len = cpu_to_le32(ret2); + *p += ret2; + } else { + rel->dname_len = cpu_to_le32(dentry->d_name.len); + memcpy(*p, dentry->d_name.name, dentry->d_name.len); + *p += dentry->d_name.len; + } + } else { + spin_unlock(&dentry->d_lock); } - spin_unlock(&dentry->d_lock); return ret; } diff --git a/fs/ceph/crypto.c b/fs/ceph/crypto.c new file mode 100644 index 000000000000..e4d5cd56a80b --- /dev/null +++ b/fs/ceph/crypto.c @@ -0,0 +1,673 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * The base64 encode/decode code was copied from fscrypt: + * Copyright (C) 2015, Google, Inc. + * Copyright (C) 2015, Motorola Mobility + * Written by Uday Savagaonkar, 2014. + * Modified by Jaegeuk Kim, 2015. + */ +#include +#include +#include +#include + +#include "super.h" +#include "mds_client.h" +#include "crypto.h" + +/* + * The base64url encoding used by fscrypt includes the '_' character, which may + * cause problems in snapshot names (which can not start with '_'). Thus, we + * used the base64 encoding defined for IMAP mailbox names (RFC 3501) instead, + * which replaces '-' and '_' by '+' and ','. + */ +static const char base64_table[65] = + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,"; + +int ceph_base64_encode(const u8 *src, int srclen, char *dst) +{ + u32 ac = 0; + int bits = 0; + int i; + char *cp = dst; + + for (i = 0; i < srclen; i++) { + ac = (ac << 8) | src[i]; + bits += 8; + do { + bits -= 6; + *cp++ = base64_table[(ac >> bits) & 0x3f]; + } while (bits >= 6); + } + if (bits) + *cp++ = base64_table[(ac << (6 - bits)) & 0x3f]; + return cp - dst; +} + +int ceph_base64_decode(const char *src, int srclen, u8 *dst) +{ + u32 ac = 0; + int bits = 0; + int i; + u8 *bp = dst; + + for (i = 0; i < srclen; i++) { + const char *p = strchr(base64_table, src[i]); + + if (p == NULL || src[i] == 0) + return -1; + ac = (ac << 6) | (p - base64_table); + bits += 6; + if (bits >= 8) { + bits -= 8; + *bp++ = (u8)(ac >> bits); + } + } + if (ac & ((1 << bits) - 1)) + return -1; + return bp - dst; +} + +static int ceph_crypt_get_context(struct inode *inode, void *ctx, size_t len) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_fscrypt_auth *cfa = (struct ceph_fscrypt_auth *)ci->fscrypt_auth; + u32 ctxlen; + + /* Non existent or too short? */ + if (!cfa || (ci->fscrypt_auth_len < (offsetof(struct ceph_fscrypt_auth, cfa_blob) + 1))) + return -ENOBUFS; + + /* Some format we don't recognize? */ + if (le32_to_cpu(cfa->cfa_version) != CEPH_FSCRYPT_AUTH_VERSION) + return -ENOBUFS; + + ctxlen = le32_to_cpu(cfa->cfa_blob_len); + if (len < ctxlen) + return -ERANGE; + + memcpy(ctx, cfa->cfa_blob, ctxlen); + return ctxlen; +} + +static int ceph_crypt_set_context(struct inode *inode, const void *ctx, + size_t len, void *fs_data) +{ + int ret; + struct iattr attr = { }; + struct ceph_iattr cia = { }; + struct ceph_fscrypt_auth *cfa; + + WARN_ON_ONCE(fs_data); + + if (len > FSCRYPT_SET_CONTEXT_MAX_SIZE) + return -EINVAL; + + cfa = kzalloc(sizeof(*cfa), GFP_KERNEL); + if (!cfa) + return -ENOMEM; + + cfa->cfa_version = cpu_to_le32(CEPH_FSCRYPT_AUTH_VERSION); + cfa->cfa_blob_len = cpu_to_le32(len); + memcpy(cfa->cfa_blob, ctx, len); + + cia.fscrypt_auth = cfa; + + ret = __ceph_setattr(inode, &attr, &cia); + if (ret == 0) + inode_set_flags(inode, S_ENCRYPTED, S_ENCRYPTED); + kfree(cia.fscrypt_auth); + return ret; +} + +static bool ceph_crypt_empty_dir(struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + + return ci->i_rsubdirs + ci->i_rfiles == 1; +} + +static const union fscrypt_policy *ceph_get_dummy_policy(struct super_block *sb) +{ + return ceph_sb_to_client(sb)->fsc_dummy_enc_policy.policy; +} + +static struct fscrypt_operations ceph_fscrypt_ops = { + .get_context = ceph_crypt_get_context, + .set_context = ceph_crypt_set_context, + .get_dummy_policy = ceph_get_dummy_policy, + .empty_dir = ceph_crypt_empty_dir, +}; + +void ceph_fscrypt_set_ops(struct super_block *sb) +{ + fscrypt_set_ops(sb, &ceph_fscrypt_ops); +} + +void ceph_fscrypt_free_dummy_policy(struct ceph_fs_client *fsc) +{ + fscrypt_free_dummy_policy(&fsc->fsc_dummy_enc_policy); +} + +int ceph_fscrypt_prepare_context(struct inode *dir, struct inode *inode, + struct ceph_acl_sec_ctx *as) +{ + int ret, ctxsize; + bool encrypted = false; + struct ceph_inode_info *ci = ceph_inode(inode); + + ret = fscrypt_prepare_new_inode(dir, inode, &encrypted); + if (ret) + return ret; + if (!encrypted) + return 0; + + as->fscrypt_auth = kzalloc(sizeof(*as->fscrypt_auth), GFP_KERNEL); + if (!as->fscrypt_auth) + return -ENOMEM; + + ctxsize = fscrypt_context_for_new_inode(as->fscrypt_auth->cfa_blob, + inode); + if (ctxsize < 0) + return ctxsize; + + as->fscrypt_auth->cfa_version = cpu_to_le32(CEPH_FSCRYPT_AUTH_VERSION); + as->fscrypt_auth->cfa_blob_len = cpu_to_le32(ctxsize); + + WARN_ON_ONCE(ci->fscrypt_auth); + kfree(ci->fscrypt_auth); + ci->fscrypt_auth_len = ceph_fscrypt_auth_len(as->fscrypt_auth); + ci->fscrypt_auth = kmemdup(as->fscrypt_auth, ci->fscrypt_auth_len, + GFP_KERNEL); + if (!ci->fscrypt_auth) + return -ENOMEM; + + inode->i_flags |= S_ENCRYPTED; + + return 0; +} + +void ceph_fscrypt_as_ctx_to_req(struct ceph_mds_request *req, + struct ceph_acl_sec_ctx *as) +{ + swap(req->r_fscrypt_auth, as->fscrypt_auth); +} + +/* + * User-created snapshots can't start with '_'. Snapshots that start with this + * character are special (hint: there aren't real snapshots) and use the + * following format: + * + * __ + * + * where: + * - - the real snapshot name that may need to be decrypted, + * - - the inode number (in decimal) for the actual snapshot + * + * This function parses these snapshot names and returns the inode + * . 'name_len' will also bet set with the + * length. + */ +static struct inode *parse_longname(const struct inode *parent, + const char *name, int *name_len) +{ + struct inode *dir = NULL; + struct ceph_vino vino = { .snap = CEPH_NOSNAP }; + char *inode_number; + char *name_end; + int orig_len = *name_len; + int ret = -EIO; + + /* Skip initial '_' */ + name++; + name_end = strrchr(name, '_'); + if (!name_end) { + dout("Failed to parse long snapshot name: %s\n", name); + return ERR_PTR(-EIO); + } + *name_len = (name_end - name); + if (*name_len <= 0) { + pr_err("Failed to parse long snapshot name\n"); + return ERR_PTR(-EIO); + } + + /* Get the inode number */ + inode_number = kmemdup_nul(name_end + 1, + orig_len - *name_len - 2, + GFP_KERNEL); + if (!inode_number) + return ERR_PTR(-ENOMEM); + ret = kstrtou64(inode_number, 10, &vino.ino); + if (ret) { + dout("Failed to parse inode number: %s\n", name); + dir = ERR_PTR(ret); + goto out; + } + + /* And finally the inode */ + dir = ceph_find_inode(parent->i_sb, vino); + if (!dir) { + /* This can happen if we're not mounting cephfs on the root */ + dir = ceph_get_inode(parent->i_sb, vino, NULL); + if (!dir) + dir = ERR_PTR(-ENOENT); + } + if (IS_ERR(dir)) + dout("Can't find inode %s (%s)\n", inode_number, name); + +out: + kfree(inode_number); + return dir; +} + +int ceph_encode_encrypted_dname(struct inode *parent, struct qstr *d_name, + char *buf) +{ + struct inode *dir = parent; + struct qstr iname; + u32 len; + int name_len; + int elen; + int ret; + u8 *cryptbuf = NULL; + + iname.name = d_name->name; + name_len = d_name->len; + + /* Handle the special case of snapshot names that start with '_' */ + if ((ceph_snap(dir) == CEPH_SNAPDIR) && (name_len > 0) && + (iname.name[0] == '_')) { + dir = parse_longname(parent, iname.name, &name_len); + if (IS_ERR(dir)) + return PTR_ERR(dir); + iname.name++; /* skip initial '_' */ + } + iname.len = name_len; + + if (!fscrypt_has_encryption_key(dir)) { + memcpy(buf, d_name->name, d_name->len); + elen = d_name->len; + goto out; + } + + /* + * Convert cleartext d_name to ciphertext. If result is longer than + * CEPH_NOHASH_NAME_MAX, sha256 the remaining bytes + * + * See: fscrypt_setup_filename + */ + if (!fscrypt_fname_encrypted_size(dir, iname.len, NAME_MAX, &len)) { + elen = -ENAMETOOLONG; + goto out; + } + + /* Allocate a buffer appropriate to hold the result */ + cryptbuf = kmalloc(len > CEPH_NOHASH_NAME_MAX ? NAME_MAX : len, + GFP_KERNEL); + if (!cryptbuf) { + elen = -ENOMEM; + goto out; + } + + ret = fscrypt_fname_encrypt(dir, &iname, cryptbuf, len); + if (ret) { + elen = ret; + goto out; + } + + /* hash the end if the name is long enough */ + if (len > CEPH_NOHASH_NAME_MAX) { + u8 hash[SHA256_DIGEST_SIZE]; + u8 *extra = cryptbuf + CEPH_NOHASH_NAME_MAX; + + /* + * hash the extra bytes and overwrite crypttext beyond that + * point with it + */ + sha256(extra, len - CEPH_NOHASH_NAME_MAX, hash); + memcpy(extra, hash, SHA256_DIGEST_SIZE); + len = CEPH_NOHASH_NAME_MAX + SHA256_DIGEST_SIZE; + } + + /* base64 encode the encrypted name */ + elen = ceph_base64_encode(cryptbuf, len, buf); + dout("base64-encoded ciphertext name = %.*s\n", elen, buf); + + /* To understand the 240 limit, see CEPH_NOHASH_NAME_MAX comments */ + WARN_ON(elen > 240); + if ((elen > 0) && (dir != parent)) { + char tmp_buf[NAME_MAX]; + + elen = snprintf(tmp_buf, sizeof(tmp_buf), "_%.*s_%ld", + elen, buf, dir->i_ino); + memcpy(buf, tmp_buf, elen); + } + +out: + kfree(cryptbuf); + if (dir != parent) { + if ((dir->i_state & I_NEW)) + discard_new_inode(dir); + else + iput(dir); + } + return elen; +} + +int ceph_encode_encrypted_fname(struct inode *parent, struct dentry *dentry, + char *buf) +{ + WARN_ON_ONCE(!fscrypt_has_encryption_key(parent)); + + return ceph_encode_encrypted_dname(parent, &dentry->d_name, buf); +} + +/** + * ceph_fname_to_usr - convert a filename for userland presentation + * @fname: ceph_fname to be converted + * @tname: temporary name buffer to use for conversion (may be NULL) + * @oname: where converted name should be placed + * @is_nokey: set to true if key wasn't available during conversion (may be NULL) + * + * Given a filename (usually from the MDS), format it for presentation to + * userland. If @parent is not encrypted, just pass it back as-is. + * + * Otherwise, base64 decode the string, and then ask fscrypt to format it + * for userland presentation. + * + * Returns 0 on success or negative error code on error. + */ +int ceph_fname_to_usr(const struct ceph_fname *fname, struct fscrypt_str *tname, + struct fscrypt_str *oname, bool *is_nokey) +{ + struct inode *dir = fname->dir; + struct fscrypt_str _tname = FSTR_INIT(NULL, 0); + struct fscrypt_str iname; + char *name = fname->name; + int name_len = fname->name_len; + int ret; + + /* Sanity check that the resulting name will fit in the buffer */ + if (fname->name_len > NAME_MAX || fname->ctext_len > NAME_MAX) + return -EIO; + + /* Handle the special case of snapshot names that start with '_' */ + if ((ceph_snap(dir) == CEPH_SNAPDIR) && (name_len > 0) && + (name[0] == '_')) { + dir = parse_longname(dir, name, &name_len); + if (IS_ERR(dir)) + return PTR_ERR(dir); + name++; /* skip initial '_' */ + } + + if (!IS_ENCRYPTED(dir)) { + oname->name = fname->name; + oname->len = fname->name_len; + ret = 0; + goto out_inode; + } + + ret = ceph_fscrypt_prepare_readdir(dir); + if (ret) + goto out_inode; + + /* + * Use the raw dentry name as sent by the MDS instead of + * generating a nokey name via fscrypt. + */ + if (!fscrypt_has_encryption_key(dir)) { + if (fname->no_copy) + oname->name = fname->name; + else + memcpy(oname->name, fname->name, fname->name_len); + oname->len = fname->name_len; + if (is_nokey) + *is_nokey = true; + ret = 0; + goto out_inode; + } + + if (fname->ctext_len == 0) { + int declen; + + if (!tname) { + ret = fscrypt_fname_alloc_buffer(NAME_MAX, &_tname); + if (ret) + goto out_inode; + tname = &_tname; + } + + declen = ceph_base64_decode(name, name_len, tname->name); + if (declen <= 0) { + ret = -EIO; + goto out; + } + iname.name = tname->name; + iname.len = declen; + } else { + iname.name = fname->ctext; + iname.len = fname->ctext_len; + } + + ret = fscrypt_fname_disk_to_usr(dir, 0, 0, &iname, oname); + if (!ret && (dir != fname->dir)) { + char tmp_buf[CEPH_BASE64_CHARS(NAME_MAX)]; + + name_len = snprintf(tmp_buf, sizeof(tmp_buf), "_%.*s_%ld", + oname->len, oname->name, dir->i_ino); + memcpy(oname->name, tmp_buf, name_len); + oname->len = name_len; + } + +out: + fscrypt_fname_free_buffer(&_tname); +out_inode: + if ((dir != fname->dir) && !IS_ERR(dir)) { + if ((dir->i_state & I_NEW)) + discard_new_inode(dir); + else + iput(dir); + } + return ret; +} + +/** + * ceph_fscrypt_prepare_readdir - simple __fscrypt_prepare_readdir() wrapper + * @dir: directory inode for readdir prep + * + * Simple wrapper around __fscrypt_prepare_readdir() that will mark directory as + * non-complete if this call results in having the directory unlocked. + * + * Returns: + * 1 - if directory was locked and key is now loaded (i.e. dir is unlocked) + * 0 - if directory is still locked + * < 0 - if __fscrypt_prepare_readdir() fails + */ +int ceph_fscrypt_prepare_readdir(struct inode *dir) +{ + bool had_key = fscrypt_has_encryption_key(dir); + int err; + + if (!IS_ENCRYPTED(dir)) + return 0; + + err = __fscrypt_prepare_readdir(dir); + if (err) + return err; + if (!had_key && fscrypt_has_encryption_key(dir)) { + /* directory just got unlocked, mark it as not complete */ + ceph_dir_clear_complete(dir); + return 1; + } + return 0; +} + +int ceph_fscrypt_decrypt_block_inplace(const struct inode *inode, + struct page *page, unsigned int len, + unsigned int offs, u64 lblk_num) +{ + dout("%s: len %u offs %u blk %llu\n", __func__, len, offs, lblk_num); + return fscrypt_decrypt_block_inplace(inode, page, len, offs, lblk_num); +} + +int ceph_fscrypt_encrypt_block_inplace(const struct inode *inode, + struct page *page, unsigned int len, + unsigned int offs, u64 lblk_num, + gfp_t gfp_flags) +{ + dout("%s: len %u offs %u blk %llu\n", __func__, len, offs, lblk_num); + return fscrypt_encrypt_block_inplace(inode, page, len, offs, lblk_num, + gfp_flags); +} + +/** + * ceph_fscrypt_decrypt_pages - decrypt an array of pages + * @inode: pointer to inode associated with these pages + * @page: pointer to page array + * @off: offset into the file that the read data starts + * @len: max length to decrypt + * + * Decrypt an array of fscrypt'ed pages and return the amount of + * data decrypted. Any data in the page prior to the start of the + * first complete block in the read is ignored. Any incomplete + * crypto blocks at the end of the array are ignored (and should + * probably be zeroed by the caller). + * + * Returns the length of the decrypted data or a negative errno. + */ +int ceph_fscrypt_decrypt_pages(struct inode *inode, struct page **page, + u64 off, int len) +{ + int i, num_blocks; + u64 baseblk = off >> CEPH_FSCRYPT_BLOCK_SHIFT; + int ret = 0; + + /* + * We can't deal with partial blocks on an encrypted file, so mask off + * the last bit. + */ + num_blocks = ceph_fscrypt_blocks(off, len & CEPH_FSCRYPT_BLOCK_MASK); + + /* Decrypt each block */ + for (i = 0; i < num_blocks; ++i) { + int blkoff = i << CEPH_FSCRYPT_BLOCK_SHIFT; + int pgidx = blkoff >> PAGE_SHIFT; + unsigned int pgoffs = offset_in_page(blkoff); + int fret; + + fret = ceph_fscrypt_decrypt_block_inplace(inode, page[pgidx], + CEPH_FSCRYPT_BLOCK_SIZE, pgoffs, + baseblk + i); + if (fret < 0) { + if (ret == 0) + ret = fret; + break; + } + ret += CEPH_FSCRYPT_BLOCK_SIZE; + } + return ret; +} + +/** + * ceph_fscrypt_decrypt_extents: decrypt received extents in given buffer + * @inode: inode associated with pages being decrypted + * @page: pointer to page array + * @off: offset into the file that the data in page[0] starts + * @map: pointer to extent array + * @ext_cnt: length of extent array + * + * Given an extent map and a page array, decrypt the received data in-place, + * skipping holes. Returns the offset into buffer of end of last decrypted + * block. + */ +int ceph_fscrypt_decrypt_extents(struct inode *inode, struct page **page, + u64 off, struct ceph_sparse_extent *map, + u32 ext_cnt) +{ + int i, ret = 0; + struct ceph_inode_info *ci = ceph_inode(inode); + u64 objno, objoff; + u32 xlen; + + /* Nothing to do for empty array */ + if (ext_cnt == 0) { + dout("%s: empty array, ret 0\n", __func__); + return 0; + } + + ceph_calc_file_object_mapping(&ci->i_layout, off, map[0].len, + &objno, &objoff, &xlen); + + for (i = 0; i < ext_cnt; ++i) { + struct ceph_sparse_extent *ext = &map[i]; + int pgsoff = ext->off - objoff; + int pgidx = pgsoff >> PAGE_SHIFT; + int fret; + + if ((ext->off | ext->len) & ~CEPH_FSCRYPT_BLOCK_MASK) { + pr_warn("%s: bad encrypted sparse extent idx %d off %llx len %llx\n", + __func__, i, ext->off, ext->len); + return -EIO; + } + fret = ceph_fscrypt_decrypt_pages(inode, &page[pgidx], + off + pgsoff, ext->len); + dout("%s: [%d] 0x%llx~0x%llx fret %d\n", __func__, i, + ext->off, ext->len, fret); + if (fret < 0) { + if (ret == 0) + ret = fret; + break; + } + ret = pgsoff + fret; + } + dout("%s: ret %d\n", __func__, ret); + return ret; +} + +/** + * ceph_fscrypt_encrypt_pages - encrypt an array of pages + * @inode: pointer to inode associated with these pages + * @page: pointer to page array + * @off: offset into the file that the data starts + * @len: max length to encrypt + * @gfp: gfp flags to use for allocation + * + * Decrypt an array of cleartext pages and return the amount of + * data encrypted. Any data in the page prior to the start of the + * first complete block in the read is ignored. Any incomplete + * crypto blocks at the end of the array are ignored. + * + * Returns the length of the encrypted data or a negative errno. + */ +int ceph_fscrypt_encrypt_pages(struct inode *inode, struct page **page, u64 off, + int len, gfp_t gfp) +{ + int i, num_blocks; + u64 baseblk = off >> CEPH_FSCRYPT_BLOCK_SHIFT; + int ret = 0; + + /* + * We can't deal with partial blocks on an encrypted file, so mask off + * the last bit. + */ + num_blocks = ceph_fscrypt_blocks(off, len & CEPH_FSCRYPT_BLOCK_MASK); + + /* Encrypt each block */ + for (i = 0; i < num_blocks; ++i) { + int blkoff = i << CEPH_FSCRYPT_BLOCK_SHIFT; + int pgidx = blkoff >> PAGE_SHIFT; + unsigned int pgoffs = offset_in_page(blkoff); + int fret; + + fret = ceph_fscrypt_encrypt_block_inplace(inode, page[pgidx], + CEPH_FSCRYPT_BLOCK_SIZE, pgoffs, + baseblk + i, gfp); + if (fret < 0) { + if (ret == 0) + ret = fret; + break; + } + ret += CEPH_FSCRYPT_BLOCK_SIZE; + } + return ret; +} diff --git a/fs/ceph/crypto.h b/fs/ceph/crypto.h new file mode 100644 index 000000000000..47e0c319fc68 --- /dev/null +++ b/fs/ceph/crypto.h @@ -0,0 +1,288 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Ceph fscrypt functionality + */ + +#ifndef _CEPH_CRYPTO_H +#define _CEPH_CRYPTO_H + +#include +#include + +#define CEPH_FSCRYPT_BLOCK_SHIFT 12 +#define CEPH_FSCRYPT_BLOCK_SIZE (_AC(1, UL) << CEPH_FSCRYPT_BLOCK_SHIFT) +#define CEPH_FSCRYPT_BLOCK_MASK (~(CEPH_FSCRYPT_BLOCK_SIZE-1)) + +struct ceph_fs_client; +struct ceph_acl_sec_ctx; +struct ceph_mds_request; + +struct ceph_fname { + struct inode *dir; + char *name; // b64 encoded, possibly hashed + unsigned char *ctext; // binary crypttext (if any) + u32 name_len; // length of name buffer + u32 ctext_len; // length of crypttext + bool no_copy; +}; + +/* + * Header for the crypted file when truncating the size, this + * will be sent to MDS, and the MDS will update the encrypted + * last block and then truncate the size. + */ +struct ceph_fscrypt_truncate_size_header { + __u8 ver; + __u8 compat; + + /* + * It will be sizeof(assert_ver + file_offset + block_size) + * if the last block is empty when it's located in a file + * hole. Or the data_len will plus CEPH_FSCRYPT_BLOCK_SIZE. + */ + __le32 data_len; + + __le64 change_attr; + __le64 file_offset; + __le32 block_size; +} __packed; + +struct ceph_fscrypt_auth { + __le32 cfa_version; + __le32 cfa_blob_len; + u8 cfa_blob[FSCRYPT_SET_CONTEXT_MAX_SIZE]; +} __packed; + +#define CEPH_FSCRYPT_AUTH_VERSION 1 +static inline u32 ceph_fscrypt_auth_len(struct ceph_fscrypt_auth *fa) +{ + u32 ctxsize = le32_to_cpu(fa->cfa_blob_len); + + return offsetof(struct ceph_fscrypt_auth, cfa_blob) + ctxsize; +} + +#ifdef CONFIG_FS_ENCRYPTION +/* + * We want to encrypt filenames when creating them, but the encrypted + * versions of those names may have illegal characters in them. To mitigate + * that, we base64 encode them, but that gives us a result that can exceed + * NAME_MAX. + * + * Follow a similar scheme to fscrypt itself, and cap the filename to a + * smaller size. If the ciphertext name is longer than the value below, then + * sha256 hash the remaining bytes. + * + * For the fscrypt_nokey_name struct the dirhash[2] member is useless in ceph + * so the corresponding struct will be: + * + * struct fscrypt_ceph_nokey_name { + * u8 bytes[157]; + * u8 sha256[SHA256_DIGEST_SIZE]; + * }; // 180 bytes => 240 bytes base64-encoded, which is <= NAME_MAX (255) + * + * (240 bytes is the maximum size allowed for snapshot names to take into + * account the format: '__'.) + * + * Note that for long names that end up having their tail portion hashed, we + * must also store the full encrypted name (in the dentry's alternate_name + * field). + */ +#define CEPH_NOHASH_NAME_MAX (180 - SHA256_DIGEST_SIZE) + +#define CEPH_BASE64_CHARS(nbytes) DIV_ROUND_UP((nbytes) * 4, 3) + +int ceph_base64_encode(const u8 *src, int srclen, char *dst); +int ceph_base64_decode(const char *src, int srclen, u8 *dst); + +void ceph_fscrypt_set_ops(struct super_block *sb); + +void ceph_fscrypt_free_dummy_policy(struct ceph_fs_client *fsc); + +int ceph_fscrypt_prepare_context(struct inode *dir, struct inode *inode, + struct ceph_acl_sec_ctx *as); +void ceph_fscrypt_as_ctx_to_req(struct ceph_mds_request *req, + struct ceph_acl_sec_ctx *as); +int ceph_encode_encrypted_dname(struct inode *parent, struct qstr *d_name, + char *buf); +int ceph_encode_encrypted_fname(struct inode *parent, struct dentry *dentry, + char *buf); + +static inline int ceph_fname_alloc_buffer(struct inode *parent, + struct fscrypt_str *fname) +{ + if (!IS_ENCRYPTED(parent)) + return 0; + return fscrypt_fname_alloc_buffer(NAME_MAX, fname); +} + +static inline void ceph_fname_free_buffer(struct inode *parent, + struct fscrypt_str *fname) +{ + if (IS_ENCRYPTED(parent)) + fscrypt_fname_free_buffer(fname); +} + +int ceph_fname_to_usr(const struct ceph_fname *fname, struct fscrypt_str *tname, + struct fscrypt_str *oname, bool *is_nokey); +int ceph_fscrypt_prepare_readdir(struct inode *dir); + +static inline unsigned int ceph_fscrypt_blocks(u64 off, u64 len) +{ + /* crypto blocks cannot span more than one page */ + BUILD_BUG_ON(CEPH_FSCRYPT_BLOCK_SHIFT > PAGE_SHIFT); + + return ((off+len+CEPH_FSCRYPT_BLOCK_SIZE-1) >> CEPH_FSCRYPT_BLOCK_SHIFT) - + (off >> CEPH_FSCRYPT_BLOCK_SHIFT); +} + +/* + * If we have an encrypted inode then we must adjust the offset and + * range of the on-the-wire read to cover an entire encryption block. + * The copy will be done using the original offset and length, after + * we've decrypted the result. + */ +static inline void ceph_fscrypt_adjust_off_and_len(struct inode *inode, + u64 *off, u64 *len) +{ + if (IS_ENCRYPTED(inode)) { + *len = ceph_fscrypt_blocks(*off, *len) * CEPH_FSCRYPT_BLOCK_SIZE; + *off &= CEPH_FSCRYPT_BLOCK_MASK; + } +} + +int ceph_fscrypt_decrypt_block_inplace(const struct inode *inode, + struct page *page, unsigned int len, + unsigned int offs, u64 lblk_num); +int ceph_fscrypt_encrypt_block_inplace(const struct inode *inode, + struct page *page, unsigned int len, + unsigned int offs, u64 lblk_num, + gfp_t gfp_flags); +int ceph_fscrypt_decrypt_pages(struct inode *inode, struct page **page, + u64 off, int len); +int ceph_fscrypt_decrypt_extents(struct inode *inode, struct page **page, + u64 off, struct ceph_sparse_extent *map, + u32 ext_cnt); +int ceph_fscrypt_encrypt_pages(struct inode *inode, struct page **page, u64 off, + int len, gfp_t gfp); + +static inline struct page *ceph_fscrypt_pagecache_page(struct page *page) +{ + return fscrypt_is_bounce_page(page) ? fscrypt_pagecache_page(page) : page; +} + +#else /* CONFIG_FS_ENCRYPTION */ + +static inline void ceph_fscrypt_set_ops(struct super_block *sb) +{ +} + +static inline void ceph_fscrypt_free_dummy_policy(struct ceph_fs_client *fsc) +{ +} + +static inline int ceph_fscrypt_prepare_context(struct inode *dir, + struct inode *inode, + struct ceph_acl_sec_ctx *as) +{ + if (IS_ENCRYPTED(dir)) + return -EOPNOTSUPP; + return 0; +} + +static inline void ceph_fscrypt_as_ctx_to_req(struct ceph_mds_request *req, + struct ceph_acl_sec_ctx *as_ctx) +{ +} + +static inline int ceph_encode_encrypted_dname(struct inode *parent, + struct qstr *d_name, char *buf) +{ + memcpy(buf, d_name->name, d_name->len); + return d_name->len; +} + +static inline int ceph_encode_encrypted_fname(struct inode *parent, + struct dentry *dentry, char *buf) +{ + return -EOPNOTSUPP; +} + +static inline int ceph_fname_alloc_buffer(struct inode *parent, + struct fscrypt_str *fname) +{ + return 0; +} + +static inline void ceph_fname_free_buffer(struct inode *parent, + struct fscrypt_str *fname) +{ +} + +static inline int ceph_fname_to_usr(const struct ceph_fname *fname, + struct fscrypt_str *tname, + struct fscrypt_str *oname, bool *is_nokey) +{ + oname->name = fname->name; + oname->len = fname->name_len; + return 0; +} + +static inline int ceph_fscrypt_prepare_readdir(struct inode *dir) +{ + return 0; +} + +static inline void ceph_fscrypt_adjust_off_and_len(struct inode *inode, + u64 *off, u64 *len) +{ +} + +static inline int ceph_fscrypt_decrypt_block_inplace(const struct inode *inode, + struct page *page, unsigned int len, + unsigned int offs, u64 lblk_num) +{ + return 0; +} + +static inline int ceph_fscrypt_encrypt_block_inplace(const struct inode *inode, + struct page *page, unsigned int len, + unsigned int offs, u64 lblk_num, + gfp_t gfp_flags) +{ + return 0; +} + +static inline int ceph_fscrypt_decrypt_pages(struct inode *inode, + struct page **page, u64 off, + int len) +{ + return 0; +} + +static inline int ceph_fscrypt_decrypt_extents(struct inode *inode, + struct page **page, u64 off, + struct ceph_sparse_extent *map, + u32 ext_cnt) +{ + return 0; +} + +static inline int ceph_fscrypt_encrypt_pages(struct inode *inode, + struct page **page, u64 off, + int len, gfp_t gfp) +{ + return 0; +} + +static inline struct page *ceph_fscrypt_pagecache_page(struct page *page) +{ + return page; +} +#endif /* CONFIG_FS_ENCRYPTION */ + +static inline loff_t ceph_fscrypt_page_offset(struct page *page) +{ + return page_offset(ceph_fscrypt_pagecache_page(page)); +} + +#endif /* _CEPH_CRYPTO_H */ diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index bdcffb04513f..854cbdd66661 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -9,6 +9,7 @@ #include "super.h" #include "mds_client.h" +#include "crypto.h" /* * Directory operations: readdir, lookup, create, link, unlink, @@ -241,7 +242,9 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx, di = ceph_dentry(dentry); if (d_unhashed(dentry) || d_really_is_negative(dentry) || - di->lease_shared_gen != shared_gen) { + di->lease_shared_gen != shared_gen || + ((dentry->d_flags & DCACHE_NOKEY_NAME) && + fscrypt_has_encryption_key(dir))) { spin_unlock(&dentry->d_lock); dput(dentry); err = -EAGAIN; @@ -340,6 +343,10 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) ctx->pos = 2; } + err = ceph_fscrypt_prepare_readdir(inode); + if (err < 0) + return err; + spin_lock(&ci->i_ceph_lock); /* request Fx cap. if have Fx, we don't need to release Fs cap * for later create/unlink. */ @@ -389,6 +396,7 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); if (IS_ERR(req)) return PTR_ERR(req); + err = ceph_alloc_readdir_reply_buffer(req, inode); if (err) { ceph_mdsc_put_request(req); @@ -402,11 +410,21 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) req->r_inode_drop = CEPH_CAP_FILE_EXCL; } if (dfi->last_name) { - req->r_path2 = kstrdup(dfi->last_name, GFP_KERNEL); + struct qstr d_name = { .name = dfi->last_name, + .len = strlen(dfi->last_name) }; + + req->r_path2 = kzalloc(NAME_MAX + 1, GFP_KERNEL); if (!req->r_path2) { ceph_mdsc_put_request(req); return -ENOMEM; } + + err = ceph_encode_encrypted_dname(inode, &d_name, + req->r_path2); + if (err < 0) { + ceph_mdsc_put_request(req); + return err; + } } else if (is_hash_order(ctx->pos)) { req->r_args.readdir.offset_hash = cpu_to_le32(fpos_hash(ctx->pos)); @@ -511,15 +529,20 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) for (; i < rinfo->dir_nr; i++) { struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i; - BUG_ON(rde->offset < ctx->pos); + if (rde->offset < ctx->pos) { + pr_warn("%s: rde->offset 0x%llx ctx->pos 0x%llx\n", + __func__, rde->offset, ctx->pos); + return -EIO; + } + + if (WARN_ON_ONCE(!rde->inode.in)) + return -EIO; ctx->pos = rde->offset; dout("readdir (%d/%d) -> %llx '%.*s' %p\n", i, rinfo->dir_nr, ctx->pos, rde->name_len, rde->name, &rde->inode.in); - BUG_ON(!rde->inode.in); - if (!dir_emit(ctx, rde->name, rde->name_len, ceph_present_ino(inode->i_sb, le64_to_cpu(rde->inode.in->ino)), le32_to_cpu(rde->inode.in->mode) >> 12)) { @@ -532,6 +555,8 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) dout("filldir stopping us...\n"); return 0; } + + /* Reset the lengths to their original allocated vals */ ctx->pos++; } @@ -586,7 +611,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) dfi->dir_ordered_count); spin_unlock(&ci->i_ceph_lock); } - dout("readdir %p file %p done.\n", inode, file); return 0; } @@ -760,6 +784,18 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, if (dentry->d_name.len > NAME_MAX) return ERR_PTR(-ENAMETOOLONG); + if (IS_ENCRYPTED(dir)) { + bool had_key = fscrypt_has_encryption_key(dir); + + err = fscrypt_prepare_lookup_partial(dir, dentry); + if (err < 0) + return ERR_PTR(err); + + /* mark directory as incomplete if it has been unlocked */ + if (!had_key && fscrypt_has_encryption_key(dir)) + ceph_dir_clear_complete(dir); + } + /* can we conclude ENOENT locally? */ if (d_really_is_negative(dentry)) { struct ceph_inode_info *ci = ceph_inode(dir); @@ -865,13 +901,6 @@ static int ceph_mknod(struct mnt_idmap *idmap, struct inode *dir, goto out; } - err = ceph_pre_init_acls(dir, &mode, &as_ctx); - if (err < 0) - goto out; - err = ceph_security_init_secctx(dentry, mode, &as_ctx); - if (err < 0) - goto out; - dout("mknod in dir %p dentry %p mode 0%ho rdev %d\n", dir, dentry, mode, rdev); req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_MKNOD, USE_AUTH_MDS); @@ -879,6 +908,17 @@ static int ceph_mknod(struct mnt_idmap *idmap, struct inode *dir, err = PTR_ERR(req); goto out; } + + req->r_new_inode = ceph_new_inode(dir, dentry, &mode, &as_ctx); + if (IS_ERR(req->r_new_inode)) { + err = PTR_ERR(req->r_new_inode); + req->r_new_inode = NULL; + goto out_req; + } + + if (S_ISREG(mode) && IS_ENCRYPTED(dir)) + set_bit(CEPH_MDS_R_FSCRYPT_FILE, &req->r_req_flags); + req->r_dentry = dget(dentry); req->r_num_caps = 2; req->r_parent = dir; @@ -889,13 +929,13 @@ static int ceph_mknod(struct mnt_idmap *idmap, struct inode *dir, req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL | CEPH_CAP_XATTR_EXCL; req->r_dentry_unless = CEPH_CAP_FILE_EXCL; - if (as_ctx.pagelist) { - req->r_pagelist = as_ctx.pagelist; - as_ctx.pagelist = NULL; - } + + ceph_as_ctx_to_req(req, &as_ctx); + err = ceph_mdsc_do_request(mdsc, dir, req); if (!err && !req->r_reply_info.head->is_dentry) err = ceph_handle_notrace_create(dir, dentry); +out_req: ceph_mdsc_put_request(req); out: if (!err) @@ -912,12 +952,50 @@ static int ceph_create(struct mnt_idmap *idmap, struct inode *dir, return ceph_mknod(idmap, dir, dentry, mode, 0); } +#if IS_ENABLED(CONFIG_FS_ENCRYPTION) +static int prep_encrypted_symlink_target(struct ceph_mds_request *req, + const char *dest) +{ + int err; + int len = strlen(dest); + struct fscrypt_str osd_link = FSTR_INIT(NULL, 0); + + err = fscrypt_prepare_symlink(req->r_parent, dest, len, PATH_MAX, + &osd_link); + if (err) + goto out; + + err = fscrypt_encrypt_symlink(req->r_new_inode, dest, len, &osd_link); + if (err) + goto out; + + req->r_path2 = kmalloc(CEPH_BASE64_CHARS(osd_link.len) + 1, GFP_KERNEL); + if (!req->r_path2) { + err = -ENOMEM; + goto out; + } + + len = ceph_base64_encode(osd_link.name, osd_link.len, req->r_path2); + req->r_path2[len] = '\0'; +out: + fscrypt_fname_free_buffer(&osd_link); + return err; +} +#else +static int prep_encrypted_symlink_target(struct ceph_mds_request *req, + const char *dest) +{ + return -EOPNOTSUPP; +} +#endif + static int ceph_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *dest) { struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb); struct ceph_mds_request *req; struct ceph_acl_sec_ctx as_ctx = {}; + umode_t mode = S_IFLNK | 0777; int err; if (ceph_snap(dir) != CEPH_NOSNAP) @@ -932,38 +1010,48 @@ static int ceph_symlink(struct mnt_idmap *idmap, struct inode *dir, goto out; } - err = ceph_security_init_secctx(dentry, S_IFLNK | 0777, &as_ctx); - if (err < 0) - goto out; - dout("symlink in dir %p dentry %p to '%s'\n", dir, dentry, dest); req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SYMLINK, USE_AUTH_MDS); if (IS_ERR(req)) { err = PTR_ERR(req); goto out; } - req->r_path2 = kstrdup(dest, GFP_KERNEL); - if (!req->r_path2) { - err = -ENOMEM; - ceph_mdsc_put_request(req); - goto out; + + req->r_new_inode = ceph_new_inode(dir, dentry, &mode, &as_ctx); + if (IS_ERR(req->r_new_inode)) { + err = PTR_ERR(req->r_new_inode); + req->r_new_inode = NULL; + goto out_req; } + req->r_parent = dir; ihold(dir); + if (IS_ENCRYPTED(req->r_new_inode)) { + err = prep_encrypted_symlink_target(req, dest); + if (err) + goto out_req; + } else { + req->r_path2 = kstrdup(dest, GFP_KERNEL); + if (!req->r_path2) { + err = -ENOMEM; + goto out_req; + } + } + set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); req->r_dentry = dget(dentry); req->r_num_caps = 2; req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL | CEPH_CAP_XATTR_EXCL; req->r_dentry_unless = CEPH_CAP_FILE_EXCL; - if (as_ctx.pagelist) { - req->r_pagelist = as_ctx.pagelist; - as_ctx.pagelist = NULL; - } + + ceph_as_ctx_to_req(req, &as_ctx); + err = ceph_mdsc_do_request(mdsc, dir, req); if (!err && !req->r_reply_info.head->is_dentry) err = ceph_handle_notrace_create(dir, dentry); +out_req: ceph_mdsc_put_request(req); out: if (err) @@ -1003,14 +1091,12 @@ static int ceph_mkdir(struct mnt_idmap *idmap, struct inode *dir, err = -EDQUOT; goto out; } + if ((op == CEPH_MDS_OP_MKSNAP) && IS_ENCRYPTED(dir) && + !fscrypt_has_encryption_key(dir)) { + err = -ENOKEY; + goto out; + } - mode |= S_IFDIR; - err = ceph_pre_init_acls(dir, &mode, &as_ctx); - if (err < 0) - goto out; - err = ceph_security_init_secctx(dentry, mode, &as_ctx); - if (err < 0) - goto out; req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); if (IS_ERR(req)) { @@ -1018,6 +1104,14 @@ static int ceph_mkdir(struct mnt_idmap *idmap, struct inode *dir, goto out; } + mode |= S_IFDIR; + req->r_new_inode = ceph_new_inode(dir, dentry, &mode, &as_ctx); + if (IS_ERR(req->r_new_inode)) { + err = PTR_ERR(req->r_new_inode); + req->r_new_inode = NULL; + goto out_req; + } + req->r_dentry = dget(dentry); req->r_num_caps = 2; req->r_parent = dir; @@ -1027,15 +1121,15 @@ static int ceph_mkdir(struct mnt_idmap *idmap, struct inode *dir, req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL | CEPH_CAP_XATTR_EXCL; req->r_dentry_unless = CEPH_CAP_FILE_EXCL; - if (as_ctx.pagelist) { - req->r_pagelist = as_ctx.pagelist; - as_ctx.pagelist = NULL; - } + + ceph_as_ctx_to_req(req, &as_ctx); + err = ceph_mdsc_do_request(mdsc, dir, req); if (!err && !req->r_reply_info.head->is_target && !req->r_reply_info.head->is_dentry) err = ceph_handle_notrace_create(dir, dentry); +out_req: ceph_mdsc_put_request(req); out: if (!err) @@ -1063,6 +1157,10 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir, if (ceph_snap(dir) != CEPH_NOSNAP) return -EROFS; + err = fscrypt_prepare_link(old_dentry, dir, dentry); + if (err) + return err; + dout("link in dir %p %llx.%llx old_dentry %p:'%pd' dentry %p:'%pd'\n", dir, ceph_vinop(dir), old_dentry, old_dentry, dentry, dentry); req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LINK, USE_AUTH_MDS); @@ -1310,6 +1408,11 @@ static int ceph_rename(struct mnt_idmap *idmap, struct inode *old_dir, if (err) return err; + err = fscrypt_prepare_rename(old_dir, old_dentry, new_dir, new_dentry, + flags); + if (err) + return err; + dout("rename dir %p dentry %p to dir %p dentry %p\n", old_dir, old_dentry, new_dir, new_dentry); req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); @@ -1765,6 +1868,10 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags) struct inode *dir, *inode; struct ceph_mds_client *mdsc; + valid = fscrypt_d_revalidate(dentry, flags); + if (valid <= 0) + return valid; + if (flags & LOOKUP_RCU) { parent = READ_ONCE(dentry->d_parent); dir = d_inode_rcu(parent); @@ -1777,8 +1884,9 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags) inode = d_inode(dentry); } - dout("d_revalidate %p '%pd' inode %p offset 0x%llx\n", dentry, - dentry, inode, ceph_dentry(dentry)->offset); + dout("d_revalidate %p '%pd' inode %p offset 0x%llx nokey %d\n", dentry, + dentry, inode, ceph_dentry(dentry)->offset, + !!(dentry->d_flags & DCACHE_NOKEY_NAME)); mdsc = ceph_sb_to_client(dir->i_sb)->mdsc; diff --git a/fs/ceph/export.c b/fs/ceph/export.c index f780e4e0d062..8559990a59a5 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -7,6 +7,7 @@ #include "super.h" #include "mds_client.h" +#include "crypto.h" /* * Basic fh @@ -535,7 +536,9 @@ static int ceph_get_name(struct dentry *parent, char *name, { struct ceph_mds_client *mdsc; struct ceph_mds_request *req; + struct inode *dir = d_inode(parent); struct inode *inode = d_inode(child); + struct ceph_mds_reply_info_parsed *rinfo; int err; if (ceph_snap(inode) != CEPH_NOSNAP) @@ -547,30 +550,47 @@ static int ceph_get_name(struct dentry *parent, char *name, if (IS_ERR(req)) return PTR_ERR(req); - inode_lock(d_inode(parent)); - + inode_lock(dir); req->r_inode = inode; ihold(inode); req->r_ino2 = ceph_vino(d_inode(parent)); - req->r_parent = d_inode(parent); - ihold(req->r_parent); + req->r_parent = dir; + ihold(dir); set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); req->r_num_caps = 2; err = ceph_mdsc_do_request(mdsc, NULL, req); + inode_unlock(dir); - inode_unlock(d_inode(parent)); + if (err) + goto out; - if (!err) { - struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; + rinfo = &req->r_reply_info; + if (!IS_ENCRYPTED(dir)) { memcpy(name, rinfo->dname, rinfo->dname_len); name[rinfo->dname_len] = 0; - dout("get_name %p ino %llx.%llx name %s\n", - child, ceph_vinop(inode), name); } else { - dout("get_name %p ino %llx.%llx err %d\n", - child, ceph_vinop(inode), err); - } + struct fscrypt_str oname = FSTR_INIT(NULL, 0); + struct ceph_fname fname = { .dir = dir, + .name = rinfo->dname, + .ctext = rinfo->altname, + .name_len = rinfo->dname_len, + .ctext_len = rinfo->altname_len }; + err = ceph_fname_alloc_buffer(dir, &oname); + if (err < 0) + goto out; + + err = ceph_fname_to_usr(&fname, NULL, &oname, NULL); + if (!err) { + memcpy(name, oname.name, oname.len); + name[oname.len] = 0; + } + ceph_fname_free_buffer(dir, &oname); + } +out: + dout("get_name %p ino %llx.%llx err %d %s%s\n", + child, ceph_vinop(inode), err, + err ? "" : "name ", err ? "" : name); ceph_mdsc_put_request(req); return err; } diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 63efe5389783..b1da02f5dbe3 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -366,8 +366,13 @@ int ceph_open(struct inode *inode, struct file *file) /* filter out O_CREAT|O_EXCL; vfs did that already. yuck. */ flags = file->f_flags & ~(O_CREAT|O_EXCL); - if (S_ISDIR(inode->i_mode)) + if (S_ISDIR(inode->i_mode)) { flags = O_DIRECTORY; /* mds likes to know */ + } else if (S_ISREG(inode->i_mode)) { + err = fscrypt_file_open(inode, file); + if (err) + return err; + } dout("open inode %p ino %llx.%llx file %p flags %d (%d)\n", inode, ceph_vinop(inode), file, flags, file->f_flags); @@ -604,7 +609,8 @@ static void ceph_async_create_cb(struct ceph_mds_client *mdsc, ceph_mdsc_release_dir_caps(req); } -static int ceph_finish_async_create(struct inode *dir, struct dentry *dentry, +static int ceph_finish_async_create(struct inode *dir, struct inode *inode, + struct dentry *dentry, struct file *file, umode_t mode, struct ceph_mds_request *req, struct ceph_acl_sec_ctx *as_ctx, @@ -616,7 +622,6 @@ static int ceph_finish_async_create(struct inode *dir, struct dentry *dentry, struct ceph_mds_reply_info_in iinfo = { .in = &in }; struct ceph_inode_info *ci = ceph_inode(dir); struct ceph_dentry_info *di = ceph_dentry(dentry); - struct inode *inode; struct timespec64 now; struct ceph_string *pool_ns; struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb); @@ -625,10 +630,6 @@ static int ceph_finish_async_create(struct inode *dir, struct dentry *dentry, ktime_get_real_ts64(&now); - inode = ceph_get_inode(dentry->d_sb, vino); - if (IS_ERR(inode)) - return PTR_ERR(inode); - iinfo.inline_version = CEPH_INLINE_NONE; iinfo.change_attr = 1; ceph_encode_timespec64(&iinfo.btime, &now); @@ -686,8 +687,7 @@ static int ceph_finish_async_create(struct inode *dir, struct dentry *dentry, ceph_dir_clear_complete(dir); if (!d_unhashed(dentry)) d_drop(dentry); - if (inode->i_state & I_NEW) - discard_new_inode(inode); + discard_new_inode(inode); } else { struct dentry *dn; @@ -733,6 +733,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_request *req; + struct inode *new_inode = NULL; struct dentry *dn; struct ceph_acl_sec_ctx as_ctx = {}; bool try_async = ceph_test_mount_opt(fsc, ASYNC_DIROPS); @@ -755,15 +756,16 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, */ flags &= ~O_TRUNC; +retry: if (flags & O_CREAT) { if (ceph_quota_is_max_files_exceeded(dir)) return -EDQUOT; - err = ceph_pre_init_acls(dir, &mode, &as_ctx); - if (err < 0) - return err; - err = ceph_security_init_secctx(dentry, mode, &as_ctx); - if (err < 0) + + new_inode = ceph_new_inode(dir, dentry, &mode, &as_ctx); + if (IS_ERR(new_inode)) { + err = PTR_ERR(new_inode); goto out_ctx; + } /* Async create can't handle more than a page of xattrs */ if (as_ctx.pagelist && !list_is_singular(&as_ctx.pagelist->head)) @@ -772,7 +774,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, /* If it's not being looked up, it's negative */ return -ENOENT; } -retry: + /* do the open */ req = prepare_open_request(dir->i_sb, flags, mode); if (IS_ERR(req)) { @@ -787,6 +789,12 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, req->r_args.open.mask = cpu_to_le32(mask); req->r_parent = dir; ihold(dir); + if (IS_ENCRYPTED(dir)) { + set_bit(CEPH_MDS_R_FSCRYPT_FILE, &req->r_req_flags); + err = fscrypt_prepare_lookup_partial(dir, dentry); + if (err < 0) + goto out_req; + } if (flags & O_CREAT) { struct ceph_file_layout lo; @@ -794,32 +802,47 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL | CEPH_CAP_XATTR_EXCL; req->r_dentry_unless = CEPH_CAP_FILE_EXCL; - if (as_ctx.pagelist) { - req->r_pagelist = as_ctx.pagelist; - as_ctx.pagelist = NULL; - } - if (try_async && - (req->r_dir_caps = - try_prep_async_create(dir, dentry, &lo, - &req->r_deleg_ino))) { + + ceph_as_ctx_to_req(req, &as_ctx); + + if (try_async && (req->r_dir_caps = + try_prep_async_create(dir, dentry, &lo, + &req->r_deleg_ino))) { + struct ceph_vino vino = { .ino = req->r_deleg_ino, + .snap = CEPH_NOSNAP }; struct ceph_dentry_info *di = ceph_dentry(dentry); set_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags); req->r_args.open.flags |= cpu_to_le32(CEPH_O_EXCL); req->r_callback = ceph_async_create_cb; + /* Hash inode before RPC */ + new_inode = ceph_get_inode(dir->i_sb, vino, new_inode); + if (IS_ERR(new_inode)) { + err = PTR_ERR(new_inode); + new_inode = NULL; + goto out_req; + } + WARN_ON_ONCE(!(new_inode->i_state & I_NEW)); + spin_lock(&dentry->d_lock); di->flags |= CEPH_DENTRY_ASYNC_CREATE; spin_unlock(&dentry->d_lock); err = ceph_mdsc_submit_request(mdsc, dir, req); if (!err) { - err = ceph_finish_async_create(dir, dentry, - file, mode, req, - &as_ctx, &lo); + err = ceph_finish_async_create(dir, new_inode, + dentry, file, + mode, req, + &as_ctx, &lo); + new_inode = NULL; } else if (err == -EJUKEBOX) { restore_deleg_ino(dir, req->r_deleg_ino); ceph_mdsc_put_request(req); + discard_new_inode(new_inode); + ceph_release_acl_sec_ctx(&as_ctx); + memset(&as_ctx, 0, sizeof(as_ctx)); + new_inode = NULL; try_async = false; ceph_put_string(rcu_dereference_raw(lo.pool_ns)); goto retry; @@ -830,6 +853,8 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, } set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); + req->r_new_inode = new_inode; + new_inode = NULL; err = ceph_mdsc_do_request(mdsc, (flags & O_CREAT) ? dir : NULL, req); if (err == -ENOENT) { dentry = ceph_handle_snapdir(req, dentry); @@ -858,6 +883,13 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, dout("atomic_open finish_no_open on dn %p\n", dn); err = finish_no_open(file, dn); } else { + if (IS_ENCRYPTED(dir) && + !fscrypt_has_permitted_context(dir, d_inode(dentry))) { + pr_warn("Inconsistent encryption context (parent %llx:%llx child %llx:%llx)\n", + ceph_vinop(dir), ceph_vinop(d_inode(dentry))); + goto out_req; + } + dout("atomic_open finish_open on dn %p\n", dn); if (req->r_op == CEPH_MDS_OP_CREATE && req->r_reply_info.has_create_ino) { struct inode *newino = d_inode(dentry); @@ -870,6 +902,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, } out_req: ceph_mdsc_put_request(req); + iput(new_inode); out_ctx: ceph_release_acl_sec_ctx(&as_ctx); dout("atomic_open result=%d\n", err); @@ -924,21 +957,24 @@ enum { * If we get a short result from the OSD, check against i_size; we need to * only return a short read to the caller if we hit EOF. */ -static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, - int *retry_op) +ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos, + struct iov_iter *to, int *retry_op, + u64 *last_objver) { - struct file *file = iocb->ki_filp; - struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_osd_client *osdc = &fsc->client->osdc; ssize_t ret; - u64 off = iocb->ki_pos; + u64 off = *ki_pos; u64 len = iov_iter_count(to); u64 i_size = i_size_read(inode); + bool sparse = IS_ENCRYPTED(inode) || ceph_test_mount_opt(fsc, SPARSEREAD); + u64 objver = 0; - dout("sync_read on file %p %llu~%u %s\n", file, off, (unsigned)len, - (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); + dout("sync_read on inode %p %llx~%llx\n", inode, *ki_pos, len); + + if (ceph_inode_is_shutdown(inode)) + return -EIO; if (!len) return 0; @@ -962,10 +998,21 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, bool more; int idx; size_t left; + struct ceph_osd_req_op *op; + u64 read_off = off; + u64 read_len = len; + + /* determine new offset/length if encrypted */ + ceph_fscrypt_adjust_off_and_len(inode, &read_off, &read_len); + + dout("sync_read orig %llu~%llu reading %llu~%llu", + off, len, read_off, read_len); req = ceph_osdc_new_request(osdc, &ci->i_layout, - ci->i_vino, off, &len, 0, 1, - CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, + ci->i_vino, read_off, &read_len, 0, 1, + sparse ? CEPH_OSD_OP_SPARSE_READ : + CEPH_OSD_OP_READ, + CEPH_OSD_FLAG_READ, NULL, ci->i_truncate_seq, ci->i_truncate_size, false); if (IS_ERR(req)) { @@ -973,10 +1020,13 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, break; } + /* adjust len downward if the request truncated the len */ + if (off + len > read_off + read_len) + len = read_off + read_len - off; more = len < iov_iter_count(to); - num_pages = calc_pages_for(off, len); - page_off = off & ~PAGE_MASK; + num_pages = calc_pages_for(read_off, read_len); + page_off = offset_in_page(off); pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); if (IS_ERR(pages)) { ceph_osdc_put_request(req); @@ -984,29 +1034,75 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, break; } - osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_off, + osd_req_op_extent_osd_data_pages(req, 0, pages, read_len, + offset_in_page(read_off), false, false); + + op = &req->r_ops[0]; + if (sparse) { + ret = ceph_alloc_sparse_ext_map(op); + if (ret) { + ceph_osdc_put_request(req); + break; + } + } + ceph_osdc_start_request(osdc, req); ret = ceph_osdc_wait_request(osdc, req); ceph_update_read_metrics(&fsc->mdsc->metric, req->r_start_latency, req->r_end_latency, - len, ret); + read_len, ret); - ceph_osdc_put_request(req); + if (ret > 0) + objver = req->r_version; i_size = i_size_read(inode); dout("sync_read %llu~%llu got %zd i_size %llu%s\n", off, len, ret, i_size, (more ? " MORE" : "")); - if (ret == -ENOENT) + /* Fix it to go to end of extent map */ + if (sparse && ret >= 0) + ret = ceph_sparse_ext_map_end(op); + else if (ret == -ENOENT) ret = 0; + + if (ret > 0 && IS_ENCRYPTED(inode)) { + int fret; + + fret = ceph_fscrypt_decrypt_extents(inode, pages, + read_off, op->extent.sparse_ext, + op->extent.sparse_ext_cnt); + if (fret < 0) { + ret = fret; + ceph_osdc_put_request(req); + break; + } + + /* account for any partial block at the beginning */ + fret -= (off - read_off); + + /* + * Short read after big offset adjustment? + * Nothing is usable, just call it a zero + * len read. + */ + fret = max(fret, 0); + + /* account for partial block at the end */ + ret = min_t(ssize_t, fret, len); + } + + ceph_osdc_put_request(req); + + /* Short read but not EOF? Zero out the remainder. */ if (ret >= 0 && ret < len && (off + ret < i_size)) { int zlen = min(len - ret, i_size - off - ret); int zoff = page_off + ret; + dout("sync_read zero gap %llu~%llu\n", - off + ret, off + ret + zlen); + off + ret, off + ret + zlen); ceph_zero_page_vector_range(zoff, zlen, pages); ret += zlen; } @@ -1014,15 +1110,16 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, idx = 0; left = ret > 0 ? ret : 0; while (left > 0) { - size_t len, copied; - page_off = off & ~PAGE_MASK; - len = min_t(size_t, left, PAGE_SIZE - page_off); + size_t plen, copied; + + plen = min_t(size_t, left, PAGE_SIZE - page_off); SetPageUptodate(pages[idx]); copied = copy_page_to_iter(pages[idx++], - page_off, len, to); + page_off, plen, to); off += copied; left -= copied; - if (copied < len) { + page_off = 0; + if (copied < plen) { ret = -EFAULT; break; } @@ -1039,21 +1136,37 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, break; } - if (off > iocb->ki_pos) { - if (off >= i_size) { - *retry_op = CHECK_EOF; - ret = i_size - iocb->ki_pos; - iocb->ki_pos = i_size; - } else { - ret = off - iocb->ki_pos; - iocb->ki_pos = off; + if (ret > 0) { + if (off > *ki_pos) { + if (off >= i_size) { + *retry_op = CHECK_EOF; + ret = i_size - *ki_pos; + *ki_pos = i_size; + } else { + ret = off - *ki_pos; + *ki_pos = off; + } } - } + if (last_objver) + *last_objver = objver; + } dout("sync_read result %zd retry_op %d\n", ret, *retry_op); return ret; } +static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, + int *retry_op) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + + dout("sync_read on file %p %llx~%zx %s\n", file, iocb->ki_pos, + iov_iter_count(to), (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); + + return __ceph_sync_read(inode, &iocb->ki_pos, to, retry_op, NULL); +} + struct ceph_aio_request { struct kiocb *iocb; size_t total_len; @@ -1125,8 +1238,10 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req) struct inode *inode = req->r_inode; struct ceph_aio_request *aio_req = req->r_priv; struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0); + struct ceph_osd_req_op *op = &req->r_ops[0]; struct ceph_client_metric *metric = &ceph_sb_to_mdsc(inode->i_sb)->metric; unsigned int len = osd_data->bvec_pos.iter.bi_size; + bool sparse = (op->op == CEPH_OSD_OP_SPARSE_READ); BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_BVECS); BUG_ON(!osd_data->num_bvecs); @@ -1147,6 +1262,8 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req) } rc = -ENOMEM; } else if (!aio_req->write) { + if (sparse && rc >= 0) + rc = ceph_sparse_ext_map_end(op); if (rc == -ENOENT) rc = 0; if (rc >= 0 && len > rc) { @@ -1283,6 +1400,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, loff_t pos = iocb->ki_pos; bool write = iov_iter_rw(iter) == WRITE; bool should_dirty = !write && user_backed_iter(iter); + bool sparse = ceph_test_mount_opt(fsc, SPARSEREAD); if (write && ceph_snap(file_inode(file)) != CEPH_NOSNAP) return -EROFS; @@ -1310,6 +1428,8 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, while (iov_iter_count(iter) > 0) { u64 size = iov_iter_count(iter); ssize_t len; + struct ceph_osd_req_op *op; + int readop = sparse ? CEPH_OSD_OP_SPARSE_READ : CEPH_OSD_OP_READ; if (write) size = min_t(u64, size, fsc->mount_options->wsize); @@ -1320,8 +1440,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino, pos, &size, 0, 1, - write ? CEPH_OSD_OP_WRITE : - CEPH_OSD_OP_READ, + write ? CEPH_OSD_OP_WRITE : readop, flags, snapc, ci->i_truncate_seq, ci->i_truncate_size, @@ -1372,6 +1491,14 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, } osd_req_op_extent_osd_data_bvecs(req, 0, bvecs, num_pages, len); + op = &req->r_ops[0]; + if (sparse) { + ret = ceph_alloc_sparse_ext_map(op); + if (ret) { + ceph_osdc_put_request(req); + break; + } + } if (aio_req) { aio_req->total_len += len; @@ -1399,8 +1526,11 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, size = i_size_read(inode); if (!write) { - if (ret == -ENOENT) + if (sparse && ret >= 0) + ret = ceph_sparse_ext_map_end(op); + else if (ret == -ENOENT) ret = 0; + if (ret >= 0 && ret < len && pos + ret < size) { struct iov_iter i; int zlen = min_t(size_t, len - ret, @@ -1481,13 +1611,12 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_client(inode); - struct ceph_vino vino; + struct ceph_osd_client *osdc = &fsc->client->osdc; struct ceph_osd_request *req; struct page **pages; u64 len; int num_pages; int written = 0; - int flags; int ret; bool check_caps = false; struct timespec64 mtime = current_time(inode); @@ -1505,79 +1634,350 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, return ret; ceph_fscache_invalidate(inode, false); - ret = invalidate_inode_pages2_range(inode->i_mapping, - pos >> PAGE_SHIFT, - (pos + count - 1) >> PAGE_SHIFT); - if (ret < 0) - dout("invalidate_inode_pages2_range returned %d\n", ret); - - flags = /* CEPH_OSD_FLAG_ORDERSNAP | */ CEPH_OSD_FLAG_WRITE; while ((len = iov_iter_count(from)) > 0) { size_t left; int n; + u64 write_pos = pos; + u64 write_len = len; + u64 objnum, objoff; + u32 xlen; + u64 assert_ver = 0; + bool rmw; + bool first, last; + struct iov_iter saved_iter = *from; + size_t off; - vino = ceph_vino(inode); - req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, - vino, pos, &len, 0, 1, - CEPH_OSD_OP_WRITE, flags, snapc, - ci->i_truncate_seq, - ci->i_truncate_size, - false); - if (IS_ERR(req)) { - ret = PTR_ERR(req); - break; - } + ceph_fscrypt_adjust_off_and_len(inode, &write_pos, &write_len); + + /* clamp the length to the end of first object */ + ceph_calc_file_object_mapping(&ci->i_layout, write_pos, + write_len, &objnum, &objoff, + &xlen); + write_len = xlen; + + /* adjust len downward if it goes beyond current object */ + if (pos + len > write_pos + write_len) + len = write_pos + write_len - pos; /* - * write from beginning of first page, - * regardless of io alignment + * If we had to adjust the length or position to align with a + * crypto block, then we must do a read/modify/write cycle. We + * use a version assertion to redrive the thing if something + * changes in between. */ - num_pages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT; + first = pos != write_pos; + last = (pos + len) != (write_pos + write_len); + rmw = first || last; + dout("sync_write ino %llx %lld~%llu adjusted %lld~%llu -- %srmw\n", + ci->i_vino.ino, pos, len, write_pos, write_len, + rmw ? "" : "no "); + + /* + * The data is emplaced into the page as it would be if it were + * in an array of pagecache pages. + */ + num_pages = calc_pages_for(write_pos, write_len); pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); if (IS_ERR(pages)) { ret = PTR_ERR(pages); - goto out; + break; + } + + /* Do we need to preload the pages? */ + if (rmw) { + u64 first_pos = write_pos; + u64 last_pos = (write_pos + write_len) - CEPH_FSCRYPT_BLOCK_SIZE; + u64 read_len = CEPH_FSCRYPT_BLOCK_SIZE; + struct ceph_osd_req_op *op; + + /* We should only need to do this for encrypted inodes */ + WARN_ON_ONCE(!IS_ENCRYPTED(inode)); + + /* No need to do two reads if first and last blocks are same */ + if (first && last_pos == first_pos) + last = false; + + /* + * Allocate a read request for one or two extents, + * depending on how the request was aligned. + */ + req = ceph_osdc_new_request(osdc, &ci->i_layout, + ci->i_vino, first ? first_pos : last_pos, + &read_len, 0, (first && last) ? 2 : 1, + CEPH_OSD_OP_SPARSE_READ, CEPH_OSD_FLAG_READ, + NULL, ci->i_truncate_seq, + ci->i_truncate_size, false); + if (IS_ERR(req)) { + ceph_release_page_vector(pages, num_pages); + ret = PTR_ERR(req); + break; + } + + /* Something is misaligned! */ + if (read_len != CEPH_FSCRYPT_BLOCK_SIZE) { + ceph_osdc_put_request(req); + ceph_release_page_vector(pages, num_pages); + ret = -EIO; + break; + } + + /* Add extent for first block? */ + op = &req->r_ops[0]; + + if (first) { + osd_req_op_extent_osd_data_pages(req, 0, pages, + CEPH_FSCRYPT_BLOCK_SIZE, + offset_in_page(first_pos), + false, false); + /* We only expect a single extent here */ + ret = __ceph_alloc_sparse_ext_map(op, 1); + if (ret) { + ceph_osdc_put_request(req); + ceph_release_page_vector(pages, num_pages); + break; + } + } + + /* Add extent for last block */ + if (last) { + /* Init the other extent if first extent has been used */ + if (first) { + op = &req->r_ops[1]; + osd_req_op_extent_init(req, 1, + CEPH_OSD_OP_SPARSE_READ, + last_pos, CEPH_FSCRYPT_BLOCK_SIZE, + ci->i_truncate_size, + ci->i_truncate_seq); + } + + ret = __ceph_alloc_sparse_ext_map(op, 1); + if (ret) { + ceph_osdc_put_request(req); + ceph_release_page_vector(pages, num_pages); + break; + } + + osd_req_op_extent_osd_data_pages(req, first ? 1 : 0, + &pages[num_pages - 1], + CEPH_FSCRYPT_BLOCK_SIZE, + offset_in_page(last_pos), + false, false); + } + + ceph_osdc_start_request(osdc, req); + ret = ceph_osdc_wait_request(osdc, req); + + /* FIXME: length field is wrong if there are 2 extents */ + ceph_update_read_metrics(&fsc->mdsc->metric, + req->r_start_latency, + req->r_end_latency, + read_len, ret); + + /* Ok if object is not already present */ + if (ret == -ENOENT) { + /* + * If there is no object, then we can't assert + * on its version. Set it to 0, and we'll use an + * exclusive create instead. + */ + ceph_osdc_put_request(req); + ret = 0; + + /* + * zero out the soon-to-be uncopied parts of the + * first and last pages. + */ + if (first) + zero_user_segment(pages[0], 0, + offset_in_page(first_pos)); + if (last) + zero_user_segment(pages[num_pages - 1], + offset_in_page(last_pos), + PAGE_SIZE); + } else { + if (ret < 0) { + ceph_osdc_put_request(req); + ceph_release_page_vector(pages, num_pages); + break; + } + + op = &req->r_ops[0]; + if (op->extent.sparse_ext_cnt == 0) { + if (first) + zero_user_segment(pages[0], 0, + offset_in_page(first_pos)); + else + zero_user_segment(pages[num_pages - 1], + offset_in_page(last_pos), + PAGE_SIZE); + } else if (op->extent.sparse_ext_cnt != 1 || + ceph_sparse_ext_map_end(op) != + CEPH_FSCRYPT_BLOCK_SIZE) { + ret = -EIO; + ceph_osdc_put_request(req); + ceph_release_page_vector(pages, num_pages); + break; + } + + if (first && last) { + op = &req->r_ops[1]; + if (op->extent.sparse_ext_cnt == 0) { + zero_user_segment(pages[num_pages - 1], + offset_in_page(last_pos), + PAGE_SIZE); + } else if (op->extent.sparse_ext_cnt != 1 || + ceph_sparse_ext_map_end(op) != + CEPH_FSCRYPT_BLOCK_SIZE) { + ret = -EIO; + ceph_osdc_put_request(req); + ceph_release_page_vector(pages, num_pages); + break; + } + } + + /* Grab assert version. It must be non-zero. */ + assert_ver = req->r_version; + WARN_ON_ONCE(ret > 0 && assert_ver == 0); + + ceph_osdc_put_request(req); + if (first) { + ret = ceph_fscrypt_decrypt_block_inplace(inode, + pages[0], CEPH_FSCRYPT_BLOCK_SIZE, + offset_in_page(first_pos), + first_pos >> CEPH_FSCRYPT_BLOCK_SHIFT); + if (ret < 0) { + ceph_release_page_vector(pages, num_pages); + break; + } + } + if (last) { + ret = ceph_fscrypt_decrypt_block_inplace(inode, + pages[num_pages - 1], + CEPH_FSCRYPT_BLOCK_SIZE, + offset_in_page(last_pos), + last_pos >> CEPH_FSCRYPT_BLOCK_SHIFT); + if (ret < 0) { + ceph_release_page_vector(pages, num_pages); + break; + } + } + } } left = len; + off = offset_in_page(pos); for (n = 0; n < num_pages; n++) { - size_t plen = min_t(size_t, left, PAGE_SIZE); - ret = copy_page_from_iter(pages[n], 0, plen, from); + size_t plen = min_t(size_t, left, PAGE_SIZE - off); + + /* copy the data */ + ret = copy_page_from_iter(pages[n], off, plen, from); if (ret != plen) { ret = -EFAULT; break; } + off = 0; left -= ret; } - if (ret < 0) { + dout("sync_write write failed with %d\n", ret); ceph_release_page_vector(pages, num_pages); - goto out; + break; } + if (IS_ENCRYPTED(inode)) { + ret = ceph_fscrypt_encrypt_pages(inode, pages, + write_pos, write_len, + GFP_KERNEL); + if (ret < 0) { + dout("encryption failed with %d\n", ret); + ceph_release_page_vector(pages, num_pages); + break; + } + } + + req = ceph_osdc_new_request(osdc, &ci->i_layout, + ci->i_vino, write_pos, &write_len, + rmw ? 1 : 0, rmw ? 2 : 1, + CEPH_OSD_OP_WRITE, + CEPH_OSD_FLAG_WRITE, + snapc, ci->i_truncate_seq, + ci->i_truncate_size, false); + if (IS_ERR(req)) { + ret = PTR_ERR(req); + ceph_release_page_vector(pages, num_pages); + break; + } + + dout("sync_write write op %lld~%llu\n", write_pos, write_len); + osd_req_op_extent_osd_data_pages(req, rmw ? 1 : 0, pages, write_len, + offset_in_page(write_pos), false, + true); req->r_inode = inode; - - osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, - false, true); - req->r_mtime = mtime; - ceph_osdc_start_request(&fsc->client->osdc, req); - ret = ceph_osdc_wait_request(&fsc->client->osdc, req); + + /* Set up the assertion */ + if (rmw) { + /* + * Set up the assertion. If we don't have a version + * number, then the object doesn't exist yet. Use an + * exclusive create instead of a version assertion in + * that case. + */ + if (assert_ver) { + osd_req_op_init(req, 0, CEPH_OSD_OP_ASSERT_VER, 0); + req->r_ops[0].assert_ver.ver = assert_ver; + } else { + osd_req_op_init(req, 0, CEPH_OSD_OP_CREATE, + CEPH_OSD_OP_FLAG_EXCL); + } + } + + ceph_osdc_start_request(osdc, req); + ret = ceph_osdc_wait_request(osdc, req); ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency, req->r_end_latency, len, ret); -out: ceph_osdc_put_request(req); if (ret != 0) { + dout("sync_write osd write returned %d\n", ret); + /* Version changed! Must re-do the rmw cycle */ + if ((assert_ver && (ret == -ERANGE || ret == -EOVERFLOW)) || + (!assert_ver && ret == -EEXIST)) { + /* We should only ever see this on a rmw */ + WARN_ON_ONCE(!rmw); + + /* The version should never go backward */ + WARN_ON_ONCE(ret == -EOVERFLOW); + + *from = saved_iter; + + /* FIXME: limit number of times we loop? */ + continue; + } ceph_set_error_write(ci); break; } ceph_clear_error_write(ci); + + /* + * We successfully wrote to a range of the file. Declare + * that region of the pagecache invalid. + */ + ret = invalidate_inode_pages2_range( + inode->i_mapping, + pos >> PAGE_SHIFT, + (pos + len - 1) >> PAGE_SHIFT); + if (ret < 0) { + dout("invalidate_inode_pages2_range returned %d\n", + ret); + ret = 0; + } pos += len; written += len; + dout("sync_write written %d\n", written); if (pos > i_size_read(inode)) { check_caps = ceph_inode_set_size(inode, pos); if (check_caps) @@ -1591,6 +1991,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, ret = written; iocb->ki_pos = pos; } + dout("sync_write returning %d\n", ret); return ret; } @@ -1648,7 +2049,9 @@ static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to) ceph_cap_string(got)); if (!ceph_has_inline_data(ci)) { - if (!retry_op && (iocb->ki_flags & IOCB_DIRECT)) { + if (!retry_op && + (iocb->ki_flags & IOCB_DIRECT) && + !IS_ENCRYPTED(inode)) { ret = ceph_direct_read_write(iocb, to, NULL, NULL); if (ret >= 0 && ret < len) @@ -1934,7 +2337,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) /* we might need to revert back to that point */ data = *from; - if (iocb->ki_flags & IOCB_DIRECT) + if ((iocb->ki_flags & IOCB_DIRECT) && !IS_ENCRYPTED(inode)) written = ceph_direct_read_write(iocb, &data, snapc, &prealloc_cf); else @@ -2165,6 +2568,9 @@ static long ceph_fallocate(struct file *file, int mode, if (!S_ISREG(inode->i_mode)) return -EOPNOTSUPP; + if (IS_ENCRYPTED(inode)) + return -EOPNOTSUPP; + prealloc_cf = ceph_alloc_cap_flush(); if (!prealloc_cf) return -ENOMEM; @@ -2486,6 +2892,10 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, return -EOPNOTSUPP; } + /* Every encrypted inode gets its own key, so we can't offload them */ + if (IS_ENCRYPTED(src_inode) || IS_ENCRYPTED(dst_inode)) + return -EOPNOTSUPP; + if (len < src_ci->i_layout.object_size) return -EOPNOTSUPP; /* no remote copy will be done */ diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index fd05d68e2990..800ab7920513 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -14,10 +14,12 @@ #include #include #include +#include #include "super.h" #include "mds_client.h" #include "cache.h" +#include "crypto.h" #include /* @@ -33,6 +35,7 @@ */ static const struct inode_operations ceph_symlink_iops; +static const struct inode_operations ceph_encrypted_symlink_iops; static void ceph_inode_work(struct work_struct *work); @@ -52,17 +55,99 @@ static int ceph_set_ino_cb(struct inode *inode, void *data) return 0; } -struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino) +/** + * ceph_new_inode - allocate a new inode in advance of an expected create + * @dir: parent directory for new inode + * @dentry: dentry that may eventually point to new inode + * @mode: mode of new inode + * @as_ctx: pointer to inherited security context + * + * Allocate a new inode in advance of an operation to create a new inode. + * This allocates the inode and sets up the acl_sec_ctx with appropriate + * info for the new inode. + * + * Returns a pointer to the new inode or an ERR_PTR. + */ +struct inode *ceph_new_inode(struct inode *dir, struct dentry *dentry, + umode_t *mode, struct ceph_acl_sec_ctx *as_ctx) +{ + int err; + struct inode *inode; + + inode = new_inode(dir->i_sb); + if (!inode) + return ERR_PTR(-ENOMEM); + + if (!S_ISLNK(*mode)) { + err = ceph_pre_init_acls(dir, mode, as_ctx); + if (err < 0) + goto out_err; + } + + inode->i_state = 0; + inode->i_mode = *mode; + + err = ceph_security_init_secctx(dentry, *mode, as_ctx); + if (err < 0) + goto out_err; + + /* + * We'll skip setting fscrypt context for snapshots, leaving that for + * the handle_reply(). + */ + if (ceph_snap(dir) != CEPH_SNAPDIR) { + err = ceph_fscrypt_prepare_context(dir, inode, as_ctx); + if (err) + goto out_err; + } + + return inode; +out_err: + iput(inode); + return ERR_PTR(err); +} + +void ceph_as_ctx_to_req(struct ceph_mds_request *req, + struct ceph_acl_sec_ctx *as_ctx) +{ + if (as_ctx->pagelist) { + req->r_pagelist = as_ctx->pagelist; + as_ctx->pagelist = NULL; + } + ceph_fscrypt_as_ctx_to_req(req, as_ctx); +} + +/** + * ceph_get_inode - find or create/hash a new inode + * @sb: superblock to search and allocate in + * @vino: vino to search for + * @newino: optional new inode to insert if one isn't found (may be NULL) + * + * Search for or insert a new inode into the hash for the given vino, and + * return a reference to it. If new is non-NULL, its reference is consumed. + */ +struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino, + struct inode *newino) { struct inode *inode; if (ceph_vino_is_reserved(vino)) return ERR_PTR(-EREMOTEIO); - inode = iget5_locked(sb, (unsigned long)vino.ino, ceph_ino_compare, - ceph_set_ino_cb, &vino); - if (!inode) + if (newino) { + inode = inode_insert5(newino, (unsigned long)vino.ino, + ceph_ino_compare, ceph_set_ino_cb, &vino); + if (inode != newino) + iput(newino); + } else { + inode = iget5_locked(sb, (unsigned long)vino.ino, + ceph_ino_compare, ceph_set_ino_cb, &vino); + } + + if (!inode) { + dout("No inode found for %llx.%llx\n", vino.ino, vino.snap); return ERR_PTR(-ENOMEM); + } dout("get_inode on %llu=%llx.%llx got %p new %d\n", ceph_present_inode(inode), ceph_vinop(inode), inode, !!(inode->i_state & I_NEW)); @@ -78,8 +163,9 @@ struct inode *ceph_get_snapdir(struct inode *parent) .ino = ceph_ino(parent), .snap = CEPH_SNAPDIR, }; - struct inode *inode = ceph_get_inode(parent->i_sb, vino); + struct inode *inode = ceph_get_inode(parent->i_sb, vino, NULL); struct ceph_inode_info *ci = ceph_inode(inode); + int ret = -ENOTDIR; if (IS_ERR(inode)) return inode; @@ -105,6 +191,24 @@ struct inode *ceph_get_snapdir(struct inode *parent) ci->i_rbytes = 0; ci->i_btime = ceph_inode(parent)->i_btime; +#ifdef CONFIG_FS_ENCRYPTION + /* if encrypted, just borrow fscrypt_auth from parent */ + if (IS_ENCRYPTED(parent)) { + struct ceph_inode_info *pci = ceph_inode(parent); + + ci->fscrypt_auth = kmemdup(pci->fscrypt_auth, + pci->fscrypt_auth_len, + GFP_KERNEL); + if (ci->fscrypt_auth) { + inode->i_flags |= S_ENCRYPTED; + ci->fscrypt_auth_len = pci->fscrypt_auth_len; + } else { + dout("Failed to alloc snapdir fscrypt_auth\n"); + ret = -ENOMEM; + goto err; + } + } +#endif if (inode->i_state & I_NEW) { inode->i_op = &ceph_snapdir_iops; inode->i_fop = &ceph_snapdir_fops; @@ -118,7 +222,7 @@ struct inode *ceph_get_snapdir(struct inode *parent) discard_new_inode(inode); else iput(inode); - return ERR_PTR(-ENOTDIR); + return ERR_PTR(ret); } const struct inode_operations ceph_file_iops = { @@ -517,6 +621,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb) ci->i_truncate_seq = 0; ci->i_truncate_size = 0; ci->i_truncate_pending = 0; + ci->i_truncate_pagecache_size = 0; ci->i_max_size = 0; ci->i_reported_size = 0; @@ -547,6 +652,10 @@ struct inode *ceph_alloc_inode(struct super_block *sb) INIT_WORK(&ci->i_work, ceph_inode_work); ci->i_work_mask = 0; memset(&ci->i_btime, '\0', sizeof(ci->i_btime)); +#ifdef CONFIG_FS_ENCRYPTION + ci->fscrypt_auth = NULL; + ci->fscrypt_auth_len = 0; +#endif return &ci->netfs.inode; } @@ -555,6 +664,10 @@ void ceph_free_inode(struct inode *inode) struct ceph_inode_info *ci = ceph_inode(inode); kfree(ci->i_symlink); +#ifdef CONFIG_FS_ENCRYPTION + kfree(ci->fscrypt_auth); +#endif + fscrypt_free_inode(inode); kmem_cache_free(ceph_inode_cachep, ci); } @@ -575,6 +688,7 @@ void ceph_evict_inode(struct inode *inode) clear_inode(inode); ceph_fscache_unregister_inode_cookie(ci); + fscrypt_put_encryption_info(inode); __ceph_remove_caps(ci); @@ -650,7 +764,7 @@ int ceph_fill_file_size(struct inode *inode, int issued, ceph_fscache_update(inode); ci->i_reported_size = size; if (truncate_seq != ci->i_truncate_seq) { - dout("truncate_seq %u -> %u\n", + dout("%s truncate_seq %u -> %u\n", __func__, ci->i_truncate_seq, truncate_seq); ci->i_truncate_seq = truncate_seq; @@ -674,11 +788,26 @@ int ceph_fill_file_size(struct inode *inode, int issued, } } } - if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) >= 0 && - ci->i_truncate_size != truncate_size) { - dout("truncate_size %lld -> %llu\n", ci->i_truncate_size, - truncate_size); + + /* + * It's possible that the new sizes of the two consecutive + * size truncations will be in the same fscrypt last block, + * and we need to truncate the corresponding page caches + * anyway. + */ + if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) >= 0) { + dout("%s truncate_size %lld -> %llu, encrypted %d\n", __func__, + ci->i_truncate_size, truncate_size, !!IS_ENCRYPTED(inode)); + ci->i_truncate_size = truncate_size; + + if (IS_ENCRYPTED(inode)) { + dout("%s truncate_pagecache_size %lld -> %llu\n", + __func__, ci->i_truncate_pagecache_size, size); + ci->i_truncate_pagecache_size = size; + } else { + ci->i_truncate_pagecache_size = truncate_size; + } } return queue_trunc; } @@ -752,6 +881,34 @@ void ceph_fill_file_time(struct inode *inode, int issued, inode, time_warp_seq, ci->i_time_warp_seq); } +#if IS_ENABLED(CONFIG_FS_ENCRYPTION) +static int decode_encrypted_symlink(const char *encsym, int enclen, u8 **decsym) +{ + int declen; + u8 *sym; + + sym = kmalloc(enclen + 1, GFP_NOFS); + if (!sym) + return -ENOMEM; + + declen = ceph_base64_decode(encsym, enclen, sym); + if (declen < 0) { + pr_err("%s: can't decode symlink (%d). Content: %.*s\n", + __func__, declen, enclen, encsym); + kfree(sym); + return -EIO; + } + sym[declen + 1] = '\0'; + *decsym = sym; + return declen; +} +#else +static int decode_encrypted_symlink(const char *encsym, int symlen, u8 **decsym) +{ + return -EOPNOTSUPP; +} +#endif + /* * Populate an inode based on info from mds. May be called on new or * existing inodes. @@ -857,15 +1014,20 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page, issued |= __ceph_caps_dirty(ci); new_issued = ~issued & info_caps; - /* directories have fl_stripe_unit set to zero */ - if (le32_to_cpu(info->layout.fl_stripe_unit)) - inode->i_blkbits = - fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1; - else - inode->i_blkbits = CEPH_BLOCK_SHIFT; - __ceph_update_quota(ci, iinfo->max_bytes, iinfo->max_files); +#ifdef CONFIG_FS_ENCRYPTION + if (iinfo->fscrypt_auth_len && + ((inode->i_state & I_NEW) || (ci->fscrypt_auth_len == 0))) { + kfree(ci->fscrypt_auth); + ci->fscrypt_auth_len = iinfo->fscrypt_auth_len; + ci->fscrypt_auth = iinfo->fscrypt_auth; + iinfo->fscrypt_auth = NULL; + iinfo->fscrypt_auth_len = 0; + inode_set_flags(inode, S_ENCRYPTED, S_ENCRYPTED); + } +#endif + if ((new_version || (new_issued & CEPH_CAP_AUTH_SHARED)) && (issued & CEPH_CAP_AUTH_EXCL) == 0) { inode->i_mode = mode; @@ -878,6 +1040,15 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page, ceph_decode_timespec64(&ci->i_snap_btime, &iinfo->snap_btime); } + /* directories have fl_stripe_unit set to zero */ + if (IS_ENCRYPTED(inode)) + inode->i_blkbits = CEPH_FSCRYPT_BLOCK_SHIFT; + else if (le32_to_cpu(info->layout.fl_stripe_unit)) + inode->i_blkbits = + fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1; + else + inode->i_blkbits = CEPH_BLOCK_SHIFT; + if ((new_version || (new_issued & CEPH_CAP_LINK_SHARED)) && (issued & CEPH_CAP_LINK_EXCL) == 0) set_nlink(inode, le32_to_cpu(info->nlink)); @@ -899,6 +1070,7 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page, if (new_version || (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) { + u64 size = le64_to_cpu(info->size); s64 old_pool = ci->i_layout.pool_id; struct ceph_string *old_ns; @@ -912,10 +1084,22 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page, pool_ns = old_ns; + if (IS_ENCRYPTED(inode) && size && + iinfo->fscrypt_file_len == sizeof(__le64)) { + u64 fsize = __le64_to_cpu(*(__le64 *)iinfo->fscrypt_file); + + if (size == round_up(fsize, CEPH_FSCRYPT_BLOCK_SIZE)) { + size = fsize; + } else { + pr_warn("fscrypt size mismatch: size=%llu fscrypt_file=%llu, discarding fscrypt_file size.\n", + info->size, size); + } + } + queue_trunc = ceph_fill_file_size(inode, issued, le32_to_cpu(info->truncate_seq), le64_to_cpu(info->truncate_size), - le64_to_cpu(info->size)); + size); /* only update max_size on auth cap */ if ((info->cap.flags & CEPH_CAP_FLAG_AUTH) && ci->i_max_size != le64_to_cpu(info->max_size)) { @@ -975,26 +1159,42 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page, inode->i_fop = &ceph_file_fops; break; case S_IFLNK: - inode->i_op = &ceph_symlink_iops; if (!ci->i_symlink) { u32 symlen = iinfo->symlink_len; char *sym; spin_unlock(&ci->i_ceph_lock); - if (symlen != i_size_read(inode)) { - pr_err("%s %llx.%llx BAD symlink " - "size %lld\n", __func__, - ceph_vinop(inode), - i_size_read(inode)); + if (IS_ENCRYPTED(inode)) { + if (symlen != i_size_read(inode)) + pr_err("%s %llx.%llx BAD symlink size %lld\n", + __func__, ceph_vinop(inode), + i_size_read(inode)); + + err = decode_encrypted_symlink(iinfo->symlink, + symlen, (u8 **)&sym); + if (err < 0) { + pr_err("%s decoding encrypted symlink failed: %d\n", + __func__, err); + goto out; + } + symlen = err; i_size_write(inode, symlen); inode->i_blocks = calc_inode_blocks(symlen); - } + } else { + if (symlen != i_size_read(inode)) { + pr_err("%s %llx.%llx BAD symlink size %lld\n", + __func__, ceph_vinop(inode), + i_size_read(inode)); + i_size_write(inode, symlen); + inode->i_blocks = calc_inode_blocks(symlen); + } - err = -ENOMEM; - sym = kstrndup(iinfo->symlink, symlen, GFP_NOFS); - if (!sym) - goto out; + err = -ENOMEM; + sym = kstrndup(iinfo->symlink, symlen, GFP_NOFS); + if (!sym) + goto out; + } spin_lock(&ci->i_ceph_lock); if (!ci->i_symlink) @@ -1002,7 +1202,17 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page, else kfree(sym); /* lost a race */ } - inode->i_link = ci->i_symlink; + + if (IS_ENCRYPTED(inode)) { + /* + * Encrypted symlinks need to be decrypted before we can + * cache their targets in i_link. Don't touch it here. + */ + inode->i_op = &ceph_encrypted_symlink_iops; + } else { + inode->i_link = ci->i_symlink; + inode->i_op = &ceph_symlink_iops; + } break; case S_IFDIR: inode->i_op = &ceph_dir_iops; @@ -1310,8 +1520,15 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req) if (dir && req->r_op == CEPH_MDS_OP_LOOKUPNAME && test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags) && !test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) { + bool is_nokey = false; struct qstr dname; struct dentry *dn, *parent; + struct fscrypt_str oname = FSTR_INIT(NULL, 0); + struct ceph_fname fname = { .dir = dir, + .name = rinfo->dname, + .ctext = rinfo->altname, + .name_len = rinfo->dname_len, + .ctext_len = rinfo->altname_len }; BUG_ON(!rinfo->head->is_target); BUG_ON(req->r_dentry); @@ -1319,8 +1536,20 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req) parent = d_find_any_alias(dir); BUG_ON(!parent); - dname.name = rinfo->dname; - dname.len = rinfo->dname_len; + err = ceph_fname_alloc_buffer(dir, &oname); + if (err < 0) { + dput(parent); + goto done; + } + + err = ceph_fname_to_usr(&fname, NULL, &oname, &is_nokey); + if (err < 0) { + dput(parent); + ceph_fname_free_buffer(dir, &oname); + goto done; + } + dname.name = oname.name; + dname.len = oname.len; dname.hash = full_name_hash(parent, dname.name, dname.len); tvino.ino = le64_to_cpu(rinfo->targeti.in->ino); tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid); @@ -1335,9 +1564,15 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req) dname.len, dname.name, dn); if (!dn) { dput(parent); + ceph_fname_free_buffer(dir, &oname); err = -ENOMEM; goto done; } + if (is_nokey) { + spin_lock(&dn->d_lock); + dn->d_flags |= DCACHE_NOKEY_NAME; + spin_unlock(&dn->d_lock); + } err = 0; } else if (d_really_is_positive(dn) && (ceph_ino(d_inode(dn)) != tvino.ino || @@ -1349,6 +1584,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req) dput(dn); goto retry_lookup; } + ceph_fname_free_buffer(dir, &oname); req->r_dentry = dn; dput(parent); @@ -1552,7 +1788,7 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req, vino.ino = le64_to_cpu(rde->inode.in->ino); vino.snap = le64_to_cpu(rde->inode.in->snapid); - in = ceph_get_inode(req->r_dentry->d_sb, vino); + in = ceph_get_inode(req->r_dentry->d_sb, vino, NULL); if (IS_ERR(in)) { err = PTR_ERR(in); dout("new_inode badness got %d\n", err); @@ -1630,7 +1866,8 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, struct ceph_mds_session *session) { struct dentry *parent = req->r_dentry; - struct ceph_inode_info *ci = ceph_inode(d_inode(parent)); + struct inode *inode = d_inode(parent); + struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; struct qstr dname; struct dentry *dn; @@ -1704,9 +1941,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, tvino.snap = le64_to_cpu(rde->inode.in->snapid); if (rinfo->hash_order) { - u32 hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash, - rde->name, rde->name_len); - hash = ceph_frag_value(hash); + u32 hash = ceph_frag_value(rde->raw_hash); if (hash != last_hash) fpos_offset = 2; last_hash = hash; @@ -1729,6 +1964,11 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, err = -ENOMEM; goto out; } + if (rde->is_nokey) { + spin_lock(&dn->d_lock); + dn->d_flags |= DCACHE_NOKEY_NAME; + spin_unlock(&dn->d_lock); + } } else if (d_really_is_positive(dn) && (ceph_ino(d_inode(dn)) != tvino.ino || ceph_snap(d_inode(dn)) != tvino.snap)) { @@ -1754,7 +1994,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, if (d_really_is_positive(dn)) { in = d_inode(dn); } else { - in = ceph_get_inode(parent->d_sb, tvino); + in = ceph_get_inode(parent->d_sb, tvino, NULL); if (IS_ERR(in)) { dout("new_inode badness\n"); d_drop(dn); @@ -1927,7 +2167,7 @@ void __ceph_do_pending_vmtruncate(struct inode *inode) retry: spin_lock(&ci->i_ceph_lock); if (ci->i_truncate_pending == 0) { - dout("__do_pending_vmtruncate %p none pending\n", inode); + dout("%s %p none pending\n", __func__, inode); spin_unlock(&ci->i_ceph_lock); mutex_unlock(&ci->i_truncate_mutex); return; @@ -1939,8 +2179,7 @@ void __ceph_do_pending_vmtruncate(struct inode *inode) */ if (ci->i_wrbuffer_ref_head < ci->i_wrbuffer_ref) { spin_unlock(&ci->i_ceph_lock); - dout("__do_pending_vmtruncate %p flushing snaps first\n", - inode); + dout("%s %p flushing snaps first\n", __func__, inode); filemap_write_and_wait_range(&inode->i_data, 0, inode->i_sb->s_maxbytes); goto retry; @@ -1949,9 +2188,9 @@ void __ceph_do_pending_vmtruncate(struct inode *inode) /* there should be no reader or writer */ WARN_ON_ONCE(ci->i_rd_ref || ci->i_wr_ref); - to = ci->i_truncate_size; + to = ci->i_truncate_pagecache_size; wrbuffer_refs = ci->i_wrbuffer_ref; - dout("__do_pending_vmtruncate %p (%d) to %lld\n", inode, + dout("%s %p (%d) to %lld\n", __func__, inode, ci->i_truncate_pending, to); spin_unlock(&ci->i_ceph_lock); @@ -1959,7 +2198,7 @@ void __ceph_do_pending_vmtruncate(struct inode *inode) truncate_pagecache(inode, to); spin_lock(&ci->i_ceph_lock); - if (to == ci->i_truncate_size) { + if (to == ci->i_truncate_pagecache_size) { ci->i_truncate_pending = 0; finish = 1; } @@ -2000,6 +2239,32 @@ static void ceph_inode_work(struct work_struct *work) iput(inode); } +static const char *ceph_encrypted_get_link(struct dentry *dentry, + struct inode *inode, + struct delayed_call *done) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + + if (!dentry) + return ERR_PTR(-ECHILD); + + return fscrypt_get_symlink(inode, ci->i_symlink, i_size_read(inode), + done); +} + +static int ceph_encrypted_symlink_getattr(struct mnt_idmap *idmap, + const struct path *path, + struct kstat *stat, u32 request_mask, + unsigned int query_flags) +{ + int ret; + + ret = ceph_getattr(idmap, path, stat, request_mask, query_flags); + if (ret) + return ret; + return fscrypt_symlink_getattr(path, stat); +} + /* * symlinks */ @@ -2010,20 +2275,173 @@ static const struct inode_operations ceph_symlink_iops = { .listxattr = ceph_listxattr, }; -int __ceph_setattr(struct inode *inode, struct iattr *attr) +static const struct inode_operations ceph_encrypted_symlink_iops = { + .get_link = ceph_encrypted_get_link, + .setattr = ceph_setattr, + .getattr = ceph_encrypted_symlink_getattr, + .listxattr = ceph_listxattr, +}; + +/* + * Transfer the encrypted last block to the MDS and the MDS + * will help update it when truncating a smaller size. + * + * We don't support a PAGE_SIZE that is smaller than the + * CEPH_FSCRYPT_BLOCK_SIZE. + */ +static int fill_fscrypt_truncate(struct inode *inode, + struct ceph_mds_request *req, + struct iattr *attr) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + int boff = attr->ia_size % CEPH_FSCRYPT_BLOCK_SIZE; + loff_t pos, orig_pos = round_down(attr->ia_size, + CEPH_FSCRYPT_BLOCK_SIZE); + u64 block = orig_pos >> CEPH_FSCRYPT_BLOCK_SHIFT; + struct ceph_pagelist *pagelist = NULL; + struct kvec iov = {0}; + struct iov_iter iter; + struct page *page = NULL; + struct ceph_fscrypt_truncate_size_header header; + int retry_op = 0; + int len = CEPH_FSCRYPT_BLOCK_SIZE; + loff_t i_size = i_size_read(inode); + int got, ret, issued; + u64 objver; + + ret = __ceph_get_caps(inode, NULL, CEPH_CAP_FILE_RD, 0, -1, &got); + if (ret < 0) + return ret; + + issued = __ceph_caps_issued(ci, NULL); + + dout("%s size %lld -> %lld got cap refs on %s, issued %s\n", __func__, + i_size, attr->ia_size, ceph_cap_string(got), + ceph_cap_string(issued)); + + /* Try to writeback the dirty pagecaches */ + if (issued & (CEPH_CAP_FILE_BUFFER)) { + loff_t lend = orig_pos + CEPH_FSCRYPT_BLOCK_SHIFT - 1; + + ret = filemap_write_and_wait_range(inode->i_mapping, + orig_pos, lend); + if (ret < 0) + goto out; + } + + page = __page_cache_alloc(GFP_KERNEL); + if (page == NULL) { + ret = -ENOMEM; + goto out; + } + + pagelist = ceph_pagelist_alloc(GFP_KERNEL); + if (!pagelist) { + ret = -ENOMEM; + goto out; + } + + iov.iov_base = kmap_local_page(page); + iov.iov_len = len; + iov_iter_kvec(&iter, READ, &iov, 1, len); + + pos = orig_pos; + ret = __ceph_sync_read(inode, &pos, &iter, &retry_op, &objver); + if (ret < 0) + goto out; + + /* Insert the header first */ + header.ver = 1; + header.compat = 1; + header.change_attr = cpu_to_le64(inode_peek_iversion_raw(inode)); + + /* + * Always set the block_size to CEPH_FSCRYPT_BLOCK_SIZE, + * because in MDS it may need this to do the truncate. + */ + header.block_size = cpu_to_le32(CEPH_FSCRYPT_BLOCK_SIZE); + + /* + * If we hit a hole here, we should just skip filling + * the fscrypt for the request, because once the fscrypt + * is enabled, the file will be split into many blocks + * with the size of CEPH_FSCRYPT_BLOCK_SIZE, if there + * has a hole, the hole size should be multiple of block + * size. + * + * If the Rados object doesn't exist, it will be set to 0. + */ + if (!objver) { + dout("%s hit hole, ppos %lld < size %lld\n", __func__, + pos, i_size); + + header.data_len = cpu_to_le32(8 + 8 + 4); + header.file_offset = 0; + ret = 0; + } else { + header.data_len = cpu_to_le32(8 + 8 + 4 + CEPH_FSCRYPT_BLOCK_SIZE); + header.file_offset = cpu_to_le64(orig_pos); + + dout("%s encrypt block boff/bsize %d/%lu\n", __func__, + boff, CEPH_FSCRYPT_BLOCK_SIZE); + + /* truncate and zero out the extra contents for the last block */ + memset(iov.iov_base + boff, 0, PAGE_SIZE - boff); + + /* encrypt the last block */ + ret = ceph_fscrypt_encrypt_block_inplace(inode, page, + CEPH_FSCRYPT_BLOCK_SIZE, + 0, block, + GFP_KERNEL); + if (ret) + goto out; + } + + /* Insert the header */ + ret = ceph_pagelist_append(pagelist, &header, sizeof(header)); + if (ret) + goto out; + + if (header.block_size) { + /* Append the last block contents to pagelist */ + ret = ceph_pagelist_append(pagelist, iov.iov_base, + CEPH_FSCRYPT_BLOCK_SIZE); + if (ret) + goto out; + } + req->r_pagelist = pagelist; +out: + dout("%s %p size dropping cap refs on %s\n", __func__, + inode, ceph_cap_string(got)); + ceph_put_cap_refs(ci, got); + if (iov.iov_base) + kunmap_local(iov.iov_base); + if (page) + __free_pages(page, 0); + if (ret && pagelist) + ceph_pagelist_release(pagelist); + return ret; +} + +int __ceph_setattr(struct inode *inode, struct iattr *attr, + struct ceph_iattr *cia) { struct ceph_inode_info *ci = ceph_inode(inode); unsigned int ia_valid = attr->ia_valid; struct ceph_mds_request *req; struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_cap_flush *prealloc_cf; + loff_t isize = i_size_read(inode); int issued; int release = 0, dirtied = 0; int mask = 0; int err = 0; int inode_dirty_flags = 0; bool lock_snap_rwsem = false; + bool fill_fscrypt; + int truncate_retry = 20; /* The RMW will take around 50ms */ +retry: prealloc_cf = ceph_alloc_cap_flush(); if (!prealloc_cf) return -ENOMEM; @@ -2035,6 +2453,7 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) return PTR_ERR(req); } + fill_fscrypt = false; spin_lock(&ci->i_ceph_lock); issued = __ceph_caps_issued(ci, NULL); @@ -2050,6 +2469,43 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) } dout("setattr %p issued %s\n", inode, ceph_cap_string(issued)); +#if IS_ENABLED(CONFIG_FS_ENCRYPTION) + if (cia && cia->fscrypt_auth) { + u32 len = ceph_fscrypt_auth_len(cia->fscrypt_auth); + + if (len > sizeof(*cia->fscrypt_auth)) { + err = -EINVAL; + spin_unlock(&ci->i_ceph_lock); + goto out; + } + + dout("setattr %llx:%llx fscrypt_auth len %u to %u)\n", + ceph_vinop(inode), ci->fscrypt_auth_len, len); + + /* It should never be re-set once set */ + WARN_ON_ONCE(ci->fscrypt_auth); + + if (issued & CEPH_CAP_AUTH_EXCL) { + dirtied |= CEPH_CAP_AUTH_EXCL; + kfree(ci->fscrypt_auth); + ci->fscrypt_auth = (u8 *)cia->fscrypt_auth; + ci->fscrypt_auth_len = len; + } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 || + ci->fscrypt_auth_len != len || + memcmp(ci->fscrypt_auth, cia->fscrypt_auth, len)) { + req->r_fscrypt_auth = cia->fscrypt_auth; + mask |= CEPH_SETATTR_FSCRYPT_AUTH; + release |= CEPH_CAP_AUTH_SHARED; + } + cia->fscrypt_auth = NULL; + } +#else + if (cia && cia->fscrypt_auth) { + err = -EINVAL; + spin_unlock(&ci->i_ceph_lock); + goto out; + } +#endif /* CONFIG_FS_ENCRYPTION */ if (ia_valid & ATTR_UID) { dout("setattr %p uid %d -> %d\n", inode, @@ -2119,10 +2575,27 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) } } if (ia_valid & ATTR_SIZE) { - loff_t isize = i_size_read(inode); - dout("setattr %p size %lld -> %lld\n", inode, isize, attr->ia_size); - if ((issued & CEPH_CAP_FILE_EXCL) && attr->ia_size >= isize) { + /* + * Only when the new size is smaller and not aligned to + * CEPH_FSCRYPT_BLOCK_SIZE will the RMW is needed. + */ + if (IS_ENCRYPTED(inode) && attr->ia_size < isize && + (attr->ia_size % CEPH_FSCRYPT_BLOCK_SIZE)) { + mask |= CEPH_SETATTR_SIZE; + release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL | + CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR; + set_bit(CEPH_MDS_R_FSCRYPT_FILE, &req->r_req_flags); + mask |= CEPH_SETATTR_FSCRYPT_FILE; + req->r_args.setattr.size = + cpu_to_le64(round_up(attr->ia_size, + CEPH_FSCRYPT_BLOCK_SIZE)); + req->r_args.setattr.old_size = + cpu_to_le64(round_up(isize, + CEPH_FSCRYPT_BLOCK_SIZE)); + req->r_fscrypt_file = attr->ia_size; + fill_fscrypt = true; + } else if ((issued & CEPH_CAP_FILE_EXCL) && attr->ia_size >= isize) { if (attr->ia_size > isize) { i_size_write(inode, attr->ia_size); inode->i_blocks = calc_inode_blocks(attr->ia_size); @@ -2132,11 +2605,24 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) } } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 || attr->ia_size != isize) { - req->r_args.setattr.size = cpu_to_le64(attr->ia_size); - req->r_args.setattr.old_size = cpu_to_le64(isize); mask |= CEPH_SETATTR_SIZE; release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL | CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR; + if (IS_ENCRYPTED(inode) && attr->ia_size) { + set_bit(CEPH_MDS_R_FSCRYPT_FILE, &req->r_req_flags); + mask |= CEPH_SETATTR_FSCRYPT_FILE; + req->r_args.setattr.size = + cpu_to_le64(round_up(attr->ia_size, + CEPH_FSCRYPT_BLOCK_SIZE)); + req->r_args.setattr.old_size = + cpu_to_le64(round_up(isize, + CEPH_FSCRYPT_BLOCK_SIZE)); + req->r_fscrypt_file = attr->ia_size; + } else { + req->r_args.setattr.size = cpu_to_le64(attr->ia_size); + req->r_args.setattr.old_size = cpu_to_le64(isize); + req->r_fscrypt_file = 0; + } } } if (ia_valid & ATTR_MTIME) { @@ -2199,8 +2685,10 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) release &= issued; spin_unlock(&ci->i_ceph_lock); - if (lock_snap_rwsem) + if (lock_snap_rwsem) { up_read(&mdsc->snap_rwsem); + lock_snap_rwsem = false; + } if (inode_dirty_flags) __mark_inode_dirty(inode, inode_dirty_flags); @@ -2212,8 +2700,29 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) req->r_args.setattr.mask = cpu_to_le32(mask); req->r_num_caps = 1; req->r_stamp = attr->ia_ctime; + if (fill_fscrypt) { + err = fill_fscrypt_truncate(inode, req, attr); + if (err) + goto out; + } + + /* + * The truncate request will return -EAGAIN when the + * last block has been updated just before the MDS + * successfully gets the xlock for the FILE lock. To + * avoid corrupting the file contents we need to retry + * it. + */ err = ceph_mdsc_do_request(mdsc, NULL, req); + if (err == -EAGAIN && truncate_retry--) { + dout("setattr %p result=%d (%s locally, %d remote), retry it!\n", + inode, err, ceph_cap_string(dirtied), mask); + ceph_mdsc_put_request(req); + ceph_free_cap_flush(prealloc_cf); + goto retry; + } } +out: dout("setattr %p result=%d (%s locally, %d remote)\n", inode, err, ceph_cap_string(dirtied), mask); @@ -2242,6 +2751,10 @@ int ceph_setattr(struct mnt_idmap *idmap, struct dentry *dentry, if (ceph_inode_is_shutdown(inode)) return -ESTALE; + err = fscrypt_prepare_setattr(dentry, attr); + if (err) + return err; + err = setattr_prepare(&nop_mnt_idmap, dentry, attr); if (err != 0) return err; @@ -2254,7 +2767,7 @@ int ceph_setattr(struct mnt_idmap *idmap, struct dentry *dentry, ceph_quota_is_max_bytes_exceeded(inode, attr->ia_size)) return -EDQUOT; - err = __ceph_setattr(inode, attr); + err = __ceph_setattr(inode, attr, NULL); if (err >= 0 && (attr->ia_valid & ATTR_MODE)) err = posix_acl_chmod(&nop_mnt_idmap, dentry, attr->ia_mode); @@ -2525,8 +3038,12 @@ int ceph_getattr(struct mnt_idmap *idmap, const struct path *path, stat->nlink = 1 + 1 + ci->i_subdirs; } - stat->attributes_mask |= STATX_ATTR_CHANGE_MONOTONIC; stat->attributes |= STATX_ATTR_CHANGE_MONOTONIC; + if (IS_ENCRYPTED(inode)) + stat->attributes |= STATX_ATTR_ENCRYPTED; + stat->attributes_mask |= (STATX_ATTR_CHANGE_MONOTONIC | + STATX_ATTR_ENCRYPTED); + stat->result_mask = request_mask & valid_mask; return err; } diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c index deac817647eb..91a84917d203 100644 --- a/fs/ceph/ioctl.c +++ b/fs/ceph/ioctl.c @@ -6,6 +6,7 @@ #include "mds_client.h" #include "ioctl.h" #include +#include /* * ioctls @@ -268,9 +269,96 @@ static long ceph_ioctl_syncio(struct file *file) return 0; } +static int vet_mds_for_fscrypt(struct file *file) +{ + int i, ret = -EOPNOTSUPP; + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(file_inode(file)->i_sb); + + mutex_lock(&mdsc->mutex); + for (i = 0; i < mdsc->max_sessions; i++) { + struct ceph_mds_session *s = mdsc->sessions[i]; + + if (!s) + continue; + if (test_bit(CEPHFS_FEATURE_ALTERNATE_NAME, &s->s_features)) + ret = 0; + break; + } + mutex_unlock(&mdsc->mutex); + return ret; +} + +static long ceph_set_encryption_policy(struct file *file, unsigned long arg) +{ + int ret, got = 0; + struct inode *inode = file_inode(file); + struct ceph_inode_info *ci = ceph_inode(inode); + + /* encrypted directories can't have striped layout */ + if (ci->i_layout.stripe_count > 1) + return -EINVAL; + + ret = vet_mds_for_fscrypt(file); + if (ret) + return ret; + + /* + * Ensure we hold these caps so that we _know_ that the rstats check + * in the empty_dir check is reliable. + */ + ret = ceph_get_caps(file, CEPH_CAP_FILE_SHARED, 0, -1, &got); + if (ret) + return ret; + + ret = fscrypt_ioctl_set_policy(file, (const void __user *)arg); + if (got) + ceph_put_cap_refs(ci, got); + + return ret; +} + +static const char *ceph_ioctl_cmd_name(const unsigned int cmd) +{ + switch (cmd) { + case CEPH_IOC_GET_LAYOUT: + return "get_layout"; + case CEPH_IOC_SET_LAYOUT: + return "set_layout"; + case CEPH_IOC_SET_LAYOUT_POLICY: + return "set_layout_policy"; + case CEPH_IOC_GET_DATALOC: + return "get_dataloc"; + case CEPH_IOC_LAZYIO: + return "lazyio"; + case CEPH_IOC_SYNCIO: + return "syncio"; + case FS_IOC_SET_ENCRYPTION_POLICY: + return "set_encryption_policy"; + case FS_IOC_GET_ENCRYPTION_POLICY: + return "get_encryption_policy"; + case FS_IOC_GET_ENCRYPTION_POLICY_EX: + return "get_encryption_policy_ex"; + case FS_IOC_ADD_ENCRYPTION_KEY: + return "add_encryption_key"; + case FS_IOC_REMOVE_ENCRYPTION_KEY: + return "remove_encryption_key"; + case FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS: + return "remove_encryption_key_all_users"; + case FS_IOC_GET_ENCRYPTION_KEY_STATUS: + return "get_encryption_key_status"; + case FS_IOC_GET_ENCRYPTION_NONCE: + return "get_encryption_nonce"; + default: + return "unknown"; + } +} + long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { - dout("ioctl file %p cmd %u arg %lu\n", file, cmd, arg); + int ret; + + dout("ioctl file %p cmd %s arg %lu\n", file, + ceph_ioctl_cmd_name(cmd), arg); switch (cmd) { case CEPH_IOC_GET_LAYOUT: return ceph_ioctl_get_layout(file, (void __user *)arg); @@ -289,6 +377,43 @@ long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case CEPH_IOC_SYNCIO: return ceph_ioctl_syncio(file); + + case FS_IOC_SET_ENCRYPTION_POLICY: + return ceph_set_encryption_policy(file, arg); + + case FS_IOC_GET_ENCRYPTION_POLICY: + ret = vet_mds_for_fscrypt(file); + if (ret) + return ret; + return fscrypt_ioctl_get_policy(file, (void __user *)arg); + + case FS_IOC_GET_ENCRYPTION_POLICY_EX: + ret = vet_mds_for_fscrypt(file); + if (ret) + return ret; + return fscrypt_ioctl_get_policy_ex(file, (void __user *)arg); + + case FS_IOC_ADD_ENCRYPTION_KEY: + ret = vet_mds_for_fscrypt(file); + if (ret) + return ret; + return fscrypt_ioctl_add_key(file, (void __user *)arg); + + case FS_IOC_REMOVE_ENCRYPTION_KEY: + return fscrypt_ioctl_remove_key(file, (void __user *)arg); + + case FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS: + return fscrypt_ioctl_remove_key_all_users(file, + (void __user *)arg); + + case FS_IOC_GET_ENCRYPTION_KEY_STATUS: + return fscrypt_ioctl_get_key_status(file, (void __user *)arg); + + case FS_IOC_GET_ENCRYPTION_NONCE: + ret = vet_mds_for_fscrypt(file); + if (ret) + return ret; + return fscrypt_ioctl_get_nonce(file, (void __user *)arg); } return -ENOTTY; diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 5fb367b1d4b0..615db141b6c4 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -15,6 +15,7 @@ #include "super.h" #include "mds_client.h" +#include "crypto.h" #include #include @@ -184,8 +185,54 @@ static int parse_reply_info_in(void **p, void *end, info->rsnaps = 0; } + if (struct_v >= 5) { + u32 alen; + + ceph_decode_32_safe(p, end, alen, bad); + + while (alen--) { + u32 len; + + /* key */ + ceph_decode_32_safe(p, end, len, bad); + ceph_decode_skip_n(p, end, len, bad); + /* value */ + ceph_decode_32_safe(p, end, len, bad); + ceph_decode_skip_n(p, end, len, bad); + } + } + + /* fscrypt flag -- ignore */ + if (struct_v >= 6) + ceph_decode_skip_8(p, end, bad); + + info->fscrypt_auth = NULL; + info->fscrypt_auth_len = 0; + info->fscrypt_file = NULL; + info->fscrypt_file_len = 0; + if (struct_v >= 7) { + ceph_decode_32_safe(p, end, info->fscrypt_auth_len, bad); + if (info->fscrypt_auth_len) { + info->fscrypt_auth = kmalloc(info->fscrypt_auth_len, + GFP_KERNEL); + if (!info->fscrypt_auth) + return -ENOMEM; + ceph_decode_copy_safe(p, end, info->fscrypt_auth, + info->fscrypt_auth_len, bad); + } + ceph_decode_32_safe(p, end, info->fscrypt_file_len, bad); + if (info->fscrypt_file_len) { + info->fscrypt_file = kmalloc(info->fscrypt_file_len, + GFP_KERNEL); + if (!info->fscrypt_file) + return -ENOMEM; + ceph_decode_copy_safe(p, end, info->fscrypt_file, + info->fscrypt_file_len, bad); + } + } *p = end; } else { + /* legacy (unversioned) struct */ if (features & CEPH_FEATURE_MDS_INLINE_DATA) { ceph_decode_64_safe(p, end, info->inline_version, bad); ceph_decode_32_safe(p, end, info->inline_len, bad); @@ -263,27 +310,47 @@ static int parse_reply_info_dir(void **p, void *end, static int parse_reply_info_lease(void **p, void *end, struct ceph_mds_reply_lease **lease, - u64 features) + u64 features, u32 *altname_len, u8 **altname) { + u8 struct_v; + u32 struct_len; + void *lend; + if (features == (u64)-1) { - u8 struct_v, struct_compat; - u32 struct_len; + u8 struct_compat; + ceph_decode_8_safe(p, end, struct_v, bad); ceph_decode_8_safe(p, end, struct_compat, bad); + /* struct_v is expected to be >= 1. we only understand * encoding whose struct_compat == 1. */ if (!struct_v || struct_compat != 1) goto bad; + ceph_decode_32_safe(p, end, struct_len, bad); - ceph_decode_need(p, end, struct_len, bad); - end = *p + struct_len; + } else { + struct_len = sizeof(**lease); + *altname_len = 0; + *altname = NULL; } - ceph_decode_need(p, end, sizeof(**lease), bad); + lend = *p + struct_len; + ceph_decode_need(p, end, struct_len, bad); *lease = *p; *p += sizeof(**lease); - if (features == (u64)-1) - *p = end; + + if (features == (u64)-1) { + if (struct_v >= 2) { + ceph_decode_32_safe(p, end, *altname_len, bad); + ceph_decode_need(p, end, *altname_len, bad); + *altname = *p; + *p += *altname_len; + } else { + *altname = NULL; + *altname_len = 0; + } + } + *p = lend; return 0; bad: return -EIO; @@ -313,7 +380,8 @@ static int parse_reply_info_trace(void **p, void *end, info->dname = *p; *p += info->dname_len; - err = parse_reply_info_lease(p, end, &info->dlease, features); + err = parse_reply_info_lease(p, end, &info->dlease, features, + &info->altname_len, &info->altname); if (err < 0) goto out_bad; } @@ -339,9 +407,10 @@ static int parse_reply_info_trace(void **p, void *end, * parse readdir results */ static int parse_reply_info_readdir(void **p, void *end, - struct ceph_mds_reply_info_parsed *info, - u64 features) + struct ceph_mds_request *req, + u64 features) { + struct ceph_mds_reply_info_parsed *info = &req->r_reply_info; u32 num, i = 0; int err; @@ -371,18 +440,87 @@ static int parse_reply_info_readdir(void **p, void *end, info->dir_nr = num; while (num) { + struct inode *inode = d_inode(req->r_dentry); + struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_mds_reply_dir_entry *rde = info->dir_entries + i; + struct fscrypt_str tname = FSTR_INIT(NULL, 0); + struct fscrypt_str oname = FSTR_INIT(NULL, 0); + struct ceph_fname fname; + u32 altname_len, _name_len; + u8 *altname, *_name; + /* dentry */ - ceph_decode_32_safe(p, end, rde->name_len, bad); - ceph_decode_need(p, end, rde->name_len, bad); - rde->name = *p; - *p += rde->name_len; - dout("parsed dir dname '%.*s'\n", rde->name_len, rde->name); + ceph_decode_32_safe(p, end, _name_len, bad); + ceph_decode_need(p, end, _name_len, bad); + _name = *p; + *p += _name_len; + dout("parsed dir dname '%.*s'\n", _name_len, _name); + + if (info->hash_order) + rde->raw_hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash, + _name, _name_len); /* dentry lease */ - err = parse_reply_info_lease(p, end, &rde->lease, features); + err = parse_reply_info_lease(p, end, &rde->lease, features, + &altname_len, &altname); if (err) goto out_bad; + + /* + * Try to dencrypt the dentry names and update them + * in the ceph_mds_reply_dir_entry struct. + */ + fname.dir = inode; + fname.name = _name; + fname.name_len = _name_len; + fname.ctext = altname; + fname.ctext_len = altname_len; + /* + * The _name_len maybe larger than altname_len, such as + * when the human readable name length is in range of + * (CEPH_NOHASH_NAME_MAX, CEPH_NOHASH_NAME_MAX + SHA256_DIGEST_SIZE), + * then the copy in ceph_fname_to_usr will corrupt the + * data if there has no encryption key. + * + * Just set the no_copy flag and then if there has no + * encryption key the oname.name will be assigned to + * _name always. + */ + fname.no_copy = true; + if (altname_len == 0) { + /* + * Set tname to _name, and this will be used + * to do the base64_decode in-place. It's + * safe because the decoded string should + * always be shorter, which is 3/4 of origin + * string. + */ + tname.name = _name; + + /* + * Set oname to _name too, and this will be + * used to do the dencryption in-place. + */ + oname.name = _name; + oname.len = _name_len; + } else { + /* + * This will do the decryption only in-place + * from altname cryptext directly. + */ + oname.name = altname; + oname.len = altname_len; + } + rde->is_nokey = false; + err = ceph_fname_to_usr(&fname, &tname, &oname, &rde->is_nokey); + if (err) { + pr_err("%s unable to decode %.*s, got %d\n", __func__, + _name_len, _name, err); + goto out_bad; + } + rde->name = oname.name; + rde->name_len = oname.len; + /* inode */ err = parse_reply_info_in(p, end, &rde->inode, features); if (err < 0) @@ -581,15 +719,16 @@ static int parse_reply_info_getvxattr(void **p, void *end, * parse extra results */ static int parse_reply_info_extra(void **p, void *end, - struct ceph_mds_reply_info_parsed *info, + struct ceph_mds_request *req, u64 features, struct ceph_mds_session *s) { + struct ceph_mds_reply_info_parsed *info = &req->r_reply_info; u32 op = le32_to_cpu(info->head->op); if (op == CEPH_MDS_OP_GETFILELOCK) return parse_reply_info_filelock(p, end, info, features); else if (op == CEPH_MDS_OP_READDIR || op == CEPH_MDS_OP_LSSNAP) - return parse_reply_info_readdir(p, end, info, features); + return parse_reply_info_readdir(p, end, req, features); else if (op == CEPH_MDS_OP_CREATE) return parse_reply_info_create(p, end, info, features, s); else if (op == CEPH_MDS_OP_GETVXATTR) @@ -602,9 +741,9 @@ static int parse_reply_info_extra(void **p, void *end, * parse entire mds reply */ static int parse_reply_info(struct ceph_mds_session *s, struct ceph_msg *msg, - struct ceph_mds_reply_info_parsed *info, - u64 features) + struct ceph_mds_request *req, u64 features) { + struct ceph_mds_reply_info_parsed *info = &req->r_reply_info; void *p, *end; u32 len; int err; @@ -626,7 +765,7 @@ static int parse_reply_info(struct ceph_mds_session *s, struct ceph_msg *msg, ceph_decode_32_safe(&p, end, len, bad); if (len > 0) { ceph_decode_need(&p, end, len, bad); - err = parse_reply_info_extra(&p, p+len, info, features, s); + err = parse_reply_info_extra(&p, p+len, req, features, s); if (err < 0) goto out_bad; } @@ -651,8 +790,21 @@ static int parse_reply_info(struct ceph_mds_session *s, struct ceph_msg *msg, static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info) { + int i; + + kfree(info->diri.fscrypt_auth); + kfree(info->diri.fscrypt_file); + kfree(info->targeti.fscrypt_auth); + kfree(info->targeti.fscrypt_file); if (!info->dir_entries) return; + + for (i = 0; i < info->dir_nr; i++) { + struct ceph_mds_reply_dir_entry *rde = info->dir_entries + i; + + kfree(rde->inode.fscrypt_auth); + kfree(rde->inode.fscrypt_file); + } free_pages((unsigned long)info->dir_entries, get_order(info->dir_buf_size)); } @@ -945,6 +1097,7 @@ void ceph_mdsc_release_request(struct kref *kref) iput(req->r_parent); } iput(req->r_target_inode); + iput(req->r_new_inode); if (req->r_dentry) dput(req->r_dentry); if (req->r_old_dentry) @@ -965,6 +1118,8 @@ void ceph_mdsc_release_request(struct kref *kref) put_cred(req->r_cred); if (req->r_pagelist) ceph_pagelist_release(req->r_pagelist); + kfree(req->r_fscrypt_auth); + kfree(req->r_altname); put_request_session(req); ceph_unreserve_caps(req->r_mdsc, &req->r_caps_reservation); WARN_ON_ONCE(!list_empty(&req->r_wait)); @@ -2373,20 +2528,90 @@ static inline u64 __get_oldest_tid(struct ceph_mds_client *mdsc) return mdsc->oldest_tid; } -/* - * Build a dentry's path. Allocate on heap; caller must kfree. Based - * on build_path_from_dentry in fs/cifs/dir.c. +#if IS_ENABLED(CONFIG_FS_ENCRYPTION) +static u8 *get_fscrypt_altname(const struct ceph_mds_request *req, u32 *plen) +{ + struct inode *dir = req->r_parent; + struct dentry *dentry = req->r_dentry; + u8 *cryptbuf = NULL; + u32 len = 0; + int ret = 0; + + /* only encode if we have parent and dentry */ + if (!dir || !dentry) + goto success; + + /* No-op unless this is encrypted */ + if (!IS_ENCRYPTED(dir)) + goto success; + + ret = ceph_fscrypt_prepare_readdir(dir); + if (ret < 0) + return ERR_PTR(ret); + + /* No key? Just ignore it. */ + if (!fscrypt_has_encryption_key(dir)) + goto success; + + if (!fscrypt_fname_encrypted_size(dir, dentry->d_name.len, NAME_MAX, + &len)) { + WARN_ON_ONCE(1); + return ERR_PTR(-ENAMETOOLONG); + } + + /* No need to append altname if name is short enough */ + if (len <= CEPH_NOHASH_NAME_MAX) { + len = 0; + goto success; + } + + cryptbuf = kmalloc(len, GFP_KERNEL); + if (!cryptbuf) + return ERR_PTR(-ENOMEM); + + ret = fscrypt_fname_encrypt(dir, &dentry->d_name, cryptbuf, len); + if (ret) { + kfree(cryptbuf); + return ERR_PTR(ret); + } +success: + *plen = len; + return cryptbuf; +} +#else +static u8 *get_fscrypt_altname(const struct ceph_mds_request *req, u32 *plen) +{ + *plen = 0; + return NULL; +} +#endif + +/** + * ceph_mdsc_build_path - build a path string to a given dentry + * @dentry: dentry to which path should be built + * @plen: returned length of string + * @pbase: returned base inode number + * @for_wire: is this path going to be sent to the MDS? * - * If @stop_on_nosnap, generate path relative to the first non-snapped - * inode. + * Build a string that represents the path to the dentry. This is mostly called + * for two different purposes: + * + * 1) we need to build a path string to send to the MDS (for_wire == true) + * 2) we need a path string for local presentation (e.g. debugfs) + * (for_wire == false) + * + * The path is built in reverse, starting with the dentry. Walk back up toward + * the root, building the path until the first non-snapped inode is reached + * (for_wire) or the root inode is reached (!for_wire). * * Encode hidden .snap dirs as a double /, i.e. * foo/.snap/bar -> foo//bar */ char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *pbase, - int stop_on_nosnap) + int for_wire) { - struct dentry *temp; + struct dentry *cur; + struct inode *inode; char *path; int pos; unsigned seq; @@ -2403,34 +2628,72 @@ char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *pbase, path[pos] = '\0'; seq = read_seqbegin(&rename_lock); - rcu_read_lock(); - temp = dentry; + cur = dget(dentry); for (;;) { - struct inode *inode; + struct dentry *parent; - spin_lock(&temp->d_lock); - inode = d_inode(temp); + spin_lock(&cur->d_lock); + inode = d_inode(cur); if (inode && ceph_snap(inode) == CEPH_SNAPDIR) { dout("build_path path+%d: %p SNAPDIR\n", - pos, temp); - } else if (stop_on_nosnap && inode && dentry != temp && + pos, cur); + spin_unlock(&cur->d_lock); + parent = dget_parent(cur); + } else if (for_wire && inode && dentry != cur && ceph_snap(inode) == CEPH_NOSNAP) { - spin_unlock(&temp->d_lock); + spin_unlock(&cur->d_lock); pos++; /* get rid of any prepended '/' */ break; - } else { - pos -= temp->d_name.len; + } else if (!for_wire || !IS_ENCRYPTED(d_inode(cur->d_parent))) { + pos -= cur->d_name.len; if (pos < 0) { - spin_unlock(&temp->d_lock); + spin_unlock(&cur->d_lock); break; } - memcpy(path + pos, temp->d_name.name, temp->d_name.len); + memcpy(path + pos, cur->d_name.name, cur->d_name.len); + spin_unlock(&cur->d_lock); + parent = dget_parent(cur); + } else { + int len, ret; + char buf[NAME_MAX]; + + /* + * Proactively copy name into buf, in case we need to + * present it as-is. + */ + memcpy(buf, cur->d_name.name, cur->d_name.len); + len = cur->d_name.len; + spin_unlock(&cur->d_lock); + parent = dget_parent(cur); + + ret = ceph_fscrypt_prepare_readdir(d_inode(parent)); + if (ret < 0) { + dput(parent); + dput(cur); + return ERR_PTR(ret); + } + + if (fscrypt_has_encryption_key(d_inode(parent))) { + len = ceph_encode_encrypted_fname(d_inode(parent), + cur, buf); + if (len < 0) { + dput(parent); + dput(cur); + return ERR_PTR(len); + } + } + pos -= len; + if (pos < 0) { + dput(parent); + break; + } + memcpy(path + pos, buf, len); } - spin_unlock(&temp->d_lock); - temp = READ_ONCE(temp->d_parent); + dput(cur); + cur = parent; /* Are we at the root? */ - if (IS_ROOT(temp)) + if (IS_ROOT(cur)) break; /* Are we out of buffer? */ @@ -2439,8 +2702,9 @@ char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *pbase, path[pos] = '/'; } - base = ceph_ino(d_inode(temp)); - rcu_read_unlock(); + inode = d_inode(cur); + base = inode ? ceph_ino(inode) : 0; + dput(cur); if (read_seqretry(&rename_lock, seq)) goto retry; @@ -2450,8 +2714,8 @@ char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *pbase, * A rename didn't occur, but somehow we didn't end up where * we thought we would. Throw a warning and try again. */ - pr_warn("build_path did not end path lookup where " - "expected, pos is %d\n", pos); + pr_warn("build_path did not end path lookup where expected (pos = %d)\n", + pos); goto retry; } @@ -2471,7 +2735,8 @@ static int build_dentry_path(struct dentry *dentry, struct inode *dir, rcu_read_lock(); if (!dir) dir = d_inode_rcu(dentry->d_parent); - if (dir && parent_locked && ceph_snap(dir) == CEPH_NOSNAP) { + if (dir && parent_locked && ceph_snap(dir) == CEPH_NOSNAP && + !IS_ENCRYPTED(dir)) { *pino = ceph_ino(dir); rcu_read_unlock(); *ppath = dentry->d_name.name; @@ -2539,8 +2804,8 @@ static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry, return r; } -static void encode_timestamp_and_gids(void **p, - const struct ceph_mds_request *req) +static void encode_mclientrequest_tail(void **p, + const struct ceph_mds_request *req) { struct ceph_timespec ts; int i; @@ -2548,11 +2813,43 @@ static void encode_timestamp_and_gids(void **p, ceph_encode_timespec64(&ts, &req->r_stamp); ceph_encode_copy(p, &ts, sizeof(ts)); - /* gid_list */ + /* v4: gid_list */ ceph_encode_32(p, req->r_cred->group_info->ngroups); for (i = 0; i < req->r_cred->group_info->ngroups; i++) ceph_encode_64(p, from_kgid(&init_user_ns, req->r_cred->group_info->gid[i])); + + /* v5: altname */ + ceph_encode_32(p, req->r_altname_len); + ceph_encode_copy(p, req->r_altname, req->r_altname_len); + + /* v6: fscrypt_auth and fscrypt_file */ + if (req->r_fscrypt_auth) { + u32 authlen = ceph_fscrypt_auth_len(req->r_fscrypt_auth); + + ceph_encode_32(p, authlen); + ceph_encode_copy(p, req->r_fscrypt_auth, authlen); + } else { + ceph_encode_32(p, 0); + } + if (test_bit(CEPH_MDS_R_FSCRYPT_FILE, &req->r_req_flags)) { + ceph_encode_32(p, sizeof(__le64)); + ceph_encode_64(p, req->r_fscrypt_file); + } else { + ceph_encode_32(p, 0); + } +} + +static struct ceph_mds_request_head_legacy * +find_legacy_request_head(void *p, u64 features) +{ + bool legacy = !(features & CEPH_FEATURE_FS_BTIME); + struct ceph_mds_request_head_old *ohead; + + if (legacy) + return (struct ceph_mds_request_head_legacy *)p; + ohead = (struct ceph_mds_request_head_old *)p; + return (struct ceph_mds_request_head_legacy *)&ohead->oldest_client_tid; } /* @@ -2565,7 +2862,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, int mds = session->s_mds; struct ceph_mds_client *mdsc = session->s_mdsc; struct ceph_msg *msg; - struct ceph_mds_request_head_old *head; + struct ceph_mds_request_head_legacy *lhead; const char *path1 = NULL; const char *path2 = NULL; u64 ino1 = 0, ino2 = 0; @@ -2577,6 +2874,8 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, void *p, *end; int ret; bool legacy = !(session->s_con.peer_features & CEPH_FEATURE_FS_BTIME); + bool old_version = !test_bit(CEPHFS_FEATURE_32BITS_RETRY_FWD, + &session->s_features); ret = set_request_path_attr(req->r_inode, req->r_dentry, req->r_parent, req->r_path1, req->r_ino1.ino, @@ -2601,12 +2900,32 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, goto out_free1; } - len = legacy ? sizeof(*head) : sizeof(struct ceph_mds_request_head); - len += pathlen1 + pathlen2 + 2*(1 + sizeof(u32) + sizeof(u64)) + - sizeof(struct ceph_timespec); - len += sizeof(u32) + (sizeof(u64) * req->r_cred->group_info->ngroups); + req->r_altname = get_fscrypt_altname(req, &req->r_altname_len); + if (IS_ERR(req->r_altname)) { + msg = ERR_CAST(req->r_altname); + req->r_altname = NULL; + goto out_free2; + } - /* calculate (max) length for cap releases */ + /* + * For old cephs without supporting the 32bit retry/fwd feature + * it will copy the raw memories directly when decoding the + * requests. While new cephs will decode the head depending the + * version member, so we need to make sure it will be compatible + * with them both. + */ + if (legacy) + len = sizeof(struct ceph_mds_request_head_legacy); + else if (old_version) + len = sizeof(struct ceph_mds_request_head_old); + else + len = sizeof(struct ceph_mds_request_head); + + /* filepaths */ + len += 2 * (1 + sizeof(u32) + sizeof(u64)); + len += pathlen1 + pathlen2; + + /* cap releases */ len += sizeof(struct ceph_mds_request_release) * (!!req->r_inode_drop + !!req->r_dentry_drop + !!req->r_old_inode_drop + !!req->r_old_dentry_drop); @@ -2616,6 +2935,27 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, if (req->r_old_dentry_drop) len += pathlen2; + /* MClientRequest tail */ + + /* req->r_stamp */ + len += sizeof(struct ceph_timespec); + + /* gid list */ + len += sizeof(u32) + (sizeof(u64) * req->r_cred->group_info->ngroups); + + /* alternate name */ + len += sizeof(u32) + req->r_altname_len; + + /* fscrypt_auth */ + len += sizeof(u32); // fscrypt_auth + if (req->r_fscrypt_auth) + len += ceph_fscrypt_auth_len(req->r_fscrypt_auth); + + /* fscrypt_file */ + len += sizeof(u32); + if (test_bit(CEPH_MDS_R_FSCRYPT_FILE, &req->r_req_flags)) + len += sizeof(__le64); + msg = ceph_msg_new2(CEPH_MSG_CLIENT_REQUEST, len, 1, GFP_NOFS, false); if (!msg) { msg = ERR_PTR(-ENOMEM); @@ -2624,33 +2964,40 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, msg->hdr.tid = cpu_to_le64(req->r_tid); + lhead = find_legacy_request_head(msg->front.iov_base, + session->s_con.peer_features); + /* - * The old ceph_mds_request_head didn't contain a version field, and + * The ceph_mds_request_head_legacy didn't contain a version field, and * one was added when we moved the message version from 3->4. */ if (legacy) { msg->hdr.version = cpu_to_le16(3); - head = msg->front.iov_base; - p = msg->front.iov_base + sizeof(*head); - } else { - struct ceph_mds_request_head *new_head = msg->front.iov_base; + p = msg->front.iov_base + sizeof(*lhead); + } else if (old_version) { + struct ceph_mds_request_head_old *ohead = msg->front.iov_base; msg->hdr.version = cpu_to_le16(4); - new_head->version = cpu_to_le16(CEPH_MDS_REQUEST_HEAD_VERSION); - head = (struct ceph_mds_request_head_old *)&new_head->oldest_client_tid; - p = msg->front.iov_base + sizeof(*new_head); + ohead->version = cpu_to_le16(1); + p = msg->front.iov_base + sizeof(*ohead); + } else { + struct ceph_mds_request_head *nhead = msg->front.iov_base; + + msg->hdr.version = cpu_to_le16(6); + nhead->version = cpu_to_le16(CEPH_MDS_REQUEST_HEAD_VERSION); + p = msg->front.iov_base + sizeof(*nhead); } end = msg->front.iov_base + msg->front.iov_len; - head->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch); - head->op = cpu_to_le32(req->r_op); - head->caller_uid = cpu_to_le32(from_kuid(&init_user_ns, - req->r_cred->fsuid)); - head->caller_gid = cpu_to_le32(from_kgid(&init_user_ns, - req->r_cred->fsgid)); - head->ino = cpu_to_le64(req->r_deleg_ino); - head->args = req->r_args; + lhead->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch); + lhead->op = cpu_to_le32(req->r_op); + lhead->caller_uid = cpu_to_le32(from_kuid(&init_user_ns, + req->r_cred->fsuid)); + lhead->caller_gid = cpu_to_le32(from_kgid(&init_user_ns, + req->r_cred->fsgid)); + lhead->ino = cpu_to_le64(req->r_deleg_ino); + lhead->args = req->r_args; ceph_encode_filepath(&p, end, ino1, path1); ceph_encode_filepath(&p, end, ino2, path2); @@ -2665,15 +3012,23 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, req->r_inode ? req->r_inode : d_inode(req->r_dentry), mds, req->r_inode_drop, req->r_inode_unless, req->r_op == CEPH_MDS_OP_READDIR); - if (req->r_dentry_drop) - releases += ceph_encode_dentry_release(&p, req->r_dentry, + if (req->r_dentry_drop) { + ret = ceph_encode_dentry_release(&p, req->r_dentry, req->r_parent, mds, req->r_dentry_drop, req->r_dentry_unless); - if (req->r_old_dentry_drop) - releases += ceph_encode_dentry_release(&p, req->r_old_dentry, + if (ret < 0) + goto out_err; + releases += ret; + } + if (req->r_old_dentry_drop) { + ret = ceph_encode_dentry_release(&p, req->r_old_dentry, req->r_old_dentry_dir, mds, req->r_old_dentry_drop, req->r_old_dentry_unless); + if (ret < 0) + goto out_err; + releases += ret; + } if (req->r_old_inode_drop) releases += ceph_encode_inode_release(&p, d_inode(req->r_old_dentry), @@ -2684,9 +3039,9 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, p = msg->front.iov_base + req->r_request_release_offset; } - head->num_releases = cpu_to_le16(releases); + lhead->num_releases = cpu_to_le16(releases); - encode_timestamp_and_gids(&p, req); + encode_mclientrequest_tail(&p, req); if (WARN_ON_ONCE(p > end)) { ceph_msg_put(msg); @@ -2715,6 +3070,10 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session, ceph_mdsc_free_path((char *)path1, pathlen1); out: return msg; +out_err: + ceph_msg_put(msg); + msg = ERR_PTR(ret); + goto out_free2; } /* @@ -2731,18 +3090,6 @@ static void complete_request(struct ceph_mds_client *mdsc, complete_all(&req->r_completion); } -static struct ceph_mds_request_head_old * -find_old_request_head(void *p, u64 features) -{ - bool legacy = !(features & CEPH_FEATURE_FS_BTIME); - struct ceph_mds_request_head *new_head; - - if (legacy) - return (struct ceph_mds_request_head_old *)p; - new_head = (struct ceph_mds_request_head *)p; - return (struct ceph_mds_request_head_old *)&new_head->oldest_client_tid; -} - /* * called under mdsc->mutex */ @@ -2752,29 +3099,28 @@ static int __prepare_send_request(struct ceph_mds_session *session, { int mds = session->s_mds; struct ceph_mds_client *mdsc = session->s_mdsc; - struct ceph_mds_request_head_old *rhead; + struct ceph_mds_request_head_legacy *lhead; + struct ceph_mds_request_head *nhead; struct ceph_msg *msg; - int flags = 0, max_retry; + int flags = 0, old_max_retry; + bool old_version = !test_bit(CEPHFS_FEATURE_32BITS_RETRY_FWD, + &session->s_features); /* - * The type of 'r_attempts' in kernel 'ceph_mds_request' - * is 'int', while in 'ceph_mds_request_head' the type of - * 'num_retry' is '__u8'. So in case the request retries - * exceeding 256 times, the MDS will receive a incorrect - * retry seq. - * - * In this case it's ususally a bug in MDS and continue - * retrying the request makes no sense. - * - * In future this could be fixed in ceph code, so avoid - * using the hardcode here. + * Avoid inifinite retrying after overflow. The client will + * increase the retry count and if the MDS is old version, + * so we limit to retry at most 256 times. */ - max_retry = sizeof_field(struct ceph_mds_request_head, num_retry); - max_retry = 1 << (max_retry * BITS_PER_BYTE); - if (req->r_attempts >= max_retry) { - pr_warn_ratelimited("%s request tid %llu seq overflow\n", - __func__, req->r_tid); - return -EMULTIHOP; + if (req->r_attempts) { + old_max_retry = sizeof_field(struct ceph_mds_request_head_old, + num_retry); + old_max_retry = 1 << (old_max_retry * BITS_PER_BYTE); + if ((old_version && req->r_attempts >= old_max_retry) || + ((uint32_t)req->r_attempts >= U32_MAX)) { + pr_warn_ratelimited("%s request tid %llu seq overflow\n", + __func__, req->r_tid); + return -EMULTIHOP; + } } req->r_attempts++; @@ -2800,23 +3146,27 @@ static int __prepare_send_request(struct ceph_mds_session *session, * d_move mangles the src name. */ msg = req->r_request; - rhead = find_old_request_head(msg->front.iov_base, - session->s_con.peer_features); + lhead = find_legacy_request_head(msg->front.iov_base, + session->s_con.peer_features); - flags = le32_to_cpu(rhead->flags); + flags = le32_to_cpu(lhead->flags); flags |= CEPH_MDS_FLAG_REPLAY; - rhead->flags = cpu_to_le32(flags); + lhead->flags = cpu_to_le32(flags); if (req->r_target_inode) - rhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode)); + lhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode)); - rhead->num_retry = req->r_attempts - 1; + lhead->num_retry = req->r_attempts - 1; + if (!old_version) { + nhead = (struct ceph_mds_request_head*)msg->front.iov_base; + nhead->ext_num_retry = cpu_to_le32(req->r_attempts - 1); + } /* remove cap/dentry releases from message */ - rhead->num_releases = 0; + lhead->num_releases = 0; p = msg->front.iov_base + req->r_request_release_offset; - encode_timestamp_and_gids(&p, req); + encode_mclientrequest_tail(&p, req); msg->front.iov_len = p - msg->front.iov_base; msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); @@ -2834,18 +3184,23 @@ static int __prepare_send_request(struct ceph_mds_session *session, } req->r_request = msg; - rhead = find_old_request_head(msg->front.iov_base, - session->s_con.peer_features); - rhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc)); + lhead = find_legacy_request_head(msg->front.iov_base, + session->s_con.peer_features); + lhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc)); if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) flags |= CEPH_MDS_FLAG_REPLAY; if (test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags)) flags |= CEPH_MDS_FLAG_ASYNC; if (req->r_parent) flags |= CEPH_MDS_FLAG_WANT_DENTRY; - rhead->flags = cpu_to_le32(flags); - rhead->num_fwd = req->r_num_fwd; - rhead->num_retry = req->r_attempts - 1; + lhead->flags = cpu_to_le32(flags); + lhead->num_fwd = req->r_num_fwd; + lhead->num_retry = req->r_attempts - 1; + if (!old_version) { + nhead = (struct ceph_mds_request_head*)msg->front.iov_base; + nhead->ext_num_fwd = cpu_to_le32(req->r_num_fwd); + nhead->ext_num_retry = cpu_to_le32(req->r_attempts - 1); + } dout(" r_parent = %p\n", req->r_parent); return 0; @@ -3348,22 +3703,35 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) } dout("handle_reply tid %lld result %d\n", tid, result); - rinfo = &req->r_reply_info; if (test_bit(CEPHFS_FEATURE_REPLY_ENCODING, &session->s_features)) - err = parse_reply_info(session, msg, rinfo, (u64)-1); + err = parse_reply_info(session, msg, req, (u64)-1); else - err = parse_reply_info(session, msg, rinfo, session->s_con.peer_features); + err = parse_reply_info(session, msg, req, + session->s_con.peer_features); mutex_unlock(&mdsc->mutex); /* Must find target inode outside of mutexes to avoid deadlocks */ + rinfo = &req->r_reply_info; if ((err >= 0) && rinfo->head->is_target) { - struct inode *in; + struct inode *in = xchg(&req->r_new_inode, NULL); struct ceph_vino tvino = { .ino = le64_to_cpu(rinfo->targeti.in->ino), .snap = le64_to_cpu(rinfo->targeti.in->snapid) }; - in = ceph_get_inode(mdsc->fsc->sb, tvino); + /* + * If we ended up opening an existing inode, discard + * r_new_inode + */ + if (req->r_op == CEPH_MDS_OP_CREATE && + !req->r_reply_info.has_create_ino) { + /* This should never happen on an async create */ + WARN_ON_ONCE(req->r_deleg_ino); + iput(in); + in = NULL; + } + + in = ceph_get_inode(mdsc->fsc->sb, tvino, in); if (IS_ERR(in)) { err = PTR_ERR(in); mutex_lock(&session->s_mutex); @@ -3406,7 +3774,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) if (err == 0) { if (result == 0 && (req->r_op == CEPH_MDS_OP_READDIR || req->r_op == CEPH_MDS_OP_LSSNAP)) - ceph_readdir_prepopulate(req, req->r_session); + err = ceph_readdir_prepopulate(req, req->r_session); } current->journal_info = NULL; mutex_unlock(&req->r_fill_mutex); @@ -3491,33 +3859,21 @@ static void handle_forward(struct ceph_mds_client *mdsc, if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) { dout("forward tid %llu aborted, unregistering\n", tid); __unregister_request(mdsc, req); - } else if (fwd_seq <= req->r_num_fwd) { + } else if (fwd_seq <= req->r_num_fwd || (uint32_t)fwd_seq >= U32_MAX) { /* - * The type of 'num_fwd' in ceph 'MClientRequestForward' - * is 'int32_t', while in 'ceph_mds_request_head' the - * type is '__u8'. So in case the request bounces between - * MDSes exceeding 256 times, the client will get stuck. + * Avoid inifinite retrying after overflow. * - * In this case it's ususally a bug in MDS and continue - * bouncing the request makes no sense. - * - * In future this could be fixed in ceph code, so avoid - * using the hardcode here. + * The MDS will increase the fwd count and in client side + * if the num_fwd is less than the one saved in request + * that means the MDS is an old version and overflowed of + * 8 bits. */ - int max = sizeof_field(struct ceph_mds_request_head, num_fwd); - max = 1 << (max * BITS_PER_BYTE); - if (req->r_num_fwd >= max) { - mutex_lock(&req->r_fill_mutex); - req->r_err = -EMULTIHOP; - set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags); - mutex_unlock(&req->r_fill_mutex); - aborted = true; - pr_warn_ratelimited("forward tid %llu seq overflow\n", - tid); - } else { - dout("forward tid %llu to mds%d - old seq %d <= %d\n", - tid, next_mds, req->r_num_fwd, fwd_seq); - } + mutex_lock(&req->r_fill_mutex); + req->r_err = -EMULTIHOP; + set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags); + mutex_unlock(&req->r_fill_mutex); + aborted = true; + pr_warn_ratelimited("forward tid %llu seq overflow\n", tid); } else { /* resend. forward race not possible; mds would drop */ dout("forward tid %llu to mds%d (we resend)\n", tid, next_mds); @@ -4550,6 +4906,9 @@ static void handle_lease(struct ceph_mds_client *mdsc, dout("handle_lease from mds%d\n", mds); + if (!ceph_inc_mds_stopping_blocker(mdsc, session)) + return; + /* decode */ if (msg->front.iov_len < sizeof(*h) + sizeof(u32)) goto bad; @@ -4568,8 +4927,6 @@ static void handle_lease(struct ceph_mds_client *mdsc, dname.len, dname.name); mutex_lock(&session->s_mutex); - inc_session_sequence(session); - if (!inode) { dout("handle_lease no inode %llx\n", vino.ino); goto release; @@ -4631,9 +4988,13 @@ static void handle_lease(struct ceph_mds_client *mdsc, out: mutex_unlock(&session->s_mutex); iput(inode); + + ceph_dec_mds_stopping_blocker(mdsc); return; bad: + ceph_dec_mds_stopping_blocker(mdsc); + pr_err("corrupt lease message\n"); ceph_msg_dump(msg); } @@ -4829,6 +5190,9 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) } init_completion(&mdsc->safe_umount_waiters); + spin_lock_init(&mdsc->stopping_lock); + atomic_set(&mdsc->stopping_blockers, 0); + init_completion(&mdsc->stopping_waiter); init_waitqueue_head(&mdsc->session_close_wq); INIT_LIST_HEAD(&mdsc->waiting_for_map); mdsc->quotarealms_inodes = RB_ROOT; diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 86d2965e68a1..5a3714bdd64a 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -32,8 +32,9 @@ enum ceph_feature_type { CEPHFS_FEATURE_ALTERNATE_NAME, CEPHFS_FEATURE_NOTIFY_SESSION_STATE, CEPHFS_FEATURE_OP_GETVXATTR, + CEPHFS_FEATURE_32BITS_RETRY_FWD, - CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_OP_GETVXATTR, + CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_32BITS_RETRY_FWD, }; #define CEPHFS_FEATURES_CLIENT_SUPPORTED { \ @@ -44,8 +45,10 @@ enum ceph_feature_type { CEPHFS_FEATURE_MULTI_RECONNECT, \ CEPHFS_FEATURE_DELEG_INO, \ CEPHFS_FEATURE_METRIC_COLLECT, \ + CEPHFS_FEATURE_ALTERNATE_NAME, \ CEPHFS_FEATURE_NOTIFY_SESSION_STATE, \ CEPHFS_FEATURE_OP_GETVXATTR, \ + CEPHFS_FEATURE_32BITS_RETRY_FWD, \ } /* @@ -86,13 +89,19 @@ struct ceph_mds_reply_info_in { s32 dir_pin; struct ceph_timespec btime; struct ceph_timespec snap_btime; + u8 *fscrypt_auth; + u8 *fscrypt_file; + u32 fscrypt_auth_len; + u32 fscrypt_file_len; u64 rsnaps; u64 change_attr; }; struct ceph_mds_reply_dir_entry { + bool is_nokey; char *name; u32 name_len; + u32 raw_hash; struct ceph_mds_reply_lease *lease; struct ceph_mds_reply_info_in inode; loff_t offset; @@ -116,7 +125,9 @@ struct ceph_mds_reply_info_parsed { struct ceph_mds_reply_info_in diri, targeti; struct ceph_mds_reply_dirfrag *dirfrag; char *dname; + u8 *altname; u32 dname_len; + u32 altname_len; struct ceph_mds_reply_lease *dlease; struct ceph_mds_reply_xattr xattr_info; @@ -263,6 +274,7 @@ struct ceph_mds_request { struct inode *r_parent; /* parent dir inode */ struct inode *r_target_inode; /* resulting inode */ + struct inode *r_new_inode; /* new inode (for creates) */ #define CEPH_MDS_R_DIRECT_IS_HASH (1) /* r_direct_hash is valid */ #define CEPH_MDS_R_ABORTED (2) /* call was aborted */ @@ -272,11 +284,19 @@ struct ceph_mds_request { #define CEPH_MDS_R_DID_PREPOPULATE (6) /* prepopulated readdir */ #define CEPH_MDS_R_PARENT_LOCKED (7) /* is r_parent->i_rwsem wlocked? */ #define CEPH_MDS_R_ASYNC (8) /* async request */ +#define CEPH_MDS_R_FSCRYPT_FILE (9) /* must marshal fscrypt_file field */ unsigned long r_req_flags; struct mutex r_fill_mutex; union ceph_mds_request_args r_args; + + struct ceph_fscrypt_auth *r_fscrypt_auth; + u64 r_fscrypt_file; + + u8 *r_altname; /* fscrypt binary crypttext for long filenames */ + u32 r_altname_len; /* length of r_altname */ + int r_fmode; /* file mode, if expecting cap */ int r_request_release_offset; const struct cred *r_cred; @@ -381,8 +401,9 @@ struct cap_wait { }; enum { - CEPH_MDSC_STOPPING_BEGIN = 1, - CEPH_MDSC_STOPPING_FLUSHED = 2, + CEPH_MDSC_STOPPING_BEGIN = 1, + CEPH_MDSC_STOPPING_FLUSHING = 2, + CEPH_MDSC_STOPPING_FLUSHED = 3, }; /* @@ -401,7 +422,11 @@ struct ceph_mds_client { struct ceph_mds_session **sessions; /* NULL for mds if no session */ atomic_t num_sessions; int max_sessions; /* len of sessions array */ - int stopping; /* true if shutting down */ + + spinlock_t stopping_lock; /* protect snap_empty */ + int stopping; /* the stage of shutting down */ + atomic_t stopping_blockers; + struct completion stopping_waiter; atomic64_t quotarealms_count; /* # realms with quota */ /* @@ -557,7 +582,7 @@ static inline void ceph_mdsc_free_path(char *path, int len) } extern char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base, - int stop_on_nosnap); + int for_wire); extern void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry); extern void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session, diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c index 64592adfe48f..f7fcf7f08ec6 100644 --- a/fs/ceph/quota.c +++ b/fs/ceph/quota.c @@ -47,25 +47,23 @@ void ceph_handle_quota(struct ceph_mds_client *mdsc, struct inode *inode; struct ceph_inode_info *ci; + if (!ceph_inc_mds_stopping_blocker(mdsc, session)) + return; + if (msg->front.iov_len < sizeof(*h)) { pr_err("%s corrupt message mds%d len %d\n", __func__, session->s_mds, (int)msg->front.iov_len); ceph_msg_dump(msg); - return; + goto out; } - /* increment msg sequence number */ - mutex_lock(&session->s_mutex); - inc_session_sequence(session); - mutex_unlock(&session->s_mutex); - /* lookup inode */ vino.ino = le64_to_cpu(h->ino); vino.snap = CEPH_NOSNAP; inode = ceph_find_inode(sb, vino); if (!inode) { pr_warn("Failed to find inode %llu\n", vino.ino); - return; + goto out; } ci = ceph_inode(inode); @@ -78,6 +76,8 @@ void ceph_handle_quota(struct ceph_mds_client *mdsc, spin_unlock(&ci->i_ceph_lock); iput(inode); +out: + ceph_dec_mds_stopping_blocker(mdsc); } static struct ceph_quotarealm_inode * diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index c9920ade15f5..813f21add992 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -1015,6 +1015,9 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc, int locked_rwsem = 0; bool close_sessions = false; + if (!ceph_inc_mds_stopping_blocker(mdsc, session)) + return; + /* decode */ if (msg->front.iov_len < sizeof(*h)) goto bad; @@ -1030,10 +1033,6 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc, dout("%s from mds%d op %s split %llx tracelen %d\n", __func__, mds, ceph_snap_op_name(op), split, trace_len); - mutex_lock(&session->s_mutex); - inc_session_sequence(session); - mutex_unlock(&session->s_mutex); - down_write(&mdsc->snap_rwsem); locked_rwsem = 1; @@ -1151,6 +1150,7 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc, up_write(&mdsc->snap_rwsem); flush_snaps(mdsc); + ceph_dec_mds_stopping_blocker(mdsc); return; bad: @@ -1160,6 +1160,8 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc, if (locked_rwsem) up_write(&mdsc->snap_rwsem); + ceph_dec_mds_stopping_blocker(mdsc); + if (close_sessions) ceph_mdsc_close_sessions(mdsc); return; diff --git a/fs/ceph/super.c b/fs/ceph/super.c index a5f52013314d..2d7f5a8d4a92 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -20,6 +20,7 @@ #include "super.h" #include "mds_client.h" #include "cache.h" +#include "crypto.h" #include #include @@ -46,6 +47,7 @@ static void ceph_put_super(struct super_block *s) struct ceph_fs_client *fsc = ceph_sb_to_client(s); dout("put_super\n"); + ceph_fscrypt_free_dummy_policy(fsc); ceph_mdsc_close_sessions(fsc->mdsc); } @@ -151,6 +153,7 @@ enum { Opt_recover_session, Opt_source, Opt_mon_addr, + Opt_test_dummy_encryption, /* string args above */ Opt_dirstat, Opt_rbytes, @@ -165,6 +168,7 @@ enum { Opt_copyfrom, Opt_wsync, Opt_pagecache, + Opt_sparseread, }; enum ceph_recover_session_mode { @@ -192,6 +196,7 @@ static const struct fs_parameter_spec ceph_mount_parameters[] = { fsparam_string ("fsc", Opt_fscache), // fsc=... fsparam_flag_no ("ino32", Opt_ino32), fsparam_string ("mds_namespace", Opt_mds_namespace), + fsparam_string ("mon_addr", Opt_mon_addr), fsparam_flag_no ("poolperm", Opt_poolperm), fsparam_flag_no ("quotadf", Opt_quotadf), fsparam_u32 ("rasize", Opt_rasize), @@ -203,10 +208,12 @@ static const struct fs_parameter_spec ceph_mount_parameters[] = { fsparam_u32 ("rsize", Opt_rsize), fsparam_string ("snapdirname", Opt_snapdirname), fsparam_string ("source", Opt_source), - fsparam_string ("mon_addr", Opt_mon_addr), + fsparam_flag ("test_dummy_encryption", Opt_test_dummy_encryption), + fsparam_string ("test_dummy_encryption", Opt_test_dummy_encryption), fsparam_u32 ("wsize", Opt_wsize), fsparam_flag_no ("wsync", Opt_wsync), fsparam_flag_no ("pagecache", Opt_pagecache), + fsparam_flag_no ("sparseread", Opt_sparseread), {} }; @@ -576,6 +583,29 @@ static int ceph_parse_mount_param(struct fs_context *fc, else fsopt->flags &= ~CEPH_MOUNT_OPT_NOPAGECACHE; break; + case Opt_sparseread: + if (result.negated) + fsopt->flags &= ~CEPH_MOUNT_OPT_SPARSEREAD; + else + fsopt->flags |= CEPH_MOUNT_OPT_SPARSEREAD; + break; + case Opt_test_dummy_encryption: +#ifdef CONFIG_FS_ENCRYPTION + fscrypt_free_dummy_policy(&fsopt->dummy_enc_policy); + ret = fscrypt_parse_test_dummy_encryption(param, + &fsopt->dummy_enc_policy); + if (ret == -EINVAL) { + warnfc(fc, "Value of option \"%s\" is unrecognized", + param->key); + } else if (ret == -EEXIST) { + warnfc(fc, "Conflicting test_dummy_encryption options"); + ret = -EINVAL; + } +#else + warnfc(fc, + "FS encryption not supported: test_dummy_encryption mount option ignored"); +#endif + break; default: BUG(); } @@ -596,6 +626,7 @@ static void destroy_mount_options(struct ceph_mount_options *args) kfree(args->server_path); kfree(args->fscache_uniq); kfree(args->mon_addr); + fscrypt_free_dummy_policy(&args->dummy_enc_policy); kfree(args); } @@ -710,9 +741,12 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) if (!(fsopt->flags & CEPH_MOUNT_OPT_ASYNC_DIROPS)) seq_puts(m, ",wsync"); - if (fsopt->flags & CEPH_MOUNT_OPT_NOPAGECACHE) seq_puts(m, ",nopagecache"); + if (fsopt->flags & CEPH_MOUNT_OPT_SPARSEREAD) + seq_puts(m, ",sparseread"); + + fscrypt_show_test_dummy_encryption(m, ',', root->d_sb); if (fsopt->wsize != CEPH_MAX_WRITE_SIZE) seq_printf(m, ",wsize=%u", fsopt->wsize); @@ -1052,6 +1086,50 @@ static struct dentry *open_root_dentry(struct ceph_fs_client *fsc, return root; } +#ifdef CONFIG_FS_ENCRYPTION +static int ceph_apply_test_dummy_encryption(struct super_block *sb, + struct fs_context *fc, + struct ceph_mount_options *fsopt) +{ + struct ceph_fs_client *fsc = sb->s_fs_info; + + if (!fscrypt_is_dummy_policy_set(&fsopt->dummy_enc_policy)) + return 0; + + /* No changing encryption context on remount. */ + if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE && + !fscrypt_is_dummy_policy_set(&fsc->fsc_dummy_enc_policy)) { + if (fscrypt_dummy_policies_equal(&fsopt->dummy_enc_policy, + &fsc->fsc_dummy_enc_policy)) + return 0; + errorfc(fc, "Can't set test_dummy_encryption on remount"); + return -EINVAL; + } + + /* Also make sure fsopt doesn't contain a conflicting value. */ + if (fscrypt_is_dummy_policy_set(&fsc->fsc_dummy_enc_policy)) { + if (fscrypt_dummy_policies_equal(&fsopt->dummy_enc_policy, + &fsc->fsc_dummy_enc_policy)) + return 0; + errorfc(fc, "Conflicting test_dummy_encryption options"); + return -EINVAL; + } + + fsc->fsc_dummy_enc_policy = fsopt->dummy_enc_policy; + memset(&fsopt->dummy_enc_policy, 0, sizeof(fsopt->dummy_enc_policy)); + + warnfc(fc, "test_dummy_encryption mode enabled"); + return 0; +} +#else +static int ceph_apply_test_dummy_encryption(struct super_block *sb, + struct fs_context *fc, + struct ceph_mount_options *fsopt) +{ + return 0; +} +#endif + /* * mount: join the ceph cluster, and open root directory. */ @@ -1080,6 +1158,11 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc, goto out; } + err = ceph_apply_test_dummy_encryption(fsc->sb, fc, + fsc->mount_options); + if (err) + goto out; + dout("mount opening path '%s'\n", path); ceph_fs_debugfs_init(fsc); @@ -1101,6 +1184,7 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc, out: mutex_unlock(&fsc->client->mount_mutex); + ceph_fscrypt_free_dummy_policy(fsc); return ERR_PTR(err); } @@ -1126,6 +1210,8 @@ static int ceph_set_super(struct super_block *s, struct fs_context *fc) s->s_time_max = U32_MAX; s->s_flags |= SB_NODIRATIME | SB_NOATIME; + ceph_fscrypt_set_ops(s); + ret = set_anon_super_fc(s, fc); if (ret != 0) fsc->sb = NULL; @@ -1287,15 +1373,26 @@ static void ceph_free_fc(struct fs_context *fc) static int ceph_reconfigure_fc(struct fs_context *fc) { + int err; struct ceph_parse_opts_ctx *pctx = fc->fs_private; struct ceph_mount_options *fsopt = pctx->opts; - struct ceph_fs_client *fsc = ceph_sb_to_client(fc->root->d_sb); + struct super_block *sb = fc->root->d_sb; + struct ceph_fs_client *fsc = ceph_sb_to_client(sb); + + err = ceph_apply_test_dummy_encryption(sb, fc, fsopt); + if (err) + return err; if (fsopt->flags & CEPH_MOUNT_OPT_ASYNC_DIROPS) ceph_set_mount_opt(fsc, ASYNC_DIROPS); else ceph_clear_mount_opt(fsc, ASYNC_DIROPS); + if (fsopt->flags & CEPH_MOUNT_OPT_SPARSEREAD) + ceph_set_mount_opt(fsc, SPARSEREAD); + else + ceph_clear_mount_opt(fsc, SPARSEREAD); + if (strcmp_null(fsc->mount_options->mon_addr, fsopt->mon_addr)) { kfree(fsc->mount_options->mon_addr); fsc->mount_options->mon_addr = fsopt->mon_addr; @@ -1303,7 +1400,7 @@ static int ceph_reconfigure_fc(struct fs_context *fc) pr_notice("ceph: monitor addresses recorded, but not used for reconnection"); } - sync_filesystem(fc->root->d_sb); + sync_filesystem(sb); return 0; } @@ -1365,25 +1462,101 @@ static int ceph_init_fs_context(struct fs_context *fc) return -ENOMEM; } +/* + * Return true if it successfully increases the blocker counter, + * or false if the mdsc is in stopping and flushed state. + */ +static bool __inc_stopping_blocker(struct ceph_mds_client *mdsc) +{ + spin_lock(&mdsc->stopping_lock); + if (mdsc->stopping >= CEPH_MDSC_STOPPING_FLUSHING) { + spin_unlock(&mdsc->stopping_lock); + return false; + } + atomic_inc(&mdsc->stopping_blockers); + spin_unlock(&mdsc->stopping_lock); + return true; +} + +static void __dec_stopping_blocker(struct ceph_mds_client *mdsc) +{ + spin_lock(&mdsc->stopping_lock); + if (!atomic_dec_return(&mdsc->stopping_blockers) && + mdsc->stopping >= CEPH_MDSC_STOPPING_FLUSHING) + complete_all(&mdsc->stopping_waiter); + spin_unlock(&mdsc->stopping_lock); +} + +/* For metadata IO requests */ +bool ceph_inc_mds_stopping_blocker(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) +{ + mutex_lock(&session->s_mutex); + inc_session_sequence(session); + mutex_unlock(&session->s_mutex); + + return __inc_stopping_blocker(mdsc); +} + +void ceph_dec_mds_stopping_blocker(struct ceph_mds_client *mdsc) +{ + __dec_stopping_blocker(mdsc); +} + +/* For data IO requests */ +bool ceph_inc_osd_stopping_blocker(struct ceph_mds_client *mdsc) +{ + return __inc_stopping_blocker(mdsc); +} + +void ceph_dec_osd_stopping_blocker(struct ceph_mds_client *mdsc) +{ + __dec_stopping_blocker(mdsc); +} + static void ceph_kill_sb(struct super_block *s) { struct ceph_fs_client *fsc = ceph_sb_to_client(s); + struct ceph_mds_client *mdsc = fsc->mdsc; + bool wait; dout("kill_sb %p\n", s); - ceph_mdsc_pre_umount(fsc->mdsc); + ceph_mdsc_pre_umount(mdsc); flush_fs_workqueues(fsc); /* * Though the kill_anon_super() will finally trigger the - * sync_filesystem() anyway, we still need to do it here - * and then bump the stage of shutdown to stop the work - * queue as earlier as possible. + * sync_filesystem() anyway, we still need to do it here and + * then bump the stage of shutdown. This will allow us to + * drop any further message, which will increase the inodes' + * i_count reference counters but makes no sense any more, + * from MDSs. + * + * Without this when evicting the inodes it may fail in the + * kill_anon_super(), which will trigger a warning when + * destroying the fscrypt keyring and then possibly trigger + * a further crash in ceph module when the iput() tries to + * evict the inodes later. */ sync_filesystem(s); - fsc->mdsc->stopping = CEPH_MDSC_STOPPING_FLUSHED; + spin_lock(&mdsc->stopping_lock); + mdsc->stopping = CEPH_MDSC_STOPPING_FLUSHING; + wait = !!atomic_read(&mdsc->stopping_blockers); + spin_unlock(&mdsc->stopping_lock); + if (wait && atomic_read(&mdsc->stopping_blockers)) { + long timeleft = wait_for_completion_killable_timeout( + &mdsc->stopping_waiter, + fsc->client->options->mount_timeout); + if (!timeleft) /* timed out */ + pr_warn("umount timed out, %ld\n", timeleft); + else if (timeleft < 0) /* killed */ + pr_warn("umount was killed, %ld\n", timeleft); + } + + mdsc->stopping = CEPH_MDSC_STOPPING_FLUSHED; kill_anon_super(s); fsc->client->extra_mon_dispatch = NULL; diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 3bfddf34d488..51c7f2b14f6f 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -22,6 +22,7 @@ #include #include +#include "crypto.h" /* large granularity for statfs utilization stats to facilitate * large volume sizes on 32-bit machines. */ @@ -42,6 +43,7 @@ #define CEPH_MOUNT_OPT_NOCOPYFROM (1<<14) /* don't use RADOS 'copy-from' op */ #define CEPH_MOUNT_OPT_ASYNC_DIROPS (1<<15) /* allow async directory ops */ #define CEPH_MOUNT_OPT_NOPAGECACHE (1<<16) /* bypass pagecache altogether */ +#define CEPH_MOUNT_OPT_SPARSEREAD (1<<17) /* always do sparse reads */ #define CEPH_MOUNT_OPT_DEFAULT \ (CEPH_MOUNT_OPT_DCACHE | \ @@ -98,6 +100,7 @@ struct ceph_mount_options { char *server_path; /* default NULL (means "/") */ char *fscache_uniq; /* default NULL */ char *mon_addr; + struct fscrypt_dummy_policy dummy_enc_policy; }; /* mount state */ @@ -154,9 +157,11 @@ struct ceph_fs_client { #ifdef CONFIG_CEPH_FSCACHE struct fscache_volume *fscache; #endif +#ifdef CONFIG_FS_ENCRYPTION + struct fscrypt_dummy_policy fsc_dummy_enc_policy; +#endif }; - /* * File i/o capability. This tracks shared state with the metadata * server that allows us to cache or writeback attributes or to read @@ -419,6 +424,11 @@ struct ceph_inode_info { u32 i_truncate_seq; /* last truncate to smaller size */ u64 i_truncate_size; /* and the size we last truncated down to */ int i_truncate_pending; /* still need to call vmtruncate */ + /* + * For none fscrypt case it equals to i_truncate_size or it will + * equals to fscrypt_file_size + */ + u64 i_truncate_pagecache_size; u64 i_max_size; /* max file size authorized by mds */ u64 i_reported_size; /* (max_)size reported to or requested of mds */ @@ -449,6 +459,13 @@ struct ceph_inode_info { struct work_struct i_work; unsigned long i_work_mask; + +#ifdef CONFIG_FS_ENCRYPTION + u32 fscrypt_auth_len; + u32 fscrypt_file_len; + u8 *fscrypt_auth; + u8 *fscrypt_file; +#endif }; struct ceph_netfs_request_data { @@ -998,6 +1015,7 @@ static inline bool __ceph_have_pending_cap_snap(struct ceph_inode_info *ci) /* inode.c */ struct ceph_mds_reply_info_in; struct ceph_mds_reply_dirfrag; +struct ceph_acl_sec_ctx; extern const struct inode_operations ceph_file_iops; @@ -1005,8 +1023,14 @@ extern struct inode *ceph_alloc_inode(struct super_block *sb); extern void ceph_evict_inode(struct inode *inode); extern void ceph_free_inode(struct inode *inode); +struct inode *ceph_new_inode(struct inode *dir, struct dentry *dentry, + umode_t *mode, struct ceph_acl_sec_ctx *as_ctx); +void ceph_as_ctx_to_req(struct ceph_mds_request *req, + struct ceph_acl_sec_ctx *as_ctx); + extern struct inode *ceph_get_inode(struct super_block *sb, - struct ceph_vino vino); + struct ceph_vino vino, + struct inode *newino); extern struct inode *ceph_get_snapdir(struct inode *parent); extern int ceph_fill_file_size(struct inode *inode, int issued, u32 truncate_seq, u64 truncate_size, u64 size); @@ -1065,7 +1089,13 @@ static inline int ceph_do_getattr(struct inode *inode, int mask, bool force) } extern int ceph_permission(struct mnt_idmap *idmap, struct inode *inode, int mask); -extern int __ceph_setattr(struct inode *inode, struct iattr *attr); + +struct ceph_iattr { + struct ceph_fscrypt_auth *fscrypt_auth; +}; + +extern int __ceph_setattr(struct inode *inode, struct iattr *attr, + struct ceph_iattr *cia); extern int ceph_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr); extern int ceph_getattr(struct mnt_idmap *idmap, @@ -1099,6 +1129,9 @@ struct ceph_acl_sec_ctx { #ifdef CONFIG_CEPH_FS_SECURITY_LABEL void *sec_ctx; u32 sec_ctxlen; +#endif +#ifdef CONFIG_FS_ENCRYPTION + struct ceph_fscrypt_auth *fscrypt_auth; #endif struct ceph_pagelist *pagelist; }; @@ -1237,6 +1270,8 @@ extern int ceph_encode_dentry_release(void **p, struct dentry *dn, struct inode *dir, int mds, int drop, int unless); +extern int __ceph_get_caps(struct inode *inode, struct ceph_file_info *fi, + int need, int want, loff_t endoff, int *got); extern int ceph_get_caps(struct file *filp, int need, int want, loff_t endoff, int *got); extern int ceph_try_get_caps(struct inode *inode, @@ -1272,6 +1307,9 @@ extern int ceph_renew_caps(struct inode *inode, int fmode); extern int ceph_open(struct inode *inode, struct file *file); extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry, struct file *file, unsigned flags, umode_t mode); +extern ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos, + struct iov_iter *to, int *retry_op, + u64 *last_objver); extern int ceph_release(struct inode *inode, struct file *filp); extern void ceph_fill_inline_data(struct inode *inode, struct page *locked_page, char *data, size_t len); @@ -1375,4 +1413,9 @@ extern bool ceph_quota_update_statfs(struct ceph_fs_client *fsc, struct kstatfs *buf); extern void ceph_cleanup_quotarealms_inodes(struct ceph_mds_client *mdsc); +bool ceph_inc_mds_stopping_blocker(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session); +void ceph_dec_mds_stopping_blocker(struct ceph_mds_client *mdsc); +bool ceph_inc_osd_stopping_blocker(struct ceph_mds_client *mdsc); +void ceph_dec_osd_stopping_blocker(struct ceph_mds_client *mdsc); #endif /* _FS_CEPH_SUPER_H */ diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 1cbd84cc82a8..0deae4a0f5f1 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -352,6 +352,24 @@ static ssize_t ceph_vxattrcb_auth_mds(struct ceph_inode_info *ci, return ret; } +#if IS_ENABLED(CONFIG_FS_ENCRYPTION) +static bool ceph_vxattrcb_fscrypt_auth_exists(struct ceph_inode_info *ci) +{ + return ci->fscrypt_auth_len; +} + +static ssize_t ceph_vxattrcb_fscrypt_auth(struct ceph_inode_info *ci, + char *val, size_t size) +{ + if (size) { + if (size < ci->fscrypt_auth_len) + return -ERANGE; + memcpy(val, ci->fscrypt_auth, ci->fscrypt_auth_len); + } + return ci->fscrypt_auth_len; +} +#endif /* CONFIG_FS_ENCRYPTION */ + #define CEPH_XATTR_NAME(_type, _name) XATTR_CEPH_PREFIX #_type "." #_name #define CEPH_XATTR_NAME2(_type, _name, _name2) \ XATTR_CEPH_PREFIX #_type "." #_name "." #_name2 @@ -500,6 +518,15 @@ static struct ceph_vxattr ceph_common_vxattrs[] = { .exists_cb = NULL, .flags = VXATTR_FLAG_READONLY, }, +#if IS_ENABLED(CONFIG_FS_ENCRYPTION) + { + .name = "ceph.fscrypt.auth", + .name_size = sizeof("ceph.fscrypt.auth"), + .getxattr_cb = ceph_vxattrcb_fscrypt_auth, + .exists_cb = ceph_vxattrcb_fscrypt_auth_exists, + .flags = VXATTR_FLAG_READONLY, + }, +#endif /* CONFIG_FS_ENCRYPTION */ { .name = NULL, 0 } /* Required table terminator */ }; @@ -1407,6 +1434,9 @@ void ceph_release_acl_sec_ctx(struct ceph_acl_sec_ctx *as_ctx) #endif #ifdef CONFIG_CEPH_FS_SECURITY_LABEL security_release_secctx(as_ctx->sec_ctx, as_ctx->sec_ctxlen); +#endif +#ifdef CONFIG_FS_ENCRYPTION + kfree(as_ctx->fscrypt_auth); #endif if (as_ctx->pagelist) ceph_pagelist_release(as_ctx->pagelist); diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h index 49586ff26152..5f2301ee88bc 100644 --- a/include/linux/ceph/ceph_fs.h +++ b/include/linux/ceph/ceph_fs.h @@ -359,14 +359,19 @@ enum { extern const char *ceph_mds_op_name(int op); - -#define CEPH_SETATTR_MODE 1 -#define CEPH_SETATTR_UID 2 -#define CEPH_SETATTR_GID 4 -#define CEPH_SETATTR_MTIME 8 -#define CEPH_SETATTR_ATIME 16 -#define CEPH_SETATTR_SIZE 32 -#define CEPH_SETATTR_CTIME 64 +#define CEPH_SETATTR_MODE (1 << 0) +#define CEPH_SETATTR_UID (1 << 1) +#define CEPH_SETATTR_GID (1 << 2) +#define CEPH_SETATTR_MTIME (1 << 3) +#define CEPH_SETATTR_ATIME (1 << 4) +#define CEPH_SETATTR_SIZE (1 << 5) +#define CEPH_SETATTR_CTIME (1 << 6) +#define CEPH_SETATTR_MTIME_NOW (1 << 7) +#define CEPH_SETATTR_ATIME_NOW (1 << 8) +#define CEPH_SETATTR_BTIME (1 << 9) +#define CEPH_SETATTR_KILL_SGUID (1 << 10) +#define CEPH_SETATTR_FSCRYPT_AUTH (1 << 11) +#define CEPH_SETATTR_FSCRYPT_FILE (1 << 12) /* * Ceph setxattr request flags. @@ -462,24 +467,26 @@ union ceph_mds_request_args { } __attribute__ ((packed)); union ceph_mds_request_args_ext { - union ceph_mds_request_args old; - struct { - __le32 mode; - __le32 uid; - __le32 gid; - struct ceph_timespec mtime; - struct ceph_timespec atime; - __le64 size, old_size; /* old_size needed by truncate */ - __le32 mask; /* CEPH_SETATTR_* */ - struct ceph_timespec btime; - } __attribute__ ((packed)) setattr_ext; + union { + union ceph_mds_request_args old; + struct { + __le32 mode; + __le32 uid; + __le32 gid; + struct ceph_timespec mtime; + struct ceph_timespec atime; + __le64 size, old_size; /* old_size needed by truncate */ + __le32 mask; /* CEPH_SETATTR_* */ + struct ceph_timespec btime; + } __attribute__ ((packed)) setattr_ext; + }; }; #define CEPH_MDS_FLAG_REPLAY 1 /* this is a replayed op */ #define CEPH_MDS_FLAG_WANT_DENTRY 2 /* want dentry in reply */ #define CEPH_MDS_FLAG_ASYNC 4 /* request is asynchronous */ -struct ceph_mds_request_head_old { +struct ceph_mds_request_head_legacy { __le64 oldest_client_tid; __le32 mdsmap_epoch; /* on client */ __le32 flags; /* CEPH_MDS_FLAG_* */ @@ -492,9 +499,9 @@ struct ceph_mds_request_head_old { union ceph_mds_request_args args; } __attribute__ ((packed)); -#define CEPH_MDS_REQUEST_HEAD_VERSION 1 +#define CEPH_MDS_REQUEST_HEAD_VERSION 2 -struct ceph_mds_request_head { +struct ceph_mds_request_head_old { __le16 version; /* struct version */ __le64 oldest_client_tid; __le32 mdsmap_epoch; /* on client */ @@ -508,6 +515,23 @@ struct ceph_mds_request_head { union ceph_mds_request_args_ext args; } __attribute__ ((packed)); +struct ceph_mds_request_head { + __le16 version; /* struct version */ + __le64 oldest_client_tid; + __le32 mdsmap_epoch; /* on client */ + __le32 flags; /* CEPH_MDS_FLAG_* */ + __u8 num_retry, num_fwd; /* legacy count retry and fwd attempts */ + __le16 num_releases; /* # include cap/lease release records */ + __le32 op; /* mds op code */ + __le32 caller_uid, caller_gid; + __le64 ino; /* use this ino for openc, mkdir, mknod, + etc. (if replaying) */ + union ceph_mds_request_args_ext args; + + __le32 ext_num_retry; /* new count retry attempts */ + __le32 ext_num_fwd; /* new count fwd attempts */ +} __attribute__ ((packed)); + /* cap/lease release record */ struct ceph_mds_request_release { __le64 ino, cap_id; /* ino and unique cap id */ diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 99c1726be6ee..2eaaabbe98cb 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -17,6 +17,7 @@ struct ceph_msg; struct ceph_connection; +struct ceph_msg_data_cursor; /* * Ceph defines these callbacks for handling connection events. @@ -70,6 +71,30 @@ struct ceph_connection_operations { int used_proto, int result, const int *allowed_protos, int proto_cnt, const int *allowed_modes, int mode_cnt); + + /** + * sparse_read: read sparse data + * @con: connection we're reading from + * @cursor: data cursor for reading extents + * @buf: optional buffer to read into + * + * This should be called more than once, each time setting up to + * receive an extent into the current cursor position, and zeroing + * the holes between them. + * + * Returns amount of data to be read (in bytes), 0 if reading is + * complete, or -errno if there was an error. + * + * If @buf is set on a >0 return, then the data should be read into + * the provided buffer. Otherwise, it should be read into the cursor. + * + * The sparse read operation is expected to initialize the cursor + * with a length covering up to the end of the last extent. + */ + int (*sparse_read)(struct ceph_connection *con, + struct ceph_msg_data_cursor *cursor, + char **buf); + }; /* use format string %s%lld */ @@ -98,6 +123,7 @@ enum ceph_msg_data_type { CEPH_MSG_DATA_BIO, /* data source/destination is a bio list */ #endif /* CONFIG_BLOCK */ CEPH_MSG_DATA_BVECS, /* data source/destination is a bio_vec array */ + CEPH_MSG_DATA_ITER, /* data source/destination is an iov_iter */ }; #ifdef CONFIG_BLOCK @@ -199,6 +225,7 @@ struct ceph_msg_data { bool own_pages; }; struct ceph_pagelist *pagelist; + struct iov_iter iter; }; }; @@ -207,6 +234,7 @@ struct ceph_msg_data_cursor { struct ceph_msg_data *data; /* current data item */ size_t resid; /* bytes not yet consumed */ + int sr_resid; /* residual sparse_read len */ bool need_crc; /* crc update needed */ union { #ifdef CONFIG_BLOCK @@ -222,6 +250,10 @@ struct ceph_msg_data_cursor { struct page *page; /* page from list */ size_t offset; /* bytes from list */ }; + struct { + struct iov_iter iov_iter; + unsigned int lastlen; + }; }; }; @@ -251,6 +283,7 @@ struct ceph_msg { struct kref kref; bool more_to_follow; bool needs_out_seq; + bool sparse_read; int front_alloc_len; struct ceph_msgpool *pool; @@ -309,6 +342,10 @@ struct ceph_connection_v1_info { int in_base_pos; /* bytes read */ + /* sparse reads */ + struct kvec in_sr_kvec; /* current location to receive into */ + u64 in_sr_len; /* amount of data in this extent */ + /* message in temps */ u8 in_tag; /* protocol control byte */ struct ceph_msg_header in_hdr; @@ -395,6 +432,7 @@ struct ceph_connection_v2_info { void *conn_bufs[16]; int conn_buf_cnt; + int data_len_remain; struct kvec in_sign_kvecs[8]; struct kvec out_sign_kvecs[8]; @@ -573,6 +611,8 @@ void ceph_msg_data_add_bio(struct ceph_msg *msg, struct ceph_bio_iter *bio_pos, #endif /* CONFIG_BLOCK */ void ceph_msg_data_add_bvecs(struct ceph_msg *msg, struct ceph_bvec_iter *bvec_pos); +void ceph_msg_data_add_iter(struct ceph_msg *msg, + struct iov_iter *iter); struct ceph_msg *ceph_msg_new2(int type, int front_len, int max_data_items, gfp_t flags, bool can_fail); diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index fb6be72104df..bf9823956758 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -29,14 +29,62 @@ typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *); #define CEPH_HOMELESS_OSD -1 -/* a given osd we're communicating with */ +/* + * A single extent in a SPARSE_READ reply. + * + * Note that these come from the OSD as little-endian values. On BE arches, + * we convert them in-place after receipt. + */ +struct ceph_sparse_extent { + u64 off; + u64 len; +} __packed; + +/* Sparse read state machine state values */ +enum ceph_sparse_read_state { + CEPH_SPARSE_READ_HDR = 0, + CEPH_SPARSE_READ_EXTENTS, + CEPH_SPARSE_READ_DATA_LEN, + CEPH_SPARSE_READ_DATA, +}; + +/* + * A SPARSE_READ reply is a 32-bit count of extents, followed by an array of + * 64-bit offset/length pairs, and then all of the actual file data + * concatenated after it (sans holes). + * + * Unfortunately, we don't know how long the extent array is until we've + * started reading the data section of the reply. The caller should send down + * a destination buffer for the array, but we'll alloc one if it's too small + * or if the caller doesn't. + */ +struct ceph_sparse_read { + enum ceph_sparse_read_state sr_state; /* state machine state */ + u64 sr_req_off; /* orig request offset */ + u64 sr_req_len; /* orig request length */ + u64 sr_pos; /* current pos in buffer */ + int sr_index; /* current extent index */ + __le32 sr_datalen; /* length of actual data */ + u32 sr_count; /* extent count in reply */ + int sr_ext_len; /* length of extent array */ + struct ceph_sparse_extent *sr_extent; /* extent array */ +}; + +/* + * A given osd we're communicating with. + * + * Note that the o_requests tree can be searched while holding the "lock" mutex + * or the "o_requests_lock" spinlock. Insertion or removal requires both! + */ struct ceph_osd { refcount_t o_ref; + int o_sparse_op_idx; struct ceph_osd_client *o_osdc; int o_osd; int o_incarnation; struct rb_node o_node; struct ceph_connection o_con; + spinlock_t o_requests_lock; struct rb_root o_requests; struct rb_root o_linger_requests; struct rb_root o_backoff_mappings; @@ -46,6 +94,7 @@ struct ceph_osd { unsigned long lru_ttl; struct list_head o_keepalive_item; struct mutex lock; + struct ceph_sparse_read o_sparse_read; }; #define CEPH_OSD_SLAB_OPS 2 @@ -59,6 +108,7 @@ enum ceph_osd_data_type { CEPH_OSD_DATA_TYPE_BIO, #endif /* CONFIG_BLOCK */ CEPH_OSD_DATA_TYPE_BVECS, + CEPH_OSD_DATA_TYPE_ITER, }; struct ceph_osd_data { @@ -82,6 +132,7 @@ struct ceph_osd_data { struct ceph_bvec_iter bvec_pos; u32 num_bvecs; }; + struct iov_iter iter; }; }; @@ -98,6 +149,8 @@ struct ceph_osd_req_op { u64 offset, length; u64 truncate_size; u32 truncate_seq; + int sparse_ext_cnt; + struct ceph_sparse_extent *sparse_ext; struct ceph_osd_data osd_data; } extent; struct { @@ -145,6 +198,9 @@ struct ceph_osd_req_op { u32 src_fadvise_flags; struct ceph_osd_data osd_data; } copy_from; + struct { + u64 ver; + } assert_ver; }; }; @@ -199,6 +255,7 @@ struct ceph_osd_request { struct ceph_osd_client *r_osdc; struct kref r_kref; bool r_mempool; + bool r_linger; /* don't resend on failure */ struct completion r_completion; /* private to osd_client.c */ ceph_osdc_callback_t r_callback; @@ -211,9 +268,9 @@ struct ceph_osd_request { struct ceph_snap_context *r_snapc; /* for writes */ struct timespec64 r_mtime; /* ditto */ u64 r_data_offset; /* ditto */ - bool r_linger; /* don't resend on failure */ /* internal */ + u64 r_version; /* data version sent in reply */ unsigned long r_stamp; /* jiffies, send or check time */ unsigned long r_start_stamp; /* jiffies */ ktime_t r_start_latency; /* ktime_t */ @@ -450,6 +507,8 @@ void osd_req_op_extent_osd_data_bvecs(struct ceph_osd_request *osd_req, void osd_req_op_extent_osd_data_bvec_pos(struct ceph_osd_request *osd_req, unsigned int which, struct ceph_bvec_iter *bvec_pos); +void osd_req_op_extent_osd_iter(struct ceph_osd_request *osd_req, + unsigned int which, struct iov_iter *iter); extern void osd_req_op_cls_request_data_pagelist(struct ceph_osd_request *, unsigned int which, @@ -504,6 +563,20 @@ extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *, u32 truncate_seq, u64 truncate_size, bool use_mempool); +int __ceph_alloc_sparse_ext_map(struct ceph_osd_req_op *op, int cnt); + +/* + * How big an extent array should we preallocate for a sparse read? This is + * just a starting value. If we get more than this back from the OSD, the + * receiver will reallocate. + */ +#define CEPH_SPARSE_EXT_ARRAY_INITIAL 16 + +static inline int ceph_alloc_sparse_ext_map(struct ceph_osd_req_op *op) +{ + return __ceph_alloc_sparse_ext_map(op, CEPH_SPARSE_EXT_ARRAY_INITIAL); +} + extern void ceph_osdc_get_request(struct ceph_osd_request *req); extern void ceph_osdc_put_request(struct ceph_osd_request *req); @@ -558,5 +631,19 @@ int ceph_osdc_list_watchers(struct ceph_osd_client *osdc, struct ceph_object_locator *oloc, struct ceph_watch_item **watchers, u32 *num_watchers); -#endif +/* Find offset into the buffer of the end of the extent map */ +static inline u64 ceph_sparse_ext_map_end(struct ceph_osd_req_op *op) +{ + struct ceph_sparse_extent *ext; + + /* No extents? No data */ + if (op->extent.sparse_ext_cnt == 0) + return 0; + + ext = &op->extent.sparse_ext[op->extent.sparse_ext_cnt - 1]; + + return ext->off + ext->len - op->extent.offset; +} + +#endif diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h index 43a7a1573b51..73c3efbec36c 100644 --- a/include/linux/ceph/rados.h +++ b/include/linux/ceph/rados.h @@ -523,6 +523,10 @@ struct ceph_osd_op { struct { __le64 cookie; } __attribute__ ((packed)) notify; + struct { + __le64 unused; + __le64 ver; + } __attribute__ ((packed)) assert_ver; struct { __le64 offset, length; __le64 src_offset; diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 5eb4898cccd4..10a41cd9c523 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -969,6 +969,62 @@ static bool ceph_msg_data_pagelist_advance(struct ceph_msg_data_cursor *cursor, return true; } +static void ceph_msg_data_iter_cursor_init(struct ceph_msg_data_cursor *cursor, + size_t length) +{ + struct ceph_msg_data *data = cursor->data; + + cursor->iov_iter = data->iter; + cursor->lastlen = 0; + iov_iter_truncate(&cursor->iov_iter, length); + cursor->resid = iov_iter_count(&cursor->iov_iter); +} + +static struct page *ceph_msg_data_iter_next(struct ceph_msg_data_cursor *cursor, + size_t *page_offset, size_t *length) +{ + struct page *page; + ssize_t len; + + if (cursor->lastlen) + iov_iter_revert(&cursor->iov_iter, cursor->lastlen); + + len = iov_iter_get_pages2(&cursor->iov_iter, &page, PAGE_SIZE, + 1, page_offset); + BUG_ON(len < 0); + + cursor->lastlen = len; + + /* + * FIXME: The assumption is that the pages represented by the iov_iter + * are pinned, with the references held by the upper-level + * callers, or by virtue of being under writeback. Eventually, + * we'll get an iov_iter_get_pages2 variant that doesn't take + * page refs. Until then, just put the page ref. + */ + VM_BUG_ON_PAGE(!PageWriteback(page) && page_count(page) < 2, page); + put_page(page); + + *length = min_t(size_t, len, cursor->resid); + return page; +} + +static bool ceph_msg_data_iter_advance(struct ceph_msg_data_cursor *cursor, + size_t bytes) +{ + BUG_ON(bytes > cursor->resid); + cursor->resid -= bytes; + + if (bytes < cursor->lastlen) { + cursor->lastlen -= bytes; + } else { + iov_iter_advance(&cursor->iov_iter, bytes - cursor->lastlen); + cursor->lastlen = 0; + } + + return cursor->resid; +} + /* * Message data is handled (sent or received) in pieces, where each * piece resides on a single page. The network layer might not @@ -996,6 +1052,9 @@ static void __ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor) case CEPH_MSG_DATA_BVECS: ceph_msg_data_bvecs_cursor_init(cursor, length); break; + case CEPH_MSG_DATA_ITER: + ceph_msg_data_iter_cursor_init(cursor, length); + break; case CEPH_MSG_DATA_NONE: default: /* BUG(); */ @@ -1013,6 +1072,7 @@ void ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor, cursor->total_resid = length; cursor->data = msg->data; + cursor->sr_resid = 0; __ceph_msg_data_cursor_init(cursor); } @@ -1042,6 +1102,9 @@ struct page *ceph_msg_data_next(struct ceph_msg_data_cursor *cursor, case CEPH_MSG_DATA_BVECS: page = ceph_msg_data_bvecs_next(cursor, page_offset, length); break; + case CEPH_MSG_DATA_ITER: + page = ceph_msg_data_iter_next(cursor, page_offset, length); + break; case CEPH_MSG_DATA_NONE: default: page = NULL; @@ -1080,6 +1143,9 @@ void ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor, size_t bytes) case CEPH_MSG_DATA_BVECS: new_piece = ceph_msg_data_bvecs_advance(cursor, bytes); break; + case CEPH_MSG_DATA_ITER: + new_piece = ceph_msg_data_iter_advance(cursor, bytes); + break; case CEPH_MSG_DATA_NONE: default: BUG(); @@ -1879,6 +1945,18 @@ void ceph_msg_data_add_bvecs(struct ceph_msg *msg, } EXPORT_SYMBOL(ceph_msg_data_add_bvecs); +void ceph_msg_data_add_iter(struct ceph_msg *msg, + struct iov_iter *iter) +{ + struct ceph_msg_data *data; + + data = ceph_msg_data_add(msg); + data->type = CEPH_MSG_DATA_ITER; + data->iter = *iter; + + msg->data_length += iov_iter_count(&data->iter); +} + /* * construct a new message with given type, size * the new msg has a ref count of 1. diff --git a/net/ceph/messenger_v1.c b/net/ceph/messenger_v1.c index 3d57bb48a2b4..f9a50d7f0d20 100644 --- a/net/ceph/messenger_v1.c +++ b/net/ceph/messenger_v1.c @@ -159,9 +159,9 @@ static size_t sizeof_footer(struct ceph_connection *con) static void prepare_message_data(struct ceph_msg *msg, u32 data_len) { - /* Initialize data cursor */ - - ceph_msg_data_cursor_init(&msg->cursor, msg, data_len); + /* Initialize data cursor if it's not a sparse read */ + if (!msg->sparse_read) + ceph_msg_data_cursor_init(&msg->cursor, msg, data_len); } /* @@ -960,9 +960,9 @@ static void process_ack(struct ceph_connection *con) prepare_read_tag(con); } -static int read_partial_message_section(struct ceph_connection *con, - struct kvec *section, - unsigned int sec_len, u32 *crc) +static int read_partial_message_chunk(struct ceph_connection *con, + struct kvec *section, + unsigned int sec_len, u32 *crc) { int ret, left; @@ -978,11 +978,91 @@ static int read_partial_message_section(struct ceph_connection *con, section->iov_len += ret; } if (section->iov_len == sec_len) - *crc = crc32c(0, section->iov_base, section->iov_len); + *crc = crc32c(*crc, section->iov_base, section->iov_len); return 1; } +static inline int read_partial_message_section(struct ceph_connection *con, + struct kvec *section, + unsigned int sec_len, u32 *crc) +{ + *crc = 0; + return read_partial_message_chunk(con, section, sec_len, crc); +} + +static int read_sparse_msg_extent(struct ceph_connection *con, u32 *crc) +{ + struct ceph_msg_data_cursor *cursor = &con->in_msg->cursor; + bool do_bounce = ceph_test_opt(from_msgr(con->msgr), RXBOUNCE); + + if (do_bounce && unlikely(!con->bounce_page)) { + con->bounce_page = alloc_page(GFP_NOIO); + if (!con->bounce_page) { + pr_err("failed to allocate bounce page\n"); + return -ENOMEM; + } + } + + while (cursor->sr_resid > 0) { + struct page *page, *rpage; + size_t off, len; + int ret; + + page = ceph_msg_data_next(cursor, &off, &len); + rpage = do_bounce ? con->bounce_page : page; + + /* clamp to what remains in extent */ + len = min_t(int, len, cursor->sr_resid); + ret = ceph_tcp_recvpage(con->sock, rpage, (int)off, len); + if (ret <= 0) + return ret; + *crc = ceph_crc32c_page(*crc, rpage, off, ret); + ceph_msg_data_advance(cursor, (size_t)ret); + cursor->sr_resid -= ret; + if (do_bounce) + memcpy_page(page, off, rpage, off, ret); + } + return 1; +} + +static int read_sparse_msg_data(struct ceph_connection *con) +{ + struct ceph_msg_data_cursor *cursor = &con->in_msg->cursor; + bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC); + u32 crc = 0; + int ret = 1; + + if (do_datacrc) + crc = con->in_data_crc; + + do { + if (con->v1.in_sr_kvec.iov_base) + ret = read_partial_message_chunk(con, + &con->v1.in_sr_kvec, + con->v1.in_sr_len, + &crc); + else if (cursor->sr_resid > 0) + ret = read_sparse_msg_extent(con, &crc); + + if (ret <= 0) { + if (do_datacrc) + con->in_data_crc = crc; + return ret; + } + + memset(&con->v1.in_sr_kvec, 0, sizeof(con->v1.in_sr_kvec)); + ret = con->ops->sparse_read(con, cursor, + (char **)&con->v1.in_sr_kvec.iov_base); + con->v1.in_sr_len = ret; + } while (ret > 0); + + if (do_datacrc) + con->in_data_crc = crc; + + return ret < 0 ? ret : 1; /* must return > 0 to indicate success */ +} + static int read_partial_msg_data(struct ceph_connection *con) { struct ceph_msg_data_cursor *cursor = &con->in_msg->cursor; @@ -1173,7 +1253,9 @@ static int read_partial_message(struct ceph_connection *con) if (!m->num_data_items) return -EIO; - if (ceph_test_opt(from_msgr(con->msgr), RXBOUNCE)) + if (m->sparse_read) + ret = read_sparse_msg_data(con); + else if (ceph_test_opt(from_msgr(con->msgr), RXBOUNCE)) ret = read_partial_msg_data_bounce(con); else ret = read_partial_msg_data(con); diff --git a/net/ceph/messenger_v2.c b/net/ceph/messenger_v2.c index 1df1d29dee92..d09a39ff2cf0 100644 --- a/net/ceph/messenger_v2.c +++ b/net/ceph/messenger_v2.c @@ -8,9 +8,9 @@ #include #include -#include /* for crypto_memneq() */ #include #include +#include #include #include #include @@ -52,14 +52,16 @@ #define FRAME_LATE_STATUS_COMPLETE 0xe #define FRAME_LATE_STATUS_ABORTED_MASK 0xf -#define IN_S_HANDLE_PREAMBLE 1 -#define IN_S_HANDLE_CONTROL 2 -#define IN_S_HANDLE_CONTROL_REMAINDER 3 -#define IN_S_PREPARE_READ_DATA 4 -#define IN_S_PREPARE_READ_DATA_CONT 5 -#define IN_S_PREPARE_READ_ENC_PAGE 6 -#define IN_S_HANDLE_EPILOGUE 7 -#define IN_S_FINISH_SKIP 8 +#define IN_S_HANDLE_PREAMBLE 1 +#define IN_S_HANDLE_CONTROL 2 +#define IN_S_HANDLE_CONTROL_REMAINDER 3 +#define IN_S_PREPARE_READ_DATA 4 +#define IN_S_PREPARE_READ_DATA_CONT 5 +#define IN_S_PREPARE_READ_ENC_PAGE 6 +#define IN_S_PREPARE_SPARSE_DATA 7 +#define IN_S_PREPARE_SPARSE_DATA_CONT 8 +#define IN_S_HANDLE_EPILOGUE 9 +#define IN_S_FINISH_SKIP 10 #define OUT_S_QUEUE_DATA 1 #define OUT_S_QUEUE_DATA_CONT 2 @@ -967,12 +969,48 @@ static void init_sgs_cursor(struct scatterlist **sg, } } +/** + * init_sgs_pages: set up scatterlist on an array of page pointers + * @sg: scatterlist to populate + * @pages: pointer to page array + * @dpos: position in the array to start (bytes) + * @dlen: len to add to sg (bytes) + * @pad: pointer to pad destination (if any) + * + * Populate the scatterlist from the page array, starting at an arbitrary + * byte in the array and running for a specified length. + */ +static void init_sgs_pages(struct scatterlist **sg, struct page **pages, + int dpos, int dlen, u8 *pad) +{ + int idx = dpos >> PAGE_SHIFT; + int off = offset_in_page(dpos); + int resid = dlen; + + do { + int len = min(resid, (int)PAGE_SIZE - off); + + sg_set_page(*sg, pages[idx], len, off); + *sg = sg_next(*sg); + off = 0; + ++idx; + resid -= len; + } while (resid); + + if (need_padding(dlen)) { + sg_set_buf(*sg, pad, padding_len(dlen)); + *sg = sg_next(*sg); + } +} + static int setup_message_sgs(struct sg_table *sgt, struct ceph_msg *msg, u8 *front_pad, u8 *middle_pad, u8 *data_pad, - void *epilogue, bool add_tag) + void *epilogue, struct page **pages, int dpos, + bool add_tag) { struct ceph_msg_data_cursor cursor; struct scatterlist *cur_sg; + int dlen = data_len(msg); int sg_cnt; int ret; @@ -986,9 +1024,15 @@ static int setup_message_sgs(struct sg_table *sgt, struct ceph_msg *msg, if (middle_len(msg)) sg_cnt += calc_sg_cnt(msg->middle->vec.iov_base, middle_len(msg)); - if (data_len(msg)) { - ceph_msg_data_cursor_init(&cursor, msg, data_len(msg)); - sg_cnt += calc_sg_cnt_cursor(&cursor); + if (dlen) { + if (pages) { + sg_cnt += calc_pages_for(dpos, dlen); + if (need_padding(dlen)) + sg_cnt++; + } else { + ceph_msg_data_cursor_init(&cursor, msg, dlen); + sg_cnt += calc_sg_cnt_cursor(&cursor); + } } ret = sg_alloc_table(sgt, sg_cnt, GFP_NOIO); @@ -1002,9 +1046,13 @@ static int setup_message_sgs(struct sg_table *sgt, struct ceph_msg *msg, if (middle_len(msg)) init_sgs(&cur_sg, msg->middle->vec.iov_base, middle_len(msg), middle_pad); - if (data_len(msg)) { - ceph_msg_data_cursor_init(&cursor, msg, data_len(msg)); - init_sgs_cursor(&cur_sg, &cursor, data_pad); + if (dlen) { + if (pages) { + init_sgs_pages(&cur_sg, pages, dpos, dlen, data_pad); + } else { + ceph_msg_data_cursor_init(&cursor, msg, dlen); + init_sgs_cursor(&cur_sg, &cursor, data_pad); + } } WARN_ON(!sg_is_last(cur_sg)); @@ -1039,10 +1087,53 @@ static int decrypt_control_remainder(struct ceph_connection *con) padded_len(rem_len) + CEPH_GCM_TAG_LEN); } +/* Process sparse read data that lives in a buffer */ +static int process_v2_sparse_read(struct ceph_connection *con, + struct page **pages, int spos) +{ + struct ceph_msg_data_cursor *cursor = &con->v2.in_cursor; + int ret; + + for (;;) { + char *buf = NULL; + + ret = con->ops->sparse_read(con, cursor, &buf); + if (ret <= 0) + return ret; + + dout("%s: sparse_read return %x buf %p\n", __func__, ret, buf); + + do { + int idx = spos >> PAGE_SHIFT; + int soff = offset_in_page(spos); + struct page *spage = con->v2.in_enc_pages[idx]; + int len = min_t(int, ret, PAGE_SIZE - soff); + + if (buf) { + memcpy_from_page(buf, spage, soff, len); + buf += len; + } else { + struct bio_vec bv; + + get_bvec_at(cursor, &bv); + len = min_t(int, len, bv.bv_len); + memcpy_page(bv.bv_page, bv.bv_offset, + spage, soff, len); + ceph_msg_data_advance(cursor, len); + } + spos += len; + ret -= len; + } while (ret); + } +} + static int decrypt_tail(struct ceph_connection *con) { struct sg_table enc_sgt = {}; struct sg_table sgt = {}; + struct page **pages = NULL; + bool sparse = con->in_msg->sparse_read; + int dpos = 0; int tail_len; int ret; @@ -1053,9 +1144,14 @@ static int decrypt_tail(struct ceph_connection *con) if (ret) goto out; + if (sparse) { + dpos = padded_len(front_len(con->in_msg) + padded_len(middle_len(con->in_msg))); + pages = con->v2.in_enc_pages; + } + ret = setup_message_sgs(&sgt, con->in_msg, FRONT_PAD(con->v2.in_buf), - MIDDLE_PAD(con->v2.in_buf), DATA_PAD(con->v2.in_buf), - con->v2.in_buf, true); + MIDDLE_PAD(con->v2.in_buf), DATA_PAD(con->v2.in_buf), + con->v2.in_buf, pages, dpos, true); if (ret) goto out; @@ -1065,6 +1161,12 @@ static int decrypt_tail(struct ceph_connection *con) if (ret) goto out; + if (sparse && data_len(con->in_msg)) { + ret = process_v2_sparse_read(con, con->v2.in_enc_pages, dpos); + if (ret) + goto out; + } + WARN_ON(!con->v2.in_enc_page_cnt); ceph_release_page_vector(con->v2.in_enc_pages, con->v2.in_enc_page_cnt); @@ -1588,7 +1690,7 @@ static int prepare_message_secure(struct ceph_connection *con) encode_epilogue_secure(con, false); ret = setup_message_sgs(&sgt, con->out_msg, zerop, zerop, zerop, - &con->v2.out_epil, false); + &con->v2.out_epil, NULL, 0, false); if (ret) goto out; @@ -1825,6 +1927,123 @@ static void prepare_read_data_cont(struct ceph_connection *con) con->v2.in_state = IN_S_HANDLE_EPILOGUE; } +static int prepare_sparse_read_cont(struct ceph_connection *con) +{ + int ret; + struct bio_vec bv; + char *buf = NULL; + struct ceph_msg_data_cursor *cursor = &con->v2.in_cursor; + + WARN_ON(con->v2.in_state != IN_S_PREPARE_SPARSE_DATA_CONT); + + if (iov_iter_is_bvec(&con->v2.in_iter)) { + if (ceph_test_opt(from_msgr(con->msgr), RXBOUNCE)) { + con->in_data_crc = crc32c(con->in_data_crc, + page_address(con->bounce_page), + con->v2.in_bvec.bv_len); + get_bvec_at(cursor, &bv); + memcpy_to_page(bv.bv_page, bv.bv_offset, + page_address(con->bounce_page), + con->v2.in_bvec.bv_len); + } else { + con->in_data_crc = ceph_crc32c_page(con->in_data_crc, + con->v2.in_bvec.bv_page, + con->v2.in_bvec.bv_offset, + con->v2.in_bvec.bv_len); + } + + ceph_msg_data_advance(cursor, con->v2.in_bvec.bv_len); + cursor->sr_resid -= con->v2.in_bvec.bv_len; + dout("%s: advance by 0x%x sr_resid 0x%x\n", __func__, + con->v2.in_bvec.bv_len, cursor->sr_resid); + WARN_ON_ONCE(cursor->sr_resid > cursor->total_resid); + if (cursor->sr_resid) { + get_bvec_at(cursor, &bv); + if (bv.bv_len > cursor->sr_resid) + bv.bv_len = cursor->sr_resid; + if (ceph_test_opt(from_msgr(con->msgr), RXBOUNCE)) { + bv.bv_page = con->bounce_page; + bv.bv_offset = 0; + } + set_in_bvec(con, &bv); + con->v2.data_len_remain -= bv.bv_len; + return 0; + } + } else if (iov_iter_is_kvec(&con->v2.in_iter)) { + /* On first call, we have no kvec so don't compute crc */ + if (con->v2.in_kvec_cnt) { + WARN_ON_ONCE(con->v2.in_kvec_cnt > 1); + con->in_data_crc = crc32c(con->in_data_crc, + con->v2.in_kvecs[0].iov_base, + con->v2.in_kvecs[0].iov_len); + } + } else { + return -EIO; + } + + /* get next extent */ + ret = con->ops->sparse_read(con, cursor, &buf); + if (ret <= 0) { + if (ret < 0) + return ret; + + reset_in_kvecs(con); + add_in_kvec(con, con->v2.in_buf, CEPH_EPILOGUE_PLAIN_LEN); + con->v2.in_state = IN_S_HANDLE_EPILOGUE; + return 0; + } + + if (buf) { + /* receive into buffer */ + reset_in_kvecs(con); + add_in_kvec(con, buf, ret); + con->v2.data_len_remain -= ret; + return 0; + } + + if (ret > cursor->total_resid) { + pr_warn("%s: ret 0x%x total_resid 0x%zx resid 0x%zx\n", + __func__, ret, cursor->total_resid, cursor->resid); + return -EIO; + } + get_bvec_at(cursor, &bv); + if (bv.bv_len > cursor->sr_resid) + bv.bv_len = cursor->sr_resid; + if (ceph_test_opt(from_msgr(con->msgr), RXBOUNCE)) { + if (unlikely(!con->bounce_page)) { + con->bounce_page = alloc_page(GFP_NOIO); + if (!con->bounce_page) { + pr_err("failed to allocate bounce page\n"); + return -ENOMEM; + } + } + + bv.bv_page = con->bounce_page; + bv.bv_offset = 0; + } + set_in_bvec(con, &bv); + con->v2.data_len_remain -= ret; + return ret; +} + +static int prepare_sparse_read_data(struct ceph_connection *con) +{ + struct ceph_msg *msg = con->in_msg; + + dout("%s: starting sparse read\n", __func__); + + if (WARN_ON_ONCE(!con->ops->sparse_read)) + return -EOPNOTSUPP; + + if (!con_secure(con)) + con->in_data_crc = -1; + + reset_in_kvecs(con); + con->v2.in_state = IN_S_PREPARE_SPARSE_DATA_CONT; + con->v2.data_len_remain = data_len(msg); + return prepare_sparse_read_cont(con); +} + static int prepare_read_tail_plain(struct ceph_connection *con) { struct ceph_msg *msg = con->in_msg; @@ -1845,7 +2064,10 @@ static int prepare_read_tail_plain(struct ceph_connection *con) } if (data_len(msg)) { - con->v2.in_state = IN_S_PREPARE_READ_DATA; + if (msg->sparse_read) + con->v2.in_state = IN_S_PREPARE_SPARSE_DATA; + else + con->v2.in_state = IN_S_PREPARE_READ_DATA; } else { add_in_kvec(con, con->v2.in_buf, CEPH_EPILOGUE_PLAIN_LEN); con->v2.in_state = IN_S_HANDLE_EPILOGUE; @@ -2898,6 +3120,12 @@ static int populate_in_iter(struct ceph_connection *con) prepare_read_enc_page(con); ret = 0; break; + case IN_S_PREPARE_SPARSE_DATA: + ret = prepare_sparse_read_data(con); + break; + case IN_S_PREPARE_SPARSE_DATA_CONT: + ret = prepare_sparse_read_cont(con); + break; case IN_S_HANDLE_EPILOGUE: ret = handle_epilogue(con); break; @@ -3489,6 +3717,23 @@ static void revoke_at_prepare_read_enc_page(struct ceph_connection *con) con->v2.in_state = IN_S_FINISH_SKIP; } +static void revoke_at_prepare_sparse_data(struct ceph_connection *con) +{ + int resid; /* current piece of data */ + int remaining; + + WARN_ON(con_secure(con)); + WARN_ON(!data_len(con->in_msg)); + WARN_ON(!iov_iter_is_bvec(&con->v2.in_iter)); + resid = iov_iter_count(&con->v2.in_iter); + dout("%s con %p resid %d\n", __func__, con, resid); + + remaining = CEPH_EPILOGUE_PLAIN_LEN + con->v2.data_len_remain; + con->v2.in_iter.count -= resid; + set_in_skip(con, resid + remaining); + con->v2.in_state = IN_S_FINISH_SKIP; +} + static void revoke_at_handle_epilogue(struct ceph_connection *con) { int resid; @@ -3505,6 +3750,7 @@ static void revoke_at_handle_epilogue(struct ceph_connection *con) void ceph_con_v2_revoke_incoming(struct ceph_connection *con) { switch (con->v2.in_state) { + case IN_S_PREPARE_SPARSE_DATA: case IN_S_PREPARE_READ_DATA: revoke_at_prepare_read_data(con); break; @@ -3514,6 +3760,9 @@ void ceph_con_v2_revoke_incoming(struct ceph_connection *con) case IN_S_PREPARE_READ_ENC_PAGE: revoke_at_prepare_read_enc_page(con); break; + case IN_S_PREPARE_SPARSE_DATA_CONT: + revoke_at_prepare_sparse_data(con); + break; case IN_S_HANDLE_EPILOGUE: revoke_at_handle_epilogue(con); break; diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 658a6f2320cf..d3a759e052c8 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -171,6 +171,13 @@ static void ceph_osd_data_bvecs_init(struct ceph_osd_data *osd_data, osd_data->num_bvecs = num_bvecs; } +static void ceph_osd_iter_init(struct ceph_osd_data *osd_data, + struct iov_iter *iter) +{ + osd_data->type = CEPH_OSD_DATA_TYPE_ITER; + osd_data->iter = *iter; +} + static struct ceph_osd_data * osd_req_op_raw_data_in(struct ceph_osd_request *osd_req, unsigned int which) { @@ -264,6 +271,22 @@ void osd_req_op_extent_osd_data_bvec_pos(struct ceph_osd_request *osd_req, } EXPORT_SYMBOL(osd_req_op_extent_osd_data_bvec_pos); +/** + * osd_req_op_extent_osd_iter - Set up an operation with an iterator buffer + * @osd_req: The request to set up + * @which: Index of the operation in which to set the iter + * @iter: The buffer iterator + */ +void osd_req_op_extent_osd_iter(struct ceph_osd_request *osd_req, + unsigned int which, struct iov_iter *iter) +{ + struct ceph_osd_data *osd_data; + + osd_data = osd_req_op_data(osd_req, which, extent, osd_data); + ceph_osd_iter_init(osd_data, iter); +} +EXPORT_SYMBOL(osd_req_op_extent_osd_iter); + static void osd_req_op_cls_request_info_pagelist( struct ceph_osd_request *osd_req, unsigned int which, struct ceph_pagelist *pagelist) @@ -346,6 +369,8 @@ static u64 ceph_osd_data_length(struct ceph_osd_data *osd_data) #endif /* CONFIG_BLOCK */ case CEPH_OSD_DATA_TYPE_BVECS: return osd_data->bvec_pos.iter.bi_size; + case CEPH_OSD_DATA_TYPE_ITER: + return iov_iter_count(&osd_data->iter); default: WARN(true, "unrecognized data type %d\n", (int)osd_data->type); return 0; @@ -376,8 +401,10 @@ static void osd_req_op_data_release(struct ceph_osd_request *osd_req, switch (op->op) { case CEPH_OSD_OP_READ: + case CEPH_OSD_OP_SPARSE_READ: case CEPH_OSD_OP_WRITE: case CEPH_OSD_OP_WRITEFULL: + kfree(op->extent.sparse_ext); ceph_osd_data_release(&op->extent.osd_data); break; case CEPH_OSD_OP_CALL: @@ -669,6 +696,7 @@ static void get_num_data_items(struct ceph_osd_request *req, /* reply */ case CEPH_OSD_OP_STAT: case CEPH_OSD_OP_READ: + case CEPH_OSD_OP_SPARSE_READ: case CEPH_OSD_OP_LIST_WATCHERS: *num_reply_data_items += 1; break; @@ -738,7 +766,7 @@ void osd_req_op_extent_init(struct ceph_osd_request *osd_req, BUG_ON(opcode != CEPH_OSD_OP_READ && opcode != CEPH_OSD_OP_WRITE && opcode != CEPH_OSD_OP_WRITEFULL && opcode != CEPH_OSD_OP_ZERO && - opcode != CEPH_OSD_OP_TRUNCATE); + opcode != CEPH_OSD_OP_TRUNCATE && opcode != CEPH_OSD_OP_SPARSE_READ); op->extent.offset = offset; op->extent.length = length; @@ -951,6 +979,8 @@ static void ceph_osdc_msg_data_add(struct ceph_msg *msg, #endif } else if (osd_data->type == CEPH_OSD_DATA_TYPE_BVECS) { ceph_msg_data_add_bvecs(msg, &osd_data->bvec_pos); + } else if (osd_data->type == CEPH_OSD_DATA_TYPE_ITER) { + ceph_msg_data_add_iter(msg, &osd_data->iter); } else { BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_NONE); } @@ -963,6 +993,7 @@ static u32 osd_req_encode_op(struct ceph_osd_op *dst, case CEPH_OSD_OP_STAT: break; case CEPH_OSD_OP_READ: + case CEPH_OSD_OP_SPARSE_READ: case CEPH_OSD_OP_WRITE: case CEPH_OSD_OP_WRITEFULL: case CEPH_OSD_OP_ZERO: @@ -1017,6 +1048,10 @@ static u32 osd_req_encode_op(struct ceph_osd_op *dst, dst->copy_from.src_fadvise_flags = cpu_to_le32(src->copy_from.src_fadvise_flags); break; + case CEPH_OSD_OP_ASSERT_VER: + dst->assert_ver.unused = cpu_to_le64(0); + dst->assert_ver.ver = cpu_to_le64(src->assert_ver.ver); + break; default: pr_err("unsupported osd opcode %s\n", ceph_osd_op_name(src->op)); @@ -1059,7 +1094,8 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, BUG_ON(opcode != CEPH_OSD_OP_READ && opcode != CEPH_OSD_OP_WRITE && opcode != CEPH_OSD_OP_ZERO && opcode != CEPH_OSD_OP_TRUNCATE && - opcode != CEPH_OSD_OP_CREATE && opcode != CEPH_OSD_OP_DELETE); + opcode != CEPH_OSD_OP_CREATE && opcode != CEPH_OSD_OP_DELETE && + opcode != CEPH_OSD_OP_SPARSE_READ); req = ceph_osdc_alloc_request(osdc, snapc, num_ops, use_mempool, GFP_NOFS); @@ -1100,15 +1136,30 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, if (flags & CEPH_OSD_FLAG_WRITE) req->r_data_offset = off; - if (num_ops > 1) + if (num_ops > 1) { + int num_req_ops, num_rep_ops; + /* - * This is a special case for ceph_writepages_start(), but it - * also covers ceph_uninline_data(). If more multi-op request - * use cases emerge, we will need a separate helper. + * If this is a multi-op write request, assume that we'll need + * request ops. If it's a multi-op read then assume we'll need + * reply ops. Anything else and call it -EINVAL. */ - r = __ceph_osdc_alloc_messages(req, GFP_NOFS, num_ops, 0); - else + if (flags & CEPH_OSD_FLAG_WRITE) { + num_req_ops = num_ops; + num_rep_ops = 0; + } else if (flags & CEPH_OSD_FLAG_READ) { + num_req_ops = 0; + num_rep_ops = num_ops; + } else { + r = -EINVAL; + goto fail; + } + + r = __ceph_osdc_alloc_messages(req, GFP_NOFS, num_req_ops, + num_rep_ops); + } else { r = ceph_osdc_alloc_messages(req, GFP_NOFS); + } if (r) goto fail; @@ -1120,6 +1171,18 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, } EXPORT_SYMBOL(ceph_osdc_new_request); +int __ceph_alloc_sparse_ext_map(struct ceph_osd_req_op *op, int cnt) +{ + op->extent.sparse_ext_cnt = cnt; + op->extent.sparse_ext = kmalloc_array(cnt, + sizeof(*op->extent.sparse_ext), + GFP_NOFS); + if (!op->extent.sparse_ext) + return -ENOMEM; + return 0; +} +EXPORT_SYMBOL(__ceph_alloc_sparse_ext_map); + /* * We keep osd requests in an rbtree, sorted by ->r_tid. */ @@ -1177,6 +1240,7 @@ static void osd_init(struct ceph_osd *osd) { refcount_set(&osd->o_ref, 1); RB_CLEAR_NODE(&osd->o_node); + spin_lock_init(&osd->o_requests_lock); osd->o_requests = RB_ROOT; osd->o_linger_requests = RB_ROOT; osd->o_backoff_mappings = RB_ROOT; @@ -1187,6 +1251,13 @@ static void osd_init(struct ceph_osd *osd) mutex_init(&osd->lock); } +static void ceph_init_sparse_read(struct ceph_sparse_read *sr) +{ + kfree(sr->sr_extent); + memset(sr, '\0', sizeof(*sr)); + sr->sr_state = CEPH_SPARSE_READ_HDR; +} + static void osd_cleanup(struct ceph_osd *osd) { WARN_ON(!RB_EMPTY_NODE(&osd->o_node)); @@ -1197,6 +1268,8 @@ static void osd_cleanup(struct ceph_osd *osd) WARN_ON(!list_empty(&osd->o_osd_lru)); WARN_ON(!list_empty(&osd->o_keepalive_item)); + ceph_init_sparse_read(&osd->o_sparse_read); + if (osd->o_auth.authorizer) { WARN_ON(osd_homeless(osd)); ceph_auth_destroy_authorizer(osd->o_auth.authorizer); @@ -1216,6 +1289,9 @@ static struct ceph_osd *create_osd(struct ceph_osd_client *osdc, int onum) osd_init(osd); osd->o_osdc = osdc; osd->o_osd = onum; + osd->o_sparse_op_idx = -1; + + ceph_init_sparse_read(&osd->o_sparse_read); ceph_con_init(&osd->o_con, osd, &osd_con_ops, &osdc->client->msgr); @@ -1406,7 +1482,9 @@ static void link_request(struct ceph_osd *osd, struct ceph_osd_request *req) atomic_inc(&osd->o_osdc->num_homeless); get_osd(osd); + spin_lock(&osd->o_requests_lock); insert_request(&osd->o_requests, req); + spin_unlock(&osd->o_requests_lock); req->r_osd = osd; } @@ -1418,7 +1496,9 @@ static void unlink_request(struct ceph_osd *osd, struct ceph_osd_request *req) req, req->r_tid); req->r_osd = NULL; + spin_lock(&osd->o_requests_lock); erase_request(&osd->o_requests, req); + spin_unlock(&osd->o_requests_lock); put_osd(osd); if (!osd_homeless(osd)) @@ -2016,6 +2096,7 @@ static void setup_request_data(struct ceph_osd_request *req) &op->raw_data_in); break; case CEPH_OSD_OP_READ: + case CEPH_OSD_OP_SPARSE_READ: ceph_osdc_msg_data_add(reply_msg, &op->extent.osd_data); break; @@ -2435,8 +2516,10 @@ static void finish_request(struct ceph_osd_request *req) req->r_end_latency = ktime_get(); - if (req->r_osd) + if (req->r_osd) { + ceph_init_sparse_read(&req->r_osd->o_sparse_read); unlink_request(req->r_osd, req); + } atomic_dec(&osdc->num_requests); /* @@ -3795,6 +3878,7 @@ static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg) * one (type of) reply back. */ WARN_ON(!(m.flags & CEPH_OSD_FLAG_ONDISK)); + req->r_version = m.user_version; req->r_result = m.result ?: data_len; finish_request(req); mutex_unlock(&osd->lock); @@ -5348,6 +5432,24 @@ static void osd_dispatch(struct ceph_connection *con, struct ceph_msg *msg) ceph_msg_put(msg); } +/* How much sparse data was requested? */ +static u64 sparse_data_requested(struct ceph_osd_request *req) +{ + u64 len = 0; + + if (req->r_flags & CEPH_OSD_FLAG_READ) { + int i; + + for (i = 0; i < req->r_num_ops; ++i) { + struct ceph_osd_req_op *op = &req->r_ops[i]; + + if (op->op == CEPH_OSD_OP_SPARSE_READ) + len += op->extent.length; + } + } + return len; +} + /* * Lookup and return message for incoming reply. Don't try to do * anything about a larger than preallocated data portion of the @@ -5364,6 +5466,7 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, int front_len = le32_to_cpu(hdr->front_len); int data_len = le32_to_cpu(hdr->data_len); u64 tid = le64_to_cpu(hdr->tid); + u64 srlen; down_read(&osdc->lock); if (!osd_registered(osd)) { @@ -5396,7 +5499,8 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, req->r_reply = m; } - if (data_len > req->r_reply->data_length) { + srlen = sparse_data_requested(req); + if (!srlen && data_len > req->r_reply->data_length) { pr_warn("%s osd%d tid %llu data %d > preallocated %zu, skipping\n", __func__, osd->o_osd, req->r_tid, data_len, req->r_reply->data_length); @@ -5406,6 +5510,8 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, } m = ceph_msg_get(req->r_reply); + m->sparse_read = (bool)srlen; + dout("get_reply tid %lld %p\n", tid, m); out_unlock_session: @@ -5638,9 +5744,217 @@ static int osd_check_message_signature(struct ceph_msg *msg) return ceph_auth_check_message_signature(auth, msg); } +static void advance_cursor(struct ceph_msg_data_cursor *cursor, size_t len, + bool zero) +{ + while (len) { + struct page *page; + size_t poff, plen; + + page = ceph_msg_data_next(cursor, &poff, &plen); + if (plen > len) + plen = len; + if (zero) + zero_user_segment(page, poff, poff + plen); + len -= plen; + ceph_msg_data_advance(cursor, plen); + } +} + +static int prep_next_sparse_read(struct ceph_connection *con, + struct ceph_msg_data_cursor *cursor) +{ + struct ceph_osd *o = con->private; + struct ceph_sparse_read *sr = &o->o_sparse_read; + struct ceph_osd_request *req; + struct ceph_osd_req_op *op; + + spin_lock(&o->o_requests_lock); + req = lookup_request(&o->o_requests, le64_to_cpu(con->in_msg->hdr.tid)); + if (!req) { + spin_unlock(&o->o_requests_lock); + return -EBADR; + } + + if (o->o_sparse_op_idx < 0) { + u64 srlen = sparse_data_requested(req); + + dout("%s: [%d] starting new sparse read req. srlen=0x%llx\n", + __func__, o->o_osd, srlen); + ceph_msg_data_cursor_init(cursor, con->in_msg, srlen); + } else { + u64 end; + + op = &req->r_ops[o->o_sparse_op_idx]; + + WARN_ON_ONCE(op->extent.sparse_ext); + + /* hand back buffer we took earlier */ + op->extent.sparse_ext = sr->sr_extent; + sr->sr_extent = NULL; + op->extent.sparse_ext_cnt = sr->sr_count; + sr->sr_ext_len = 0; + dout("%s: [%d] completed extent array len %d cursor->resid %zd\n", + __func__, o->o_osd, op->extent.sparse_ext_cnt, cursor->resid); + /* Advance to end of data for this operation */ + end = ceph_sparse_ext_map_end(op); + if (end < sr->sr_req_len) + advance_cursor(cursor, sr->sr_req_len - end, false); + } + + ceph_init_sparse_read(sr); + + /* find next op in this request (if any) */ + while (++o->o_sparse_op_idx < req->r_num_ops) { + op = &req->r_ops[o->o_sparse_op_idx]; + if (op->op == CEPH_OSD_OP_SPARSE_READ) + goto found; + } + + /* reset for next sparse read request */ + spin_unlock(&o->o_requests_lock); + o->o_sparse_op_idx = -1; + return 0; +found: + sr->sr_req_off = op->extent.offset; + sr->sr_req_len = op->extent.length; + sr->sr_pos = sr->sr_req_off; + dout("%s: [%d] new sparse read op at idx %d 0x%llx~0x%llx\n", __func__, + o->o_osd, o->o_sparse_op_idx, sr->sr_req_off, sr->sr_req_len); + + /* hand off request's sparse extent map buffer */ + sr->sr_ext_len = op->extent.sparse_ext_cnt; + op->extent.sparse_ext_cnt = 0; + sr->sr_extent = op->extent.sparse_ext; + op->extent.sparse_ext = NULL; + + spin_unlock(&o->o_requests_lock); + return 1; +} + +#ifdef __BIG_ENDIAN +static inline void convert_extent_map(struct ceph_sparse_read *sr) +{ + int i; + + for (i = 0; i < sr->sr_count; i++) { + struct ceph_sparse_extent *ext = &sr->sr_extent[i]; + + ext->off = le64_to_cpu((__force __le64)ext->off); + ext->len = le64_to_cpu((__force __le64)ext->len); + } +} +#else +static inline void convert_extent_map(struct ceph_sparse_read *sr) +{ +} +#endif + +#define MAX_EXTENTS 4096 + +static int osd_sparse_read(struct ceph_connection *con, + struct ceph_msg_data_cursor *cursor, + char **pbuf) +{ + struct ceph_osd *o = con->private; + struct ceph_sparse_read *sr = &o->o_sparse_read; + u32 count = sr->sr_count; + u64 eoff, elen; + int ret; + + switch (sr->sr_state) { + case CEPH_SPARSE_READ_HDR: +next_op: + ret = prep_next_sparse_read(con, cursor); + if (ret <= 0) + return ret; + + /* number of extents */ + ret = sizeof(sr->sr_count); + *pbuf = (char *)&sr->sr_count; + sr->sr_state = CEPH_SPARSE_READ_EXTENTS; + break; + case CEPH_SPARSE_READ_EXTENTS: + /* Convert sr_count to host-endian */ + count = le32_to_cpu((__force __le32)sr->sr_count); + sr->sr_count = count; + dout("[%d] got %u extents\n", o->o_osd, count); + + if (count > 0) { + if (!sr->sr_extent || count > sr->sr_ext_len) { + /* + * Apply a hard cap to the number of extents. + * If we have more, assume something is wrong. + */ + if (count > MAX_EXTENTS) { + dout("%s: OSD returned 0x%x extents in a single reply!\n", + __func__, count); + return -EREMOTEIO; + } + + /* no extent array provided, or too short */ + kfree(sr->sr_extent); + sr->sr_extent = kmalloc_array(count, + sizeof(*sr->sr_extent), + GFP_NOIO); + if (!sr->sr_extent) + return -ENOMEM; + sr->sr_ext_len = count; + } + ret = count * sizeof(*sr->sr_extent); + *pbuf = (char *)sr->sr_extent; + sr->sr_state = CEPH_SPARSE_READ_DATA_LEN; + break; + } + /* No extents? Read data len */ + fallthrough; + case CEPH_SPARSE_READ_DATA_LEN: + convert_extent_map(sr); + ret = sizeof(sr->sr_datalen); + *pbuf = (char *)&sr->sr_datalen; + sr->sr_state = CEPH_SPARSE_READ_DATA; + break; + case CEPH_SPARSE_READ_DATA: + if (sr->sr_index >= count) { + sr->sr_state = CEPH_SPARSE_READ_HDR; + goto next_op; + } + + eoff = sr->sr_extent[sr->sr_index].off; + elen = sr->sr_extent[sr->sr_index].len; + + dout("[%d] ext %d off 0x%llx len 0x%llx\n", + o->o_osd, sr->sr_index, eoff, elen); + + if (elen > INT_MAX) { + dout("Sparse read extent length too long (0x%llx)\n", + elen); + return -EREMOTEIO; + } + + /* zero out anything from sr_pos to start of extent */ + if (sr->sr_pos < eoff) + advance_cursor(cursor, eoff - sr->sr_pos, true); + + /* Set position to end of extent */ + sr->sr_pos = eoff + elen; + + /* send back the new length and nullify the ptr */ + cursor->sr_resid = elen; + ret = elen; + *pbuf = NULL; + + /* Bump the array index */ + ++sr->sr_index; + break; + } + return ret; +} + static const struct ceph_connection_operations osd_con_ops = { .get = osd_get_con, .put = osd_put_con, + .sparse_read = osd_sparse_read, .alloc_msg = osd_alloc_msg, .dispatch = osd_dispatch, .fault = osd_fault,