From 79aec9844de339531f05b019644ccaf5dd777144 Mon Sep 17 00:00:00 2001 From: Sam Lang Date: Wed, 19 Dec 2012 09:44:23 -1000 Subject: [PATCH 01/63] ceph: Check for err on mds request in atomic_open The error returned by ceph_mdsc_do_request includes errors sending the request, errors on timeout, or any errors coming from the mds. If ceph_mdsc_do_request returns an error, the reply struct will most likely be bogus. We need to bail out and propogate the error instead of overwriting it. Signed-off-by: Sam Lang Reviewed-by: Sage Weil --- fs/ceph/file.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/ceph/file.c b/fs/ceph/file.c index d415096800a6..2c71cbd78332 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -243,6 +243,9 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, err = ceph_mdsc_do_request(mdsc, (flags & (O_CREAT|O_TRUNC)) ? dir : NULL, req); + if (err) + goto out_err; + err = ceph_handle_snapdir(req, dentry, err); if (err == 0 && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry) err = ceph_handle_notrace_create(dir, dentry); From 6e8575faa8fa680d59404a4d58d12190667be815 Mon Sep 17 00:00:00 2001 From: Sam Lang Date: Fri, 28 Dec 2012 09:56:46 -0800 Subject: [PATCH 02/63] ceph: Check for created flag in response from mds The mds now sends back a created inode if the create request performed the create. If the file already existed, no inode is returned in the reply. This allows ceph to set the created flag in atomic_open so that permissions are properly checked in the case that the file wasn't created by the create call to the mds. To ensure compability with previous kernels, a feature for sending back the inode in the create reply was added, so that the mds will only send back the inode if the client indicates it supports the feature. Signed-off-by: Sam Lang Reviewed-by: Sage Weil --- fs/ceph/file.c | 3 +++ fs/ceph/mds_client.c | 33 ++++++++++++++++++++++++++++-- fs/ceph/mds_client.h | 6 ++++++ include/linux/ceph/ceph_features.h | 5 ++++- 4 files changed, 44 insertions(+), 3 deletions(-) diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 2c71cbd78332..22b5b71b5401 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -266,6 +266,9 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, err = finish_no_open(file, dn); } else { dout("atomic_open finish_open on dn %p\n", dn); + if (req->r_op == CEPH_MDS_OP_CREATE && req->r_reply_info.has_create_ino) { + *opened |= FILE_CREATED; + } err = finish_open(file, dentry, ceph_open, opened); } diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 9165eb8309eb..d95842036c8b 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -232,6 +232,30 @@ bad: return -EIO; } +/* + * parse create results + */ +static int parse_reply_info_create(void **p, void *end, + struct ceph_mds_reply_info_parsed *info, + int features) +{ + if (features & CEPH_FEATURE_REPLY_CREATE_INODE) { + if (*p == end) { + info->has_create_ino = false; + } else { + info->has_create_ino = true; + info->ino = ceph_decode_64(p); + } + } + + if (unlikely(*p != end)) + goto bad; + return 0; + +bad: + return -EIO; +} + /* * parse extra results */ @@ -241,8 +265,12 @@ static int parse_reply_info_extra(void **p, void *end, { if (info->head->op == CEPH_MDS_OP_GETFILELOCK) return parse_reply_info_filelock(p, end, info, features); - else + else if (info->head->op == CEPH_MDS_OP_READDIR) return parse_reply_info_dir(p, end, info, features); + else if (info->head->op == CEPH_MDS_OP_CREATE) + return parse_reply_info_create(p, end, info, features); + else + return -EIO; } /* @@ -2170,7 +2198,8 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) mutex_lock(&req->r_fill_mutex); err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session); if (err == 0) { - if (result == 0 && req->r_op != CEPH_MDS_OP_GETFILELOCK && + if (result == 0 && (req->r_op == CEPH_MDS_OP_READDIR || + req->r_op == CEPH_MDS_OP_LSSNAP) && rinfo->dir_nr) ceph_readdir_prepopulate(req, req->r_session); ceph_unreserve_caps(mdsc, &req->r_caps_reservation); diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index dd26846dd71d..567f7c60354e 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -74,6 +74,12 @@ struct ceph_mds_reply_info_parsed { struct ceph_mds_reply_info_in *dir_in; u8 dir_complete, dir_end; }; + + /* for create results */ + struct { + bool has_create_ino; + u64 ino; + }; }; /* encoded blob describing snapshot contexts for certain diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h index dad579b0c0e6..6b7c6acbb3bf 100644 --- a/include/linux/ceph/ceph_features.h +++ b/include/linux/ceph/ceph_features.h @@ -14,13 +14,16 @@ #define CEPH_FEATURE_DIRLAYOUTHASH (1<<7) /* bits 8-17 defined by user-space; not supported yet here */ #define CEPH_FEATURE_CRUSH_TUNABLES (1<<18) +/* bits 19-25 defined by user-space; not supported yet here */ +#define CEPH_FEATURE_REPLY_CREATE_INODE (1<<27) /* * Features supported. */ #define CEPH_FEATURES_SUPPORTED_DEFAULT \ (CEPH_FEATURE_NOSRCADDR | \ - CEPH_FEATURE_CRUSH_TUNABLES) + CEPH_FEATURE_CRUSH_TUNABLES | \ + CEPH_FEATURE_REPLY_CREATE_INODE) #define CEPH_FEATURES_REQUIRED_DEFAULT \ (CEPH_FEATURE_NOSRCADDR) From a41bad1a9b9f9982eb9b451165724c5f81096683 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 30 Nov 2012 13:49:51 +0800 Subject: [PATCH 03/63] ceph: re-calculate truncate_size for strip object Otherwise osd may truncate the object to larger size. Signed-off-by: Yan, Zheng Reviewed-by: Sage Weil --- net/ceph/osd_client.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index eb9a44478764..267f183b801a 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -76,8 +76,16 @@ int ceph_calc_raw_layout(struct ceph_osd_client *osdc, orig_len - *plen, off, *plen); if (op_has_extent(op->op)) { + u32 osize = le32_to_cpu(layout->fl_object_size); op->extent.offset = objoff; op->extent.length = objlen; + if (op->extent.truncate_size <= off - objoff) { + op->extent.truncate_size = 0; + } else { + op->extent.truncate_size -= off - objoff; + if (op->extent.truncate_size > osize) + op->extent.truncate_size = osize; + } } req->r_num_pages = calc_pages_for(off, *plen); req->r_page_alignment = off & ~PAGE_MASK; From 8a92a119b292012a9bd920b908c3e9f1c512291d Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 4 Jan 2013 14:28:07 +0800 Subject: [PATCH 04/63] ceph: move dirty inode to migrating list when clearing auth caps Signed-off-by: Yan, Zheng Reviewed-by: Sage Weil --- fs/ceph/caps.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index a1d9bb30c1bf..a9fe2d5784c9 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -611,8 +611,16 @@ retry: if (flags & CEPH_CAP_FLAG_AUTH) ci->i_auth_cap = cap; - else if (ci->i_auth_cap == cap) + else if (ci->i_auth_cap == cap) { ci->i_auth_cap = NULL; + spin_lock(&mdsc->cap_dirty_lock); + if (!list_empty(&ci->i_dirty_item)) { + dout(" moving %p to cap_dirty_migrating\n", inode); + list_move(&ci->i_dirty_item, + &mdsc->cap_dirty_migrating); + } + spin_unlock(&mdsc->cap_dirty_lock); + } dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n", inode, ceph_vinop(inode), cap, ceph_cap_string(issued), From 395c312b9c535d57db122cbb5b7292223561d0b8 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 4 Jan 2013 14:37:57 +0800 Subject: [PATCH 05/63] ceph: allow revoking duplicated caps issued by non-auth MDS Allow revoking duplicated caps issued by non-auth MDS if these caps are also issued by auth MDS. Signed-off-by: Yan, Zheng Reviewed-by: Sage Weil --- fs/ceph/caps.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index a9fe2d5784c9..76b19239c426 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1468,7 +1468,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags, struct ceph_mds_client *mdsc = fsc->mdsc; struct inode *inode = &ci->vfs_inode; struct ceph_cap *cap; - int file_wanted, used; + int file_wanted, used, cap_used; int took_snap_rwsem = 0; /* true if mdsc->snap_rwsem held */ int issued, implemented, want, retain, revoking, flushing = 0; int mds = -1; /* keep track of how far we've gone through i_caps list @@ -1571,9 +1571,14 @@ retry_locked: /* NOTE: no side-effects allowed, until we take s_mutex */ + cap_used = used; + if (ci->i_auth_cap && cap != ci->i_auth_cap) + cap_used &= ~ci->i_auth_cap->issued; + revoking = cap->implemented & ~cap->issued; - dout(" mds%d cap %p issued %s implemented %s revoking %s\n", + dout(" mds%d cap %p used %s issued %s implemented %s revoking %s\n", cap->mds, cap, ceph_cap_string(cap->issued), + ceph_cap_string(cap_used), ceph_cap_string(cap->implemented), ceph_cap_string(revoking)); @@ -1601,7 +1606,7 @@ retry_locked: } /* completed revocation? going down and there are no caps? */ - if (revoking && (revoking & used) == 0) { + if (revoking && (revoking & cap_used) == 0) { dout("completed revocation of %s\n", ceph_cap_string(cap->implemented & ~cap->issued)); goto ack; @@ -1678,8 +1683,8 @@ ack: sent++; /* __send_cap drops i_ceph_lock */ - delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, used, want, - retain, flushing, NULL); + delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, cap_used, + want, retain, flushing, NULL); goto retry; /* retake i_ceph_lock and restart our cap scan. */ } From 66f58691c5c820283dd7e4d6fe8649033ed43ceb Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 4 Jan 2013 14:45:18 +0800 Subject: [PATCH 06/63] ceph: allocate cap_release message when receiving cap import When client wants to release an imported cap, it's possible there is no reserved cap_release message in corresponding mds session. so __queue_cap_release causes kernel panic. Signed-off-by: Yan, Zheng Reviewed-by: Sage Weil --- fs/ceph/caps.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 76b19239c426..40b5bbe63a39 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2833,6 +2833,9 @@ void ceph_handle_caps(struct ceph_mds_session *session, dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq, (unsigned)seq); + if (op == CEPH_CAP_OP_IMPORT) + ceph_add_cap_releases(mdsc, session); + /* lookup ino */ inode = ceph_find_inode(sb, vino); ci = ceph_inode(inode); From 390306c38dd43908f7f7730229999790a773d1d5 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 4 Jan 2013 15:30:10 +0800 Subject: [PATCH 07/63] ceph: check mds_wanted for imported cap The MDS may have incorrect wanted caps after importing caps. So the client should check the value mds has and send cap update if necessary. Signed-off-by: Yan, Zheng Reviewed-by: Sage Weil --- fs/ceph/caps.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 40b5bbe63a39..1e1e02055a2b 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2429,7 +2429,9 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, dout("mds wanted %s -> %s\n", ceph_cap_string(le32_to_cpu(grant->wanted)), ceph_cap_string(wanted)); - grant->wanted = cpu_to_le32(wanted); + /* imported cap may not have correct mds_wanted */ + if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) + check_caps = 1; } cap->seq = seq; From 1604f488ac2dcce33c8218e75a000e8c5fb57e61 Mon Sep 17 00:00:00 2001 From: Jim Schutt Date: Fri, 30 Nov 2012 09:15:25 -0700 Subject: [PATCH 08/63] libceph: for chooseleaf rules, retry CRUSH map descent from root if leaf is failed Add libceph support for a new CRUSH tunable recently added to Ceph servers. Consider the CRUSH rule step chooseleaf firstn 0 type This rule means that replicas will be chosen in a manner such that each chosen leaf's branch will contain a unique instance of . When an object is re-replicated after a leaf failure, if the CRUSH map uses a chooseleaf rule the remapped replica ends up under the bucket that held the failed leaf. This causes uneven data distribution across the storage cluster, to the point that when all the leaves but one fail under a particular bucket, that remaining leaf holds all the data from its failed peers. This behavior also limits the number of peers that can participate in the re-replication of the data held by the failed leaf, which increases the time required to re-replicate after a failure. For a chooseleaf CRUSH rule, the tree descent has two steps: call them the inner and outer descents. If the tree descent down to is the outer descent, and the descent from down to a leaf is the inner descent, the issue is that a down leaf is detected on the inner descent, so only the inner descent is retried. In order to disperse re-replicated data as widely as possible across a storage cluster after a failure, we want to retry the outer descent. So, fix up crush_choose() to allow the inner descent to return immediately on choosing a failed leaf. Wire this up as a new CRUSH tunable. Note that after this change, for a chooseleaf rule, if the primary OSD in a placement group has failed, choosing a replacement may result in one of the other OSDs in the PG colliding with the new primary. This requires that OSD's data for that PG to need moving as well. This seems unavoidable but should be relatively rare. This corresponds to ceph.git commit 88f218181a9e6d2292e2697fc93797d0f6d6e5dc. Signed-off-by: Jim Schutt Reviewed-by: Sage Weil --- include/linux/ceph/ceph_features.h | 7 +++++-- include/linux/crush/crush.h | 2 ++ net/ceph/crush/mapper.c | 13 ++++++++++--- net/ceph/osdmap.c | 6 ++++++ 4 files changed, 23 insertions(+), 5 deletions(-) diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h index 6b7c6acbb3bf..2160aab482f6 100644 --- a/include/linux/ceph/ceph_features.h +++ b/include/linux/ceph/ceph_features.h @@ -14,7 +14,9 @@ #define CEPH_FEATURE_DIRLAYOUTHASH (1<<7) /* bits 8-17 defined by user-space; not supported yet here */ #define CEPH_FEATURE_CRUSH_TUNABLES (1<<18) -/* bits 19-25 defined by user-space; not supported yet here */ +/* bits 19-24 defined by user-space; not supported yet here */ +#define CEPH_FEATURE_CRUSH_TUNABLES2 (1<<25) +/* bit 26 defined by user-space; not supported yet here */ #define CEPH_FEATURE_REPLY_CREATE_INODE (1<<27) /* @@ -22,7 +24,8 @@ */ #define CEPH_FEATURES_SUPPORTED_DEFAULT \ (CEPH_FEATURE_NOSRCADDR | \ - CEPH_FEATURE_CRUSH_TUNABLES | \ + CEPH_FEATURE_CRUSH_TUNABLES | \ + CEPH_FEATURE_CRUSH_TUNABLES2 | \ CEPH_FEATURE_REPLY_CREATE_INODE) #define CEPH_FEATURES_REQUIRED_DEFAULT \ diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h index 25baa287cff7..6a1101f24cfb 100644 --- a/include/linux/crush/crush.h +++ b/include/linux/crush/crush.h @@ -162,6 +162,8 @@ struct crush_map { __u32 choose_local_fallback_tries; /* choose attempts before giving up */ __u32 choose_total_tries; + /* attempt chooseleaf inner descent once; on failure retry outer descent */ + __u32 chooseleaf_descend_once; }; diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c index 35fce755ce10..96c8a58937db 100644 --- a/net/ceph/crush/mapper.c +++ b/net/ceph/crush/mapper.c @@ -287,6 +287,7 @@ static int is_out(const struct crush_map *map, const __u32 *weight, int item, in * @outpos: our position in that vector * @firstn: true if choosing "first n" items, false if choosing "indep" * @recurse_to_leaf: true if we want one device under each item of given type + * @descend_once: true if we should only try one descent before giving up * @out2: second output vector for leaf items (if @recurse_to_leaf) */ static int crush_choose(const struct crush_map *map, @@ -295,7 +296,7 @@ static int crush_choose(const struct crush_map *map, int x, int numrep, int type, int *out, int outpos, int firstn, int recurse_to_leaf, - int *out2) + int descend_once, int *out2) { int rep; unsigned int ftotal, flocal; @@ -399,6 +400,7 @@ static int crush_choose(const struct crush_map *map, x, outpos+1, 0, out2, outpos, firstn, 0, + map->chooseleaf_descend_once, NULL) <= outpos) /* didn't get leaf */ reject = 1; @@ -422,7 +424,10 @@ reject: ftotal++; flocal++; - if (collide && flocal <= map->choose_local_tries) + if (reject && descend_once) + /* let outer call try again */ + skip_rep = 1; + else if (collide && flocal <= map->choose_local_tries) /* retry locally a few times */ retry_bucket = 1; else if (map->choose_local_fallback_tries > 0 && @@ -485,6 +490,7 @@ int crush_do_rule(const struct crush_map *map, int i, j; int numrep; int firstn; + const int descend_once = 0; if ((__u32)ruleno >= map->max_rules) { dprintk(" bad ruleno %d\n", ruleno); @@ -544,7 +550,8 @@ int crush_do_rule(const struct crush_map *map, curstep->arg2, o+osize, j, firstn, - recurse_to_leaf, c+osize); + recurse_to_leaf, + descend_once, c+osize); } if (recurse_to_leaf) diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index de73214b5d26..ca05871635bc 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -170,6 +170,7 @@ static struct crush_map *crush_decode(void *pbyval, void *end) c->choose_local_tries = 2; c->choose_local_fallback_tries = 5; c->choose_total_tries = 19; + c->chooseleaf_descend_once = 0; ceph_decode_need(p, end, 4*sizeof(u32), bad); magic = ceph_decode_32(p); @@ -336,6 +337,11 @@ static struct crush_map *crush_decode(void *pbyval, void *end) dout("crush decode tunable choose_total_tries = %d", c->choose_total_tries); + ceph_decode_need(p, end, sizeof(u32), done); + c->chooseleaf_descend_once = ceph_decode_32(p); + dout("crush decode tunable chooseleaf_descend_once = %d", + c->chooseleaf_descend_once); + done: dout("crush_decode success\n"); return c; From 7d7c1f6136bac00174842f845babe7fb3483724e Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 15 Jan 2013 18:49:09 -0800 Subject: [PATCH 09/63] crush: avoid recursion if we have already collided This saves us some cycles, but does not affect the placement result at all. This corresponds to ceph.git commit 4abb53d4f. Signed-off-by: Sage Weil --- net/ceph/crush/mapper.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c index 96c8a58937db..cbd06a91941c 100644 --- a/net/ceph/crush/mapper.c +++ b/net/ceph/crush/mapper.c @@ -392,7 +392,7 @@ static int crush_choose(const struct crush_map *map, } reject = 0; - if (recurse_to_leaf) { + if (!collide && recurse_to_leaf) { if (item < 0) { if (crush_choose(map, map->buckets[-1-item], From c3acb18196cf3d7d3db6a5121c1bc674c3fba31f Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 7 Dec 2012 09:57:58 -0600 Subject: [PATCH 10/63] libceph: reformat __reset_osd() Reformat __reset_osd() into three distinct blocks of code handling the three return cases. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- net/ceph/osd_client.c | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 267f183b801a..eade41bb7102 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -747,31 +747,35 @@ static void remove_old_osds(struct ceph_osd_client *osdc) */ static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd) { - struct ceph_osd_request *req; - int ret = 0; + struct ceph_entity_addr *peer_addr; dout("__reset_osd %p osd%d\n", osd, osd->o_osd); if (list_empty(&osd->o_requests) && list_empty(&osd->o_linger_requests)) { __remove_osd(osdc, osd); - ret = -ENODEV; - } else if (memcmp(&osdc->osdmap->osd_addr[osd->o_osd], - &osd->o_con.peer_addr, - sizeof(osd->o_con.peer_addr)) == 0 && - !ceph_con_opened(&osd->o_con)) { + + return -ENODEV; + } + + peer_addr = &osdc->osdmap->osd_addr[osd->o_osd]; + if (!memcmp(peer_addr, &osd->o_con.peer_addr, sizeof (*peer_addr)) && + !ceph_con_opened(&osd->o_con)) { + struct ceph_osd_request *req; + dout(" osd addr hasn't changed and connection never opened," " letting msgr retry"); /* touch each r_stamp for handle_timeout()'s benfit */ list_for_each_entry(req, &osd->o_requests, r_osd_item) req->r_stamp = jiffies; - ret = -EAGAIN; - } else { - ceph_con_close(&osd->o_con); - ceph_con_open(&osd->o_con, CEPH_ENTITY_TYPE_OSD, osd->o_osd, - &osdc->osdmap->osd_addr[osd->o_osd]); - osd->o_incarnation++; + + return -EAGAIN; } - return ret; + + ceph_con_close(&osd->o_con); + ceph_con_open(&osd->o_con, CEPH_ENTITY_TYPE_OSD, osd->o_osd, peer_addr); + osd->o_incarnation++; + + return 0; } static void __insert_osd(struct ceph_osd_client *osdc, struct ceph_osd *new) From c66c6e0c0b04ce4a152fe02c562dd0752665d580 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 1 Nov 2012 08:39:26 -0500 Subject: [PATCH 11/63] rbd: document rbd_spec structure I promised Josh I would document whether there were any restrictions needed for accessing fields of an rbd_spec structure. This adds a big block of comments that documents the structure and how it is used--including the fact that we don't attempt to synchronize access to it. Signed-off-by: Alex Elder Reviewed-by: David Zafman Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 89576a0b3f2e..128978c6a4e0 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -119,7 +119,26 @@ struct rbd_image_header { * An rbd image specification. * * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely - * identify an image. + * identify an image. Each rbd_dev structure includes a pointer to + * an rbd_spec structure that encapsulates this identity. + * + * Each of the id's in an rbd_spec has an associated name. For a + * user-mapped image, the names are supplied and the id's associated + * with them are looked up. For a layered image, a parent image is + * defined by the tuple, and the names are looked up. + * + * An rbd_dev structure contains a parent_spec pointer which is + * non-null if the image it represents is a child in a layered + * image. This pointer will refer to the rbd_spec structure used + * by the parent rbd_dev for its own identity (i.e., the structure + * is shared between the parent and child). + * + * Since these structures are populated once, during the discovery + * phase of image construction, they are effectively immutable so + * we make no effort to synchronize access to them. + * + * Note that code herein does not assume the image name is known (it + * could be a null pointer). */ struct rbd_spec { u64 pool_id; From 69e7a02f633ba7de0aefa87f3f0e3b31e953b09a Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 1 Nov 2012 08:39:26 -0500 Subject: [PATCH 12/63] rbd: kill rbd_spec->image_name_len There may have been a benefit to hanging on to the length of an image name before, but there is really none now. The only time it's used is when probing for rbd images, so we can just compute the length then. Signed-off-by: Alex Elder Reviewed-by: David Zafman Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 128978c6a4e0..a002984891d7 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -147,7 +147,6 @@ struct rbd_spec { char *image_id; size_t image_id_len; char *image_name; - size_t image_name_len; u64 snap_id; char *snap_name; @@ -2563,15 +2562,15 @@ static char *rbd_dev_image_name(struct rbd_device *rbd_dev) rbd_assert(!rbd_dev->spec->image_name); - image_id_size = sizeof (__le32) + rbd_dev->spec->image_id_len; + len = strlen(rbd_dev->spec->image_id); + image_id_size = sizeof (__le32) + len; image_id = kmalloc(image_id_size, GFP_KERNEL); if (!image_id) return NULL; p = image_id; end = (char *) image_id + image_id_size; - ceph_encode_string(&p, end, rbd_dev->spec->image_id, - (u32) rbd_dev->spec->image_id_len); + ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32) len); size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX; reply_buf = kmalloc(size, GFP_KERNEL); @@ -2631,14 +2630,12 @@ static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev) /* Fetch the image name; tolerate failure here */ name = rbd_dev_image_name(rbd_dev); - if (name) { - rbd_dev->spec->image_name_len = strlen(name); + if (name) rbd_dev->spec->image_name = (char *) name; - } else { + else pr_warning(RBD_DRV_NAME "%d " "unable to get image name for image id %s\n", rbd_dev->major, rbd_dev->spec->image_id); - } /* Look up the snapshot name. */ @@ -3252,7 +3249,7 @@ static int rbd_add_parse_args(const char *buf, if (!*spec->pool_name) goto out_err; /* Missing pool name */ - spec->image_name = dup_token(&buf, &spec->image_name_len); + spec->image_name = dup_token(&buf, NULL); if (!spec->image_name) goto out_mem; if (!*spec->image_name) @@ -3342,7 +3339,7 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev) * First, see if the format 2 image id file exists, and if * so, get the image's persistent id from it. */ - size = sizeof (RBD_ID_PREFIX) + rbd_dev->spec->image_name_len; + size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name); object_name = kmalloc(size, GFP_NOIO); if (!object_name) return -ENOMEM; @@ -3400,7 +3397,7 @@ static int rbd_dev_v1_probe(struct rbd_device *rbd_dev) /* Record the header object name for this rbd image. */ - size = rbd_dev->spec->image_name_len + sizeof (RBD_SUFFIX); + size = strlen(rbd_dev->spec->image_name) + sizeof (RBD_SUFFIX); rbd_dev->header_name = kmalloc(size, GFP_KERNEL); if (!rbd_dev->header_name) { ret = -ENOMEM; From 979ed480a2722ad8d9f87054635158f652a1241e Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 1 Nov 2012 08:39:26 -0500 Subject: [PATCH 13/63] rbd: kill rbd_spec->image_id_len There is no real benefit to keeping the length of an image id, so get rid of it. Signed-off-by: Alex Elder Reviewed-by: David Zafman Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index a002984891d7..e01dbb12ad03 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -145,7 +145,6 @@ struct rbd_spec { char *pool_name; char *image_id; - size_t image_id_len; char *image_name; u64 snap_id; @@ -2492,7 +2491,6 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev) void *end; char *image_id; u64 overlap; - size_t len = 0; int ret; parent_spec = rbd_spec_alloc(); @@ -2526,13 +2524,12 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev) if (parent_spec->pool_id == CEPH_NOPOOL) goto out; /* No parent? No problem. */ - image_id = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL); + image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL); if (IS_ERR(image_id)) { ret = PTR_ERR(image_id); goto out_err; } parent_spec->image_id = image_id; - parent_spec->image_id_len = len; ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err); ceph_decode_64_safe(&p, end, overlap, out_err); @@ -3368,8 +3365,7 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev) p = response; rbd_dev->spec->image_id = ceph_extract_encoded_string(&p, p + RBD_IMAGE_ID_LEN_MAX, - &rbd_dev->spec->image_id_len, - GFP_NOIO); + NULL, GFP_NOIO); if (IS_ERR(rbd_dev->spec->image_id)) { ret = PTR_ERR(rbd_dev->spec->image_id); rbd_dev->spec->image_id = NULL; @@ -3393,7 +3389,6 @@ static int rbd_dev_v1_probe(struct rbd_device *rbd_dev) rbd_dev->spec->image_id = kstrdup("", GFP_KERNEL); if (!rbd_dev->spec->image_id) return -ENOMEM; - rbd_dev->spec->image_id_len = 0; /* Record the header object name for this rbd image. */ @@ -3443,7 +3438,7 @@ static int rbd_dev_v2_probe(struct rbd_device *rbd_dev) * Image id was filled in by the caller. Record the header * object name for this rbd image. */ - size = sizeof (RBD_HEADER_PREFIX) + rbd_dev->spec->image_id_len; + size = sizeof (RBD_HEADER_PREFIX) + strlen(rbd_dev->spec->image_id); rbd_dev->header_name = kmalloc(size, GFP_KERNEL); if (!rbd_dev->header_name) return -ENOMEM; From 4caf35f9ecdca1feef1d2e5e223b78e52ffbea87 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 1 Nov 2012 08:39:27 -0500 Subject: [PATCH 14/63] rbd: use kmemdup() This replaces two kmalloc()/memcpy() combinations with a single call to kmemdup(). Signed-off-by: Alex Elder Reviewed-by: David Zafman Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index e01dbb12ad03..d97611e2b4ee 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -3151,11 +3151,9 @@ static inline char *dup_token(const char **buf, size_t *lenp) size_t len; len = next_token(buf); - dup = kmalloc(len + 1, GFP_KERNEL); + dup = kmemdup(*buf, len + 1, GFP_KERNEL); if (!dup) return NULL; - - memcpy(dup, *buf, len); *(dup + len) = '\0'; *buf += len; @@ -3264,10 +3262,9 @@ static int rbd_add_parse_args(const char *buf, ret = -ENAMETOOLONG; goto out_err; } - spec->snap_name = kmalloc(len + 1, GFP_KERNEL); + spec->snap_name = kmemdup(buf, len + 1, GFP_KERNEL); if (!spec->snap_name) goto out_mem; - memcpy(spec->snap_name, buf, len); *(spec->snap_name + len) = '\0'; /* Initialize all rbd options to the defaults */ From dd5f049dbdf973d9bceebef1fd73647a5ede6732 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 1 Nov 2012 08:39:27 -0500 Subject: [PATCH 15/63] ceph: define ceph_encode_8_safe() It's kind of a silly macro, but ceph_encode_8_safe() is the only one missing from an otherwise pretty complete set. It's not used, but neither are a couple of the others in this set. While in there, insert some whitespace to tidy up the alignment of the line-terminating backslashes in some of the macro definitions. Signed-off-by: Alex Elder Reviewed-by: Dan Mick --- include/linux/ceph/decode.h | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/include/linux/ceph/decode.h b/include/linux/ceph/decode.h index 4bbf2db45f46..cd679f2d348b 100644 --- a/include/linux/ceph/decode.h +++ b/include/linux/ceph/decode.h @@ -52,10 +52,10 @@ static inline int ceph_has_room(void **p, void *end, size_t n) return end >= *p && n <= end - *p; } -#define ceph_decode_need(p, end, n, bad) \ - do { \ - if (!likely(ceph_has_room(p, end, n))) \ - goto bad; \ +#define ceph_decode_need(p, end, n, bad) \ + do { \ + if (!likely(ceph_has_room(p, end, n))) \ + goto bad; \ } while (0) #define ceph_decode_64_safe(p, end, v, bad) \ @@ -99,8 +99,8 @@ static inline int ceph_has_room(void **p, void *end, size_t n) * * There are two possible failures: * - converting the string would require accessing memory at or - * beyond the "end" pointer provided (-E - * - memory could not be allocated for the result + * beyond the "end" pointer provided (-ERANGE) + * - memory could not be allocated for the result (-ENOMEM) */ static inline char *ceph_extract_encoded_string(void **p, void *end, size_t *lenp, gfp_t gfp) @@ -217,10 +217,10 @@ static inline void ceph_encode_string(void **p, void *end, *p += len; } -#define ceph_encode_need(p, end, n, bad) \ - do { \ - if (!likely(ceph_has_room(p, end, n))) \ - goto bad; \ +#define ceph_encode_need(p, end, n, bad) \ + do { \ + if (!likely(ceph_has_room(p, end, n))) \ + goto bad; \ } while (0) #define ceph_encode_64_safe(p, end, v, bad) \ @@ -231,12 +231,17 @@ static inline void ceph_encode_string(void **p, void *end, #define ceph_encode_32_safe(p, end, v, bad) \ do { \ ceph_encode_need(p, end, sizeof(u32), bad); \ - ceph_encode_32(p, v); \ + ceph_encode_32(p, v); \ } while (0) #define ceph_encode_16_safe(p, end, v, bad) \ do { \ ceph_encode_need(p, end, sizeof(u16), bad); \ - ceph_encode_16(p, v); \ + ceph_encode_16(p, v); \ + } while (0) +#define ceph_encode_8_safe(p, end, v, bad) \ + do { \ + ceph_encode_need(p, end, sizeof(u8), bad); \ + ceph_encode_8(p, v); \ } while (0) #define ceph_encode_copy_safe(p, end, pv, n, bad) \ From 06ecc6cbf7a60cd5abd9fd2bda8ae69b395c2be3 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 1 Nov 2012 10:17:15 -0500 Subject: [PATCH 16/63] rbd: define and use rbd_warn() Define a new function rbd_warn() that produces a boilerplate warning message, identifying in the resulting message the affected rbd device in the best way available. Use it in a few places that now use pr_warning(). Signed-off-by: Alex Elder Reviewed-by: Dan Mick Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 43 +++++++++++++++++++++++++++++++++---------- 1 file changed, 33 insertions(+), 10 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index d97611e2b4ee..635b81d0ebdb 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -294,6 +294,33 @@ static struct device rbd_root_dev = { .release = rbd_root_dev_release, }; +static __printf(2, 3) +void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + + if (!rbd_dev) + printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf); + else if (rbd_dev->disk) + printk(KERN_WARNING "%s: %s: %pV\n", + RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf); + else if (rbd_dev->spec && rbd_dev->spec->image_name) + printk(KERN_WARNING "%s: image %s: %pV\n", + RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf); + else if (rbd_dev->spec && rbd_dev->spec->image_id) + printk(KERN_WARNING "%s: id %s: %pV\n", + RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf); + else /* punt */ + printk(KERN_WARNING "%s: rbd_dev %p: %pV\n", + RBD_DRV_NAME, rbd_dev, &vaf); + va_end(args); +} + #ifdef RBD_DEBUG #define rbd_assert(expr) \ if (unlikely(!(expr))) { \ @@ -1403,8 +1430,8 @@ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data) (unsigned int) opcode); rc = rbd_dev_refresh(rbd_dev, &hver); if (rc) - pr_warning(RBD_DRV_NAME "%d got notification but failed to " - " update snaps: %d\n", rbd_dev->major, rc); + rbd_warn(rbd_dev, "got notification but failed to " + " update snaps: %d\n", rc); rbd_req_sync_notify_ack(rbd_dev, hver, notify_id); } @@ -1767,15 +1794,13 @@ rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version) goto out_err; if (WARN_ON((size_t) ret < size)) { ret = -ENXIO; - pr_warning("short header read for image %s" - " (want %zd got %d)\n", - rbd_dev->spec->image_name, size, ret); + rbd_warn(rbd_dev, "short header read (want %zd got %d)", + size, ret); goto out_err; } if (!rbd_dev_ondisk_valid(ondisk)) { ret = -ENXIO; - pr_warning("invalid header for image %s\n", - rbd_dev->spec->image_name); + rbd_warn(rbd_dev, "invalid header"); goto out_err; } @@ -2630,9 +2655,7 @@ static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev) if (name) rbd_dev->spec->image_name = (char *) name; else - pr_warning(RBD_DRV_NAME "%d " - "unable to get image name for image id %s\n", - rbd_dev->major, rbd_dev->spec->image_id); + rbd_warn(rbd_dev, "unable to get image name"); /* Look up the snapshot name. */ From 4fb5d671399e83d3875593db2f56d5b57fcb104f Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 1 Nov 2012 10:17:15 -0500 Subject: [PATCH 17/63] rbd: add warning messages for missing arguments Tell the user (via dmesg) what was wrong with the arguments provided via /sys/bus/rbd/add. Signed-off-by: Alex Elder Reviewed-by: Dan Mick --- drivers/block/rbd.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 635b81d0ebdb..31da8c538480 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -3244,8 +3244,10 @@ static int rbd_add_parse_args(const char *buf, /* The first four tokens are required */ len = next_token(&buf); - if (!len) - return -EINVAL; /* Missing monitor address(es) */ + if (!len) { + rbd_warn(NULL, "no monitor address(es) provided"); + return -EINVAL; + } mon_addrs = buf; mon_addrs_size = len + 1; buf += len; @@ -3254,8 +3256,10 @@ static int rbd_add_parse_args(const char *buf, options = dup_token(&buf, NULL); if (!options) return -ENOMEM; - if (!*options) - goto out_err; /* Missing options */ + if (!*options) { + rbd_warn(NULL, "no options provided"); + goto out_err; + } spec = rbd_spec_alloc(); if (!spec) @@ -3264,14 +3268,18 @@ static int rbd_add_parse_args(const char *buf, spec->pool_name = dup_token(&buf, NULL); if (!spec->pool_name) goto out_mem; - if (!*spec->pool_name) - goto out_err; /* Missing pool name */ + if (!*spec->pool_name) { + rbd_warn(NULL, "no pool name provided"); + goto out_err; + } spec->image_name = dup_token(&buf, NULL); if (!spec->image_name) goto out_mem; - if (!*spec->image_name) - goto out_err; /* Missing image name */ + if (!*spec->image_name) { + rbd_warn(NULL, "no image name provided"); + goto out_err; + } /* * Snapshot name is optional; default is to use "-" From f5400b7a0e78a53edce8960a079aa022640849a1 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 1 Nov 2012 10:17:15 -0500 Subject: [PATCH 18/63] rbd: add a warning in bio_chain_clone_range() Add a warning in bio_chain_clone_range() to help a user determine what exactly might have led to a failure. There is only one; please say something if you disagree with the following reasoning. There are three places this can return abnormally: - Initially, if there is nothing to clone. It turns out that right now this cannot happen anyway. The test is in place because the code below it doesn't work if those conditions don't hold. As such they could be assertions but since I can return a null to indicate an error I just do that instead. I have not added a warning here because it won't happen. - While processing bio's, if none remain but there are supposed to be more bytes to clone. Here I have added a warning. - If bio_clone_range() returns a null pointer. That function will have already produced a warning (at least the first time, via WARN_ON_ONCE()) to distinguish the cause of the error. The only exception is memory exhaustion, and I'd rather not pepper the code with warnings in all those spots. So no warning is added in that place. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 31da8c538480..ce6c0cbb3d7a 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -993,8 +993,10 @@ static struct bio *bio_chain_clone_range(struct bio **bio_src, unsigned int bi_size; struct bio *bio; - if (!bi) + if (!bi) { + rbd_warn(NULL, "bio_chain exhausted with %u left", len); goto out_err; /* EINVAL; ran out of bio's */ + } bi_size = min_t(unsigned int, bi->bi_size - off, len); bio = bio_clone_range(bi, off, bi_size, gfpmask); if (!bio) From 935dc89f3e29e2ef1d7c89778cdb9f37ff36e94b Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 1 Nov 2012 10:17:15 -0500 Subject: [PATCH 19/63] rbd: add warnings to rbd_dev_probe_update_spec() Josh suggested adding warnings to this function to help users diagnose problems. Other than memory allocatino errors, there are two places where errors can be returned. Both represent problems that should have been caught earlier, and as such might well have been handled with BUG_ON() calls. But if either ever did manage to happen, it will be reported. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index ce6c0cbb3d7a..530a1217a0fa 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -2644,8 +2644,11 @@ static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev) osdc = &rbd_dev->rbd_client->client->osdc; name = ceph_pg_pool_name_by_id(osdc->osdmap, rbd_dev->spec->pool_id); - if (!name) - return -EIO; /* pool id too large (>= 2^31) */ + if (!name) { + rbd_warn(rbd_dev, "there is no pool with id %llu", + rbd_dev->spec->pool_id); /* Really a BUG() */ + return -EIO; + } rbd_dev->spec->pool_name = kstrdup(name, GFP_KERNEL); if (!rbd_dev->spec->pool_name) @@ -2663,6 +2666,8 @@ static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev) name = rbd_snap_name(rbd_dev, rbd_dev->spec->snap_id); if (!name) { + rbd_warn(rbd_dev, "no snapshot with id %llu", + rbd_dev->spec->snap_id); /* Really a BUG() */ ret = -EIO; goto out_err; } From 725afc97c91cd5f71a015143da5095d20cd668b9 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 8 Nov 2012 08:01:39 -0600 Subject: [PATCH 20/63] rbd: standardize rbd_request variable names There are two names used for items of rbd_request structure type: "req" and "req_data". The former name is also used to represent items of pointers to struct ceph_osd_request. Change all variables that have these names so they are instead called "rbd_req" consistently. Signed-off-by: Alex Elder Reviewed-by: Dan Mick Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 50 +++++++++++++++++++++++---------------------- 1 file changed, 26 insertions(+), 24 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 530a1217a0fa..0091fa466f83 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1088,10 +1088,12 @@ static void rbd_coll_end_req_index(struct request *rq, spin_unlock_irq(q->queue_lock); } -static void rbd_coll_end_req(struct rbd_request *req, +static void rbd_coll_end_req(struct rbd_request *rbd_req, int ret, u64 len) { - rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len); + rbd_coll_end_req_index(rbd_req->rq, + rbd_req->coll, rbd_req->coll_index, + ret, len); } /* @@ -1119,12 +1121,12 @@ static int rbd_do_request(struct request *rq, int ret; u64 bno; struct timespec mtime = CURRENT_TIME; - struct rbd_request *req_data; + struct rbd_request *rbd_req; struct ceph_osd_request_head *reqhead; struct ceph_osd_client *osdc; - req_data = kzalloc(sizeof(*req_data), GFP_NOIO); - if (!req_data) { + rbd_req = kzalloc(sizeof(*rbd_req), GFP_NOIO); + if (!rbd_req) { if (coll) rbd_coll_end_req_index(rq, coll, coll_index, -ENOMEM, len); @@ -1132,8 +1134,8 @@ static int rbd_do_request(struct request *rq, } if (coll) { - req_data->coll = coll; - req_data->coll_index = coll_index; + rbd_req->coll = coll; + rbd_req->coll_index = coll_index; } dout("rbd_do_request object_name=%s ofs=%llu len=%llu coll=%p[%d]\n", @@ -1150,12 +1152,12 @@ static int rbd_do_request(struct request *rq, req->r_callback = rbd_cb; - req_data->rq = rq; - req_data->bio = bio; - req_data->pages = pages; - req_data->len = len; + rbd_req->rq = rq; + rbd_req->bio = bio; + rbd_req->pages = pages; + rbd_req->len = len; - req->r_priv = req_data; + req->r_priv = rbd_req; reqhead = req->r_request->front.iov_base; reqhead->snapid = cpu_to_le64(CEPH_NOSNAP); @@ -1200,11 +1202,11 @@ static int rbd_do_request(struct request *rq, return ret; done_err: - bio_chain_put(req_data->bio); + bio_chain_put(rbd_req->bio); ceph_osdc_put_request(req); done_pages: - rbd_coll_end_req(req_data, ret, len); - kfree(req_data); + rbd_coll_end_req(rbd_req, ret, len); + kfree(rbd_req); return ret; } @@ -1213,7 +1215,7 @@ done_pages: */ static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg) { - struct rbd_request *req_data = req->r_priv; + struct rbd_request *rbd_req = req->r_priv; struct ceph_osd_reply_head *replyhead; struct ceph_osd_op *op; __s32 rc; @@ -1232,20 +1234,20 @@ static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg) (unsigned long long) bytes, read_op, (int) rc); if (rc == -ENOENT && read_op) { - zero_bio_chain(req_data->bio, 0); + zero_bio_chain(rbd_req->bio, 0); rc = 0; - } else if (rc == 0 && read_op && bytes < req_data->len) { - zero_bio_chain(req_data->bio, bytes); - bytes = req_data->len; + } else if (rc == 0 && read_op && bytes < rbd_req->len) { + zero_bio_chain(rbd_req->bio, bytes); + bytes = rbd_req->len; } - rbd_coll_end_req(req_data, rc, bytes); + rbd_coll_end_req(rbd_req, rc, bytes); - if (req_data->bio) - bio_chain_put(req_data->bio); + if (rbd_req->bio) + bio_chain_put(rbd_req->bio); ceph_osdc_put_request(req); - kfree(req_data); + kfree(rbd_req); } static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg) From 5f29ddd4f0954ad6c84e28b934773f128840f064 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 8 Nov 2012 08:01:39 -0600 Subject: [PATCH 21/63] rbd: standardize ceph_osd_request variable names There are spots where a ceph_osds_request pointer variable is given the name "req". Since we're dealing with (at least) three types of requests (block layer, rbd, and osd), I find this slightly distracting. Change such instances to use "osd_req" consistently to make the abstraction represented a little more obvious. Signed-off-by: Alex Elder Reviewed-by: Dan Mick Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 60 +++++++++++++++++++++++---------------------- 1 file changed, 31 insertions(+), 29 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 0091fa466f83..85131de90012 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1111,12 +1111,12 @@ static int rbd_do_request(struct request *rq, struct ceph_osd_req_op *ops, struct rbd_req_coll *coll, int coll_index, - void (*rbd_cb)(struct ceph_osd_request *req, - struct ceph_msg *msg), + void (*rbd_cb)(struct ceph_osd_request *, + struct ceph_msg *), struct ceph_osd_request **linger_req, u64 *ver) { - struct ceph_osd_request *req; + struct ceph_osd_request *osd_req; struct ceph_file_layout *layout; int ret; u64 bno; @@ -1143,67 +1143,68 @@ static int rbd_do_request(struct request *rq, (unsigned long long) len, coll, coll_index); osdc = &rbd_dev->rbd_client->client->osdc; - req = ceph_osdc_alloc_request(osdc, flags, snapc, ops, + osd_req = ceph_osdc_alloc_request(osdc, flags, snapc, ops, false, GFP_NOIO, pages, bio); - if (!req) { + if (!osd_req) { ret = -ENOMEM; goto done_pages; } - req->r_callback = rbd_cb; + osd_req->r_callback = rbd_cb; rbd_req->rq = rq; rbd_req->bio = bio; rbd_req->pages = pages; rbd_req->len = len; - req->r_priv = rbd_req; + osd_req->r_priv = rbd_req; - reqhead = req->r_request->front.iov_base; + reqhead = osd_req->r_request->front.iov_base; reqhead->snapid = cpu_to_le64(CEPH_NOSNAP); - strncpy(req->r_oid, object_name, sizeof(req->r_oid)); - req->r_oid_len = strlen(req->r_oid); + strncpy(osd_req->r_oid, object_name, sizeof(osd_req->r_oid)); + osd_req->r_oid_len = strlen(osd_req->r_oid); - layout = &req->r_file_layout; + layout = &osd_req->r_file_layout; memset(layout, 0, sizeof(*layout)); layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); layout->fl_stripe_count = cpu_to_le32(1); layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); layout->fl_pg_pool = cpu_to_le32((int) rbd_dev->spec->pool_id); ret = ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno, - req, ops); + osd_req, ops); rbd_assert(ret == 0); - ceph_osdc_build_request(req, ofs, &len, + ceph_osdc_build_request(osd_req, ofs, &len, ops, snapc, &mtime, - req->r_oid, req->r_oid_len); + osd_req->r_oid, osd_req->r_oid_len); if (linger_req) { - ceph_osdc_set_request_linger(osdc, req); - *linger_req = req; + ceph_osdc_set_request_linger(osdc, osd_req); + *linger_req = osd_req; } - ret = ceph_osdc_start_request(osdc, req, false); + ret = ceph_osdc_start_request(osdc, osd_req, false); if (ret < 0) goto done_err; if (!rbd_cb) { - ret = ceph_osdc_wait_request(osdc, req); + u64 version; + + ret = ceph_osdc_wait_request(osdc, osd_req); + version = le64_to_cpu(osd_req->r_reassert_version.version); if (ver) - *ver = le64_to_cpu(req->r_reassert_version.version); - dout("reassert_ver=%llu\n", - (unsigned long long) - le64_to_cpu(req->r_reassert_version.version)); - ceph_osdc_put_request(req); + *ver = version; + dout("reassert_ver=%llu\n", (unsigned long long) version); + ceph_osdc_put_request(osd_req); } return ret; done_err: bio_chain_put(rbd_req->bio); - ceph_osdc_put_request(req); + ceph_osdc_put_request(osd_req); done_pages: rbd_coll_end_req(rbd_req, ret, len); kfree(rbd_req); @@ -1213,9 +1214,9 @@ done_pages: /* * Ceph osd op callback */ -static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg) +static void rbd_req_cb(struct ceph_osd_request *osd_req, struct ceph_msg *msg) { - struct rbd_request *rbd_req = req->r_priv; + struct rbd_request *rbd_req = osd_req->r_priv; struct ceph_osd_reply_head *replyhead; struct ceph_osd_op *op; __s32 rc; @@ -1246,13 +1247,14 @@ static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg) if (rbd_req->bio) bio_chain_put(rbd_req->bio); - ceph_osdc_put_request(req); + ceph_osdc_put_request(osd_req); kfree(rbd_req); } -static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg) +static void rbd_simple_req_cb(struct ceph_osd_request *osd_req, + struct ceph_msg *msg) { - ceph_osdc_put_request(req); + ceph_osdc_put_request(osd_req); } /* From 8986cb37b1cf1f54b35f062f0a12dc68dd89f311 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 8 Nov 2012 08:01:39 -0600 Subject: [PATCH 22/63] rbd: be picky about osd request status type The result field in a ceph osd reply header is a signed 32-bit type, but rbd code often casually uses int to represent it. The following changes the types of variables that handle this result value to be "s32" instead of "int" to be completely explicit about it. Only at the point we pass that result to __blk_end_request() does the type get converted to the plain old int defined for that interface. There is almost certainly no binary impact of this change, but I prefer to show the exact size and signedness of the value since we know it. Signed-off-by: Alex Elder Reviewed-by: Dan Mick --- drivers/block/rbd.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 85131de90012..8d93b6a649d8 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -171,7 +171,7 @@ struct rbd_client { */ struct rbd_req_status { int done; - int rc; + s32 rc; u64 bytes; }; @@ -1053,13 +1053,13 @@ static void rbd_destroy_ops(struct ceph_osd_req_op *ops) static void rbd_coll_end_req_index(struct request *rq, struct rbd_req_coll *coll, int index, - int ret, u64 len) + s32 ret, u64 len) { struct request_queue *q; int min, max, i; dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n", - coll, index, ret, (unsigned long long) len); + coll, index, (int)ret, (unsigned long long)len); if (!rq) return; @@ -1080,7 +1080,7 @@ static void rbd_coll_end_req_index(struct request *rq, max++; for (i = min; istatus[i].rc, + __blk_end_request(rq, (int)coll->status[i].rc, coll->status[i].bytes); coll->num_done++; kref_put(&coll->kref, rbd_coll_release); @@ -1089,7 +1089,7 @@ static void rbd_coll_end_req_index(struct request *rq, } static void rbd_coll_end_req(struct rbd_request *rbd_req, - int ret, u64 len) + s32 ret, u64 len) { rbd_coll_end_req_index(rbd_req->rq, rbd_req->coll, rbd_req->coll_index, @@ -1129,7 +1129,7 @@ static int rbd_do_request(struct request *rq, if (!rbd_req) { if (coll) rbd_coll_end_req_index(rq, coll, coll_index, - -ENOMEM, len); + (s32)-ENOMEM, len); return -ENOMEM; } @@ -1206,7 +1206,7 @@ done_err: bio_chain_put(rbd_req->bio); ceph_osdc_put_request(osd_req); done_pages: - rbd_coll_end_req(rbd_req, ret, len); + rbd_coll_end_req(rbd_req, (s32)ret, len); kfree(rbd_req); return ret; } @@ -1219,7 +1219,7 @@ static void rbd_req_cb(struct ceph_osd_request *osd_req, struct ceph_msg *msg) struct rbd_request *rbd_req = osd_req->r_priv; struct ceph_osd_reply_head *replyhead; struct ceph_osd_op *op; - __s32 rc; + s32 rc; u64 bytes; int read_op; @@ -1227,14 +1227,14 @@ static void rbd_req_cb(struct ceph_osd_request *osd_req, struct ceph_msg *msg) replyhead = msg->front.iov_base; WARN_ON(le32_to_cpu(replyhead->num_ops) == 0); op = (void *)(replyhead + 1); - rc = le32_to_cpu(replyhead->result); + rc = (s32)le32_to_cpu(replyhead->result); bytes = le64_to_cpu(op->extent.length); read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ); dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n", (unsigned long long) bytes, read_op, (int) rc); - if (rc == -ENOENT && read_op) { + if (rc == (s32)-ENOENT && read_op) { zero_bio_chain(rbd_req->bio, 0); rc = 0; } else if (rc == 0 && read_op && bytes < rbd_req->len) { @@ -1679,7 +1679,8 @@ static void rbd_rq_fn(struct request_queue *q) bio_chain, coll, cur_seg); else rbd_coll_end_req_index(rq, coll, cur_seg, - -ENOMEM, chain_size); + (s32)-ENOMEM, + chain_size); size -= chain_size; ofs += chain_size; From 8295cda7ceceb7b25f9a12cd21bbfb6206e28a6d Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 8 Nov 2012 08:01:39 -0600 Subject: [PATCH 23/63] rbd: encapsulate handling for a single request In rbd_rq_fn(), requests are fetched from the block layer and each request is processed, looping through the request's list of bio's until they've all been consumed. Separate the handling for a single request into its own function to make it a bit easier to see what's going on. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 119 +++++++++++++++++++++++--------------------- 1 file changed, 63 insertions(+), 56 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 8d93b6a649d8..738d1e4c0ab5 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1583,6 +1583,64 @@ static struct rbd_req_coll *rbd_alloc_coll(int num_reqs) return coll; } +static int rbd_dev_do_request(struct request *rq, + struct rbd_device *rbd_dev, + struct ceph_snap_context *snapc, + u64 ofs, unsigned int size, + struct bio *bio_chain) +{ + int num_segs; + struct rbd_req_coll *coll; + unsigned int bio_offset; + int cur_seg = 0; + + dout("%s 0x%x bytes at 0x%llx\n", + rq_data_dir(rq) == WRITE ? "write" : "read", + size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE); + + num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size); + if (num_segs <= 0) + return num_segs; + + coll = rbd_alloc_coll(num_segs); + if (!coll) + return -ENOMEM; + + bio_offset = 0; + do { + u64 limit = rbd_segment_length(rbd_dev, ofs, size); + unsigned int clone_size; + struct bio *bio_clone; + + BUG_ON(limit > (u64)UINT_MAX); + clone_size = (unsigned int)limit; + dout("bio_chain->bi_vcnt=%hu\n", bio_chain->bi_vcnt); + + kref_get(&coll->kref); + + /* Pass a cloned bio chain via an osd request */ + + bio_clone = bio_chain_clone_range(&bio_chain, + &bio_offset, clone_size, + GFP_ATOMIC); + if (bio_clone) + (void)rbd_do_op(rq, rbd_dev, snapc, + ofs, clone_size, + bio_clone, coll, cur_seg); + else + rbd_coll_end_req_index(rq, coll, cur_seg, + (s32)-ENOMEM, + clone_size); + size -= clone_size; + ofs += clone_size; + + cur_seg++; + } while (size > 0); + kref_put(&coll->kref, rbd_coll_release); + + return 0; +} + /* * block device queue callback */ @@ -1596,10 +1654,8 @@ static void rbd_rq_fn(struct request_queue *q) bool do_write; unsigned int size; u64 ofs; - int num_segs, cur_seg = 0; - struct rbd_req_coll *coll; struct ceph_snap_context *snapc; - unsigned int bio_offset; + int result; dout("fetched request\n"); @@ -1637,60 +1693,11 @@ static void rbd_rq_fn(struct request_queue *q) ofs = blk_rq_pos(rq) * SECTOR_SIZE; bio = rq->bio; - dout("%s 0x%x bytes at 0x%llx\n", - do_write ? "write" : "read", - size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE); - - num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size); - if (num_segs <= 0) { - spin_lock_irq(q->queue_lock); - __blk_end_request_all(rq, num_segs); - ceph_put_snap_context(snapc); - continue; - } - coll = rbd_alloc_coll(num_segs); - if (!coll) { - spin_lock_irq(q->queue_lock); - __blk_end_request_all(rq, -ENOMEM); - ceph_put_snap_context(snapc); - continue; - } - - bio_offset = 0; - do { - u64 limit = rbd_segment_length(rbd_dev, ofs, size); - unsigned int chain_size; - struct bio *bio_chain; - - BUG_ON(limit > (u64) UINT_MAX); - chain_size = (unsigned int) limit; - dout("rq->bio->bi_vcnt=%hu\n", rq->bio->bi_vcnt); - - kref_get(&coll->kref); - - /* Pass a cloned bio chain via an osd request */ - - bio_chain = bio_chain_clone_range(&bio, - &bio_offset, chain_size, - GFP_ATOMIC); - if (bio_chain) - (void) rbd_do_op(rq, rbd_dev, snapc, - ofs, chain_size, - bio_chain, coll, cur_seg); - else - rbd_coll_end_req_index(rq, coll, cur_seg, - (s32)-ENOMEM, - chain_size); - size -= chain_size; - ofs += chain_size; - - cur_seg++; - } while (size > 0); - kref_put(&coll->kref, rbd_coll_release); - - spin_lock_irq(q->queue_lock); - + result = rbd_dev_do_request(rq, rbd_dev, snapc, ofs, size, bio); ceph_put_snap_context(snapc); + spin_lock_irq(q->queue_lock); + if (!size || result < 0) + __blk_end_request_all(rq, result); } } From cd323ac0eb433b14cbb270bfc5a82f308f2662de Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 8 Nov 2012 08:01:39 -0600 Subject: [PATCH 24/63] rbd: end request on error in rbd_do_request() caller Only one of the three callers of rbd_do_request() provide a collection structure to aggregate status. If an error occurs in rbd_do_request(), have the caller take care of calling rbd_coll_end_req() if necessary in that one spot. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 738d1e4c0ab5..21468d0b2792 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1126,12 +1126,8 @@ static int rbd_do_request(struct request *rq, struct ceph_osd_client *osdc; rbd_req = kzalloc(sizeof(*rbd_req), GFP_NOIO); - if (!rbd_req) { - if (coll) - rbd_coll_end_req_index(rq, coll, coll_index, - (s32)-ENOMEM, len); + if (!rbd_req) return -ENOMEM; - } if (coll) { rbd_req->coll = coll; @@ -1206,7 +1202,6 @@ done_err: bio_chain_put(rbd_req->bio); ceph_osdc_put_request(osd_req); done_pages: - rbd_coll_end_req(rbd_req, (s32)ret, len); kfree(rbd_req); return ret; } @@ -1359,7 +1354,9 @@ static int rbd_do_op(struct request *rq, ops, coll, coll_index, rbd_req_cb, 0, NULL); - + if (ret < 0) + rbd_coll_end_req_index(rq, coll, coll_index, + (s32)ret, seg_len); rbd_destroy_ops(ops); done: kfree(seg_name); From b395e8b5b8f06399e3fe3ee016c9cf41ff665efc Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 8 Nov 2012 08:01:39 -0600 Subject: [PATCH 25/63] rbd: a little more cleanup of rbd_rq_fn() Now that a big hunk in the middle of rbd_rq_fn() has been moved into its own routine we can simplify it a little more. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 56 ++++++++++++++++++++++----------------------- 1 file changed, 27 insertions(+), 29 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 21468d0b2792..9d49e5b888d8 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1644,53 +1644,51 @@ static int rbd_dev_do_request(struct request *rq, static void rbd_rq_fn(struct request_queue *q) { struct rbd_device *rbd_dev = q->queuedata; + bool read_only = rbd_dev->mapping.read_only; struct request *rq; while ((rq = blk_fetch_request(q))) { - struct bio *bio; - bool do_write; - unsigned int size; - u64 ofs; - struct ceph_snap_context *snapc; + struct ceph_snap_context *snapc = NULL; + unsigned int size = 0; int result; dout("fetched request\n"); - /* filter out block requests we don't understand */ + /* Filter out block requests we don't understand */ + if ((rq->cmd_type != REQ_TYPE_FS)) { __blk_end_request_all(rq, 0); continue; } - - /* deduce our operation (read, write) */ - do_write = (rq_data_dir(rq) == WRITE); - if (do_write && rbd_dev->mapping.read_only) { - __blk_end_request_all(rq, -EROFS); - continue; - } - spin_unlock_irq(q->queue_lock); + /* Stop writes to a read-only device */ + + result = -EROFS; + if (read_only && rq_data_dir(rq) == WRITE) + goto out_end_request; + + /* Grab a reference to the snapshot context */ + down_read(&rbd_dev->header_rwsem); - - if (!rbd_dev->exists) { - rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP); - up_read(&rbd_dev->header_rwsem); - dout("request for non-existent snapshot"); - spin_lock_irq(q->queue_lock); - __blk_end_request_all(rq, -ENXIO); - continue; + if (rbd_dev->exists) { + snapc = ceph_get_snap_context(rbd_dev->header.snapc); + rbd_assert(snapc != NULL); } - - snapc = ceph_get_snap_context(rbd_dev->header.snapc); - up_read(&rbd_dev->header_rwsem); - size = blk_rq_bytes(rq); - ofs = blk_rq_pos(rq) * SECTOR_SIZE; - bio = rq->bio; + if (!snapc) { + rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP); + dout("request for non-existent snapshot"); + result = -ENXIO; + goto out_end_request; + } - result = rbd_dev_do_request(rq, rbd_dev, snapc, ofs, size, bio); + size = blk_rq_bytes(rq); + result = rbd_dev_do_request(rq, rbd_dev, snapc, + blk_rq_pos(rq) * SECTOR_SIZE, + size, rq->bio); +out_end_request: ceph_put_snap_context(snapc); spin_lock_irq(q->queue_lock); if (!size || result < 0) From d78b650a595e23e5a115d332e3c37e019baf7703 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 9 Nov 2012 08:43:15 -0600 Subject: [PATCH 26/63] rbd: make exists flag atomic The rbd_device->exists field can be updated asynchronously, changing from set to clear if a mapped snapshot disappears from the base image's snapshot context. Currently, value of the "exists" flag is only read and modified under protection of the header semaphore, but that will change with the next patch. Making it atomic ensures this won't be a problem because the a the non-existence of device will be immediately known. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 9d49e5b888d8..2846536d446e 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -229,7 +229,7 @@ struct rbd_device { spinlock_t lock; /* queue lock */ struct rbd_image_header header; - bool exists; + atomic_t exists; struct rbd_spec *spec; char *header_name; @@ -751,7 +751,7 @@ static int rbd_dev_set_mapping(struct rbd_device *rbd_dev) goto done; rbd_dev->mapping.read_only = true; } - rbd_dev->exists = true; + atomic_set(&rbd_dev->exists, 1); done: return ret; } @@ -1671,7 +1671,7 @@ static void rbd_rq_fn(struct request_queue *q) /* Grab a reference to the snapshot context */ down_read(&rbd_dev->header_rwsem); - if (rbd_dev->exists) { + if (atomic_read(&rbd_dev->exists)) { snapc = ceph_get_snap_context(rbd_dev->header.snapc); rbd_assert(snapc != NULL); } @@ -2294,6 +2294,7 @@ struct rbd_device *rbd_dev_create(struct rbd_client *rbdc, return NULL; spin_lock_init(&rbd_dev->lock); + atomic_set(&rbd_dev->exists, 0); INIT_LIST_HEAD(&rbd_dev->node); INIT_LIST_HEAD(&rbd_dev->snaps); init_rwsem(&rbd_dev->header_rwsem); @@ -2918,7 +2919,7 @@ static int rbd_dev_snaps_update(struct rbd_device *rbd_dev) /* Existing snapshot not in the new snap context */ if (rbd_dev->spec->snap_id == snap->id) - rbd_dev->exists = false; + atomic_set(&rbd_dev->exists, 0); rbd_remove_snap_dev(snap); dout("%ssnap id %llu has been removed\n", rbd_dev->spec->snap_id == snap->id ? From a7b4c65f4f15aa657b09d13da8f45ba0b72ec0df Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 9 Nov 2012 08:43:15 -0600 Subject: [PATCH 27/63] rbd: only get snap context for write requests Right now we get the snapshot context for an rbd image (under protection of the header semaphore) for every request processed. There's no need to get the snap context if we're doing a read, so avoid doing so in that case. Note that we no longer need to hold the header semaphore to check the rbd_dev's existence flag. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 2846536d446e..0fbe9add0446 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1331,7 +1331,7 @@ static int rbd_do_op(struct request *rq, } else { opcode = CEPH_OSD_OP_READ; flags = CEPH_OSD_FLAG_READ; - snapc = NULL; + rbd_assert(!snapc); snapid = rbd_dev->spec->snap_id; payload_len = 0; } @@ -1662,22 +1662,25 @@ static void rbd_rq_fn(struct request_queue *q) } spin_unlock_irq(q->queue_lock); - /* Stop writes to a read-only device */ + /* Write requests need a reference to the snapshot context */ - result = -EROFS; - if (read_only && rq_data_dir(rq) == WRITE) - goto out_end_request; + if (rq_data_dir(rq) == WRITE) { + result = -EROFS; + if (read_only) /* Can't write to a read-only device */ + goto out_end_request; - /* Grab a reference to the snapshot context */ - - down_read(&rbd_dev->header_rwsem); - if (atomic_read(&rbd_dev->exists)) { + /* + * Note that each osd request will take its + * own reference to the snapshot context + * supplied. The reference we take here + * just guarantees the one we provide stays + * valid. + */ + down_read(&rbd_dev->header_rwsem); snapc = ceph_get_snap_context(rbd_dev->header.snapc); + up_read(&rbd_dev->header_rwsem); rbd_assert(snapc != NULL); - } - up_read(&rbd_dev->header_rwsem); - - if (!snapc) { + } else if (!atomic_read(&rbd_dev->exists)) { rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP); dout("request for non-existent snapshot"); result = -ENXIO; @@ -1689,7 +1692,8 @@ static void rbd_rq_fn(struct request_queue *q) blk_rq_pos(rq) * SECTOR_SIZE, size, rq->bio); out_end_request: - ceph_put_snap_context(snapc); + if (snapc) + ceph_put_snap_context(snapc); spin_lock_irq(q->queue_lock); if (!size || result < 0) __blk_end_request_all(rq, result); From 0ec8ce87f3bb5e4a561190f5320934e003405b6f Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 9 Nov 2012 08:43:15 -0600 Subject: [PATCH 28/63] rbd: separate layout init Pull a block of code that initializes the layout structure in an osd request into its own function so it can be reused. Signed-off-by: Alex Elder Reviewed-by: Dan Mick Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 0fbe9add0446..d4e93a28fb6a 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -54,6 +54,7 @@ /* It might be useful to have this defined elsewhere too */ +#define U32_MAX ((u32) (~0U)) #define U64_MAX ((u64) (~0ULL)) #define RBD_DRV_NAME "rbd" @@ -1096,6 +1097,16 @@ static void rbd_coll_end_req(struct rbd_request *rbd_req, ret, len); } +static void rbd_layout_init(struct ceph_file_layout *layout, u64 pool_id) +{ + memset(layout, 0, sizeof (*layout)); + layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); + layout->fl_stripe_count = cpu_to_le32(1); + layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); + rbd_assert(pool_id <= (u64) U32_MAX); + layout->fl_pg_pool = cpu_to_le32((u32) pool_id); +} + /* * Send ceph osd request */ @@ -1117,7 +1128,6 @@ static int rbd_do_request(struct request *rq, u64 *ver) { struct ceph_osd_request *osd_req; - struct ceph_file_layout *layout; int ret; u64 bno; struct timespec mtime = CURRENT_TIME; @@ -1161,14 +1171,9 @@ static int rbd_do_request(struct request *rq, strncpy(osd_req->r_oid, object_name, sizeof(osd_req->r_oid)); osd_req->r_oid_len = strlen(osd_req->r_oid); - layout = &osd_req->r_file_layout; - memset(layout, 0, sizeof(*layout)); - layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); - layout->fl_stripe_count = cpu_to_le32(1); - layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); - layout->fl_pg_pool = cpu_to_le32((int) rbd_dev->spec->pool_id); - ret = ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno, - osd_req, ops); + rbd_layout_init(&osd_req->r_file_layout, rbd_dev->spec->pool_id); + ret = ceph_calc_raw_layout(osdc, &osd_req->r_file_layout, + snapid, ofs, &len, &bno, osd_req, ops); rbd_assert(ret == 0); ceph_osdc_build_request(osd_req, ofs, &len, From af77f26caa35a95af09d1dac5c513b3901de7e37 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 9 Nov 2012 08:43:15 -0600 Subject: [PATCH 29/63] rbd: drop oid parameters from ceph_osdc_build_request() The last two parameters to ceph_osd_build_request() describe the object id, but the values passed always come from the osd request structure whose address is also provided. Get rid of those last two parameters. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 6 +----- include/linux/ceph/osd_client.h | 4 +--- net/ceph/osd_client.c | 13 +++++-------- 3 files changed, 7 insertions(+), 16 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index d4e93a28fb6a..9a701effa0ef 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1176,11 +1176,7 @@ static int rbd_do_request(struct request *rq, snapid, ofs, &len, &bno, osd_req, ops); rbd_assert(ret == 0); - ceph_osdc_build_request(osd_req, ofs, &len, - ops, - snapc, - &mtime, - osd_req->r_oid, osd_req->r_oid_len); + ceph_osdc_build_request(osd_req, ofs, &len, ops, snapc, &mtime); if (linger_req) { ceph_osdc_set_request_linger(osdc, osd_req); diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index d9b880e977e6..f2e5d2cdca06 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -227,9 +227,7 @@ extern void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off, u64 *plen, struct ceph_osd_req_op *src_ops, struct ceph_snap_context *snapc, - struct timespec *mtime, - const char *oid, - int oid_len); + struct timespec *mtime); extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *, struct ceph_file_layout *layout, diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index eade41bb7102..7d38327a8e89 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -376,9 +376,7 @@ void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off, u64 *plen, struct ceph_osd_req_op *src_ops, struct ceph_snap_context *snapc, - struct timespec *mtime, - const char *oid, - int oid_len) + struct timespec *mtime) { struct ceph_msg *msg = req->r_request; struct ceph_osd_request_head *head; @@ -405,9 +403,9 @@ void ceph_osdc_build_request(struct ceph_osd_request *req, /* fill in oid */ - head->object_len = cpu_to_le32(oid_len); - memcpy(p, oid, oid_len); - p += oid_len; + head->object_len = cpu_to_le32(req->r_oid_len); + memcpy(p, req->r_oid, req->r_oid_len); + p += req->r_oid_len; src_op = src_ops; while (src_op->op) { @@ -506,8 +504,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, ceph_osdc_build_request(req, off, plen, ops, snapc, - mtime, - req->r_oid, req->r_oid_len); + mtime); return req; } From 4775618d9255c0c135580bbee8bee6815f8194cf Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 9 Nov 2012 08:43:15 -0600 Subject: [PATCH 30/63] rbd: drop snapid parameter from rbd_req_sync_read() There is only one caller of rbd_req_sync_read(), and it passes CEPH_NOSNAP as the snapshot id argument. Delete that parameter and just use CEPH_NOSNAP within the function. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 9a701effa0ef..07e064482cd8 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1368,7 +1368,6 @@ done: * Request sync osd read */ static int rbd_req_sync_read(struct rbd_device *rbd_dev, - u64 snapid, const char *object_name, u64 ofs, u64 len, char *buf, @@ -1382,7 +1381,7 @@ static int rbd_req_sync_read(struct rbd_device *rbd_dev, return -ENOMEM; ret = rbd_req_sync_op(rbd_dev, NULL, - snapid, + CEPH_NOSNAP, CEPH_OSD_FLAG_READ, ops, object_name, ofs, len, buf, NULL, ver); rbd_destroy_ops(ops); @@ -1799,8 +1798,7 @@ rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version) if (!ondisk) return ERR_PTR(-ENOMEM); - ret = rbd_req_sync_read(rbd_dev, CEPH_NOSNAP, - rbd_dev->header_name, + ret = rbd_req_sync_read(rbd_dev, rbd_dev->header_name, 0, size, (char *) ondisk, version); From 07b2391fbbcefdecbc2f16321f8e454802e0b926 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 9 Nov 2012 08:43:16 -0600 Subject: [PATCH 31/63] rbd: drop flags parameter from rbd_req_sync_exec() All callers of rbd_req_sync_exec() pass CEPH_OSD_FLAG_READ as their flags argument. Delete that parameter and use CEPH_OSD_FLAG_READ within the function. If we find a need to support write operations we can add it back again. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 27 +++++++++------------------ 1 file changed, 9 insertions(+), 18 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 07e064482cd8..5d48bcd1709b 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1524,7 +1524,6 @@ static int rbd_req_sync_exec(struct rbd_device *rbd_dev, size_t outbound_size, char *inbound, size_t inbound_size, - int flags, u64 *ver) { struct ceph_osd_req_op *ops; @@ -1555,8 +1554,7 @@ static int rbd_req_sync_exec(struct rbd_device *rbd_dev, ops[0].cls.indata_len = outbound_size; ret = rbd_req_sync_op(rbd_dev, NULL, - CEPH_NOSNAP, - flags, ops, + CEPH_NOSNAP, CEPH_OSD_FLAG_READ, ops, object_name, 0, inbound_size, inbound, NULL, ver); @@ -2418,8 +2416,7 @@ static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id, ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, "rbd", "get_size", (char *) &snapid, sizeof (snapid), - (char *) &size_buf, sizeof (size_buf), - CEPH_OSD_FLAG_READ, NULL); + (char *) &size_buf, sizeof (size_buf), NULL); dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); if (ret < 0) return ret; @@ -2454,8 +2451,7 @@ static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev) ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, "rbd", "get_object_prefix", NULL, 0, - reply_buf, RBD_OBJ_PREFIX_LEN_MAX, - CEPH_OSD_FLAG_READ, NULL); + reply_buf, RBD_OBJ_PREFIX_LEN_MAX, NULL); dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); if (ret < 0) goto out; @@ -2494,7 +2490,7 @@ static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id, "rbd", "get_features", (char *) &snapid, sizeof (snapid), (char *) &features_buf, sizeof (features_buf), - CEPH_OSD_FLAG_READ, NULL); + NULL); dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); if (ret < 0) return ret; @@ -2549,8 +2545,7 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev) ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, "rbd", "get_parent", (char *) &snapid, sizeof (snapid), - (char *) reply_buf, size, - CEPH_OSD_FLAG_READ, NULL); + (char *) reply_buf, size, NULL); dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); if (ret < 0) goto out_err; @@ -2615,8 +2610,7 @@ static char *rbd_dev_image_name(struct rbd_device *rbd_dev) ret = rbd_req_sync_exec(rbd_dev, RBD_DIRECTORY, "rbd", "dir_get_name", image_id, image_id_size, - (char *) reply_buf, size, - CEPH_OSD_FLAG_READ, NULL); + (char *) reply_buf, size, NULL); if (ret < 0) goto out; p = reply_buf; @@ -2722,8 +2716,7 @@ static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver) ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, "rbd", "get_snapcontext", NULL, 0, - reply_buf, size, - CEPH_OSD_FLAG_READ, ver); + reply_buf, size, ver); dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); if (ret < 0) goto out; @@ -2792,8 +2785,7 @@ static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which) ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, "rbd", "get_snapshot_name", (char *) &snap_id, sizeof (snap_id), - reply_buf, size, - CEPH_OSD_FLAG_READ, NULL); + reply_buf, size, NULL); dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); if (ret < 0) goto out; @@ -3401,8 +3393,7 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev) ret = rbd_req_sync_exec(rbd_dev, object_name, "rbd", "get_id", NULL, 0, - response, RBD_IMAGE_ID_LEN_MAX, - CEPH_OSD_FLAG_READ, NULL); + response, RBD_IMAGE_ID_LEN_MAX, NULL); dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret); if (ret < 0) goto out; From 25704ac9de30ac3e73c123e7b2734f7ca744c8d8 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 9 Nov 2012 08:43:16 -0600 Subject: [PATCH 32/63] rbd: kill rbd_req_sync_op() snapc and snapid parameters The snapc and snapid parameters to rbd_req_sync_op() always take the values NULL and CEPH_NOSNAP, respectively. So just get rid of them and use those values where needed. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 5d48bcd1709b..85c5852378d2 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1257,8 +1257,6 @@ static void rbd_simple_req_cb(struct ceph_osd_request *osd_req, * Do a synchronous ceph osd operation */ static int rbd_req_sync_op(struct rbd_device *rbd_dev, - struct ceph_snap_context *snapc, - u64 snapid, int flags, struct ceph_osd_req_op *ops, const char *object_name, @@ -1278,7 +1276,7 @@ static int rbd_req_sync_op(struct rbd_device *rbd_dev, if (IS_ERR(pages)) return PTR_ERR(pages); - ret = rbd_do_request(NULL, rbd_dev, snapc, snapid, + ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP, object_name, ofs, inbound_size, NULL, pages, num_pages, flags, @@ -1380,9 +1378,7 @@ static int rbd_req_sync_read(struct rbd_device *rbd_dev, if (!ops) return -ENOMEM; - ret = rbd_req_sync_op(rbd_dev, NULL, - CEPH_NOSNAP, - CEPH_OSD_FLAG_READ, + ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ, ops, object_name, ofs, len, buf, NULL, ver); rbd_destroy_ops(ops); @@ -1461,8 +1457,7 @@ static int rbd_req_sync_watch(struct rbd_device *rbd_dev) ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie); ops[0].watch.flag = 1; - ret = rbd_req_sync_op(rbd_dev, NULL, - CEPH_NOSNAP, + ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, ops, rbd_dev->header_name, @@ -1499,8 +1494,7 @@ static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev) ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie); ops[0].watch.flag = 0; - ret = rbd_req_sync_op(rbd_dev, NULL, - CEPH_NOSNAP, + ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, ops, rbd_dev->header_name, @@ -1553,8 +1547,7 @@ static int rbd_req_sync_exec(struct rbd_device *rbd_dev, ops[0].cls.indata = outbound; ops[0].cls.indata_len = outbound_size; - ret = rbd_req_sync_op(rbd_dev, NULL, - CEPH_NOSNAP, CEPH_OSD_FLAG_READ, ops, + ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ, ops, object_name, 0, inbound_size, inbound, NULL, ver); From 7c3d22cf16f1bbcb37a73e88338c042bb49ff112 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 9 Nov 2012 12:50:10 -0600 Subject: [PATCH 33/63] rbd: don't bother setting snapid in rbd_do_request() For some reason, the snapid field of the osd request header is explicitly set to CEPH_NOSNAP in rbd_do_request(). Just a few lines later--with no code that would access this field in between--a call is made to ceph_calc_raw_layout() passing the snapid provided to rbd_do_request(), which encodes the snapid value it is provided into that field instead. In other words, there is no need to fill in CEPH_NOSNAP, and doing so suggests it might be necessary. Don't do that any more. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 85c5852378d2..54bd9fc3ef7c 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1132,7 +1132,6 @@ static int rbd_do_request(struct request *rq, u64 bno; struct timespec mtime = CURRENT_TIME; struct rbd_request *rbd_req; - struct ceph_osd_request_head *reqhead; struct ceph_osd_client *osdc; rbd_req = kzalloc(sizeof(*rbd_req), GFP_NOIO); @@ -1165,9 +1164,6 @@ static int rbd_do_request(struct request *rq, osd_req->r_priv = rbd_req; - reqhead = osd_req->r_request->front.iov_base; - reqhead->snapid = cpu_to_le64(CEPH_NOSNAP); - strncpy(osd_req->r_oid, object_name, sizeof(osd_req->r_oid)); osd_req->r_oid_len = strlen(osd_req->r_oid); From c885837f7d4f8c4f5cb2a744cc6929bc078e9dc0 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 13 Nov 2012 21:11:15 -0600 Subject: [PATCH 34/63] libceph: always allow trail in osd request An osd request structure contains an optional trail portion, which if present will contain data to be passed in the payload portion of the message containing the request. The trail field is a ceph_pagelist pointer, and if null it indicates there is no trail. A ceph_pagelist structure contains a length field, and it can legitimately hold value 0. Make use of this to change the interpretation of the "trail" of an osd request so that every osd request has trailing data, it just might have length 0. This means we change the r_trail field in a ceph_osd_request structure from a pointer to a structure that is always initialized. Note that in ceph_osdc_start_request(), the trail pointer (or now address of that structure) is assigned to a ceph message's trail field. Here's why that's still OK (looking at net/ceph/messenger.c): - What would have resulted in a null pointer previously will now refer to a 0-length page list. That message trail pointer is used in two functions, write_partial_msg_pages() and out_msg_pos_next(). - In write_partial_msg_pages(), a null page list pointer is handled the same as a message with 0-length trail, and both result in a "in_trail" variable set to false. The trail pointer is only used if in_trail is true. - The only other place the message trail pointer is used is out_msg_pos_next(). That function is only called by write_partial_msg_pages() and only touches the trail pointer if the in_trail value it is passed is true. Therefore a null ceph_msg->trail pointer is equivalent to a non-null pointer referring to a 0-length page list structure. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/osd_client.h | 4 +-- net/ceph/osd_client.c | 43 +++++++++------------------------ 2 files changed, 14 insertions(+), 33 deletions(-) diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index f2e5d2cdca06..61562c792855 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -10,6 +10,7 @@ #include #include #include +#include /* * Maximum object name size @@ -22,7 +23,6 @@ struct ceph_snap_context; struct ceph_osd_request; struct ceph_osd_client; struct ceph_authorizer; -struct ceph_pagelist; /* * completion callback for async writepages @@ -95,7 +95,7 @@ struct ceph_osd_request { struct bio *r_bio; /* instead of pages */ #endif - struct ceph_pagelist *r_trail; /* trailing part of the data */ + struct ceph_pagelist r_trail; /* trailing part of the data */ }; struct ceph_osd_event { diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 7d38327a8e89..2be50d82ccbc 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -171,10 +171,7 @@ void ceph_osdc_release_request(struct kref *kref) bio_put(req->r_bio); #endif ceph_put_snap_context(req->r_snapc); - if (req->r_trail) { - ceph_pagelist_release(req->r_trail); - kfree(req->r_trail); - } + ceph_pagelist_release(&req->r_trail); if (req->r_mempool) mempool_free(req, req->r_osdc->req_mempool); else @@ -208,8 +205,7 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, { struct ceph_osd_request *req; struct ceph_msg *msg; - int needs_trail; - int num_op = get_num_ops(ops, &needs_trail); + int num_op = get_num_ops(ops, NULL); size_t msg_size = sizeof(struct ceph_osd_request_head); msg_size += num_op*sizeof(struct ceph_osd_op); @@ -252,15 +248,7 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, } req->r_reply = msg; - /* allocate space for the trailing data */ - if (needs_trail) { - req->r_trail = kmalloc(sizeof(struct ceph_pagelist), gfp_flags); - if (!req->r_trail) { - ceph_osdc_put_request(req); - return NULL; - } - ceph_pagelist_init(req->r_trail); - } + ceph_pagelist_init(&req->r_trail); /* create request message; allow space for oid */ msg_size += MAX_OBJ_NAME_SIZE; @@ -312,29 +300,25 @@ static void osd_req_encode_op(struct ceph_osd_request *req, case CEPH_OSD_OP_GETXATTR: case CEPH_OSD_OP_SETXATTR: case CEPH_OSD_OP_CMPXATTR: - BUG_ON(!req->r_trail); - dst->xattr.name_len = cpu_to_le32(src->xattr.name_len); dst->xattr.value_len = cpu_to_le32(src->xattr.value_len); dst->xattr.cmp_op = src->xattr.cmp_op; dst->xattr.cmp_mode = src->xattr.cmp_mode; - ceph_pagelist_append(req->r_trail, src->xattr.name, + ceph_pagelist_append(&req->r_trail, src->xattr.name, src->xattr.name_len); - ceph_pagelist_append(req->r_trail, src->xattr.val, + ceph_pagelist_append(&req->r_trail, src->xattr.val, src->xattr.value_len); break; case CEPH_OSD_OP_CALL: - BUG_ON(!req->r_trail); - dst->cls.class_len = src->cls.class_len; dst->cls.method_len = src->cls.method_len; dst->cls.indata_len = cpu_to_le32(src->cls.indata_len); - ceph_pagelist_append(req->r_trail, src->cls.class_name, + ceph_pagelist_append(&req->r_trail, src->cls.class_name, src->cls.class_len); - ceph_pagelist_append(req->r_trail, src->cls.method_name, + ceph_pagelist_append(&req->r_trail, src->cls.method_name, src->cls.method_len); - ceph_pagelist_append(req->r_trail, src->cls.indata, + ceph_pagelist_append(&req->r_trail, src->cls.indata, src->cls.indata_len); break; case CEPH_OSD_OP_ROLLBACK: @@ -347,11 +331,9 @@ static void osd_req_encode_op(struct ceph_osd_request *req, __le32 prot_ver = cpu_to_le32(src->watch.prot_ver); __le32 timeout = cpu_to_le32(src->watch.timeout); - BUG_ON(!req->r_trail); - - ceph_pagelist_append(req->r_trail, + ceph_pagelist_append(&req->r_trail, &prot_ver, sizeof(prot_ver)); - ceph_pagelist_append(req->r_trail, + ceph_pagelist_append(&req->r_trail, &timeout, sizeof(timeout)); } case CEPH_OSD_OP_NOTIFY_ACK: @@ -414,8 +396,7 @@ void ceph_osdc_build_request(struct ceph_osd_request *req, op++; } - if (req->r_trail) - data_len += req->r_trail->length; + data_len += req->r_trail.length; if (snapc) { head->snap_seq = cpu_to_le64(snapc->seq); @@ -1715,7 +1696,7 @@ int ceph_osdc_start_request(struct ceph_osd_client *osdc, #ifdef CONFIG_BLOCK req->r_request->bio = req->r_bio; #endif - req->r_request->trail = req->r_trail; + req->r_request->trail = &req->r_trail; register_request(osdc, req); From 5b9d1b1cd46aa6c8abf891f25c15aee31538da7e Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 13 Nov 2012 21:11:15 -0600 Subject: [PATCH 35/63] libceph: kill op_needs_trail() Since every osd message is now prepared to include trailing data, there's no need to check ahead of time whether any operations will make use of the trail portion of the message. We can drop the second argument to get_num_ops(), and as a result we can also get rid of op_needs_trail() which is no longer used. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- net/ceph/osd_client.c | 27 ++++----------------------- 1 file changed, 4 insertions(+), 23 deletions(-) diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 2be50d82ccbc..37d43d5b828c 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -32,20 +32,6 @@ static void __unregister_linger_request(struct ceph_osd_client *osdc, static void __send_request(struct ceph_osd_client *osdc, struct ceph_osd_request *req); -static int op_needs_trail(int op) -{ - switch (op) { - case CEPH_OSD_OP_GETXATTR: - case CEPH_OSD_OP_SETXATTR: - case CEPH_OSD_OP_CMPXATTR: - case CEPH_OSD_OP_CALL: - case CEPH_OSD_OP_NOTIFY: - return 1; - default: - return 0; - } -} - static int op_has_extent(int op) { return (op == CEPH_OSD_OP_READ || @@ -179,17 +165,12 @@ void ceph_osdc_release_request(struct kref *kref) } EXPORT_SYMBOL(ceph_osdc_release_request); -static int get_num_ops(struct ceph_osd_req_op *ops, int *needs_trail) +static int get_num_ops(struct ceph_osd_req_op *ops) { int i = 0; - if (needs_trail) - *needs_trail = 0; - while (ops[i].op) { - if (needs_trail && op_needs_trail(ops[i].op)) - *needs_trail = 1; + while (ops[i].op) i++; - } return i; } @@ -205,7 +186,7 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, { struct ceph_osd_request *req; struct ceph_msg *msg; - int num_op = get_num_ops(ops, NULL); + int num_op = get_num_ops(ops); size_t msg_size = sizeof(struct ceph_osd_request_head); msg_size += num_op*sizeof(struct ceph_osd_op); @@ -365,7 +346,7 @@ void ceph_osdc_build_request(struct ceph_osd_request *req, struct ceph_osd_req_op *src_op; struct ceph_osd_op *op; void *p; - int num_op = get_num_ops(src_ops, NULL); + int num_op = get_num_ops(src_ops); size_t msg_size = sizeof(*head) + num_op*sizeof(*op); int flags = req->r_flags; u64 data_len = 0; From 0120be3c60d46d6d55f4bf7a3d654cc705eb0c54 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 14 Nov 2012 09:38:19 -0600 Subject: [PATCH 36/63] libceph: pass length to ceph_osdc_build_request() The len argument to ceph_osdc_build_request() is set up to be passed by address, but that function never updates its value so there's no need to do this. Tighten up the interface by passing the length directly. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 2 +- include/linux/ceph/osd_client.h | 2 +- net/ceph/osd_client.c | 6 +++--- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 54bd9fc3ef7c..c1b135b6cb97 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1172,7 +1172,7 @@ static int rbd_do_request(struct request *rq, snapid, ofs, &len, &bno, osd_req, ops); rbd_assert(ret == 0); - ceph_osdc_build_request(osd_req, ofs, &len, ops, snapc, &mtime); + ceph_osdc_build_request(osd_req, ofs, len, ops, snapc, &mtime); if (linger_req) { ceph_osdc_set_request_linger(osdc, osd_req); diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 61562c792855..4bfb4582439a 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -224,7 +224,7 @@ extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client * struct bio *bio); extern void ceph_osdc_build_request(struct ceph_osd_request *req, - u64 off, u64 *plen, + u64 off, u64 len, struct ceph_osd_req_op *src_ops, struct ceph_snap_context *snapc, struct timespec *mtime); diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 37d43d5b828c..e29a3ed92958 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -336,7 +336,7 @@ static void osd_req_encode_op(struct ceph_osd_request *req, * */ void ceph_osdc_build_request(struct ceph_osd_request *req, - u64 off, u64 *plen, + u64 off, u64 len, struct ceph_osd_req_op *src_ops, struct ceph_snap_context *snapc, struct timespec *mtime) @@ -390,7 +390,7 @@ void ceph_osdc_build_request(struct ceph_osd_request *req, if (flags & CEPH_OSD_FLAG_WRITE) { req->r_request->hdr.data_off = cpu_to_le16(off); - req->r_request->hdr.data_len = cpu_to_le32(*plen + data_len); + req->r_request->hdr.data_len = cpu_to_le32(len + data_len); } else if (data_len) { req->r_request->hdr.data_off = 0; req->r_request->hdr.data_len = cpu_to_le32(data_len); @@ -464,7 +464,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, req->r_num_pages = calc_pages_for(page_align, *plen); req->r_page_alignment = page_align; - ceph_osdc_build_request(req, off, plen, ops, + ceph_osdc_build_request(req, off, *plen, ops, snapc, mtime); From e8afad656cbcd06d02a7bacd4b318fa0e2907de0 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 14 Nov 2012 09:38:19 -0600 Subject: [PATCH 37/63] libceph: pass length to ceph_calc_file_object_mapping() ceph_calc_file_object_mapping() takes (among other things) a "file" offset and length, and based on the layout, determines the object number ("bno") backing the affected portion of the file's data and the offset into that object where the desired range begins. It also computes the size that should be used for the request--either the amount requested or something less if that would exceed the end of the object. This patch changes the input length parameter in this function so it is used only for input. That is, the argument will be passed by value rather than by address, so the value provided won't get updated by the function. The value would only get updated if the length would surpass the current object, and in that case the value it got updated to would be exactly that returned in *oxlen. Only one of the two callers is affected by this change. Update ceph_calc_raw_layout() so it records any updated value. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- fs/ceph/ioctl.c | 2 +- include/linux/ceph/osdmap.h | 2 +- net/ceph/osd_client.c | 6 ++++-- net/ceph/osdmap.c | 9 ++++----- 4 files changed, 10 insertions(+), 9 deletions(-) diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c index 36549a46e311..3b22150d3e19 100644 --- a/fs/ceph/ioctl.c +++ b/fs/ceph/ioctl.c @@ -194,7 +194,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) return -EFAULT; down_read(&osdc->map_sem); - r = ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, &len, + r = ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, len, &dl.object_no, &dl.object_offset, &olen); if (r < 0) diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index 5ea57ba69320..1f653e2ff5cc 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -110,7 +110,7 @@ extern void ceph_osdmap_destroy(struct ceph_osdmap *map); /* calculate mapping of a file extent to an object */ extern int ceph_calc_file_object_mapping(struct ceph_file_layout *layout, - u64 off, u64 *plen, + u64 off, u64 len, u64 *bno, u64 *oxoff, u64 *oxlen); /* calculate mapping of object to a placement group */ diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index e29a3ed92958..47e5f5b1f94c 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -53,13 +53,15 @@ int ceph_calc_raw_layout(struct ceph_osd_client *osdc, reqhead->snapid = cpu_to_le64(snapid); /* object extent? */ - r = ceph_calc_file_object_mapping(layout, off, plen, bno, + r = ceph_calc_file_object_mapping(layout, off, orig_len, bno, &objoff, &objlen); if (r < 0) return r; - if (*plen < orig_len) + if (objlen < orig_len) { + *plen = objlen; dout(" skipping last %llu, final file extent %llu~%llu\n", orig_len - *plen, off, *plen); + } if (op_has_extent(op->op)) { u32 osize = le32_to_cpu(layout->fl_object_size); diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index ca05871635bc..369f03ba9ee5 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -1016,7 +1016,7 @@ bad: * pass a stride back to the caller. */ int ceph_calc_file_object_mapping(struct ceph_file_layout *layout, - u64 off, u64 *plen, + u64 off, u64 len, u64 *ono, u64 *oxoff, u64 *oxlen) { @@ -1027,7 +1027,7 @@ int ceph_calc_file_object_mapping(struct ceph_file_layout *layout, u32 su_per_object; u64 t, su_offset; - dout("mapping %llu~%llu osize %u fl_su %u\n", off, *plen, + dout("mapping %llu~%llu osize %u fl_su %u\n", off, len, osize, su); if (su == 0 || sc == 0) goto invalid; @@ -1060,11 +1060,10 @@ int ceph_calc_file_object_mapping(struct ceph_file_layout *layout, /* * Calculate the length of the extent being written to the selected - * object. This is the minimum of the full length requested (plen) or + * object. This is the minimum of the full length requested (len) or * the remainder of the current stripe being written to. */ - *oxlen = min_t(u64, *plen, su - su_offset); - *plen = *oxlen; + *oxlen = min_t(u64, len, su - su_offset); dout(" obj extent %llu~%llu\n", *oxoff, *oxlen); return 0; From 4d6b250bf18d44571d69a0f4afec4b6a1969729f Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 13 Nov 2012 21:11:15 -0600 Subject: [PATCH 38/63] libceph: drop snapid in ceph_calc_raw_layout() A snapshot id must be provided to ceph_calc_raw_layout() even though it is not needed at all for calculating the layout. Where the snapshot id *is* needed is when building the request message for an osd operation. Drop the snapid parameter from ceph_calc_raw_layout() and pass that value instead in ceph_osdc_build_request(). Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 4 ++-- include/linux/ceph/osd_client.h | 2 +- net/ceph/osd_client.c | 14 ++++---------- 3 files changed, 7 insertions(+), 13 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index c1b135b6cb97..fa371868e9b0 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1169,10 +1169,10 @@ static int rbd_do_request(struct request *rq, rbd_layout_init(&osd_req->r_file_layout, rbd_dev->spec->pool_id); ret = ceph_calc_raw_layout(osdc, &osd_req->r_file_layout, - snapid, ofs, &len, &bno, osd_req, ops); + ofs, &len, &bno, osd_req, ops); rbd_assert(ret == 0); - ceph_osdc_build_request(osd_req, ofs, len, ops, snapc, &mtime); + ceph_osdc_build_request(osd_req, ofs, len, ops, snapc, snapid, &mtime); if (linger_req) { ceph_osdc_set_request_linger(osdc, osd_req); diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 4bfb4582439a..0e82a0a967ef 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -209,7 +209,6 @@ extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc, extern int ceph_calc_raw_layout(struct ceph_osd_client *osdc, struct ceph_file_layout *layout, - u64 snapid, u64 off, u64 *plen, u64 *bno, struct ceph_osd_request *req, struct ceph_osd_req_op *op); @@ -227,6 +226,7 @@ extern void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off, u64 len, struct ceph_osd_req_op *src_ops, struct ceph_snap_context *snapc, + u64 snap_id, struct timespec *mtime); extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *, diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 47e5f5b1f94c..b5a4b2875e8a 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -40,18 +40,14 @@ static int op_has_extent(int op) int ceph_calc_raw_layout(struct ceph_osd_client *osdc, struct ceph_file_layout *layout, - u64 snapid, u64 off, u64 *plen, u64 *bno, struct ceph_osd_request *req, struct ceph_osd_req_op *op) { - struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base; u64 orig_len = *plen; u64 objoff, objlen; /* extent in object */ int r; - reqhead->snapid = cpu_to_le64(snapid); - /* object extent? */ r = ceph_calc_file_object_mapping(layout, off, orig_len, bno, &objoff, &objlen); @@ -121,8 +117,7 @@ static int calc_layout(struct ceph_osd_client *osdc, u64 bno; int r; - r = ceph_calc_raw_layout(osdc, layout, vino.snap, off, - plen, &bno, req, op); + r = ceph_calc_raw_layout(osdc, layout, off, plen, &bno, req, op); if (r < 0) return r; @@ -340,7 +335,7 @@ static void osd_req_encode_op(struct ceph_osd_request *req, void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off, u64 len, struct ceph_osd_req_op *src_ops, - struct ceph_snap_context *snapc, + struct ceph_snap_context *snapc, u64 snap_id, struct timespec *mtime) { struct ceph_msg *msg = req->r_request; @@ -355,6 +350,7 @@ void ceph_osdc_build_request(struct ceph_osd_request *req, int i; head = msg->front.iov_base; + head->snapid = cpu_to_le64(snap_id); op = (void *)(head + 1); p = (void *)(op + num_op); @@ -466,9 +462,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, req->r_num_pages = calc_pages_for(page_align, *plen); req->r_page_alignment = page_align; - ceph_osdc_build_request(req, off, *plen, ops, - snapc, - mtime); + ceph_osdc_build_request(req, off, *plen, ops, snapc, vino.snap, mtime); return req; } From e75b45cf36565fd8ba206a9d80f670a86e61ba2f Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 13 Nov 2012 21:11:14 -0600 Subject: [PATCH 39/63] libceph: drop osdc from ceph_calc_raw_layout() The osdc parameter to ceph_calc_raw_layout() is not used, so get rid of it. Consequently, the corresponding parameter in calc_layout() becomes unused, so get rid of that as well. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 2 +- include/linux/ceph/osd_client.h | 3 +-- net/ceph/osd_client.c | 10 ++++------ 3 files changed, 6 insertions(+), 9 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index fa371868e9b0..ac8fd3856509 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1168,7 +1168,7 @@ static int rbd_do_request(struct request *rq, osd_req->r_oid_len = strlen(osd_req->r_oid); rbd_layout_init(&osd_req->r_file_layout, rbd_dev->spec->pool_id); - ret = ceph_calc_raw_layout(osdc, &osd_req->r_file_layout, + ret = ceph_calc_raw_layout(&osd_req->r_file_layout, ofs, &len, &bno, osd_req, ops); rbd_assert(ret == 0); diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 0e82a0a967ef..fe3a6e8db1f9 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -207,8 +207,7 @@ extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc, extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg); -extern int ceph_calc_raw_layout(struct ceph_osd_client *osdc, - struct ceph_file_layout *layout, +extern int ceph_calc_raw_layout(struct ceph_file_layout *layout, u64 off, u64 *plen, u64 *bno, struct ceph_osd_request *req, struct ceph_osd_req_op *op); diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index b5a4b2875e8a..cd9c28387de3 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -38,8 +38,7 @@ static int op_has_extent(int op) op == CEPH_OSD_OP_WRITE); } -int ceph_calc_raw_layout(struct ceph_osd_client *osdc, - struct ceph_file_layout *layout, +int ceph_calc_raw_layout(struct ceph_file_layout *layout, u64 off, u64 *plen, u64 *bno, struct ceph_osd_request *req, struct ceph_osd_req_op *op) @@ -107,8 +106,7 @@ EXPORT_SYMBOL(ceph_calc_raw_layout); * * fill osd op in request message. */ -static int calc_layout(struct ceph_osd_client *osdc, - struct ceph_vino vino, +static int calc_layout(struct ceph_vino vino, struct ceph_file_layout *layout, u64 off, u64 *plen, struct ceph_osd_request *req, @@ -117,7 +115,7 @@ static int calc_layout(struct ceph_osd_client *osdc, u64 bno; int r; - r = ceph_calc_raw_layout(osdc, layout, off, plen, &bno, req, op); + r = ceph_calc_raw_layout(layout, off, plen, &bno, req, op); if (r < 0) return r; @@ -452,7 +450,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, return ERR_PTR(-ENOMEM); /* calculate max write size */ - r = calc_layout(osdc, vino, layout, off, plen, req, ops); + r = calc_layout(vino, layout, off, plen, req, ops); if (r < 0) return ERR_PTR(r); req->r_file_layout = *layout; /* keep a copy */ From d178a9e74006e80f568d87e29f2a68f14fc7cbb1 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 13 Nov 2012 21:11:15 -0600 Subject: [PATCH 40/63] libceph: don't set flags in ceph_osdc_alloc_request() The only thing ceph_osdc_alloc_request() really does with the flags value it is passed is assign it to the newly-created osd request structure. Do that in the caller instead. Both callers subsequently call ceph_osdc_build_request(), so have that function (instead of ceph_osdc_alloc_request()) issue a warning if a request comes through with neither the read nor write flags set. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 3 ++- include/linux/ceph/osd_client.h | 1 - net/ceph/osd_client.c | 11 ++++------- 3 files changed, 6 insertions(+), 9 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index ac8fd3856509..bdbaa4cfd9d3 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1148,13 +1148,14 @@ static int rbd_do_request(struct request *rq, (unsigned long long) len, coll, coll_index); osdc = &rbd_dev->rbd_client->client->osdc; - osd_req = ceph_osdc_alloc_request(osdc, flags, snapc, ops, + osd_req = ceph_osdc_alloc_request(osdc, snapc, ops, false, GFP_NOIO, pages, bio); if (!osd_req) { ret = -ENOMEM; goto done_pages; } + osd_req->r_flags = flags; osd_req->r_callback = rbd_cb; rbd_req->rq = rq; diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index fe3a6e8db1f9..6ddda5bbd1a6 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -213,7 +213,6 @@ extern int ceph_calc_raw_layout(struct ceph_file_layout *layout, struct ceph_osd_req_op *op); extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, - int flags, struct ceph_snap_context *snapc, struct ceph_osd_req_op *ops, bool use_mempool, diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index cd9c28387de3..77ce1edaa07d 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -171,7 +171,6 @@ static int get_num_ops(struct ceph_osd_req_op *ops) } struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, - int flags, struct ceph_snap_context *snapc, struct ceph_osd_req_op *ops, bool use_mempool, @@ -208,10 +207,6 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, INIT_LIST_HEAD(&req->r_req_lru_item); INIT_LIST_HEAD(&req->r_osd_item); - req->r_flags = flags; - - WARN_ON((flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE)) == 0); - /* create reply message */ if (use_mempool) msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0); @@ -347,6 +342,8 @@ void ceph_osdc_build_request(struct ceph_osd_request *req, u64 data_len = 0; int i; + WARN_ON((flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE)) == 0); + head = msg->front.iov_base; head->snapid = cpu_to_le64(snap_id); op = (void *)(head + 1); @@ -442,12 +439,12 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, } else ops[1].op = 0; - req = ceph_osdc_alloc_request(osdc, flags, - snapc, ops, + req = ceph_osdc_alloc_request(osdc, snapc, ops, use_mempool, GFP_NOFS, NULL, NULL); if (!req) return ERR_PTR(-ENOMEM); + req->r_flags = flags; /* calculate max write size */ r = calc_layout(vino, layout, off, plen, req, ops); From 54a5400721da7fa5a16cea151aade5bdfee74111 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 13 Nov 2012 21:11:15 -0600 Subject: [PATCH 41/63] libceph: don't set pages or bio in ceph_osdc_alloc_request() Only one of the two callers of ceph_osdc_alloc_request() provides page or bio data for its payload. And essentially all that function was doing with those arguments was assigning them to fields in the osd request structure. Simplify ceph_osdc_alloc_request() by having the caller take care of making those assignments Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 8 ++++++-- include/linux/ceph/osd_client.h | 4 +--- net/ceph/osd_client.c | 15 ++------------- 3 files changed, 9 insertions(+), 18 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index bdbaa4cfd9d3..d1445df6398a 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1148,14 +1148,18 @@ static int rbd_do_request(struct request *rq, (unsigned long long) len, coll, coll_index); osdc = &rbd_dev->rbd_client->client->osdc; - osd_req = ceph_osdc_alloc_request(osdc, snapc, ops, - false, GFP_NOIO, pages, bio); + osd_req = ceph_osdc_alloc_request(osdc, snapc, ops, false, GFP_NOIO); if (!osd_req) { ret = -ENOMEM; goto done_pages; } osd_req->r_flags = flags; + osd_req->r_pages = pages; + if (bio) { + osd_req->r_bio = bio; + bio_get(osd_req->r_bio); + } osd_req->r_callback = rbd_cb; rbd_req->rq = rq; diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 6ddda5bbd1a6..75f56d372d44 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -216,9 +216,7 @@ extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client * struct ceph_snap_context *snapc, struct ceph_osd_req_op *ops, bool use_mempool, - gfp_t gfp_flags, - struct page **pages, - struct bio *bio); + gfp_t gfp_flags); extern void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off, u64 len, diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 77ce1edaa07d..bdc3bb12bfd5 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -174,9 +174,7 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, struct ceph_snap_context *snapc, struct ceph_osd_req_op *ops, bool use_mempool, - gfp_t gfp_flags, - struct page **pages, - struct bio *bio) + gfp_t gfp_flags) { struct ceph_osd_request *req; struct ceph_msg *msg; @@ -237,13 +235,6 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, memset(msg->front.iov_base, 0, msg->front.iov_len); req->r_request = msg; - req->r_pages = pages; -#ifdef CONFIG_BLOCK - if (bio) { - req->r_bio = bio; - bio_get(req->r_bio); - } -#endif return req; } @@ -439,9 +430,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, } else ops[1].op = 0; - req = ceph_osdc_alloc_request(osdc, snapc, ops, - use_mempool, - GFP_NOFS, NULL, NULL); + req = ceph_osdc_alloc_request(osdc, snapc, ops, use_mempool, GFP_NOFS); if (!req) return ERR_PTR(-ENOMEM); req->r_flags = flags; From d07c09589f533db9ab500ac38151bc9f3a4d0648 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 13 Nov 2012 21:11:15 -0600 Subject: [PATCH 42/63] rbd: pass num_op with ops array Add a num_op parameter to rbd_do_request() and rbd_req_sync_op() to indicate the number of entries in the array. The callers of these functions always know how many entries are in the array, so just pass that information down. This is in anticipation of eliminating the extra zero-filled entry in these ops arrays. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index d1445df6398a..b6872d3cb04c 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1119,6 +1119,7 @@ static int rbd_do_request(struct request *rq, struct page **pages, int num_pages, int flags, + unsigned int num_op, struct ceph_osd_req_op *ops, struct rbd_req_coll *coll, int coll_index, @@ -1259,6 +1260,7 @@ static void rbd_simple_req_cb(struct ceph_osd_request *osd_req, */ static int rbd_req_sync_op(struct rbd_device *rbd_dev, int flags, + unsigned int num_op, struct ceph_osd_req_op *ops, const char *object_name, u64 ofs, u64 inbound_size, @@ -1281,7 +1283,7 @@ static int rbd_req_sync_op(struct rbd_device *rbd_dev, object_name, ofs, inbound_size, NULL, pages, num_pages, flags, - ops, + num_op, ops, NULL, 0, NULL, linger_req, ver); @@ -1351,7 +1353,7 @@ static int rbd_do_op(struct request *rq, bio, NULL, 0, flags, - ops, + 1, ops, coll, coll_index, rbd_req_cb, 0, NULL); if (ret < 0) @@ -1380,7 +1382,7 @@ static int rbd_req_sync_read(struct rbd_device *rbd_dev, return -ENOMEM; ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ, - ops, object_name, ofs, len, buf, NULL, ver); + 1, ops, object_name, ofs, len, buf, NULL, ver); rbd_destroy_ops(ops); return ret; @@ -1408,7 +1410,7 @@ static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev, rbd_dev->header_name, 0, 0, NULL, NULL, 0, CEPH_OSD_FLAG_READ, - ops, + 1, ops, NULL, 0, rbd_simple_req_cb, 0, NULL); @@ -1460,7 +1462,7 @@ static int rbd_req_sync_watch(struct rbd_device *rbd_dev) ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, - ops, + 1, ops, rbd_dev->header_name, 0, 0, NULL, &rbd_dev->watch_request, NULL); @@ -1497,7 +1499,7 @@ static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev) ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, - ops, + 1, ops, rbd_dev->header_name, 0, 0, NULL, NULL, NULL); @@ -1548,7 +1550,7 @@ static int rbd_req_sync_exec(struct rbd_device *rbd_dev, ops[0].cls.indata = outbound; ops[0].cls.indata_len = outbound_size; - ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ, ops, + ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ, 1, ops, object_name, 0, inbound_size, inbound, NULL, ver); From ae7ca4a35b1f5df86e2c32b2cfc01a8d528c7b8c Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 13 Nov 2012 21:11:15 -0600 Subject: [PATCH 43/63] libceph: pass num_op with ops Both ceph_osdc_alloc_request() and ceph_osdc_build_request() are provided an array of ceph osd request operations. Rather than just passing the number of operations in the array, the caller is required append an additional zeroed operation structure to signal the end of the array. All callers know the number of operations at the time these functions are called, so drop the silly zero entry and supply that number directly. As a result, get_num_ops() is no longer needed. This also means that ceph_osdc_alloc_request() never uses its ops argument, so that can be dropped. Also rbd_create_rw_ops() no longer needs to add one to reserve room for the additional op. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 9 ++++--- include/linux/ceph/osd_client.h | 3 ++- net/ceph/osd_client.c | 43 ++++++++++++--------------------- 3 files changed, 22 insertions(+), 33 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index b6872d3cb04c..88de8ccb29bd 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1026,12 +1026,12 @@ out_err: /* * helpers for osd request op vectors. */ -static struct ceph_osd_req_op *rbd_create_rw_ops(int num_ops, +static struct ceph_osd_req_op *rbd_create_rw_ops(int num_op, int opcode, u32 payload_len) { struct ceph_osd_req_op *ops; - ops = kzalloc(sizeof (*ops) * (num_ops + 1), GFP_NOIO); + ops = kzalloc(num_op * sizeof (*ops), GFP_NOIO); if (!ops) return NULL; @@ -1149,7 +1149,7 @@ static int rbd_do_request(struct request *rq, (unsigned long long) len, coll, coll_index); osdc = &rbd_dev->rbd_client->client->osdc; - osd_req = ceph_osdc_alloc_request(osdc, snapc, ops, false, GFP_NOIO); + osd_req = ceph_osdc_alloc_request(osdc, snapc, num_op, false, GFP_NOIO); if (!osd_req) { ret = -ENOMEM; goto done_pages; @@ -1178,7 +1178,8 @@ static int rbd_do_request(struct request *rq, ofs, &len, &bno, osd_req, ops); rbd_assert(ret == 0); - ceph_osdc_build_request(osd_req, ofs, len, ops, snapc, snapid, &mtime); + ceph_osdc_build_request(osd_req, ofs, len, num_op, ops, + snapc, snapid, &mtime); if (linger_req) { ceph_osdc_set_request_linger(osdc, osd_req); diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 75f56d372d44..2b04d054e09d 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -214,12 +214,13 @@ extern int ceph_calc_raw_layout(struct ceph_file_layout *layout, extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, struct ceph_snap_context *snapc, - struct ceph_osd_req_op *ops, + unsigned int num_op, bool use_mempool, gfp_t gfp_flags); extern void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off, u64 len, + unsigned int num_op, struct ceph_osd_req_op *src_ops, struct ceph_snap_context *snapc, u64 snap_id, diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index bdc3bb12bfd5..500ae8b49321 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -160,25 +160,14 @@ void ceph_osdc_release_request(struct kref *kref) } EXPORT_SYMBOL(ceph_osdc_release_request); -static int get_num_ops(struct ceph_osd_req_op *ops) -{ - int i = 0; - - while (ops[i].op) - i++; - - return i; -} - struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, struct ceph_snap_context *snapc, - struct ceph_osd_req_op *ops, + unsigned int num_op, bool use_mempool, gfp_t gfp_flags) { struct ceph_osd_request *req; struct ceph_msg *msg; - int num_op = get_num_ops(ops); size_t msg_size = sizeof(struct ceph_osd_request_head); msg_size += num_op*sizeof(struct ceph_osd_op); @@ -317,7 +306,7 @@ static void osd_req_encode_op(struct ceph_osd_request *req, * */ void ceph_osdc_build_request(struct ceph_osd_request *req, - u64 off, u64 len, + u64 off, u64 len, unsigned int num_op, struct ceph_osd_req_op *src_ops, struct ceph_snap_context *snapc, u64 snap_id, struct timespec *mtime) @@ -327,7 +316,6 @@ void ceph_osdc_build_request(struct ceph_osd_request *req, struct ceph_osd_req_op *src_op; struct ceph_osd_op *op; void *p; - int num_op = get_num_ops(src_ops); size_t msg_size = sizeof(*head) + num_op*sizeof(*op); int flags = req->r_flags; u64 data_len = 0; @@ -346,20 +334,17 @@ void ceph_osdc_build_request(struct ceph_osd_request *req, head->flags = cpu_to_le32(flags); if (flags & CEPH_OSD_FLAG_WRITE) ceph_encode_timespec(&head->mtime, mtime); + BUG_ON(num_op > (unsigned int) ((u16) -1)); head->num_ops = cpu_to_le16(num_op); - /* fill in oid */ head->object_len = cpu_to_le32(req->r_oid_len); memcpy(p, req->r_oid, req->r_oid_len); p += req->r_oid_len; src_op = src_ops; - while (src_op->op) { - osd_req_encode_op(req, op, src_op); - src_op++; - op++; - } + while (num_op--) + osd_req_encode_op(req, op++, src_op++); data_len += req->r_trail.length; @@ -414,23 +399,24 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, bool use_mempool, int num_reply, int page_align) { - struct ceph_osd_req_op ops[3]; + struct ceph_osd_req_op ops[2]; struct ceph_osd_request *req; + unsigned int num_op = 1; int r; + memset(&ops, 0, sizeof ops); + ops[0].op = opcode; ops[0].extent.truncate_seq = truncate_seq; ops[0].extent.truncate_size = truncate_size; - ops[0].payload_len = 0; if (do_sync) { ops[1].op = CEPH_OSD_OP_STARTSYNC; - ops[1].payload_len = 0; - ops[2].op = 0; - } else - ops[1].op = 0; + num_op++; + } - req = ceph_osdc_alloc_request(osdc, snapc, ops, use_mempool, GFP_NOFS); + req = ceph_osdc_alloc_request(osdc, snapc, num_op, use_mempool, + GFP_NOFS); if (!req) return ERR_PTR(-ENOMEM); req->r_flags = flags; @@ -446,7 +432,8 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, req->r_num_pages = calc_pages_for(page_align, *plen); req->r_page_alignment = page_align; - ceph_osdc_build_request(req, off, *plen, ops, snapc, vino.snap, mtime); + ceph_osdc_build_request(req, off, *plen, num_op, ops, + snapc, vino.snap, mtime); return req; } From 139b4318ad93ae4370d88882ff89b42dcbfaaab1 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 13 Nov 2012 21:11:15 -0600 Subject: [PATCH 44/63] rbd: there is really only one op Throughout the rbd code there are spots where it appears we can handle an osd request containing more than one osd request op. But that is only the way it appears. In fact, currently only one operation at a time can be supported, and supporting more than one will require much more than fleshing out the support that's there now. This patch changes names to make it perfectly clear that anywhere we're dealing with a block of ops, we're in fact dealing with exactly one of them. We'll be able to simplify some things as a result. When multiple op support is implemented, we can update things again accordingly. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 118 +++++++++++++++++++++----------------------- 1 file changed, 56 insertions(+), 62 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 88de8ccb29bd..cc8924d8f263 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1023,32 +1023,26 @@ out_err: return NULL; } -/* - * helpers for osd request op vectors. - */ -static struct ceph_osd_req_op *rbd_create_rw_ops(int num_op, - int opcode, u32 payload_len) +static struct ceph_osd_req_op *rbd_create_rw_op(int opcode, u32 payload_len) { - struct ceph_osd_req_op *ops; + struct ceph_osd_req_op *op; - ops = kzalloc(num_op * sizeof (*ops), GFP_NOIO); - if (!ops) + op = kzalloc(sizeof (*op), GFP_NOIO); + if (!op) return NULL; - - ops[0].op = opcode; - /* * op extent offset and length will be set later on * in calc_raw_layout() */ - ops[0].payload_len = payload_len; + op->op = opcode; + op->payload_len = payload_len; - return ops; + return op; } -static void rbd_destroy_ops(struct ceph_osd_req_op *ops) +static void rbd_destroy_op(struct ceph_osd_req_op *op) { - kfree(ops); + kfree(op); } static void rbd_coll_end_req_index(struct request *rq, @@ -1314,7 +1308,7 @@ static int rbd_do_op(struct request *rq, u64 seg_ofs; u64 seg_len; int ret; - struct ceph_osd_req_op *ops; + struct ceph_osd_req_op *op; u32 payload_len; int opcode; int flags; @@ -1340,8 +1334,8 @@ static int rbd_do_op(struct request *rq, } ret = -ENOMEM; - ops = rbd_create_rw_ops(1, opcode, payload_len); - if (!ops) + op = rbd_create_rw_op(opcode, payload_len); + if (!op) goto done; /* we've taken care of segment sizes earlier when we @@ -1354,13 +1348,13 @@ static int rbd_do_op(struct request *rq, bio, NULL, 0, flags, - 1, ops, + 1, op, coll, coll_index, rbd_req_cb, 0, NULL); if (ret < 0) rbd_coll_end_req_index(rq, coll, coll_index, (s32)ret, seg_len); - rbd_destroy_ops(ops); + rbd_destroy_op(op); done: kfree(seg_name); return ret; @@ -1375,16 +1369,16 @@ static int rbd_req_sync_read(struct rbd_device *rbd_dev, char *buf, u64 *ver) { - struct ceph_osd_req_op *ops; + struct ceph_osd_req_op *op; int ret; - ops = rbd_create_rw_ops(1, CEPH_OSD_OP_READ, 0); - if (!ops) + op = rbd_create_rw_op(CEPH_OSD_OP_READ, 0); + if (!op) return -ENOMEM; ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ, - 1, ops, object_name, ofs, len, buf, NULL, ver); - rbd_destroy_ops(ops); + 1, op, object_name, ofs, len, buf, NULL, ver); + rbd_destroy_op(op); return ret; } @@ -1396,26 +1390,26 @@ static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev, u64 ver, u64 notify_id) { - struct ceph_osd_req_op *ops; + struct ceph_osd_req_op *op; int ret; - ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY_ACK, 0); - if (!ops) + op = rbd_create_rw_op(CEPH_OSD_OP_NOTIFY_ACK, 0); + if (!op) return -ENOMEM; - ops[0].watch.ver = cpu_to_le64(ver); - ops[0].watch.cookie = notify_id; - ops[0].watch.flag = 0; + op->watch.ver = cpu_to_le64(ver); + op->watch.cookie = notify_id; + op->watch.flag = 0; ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP, rbd_dev->header_name, 0, 0, NULL, NULL, 0, CEPH_OSD_FLAG_READ, - 1, ops, + 1, op, NULL, 0, rbd_simple_req_cb, 0, NULL); - rbd_destroy_ops(ops); + rbd_destroy_op(op); return ret; } @@ -1444,12 +1438,12 @@ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data) */ static int rbd_req_sync_watch(struct rbd_device *rbd_dev) { - struct ceph_osd_req_op *ops; + struct ceph_osd_req_op *op; struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; int ret; - ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0); - if (!ops) + op = rbd_create_rw_op(CEPH_OSD_OP_WATCH, 0); + if (!op) return -ENOMEM; ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0, @@ -1457,13 +1451,13 @@ static int rbd_req_sync_watch(struct rbd_device *rbd_dev) if (ret < 0) goto fail; - ops[0].watch.ver = cpu_to_le64(rbd_dev->header.obj_version); - ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie); - ops[0].watch.flag = 1; + op->watch.ver = cpu_to_le64(rbd_dev->header.obj_version); + op->watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie); + op->watch.flag = 1; ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, - 1, ops, + 1, op, rbd_dev->header_name, 0, 0, NULL, &rbd_dev->watch_request, NULL); @@ -1471,14 +1465,14 @@ static int rbd_req_sync_watch(struct rbd_device *rbd_dev) if (ret < 0) goto fail_event; - rbd_destroy_ops(ops); + rbd_destroy_op(op); return 0; fail_event: ceph_osdc_cancel_event(rbd_dev->watch_event); rbd_dev->watch_event = NULL; fail: - rbd_destroy_ops(ops); + rbd_destroy_op(op); return ret; } @@ -1487,25 +1481,25 @@ fail: */ static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev) { - struct ceph_osd_req_op *ops; + struct ceph_osd_req_op *op; int ret; - ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0); - if (!ops) + op = rbd_create_rw_op(CEPH_OSD_OP_WATCH, 0); + if (!op) return -ENOMEM; - ops[0].watch.ver = 0; - ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie); - ops[0].watch.flag = 0; + op->watch.ver = 0; + op->watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie); + op->watch.flag = 0; ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, - 1, ops, + 1, op, rbd_dev->header_name, 0, 0, NULL, NULL, NULL); - rbd_destroy_ops(ops); + rbd_destroy_op(op); ceph_osdc_cancel_event(rbd_dev->watch_event); rbd_dev->watch_event = NULL; return ret; @@ -1524,7 +1518,7 @@ static int rbd_req_sync_exec(struct rbd_device *rbd_dev, size_t inbound_size, u64 *ver) { - struct ceph_osd_req_op *ops; + struct ceph_osd_req_op *op; int class_name_len = strlen(class_name); int method_name_len = strlen(method_name); int payload_size; @@ -1539,23 +1533,23 @@ static int rbd_req_sync_exec(struct rbd_device *rbd_dev, * operation. */ payload_size = class_name_len + method_name_len + outbound_size; - ops = rbd_create_rw_ops(1, CEPH_OSD_OP_CALL, payload_size); - if (!ops) + op = rbd_create_rw_op(CEPH_OSD_OP_CALL, payload_size); + if (!op) return -ENOMEM; - ops[0].cls.class_name = class_name; - ops[0].cls.class_len = (__u8) class_name_len; - ops[0].cls.method_name = method_name; - ops[0].cls.method_len = (__u8) method_name_len; - ops[0].cls.argc = 0; - ops[0].cls.indata = outbound; - ops[0].cls.indata_len = outbound_size; + op->cls.class_name = class_name; + op->cls.class_len = (__u8) class_name_len; + op->cls.method_name = method_name; + op->cls.method_len = (__u8) method_name_len; + op->cls.argc = 0; + op->cls.indata = outbound; + op->cls.indata_len = outbound_size; - ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ, 1, ops, + ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ, 1, op, object_name, 0, inbound_size, inbound, NULL, ver); - rbd_destroy_ops(ops); + rbd_destroy_op(op); dout("cls_exec returned %d\n", ret); return ret; From 30573d680355ca0de4db2113b9080cd078ac726f Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 13 Nov 2012 21:11:15 -0600 Subject: [PATCH 45/63] rbd: assume single op in a request We now know that every of rbd_req_sync_op() passes an array of exactly one operation, as evidenced by all callers passing 1 as its num_op argument. So get rid of that argument, assuming a single op. Similarly, we now know that all callers of rbd_do_request() pass 1 as the num_op value, so that parameter can be eliminated as well. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index cc8924d8f263..7ce178fd6fe5 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1113,8 +1113,7 @@ static int rbd_do_request(struct request *rq, struct page **pages, int num_pages, int flags, - unsigned int num_op, - struct ceph_osd_req_op *ops, + struct ceph_osd_req_op *op, struct rbd_req_coll *coll, int coll_index, void (*rbd_cb)(struct ceph_osd_request *, @@ -1143,7 +1142,7 @@ static int rbd_do_request(struct request *rq, (unsigned long long) len, coll, coll_index); osdc = &rbd_dev->rbd_client->client->osdc; - osd_req = ceph_osdc_alloc_request(osdc, snapc, num_op, false, GFP_NOIO); + osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_NOIO); if (!osd_req) { ret = -ENOMEM; goto done_pages; @@ -1169,10 +1168,10 @@ static int rbd_do_request(struct request *rq, rbd_layout_init(&osd_req->r_file_layout, rbd_dev->spec->pool_id); ret = ceph_calc_raw_layout(&osd_req->r_file_layout, - ofs, &len, &bno, osd_req, ops); + ofs, &len, &bno, osd_req, op); rbd_assert(ret == 0); - ceph_osdc_build_request(osd_req, ofs, len, num_op, ops, + ceph_osdc_build_request(osd_req, ofs, len, 1, op, snapc, snapid, &mtime); if (linger_req) { @@ -1255,8 +1254,7 @@ static void rbd_simple_req_cb(struct ceph_osd_request *osd_req, */ static int rbd_req_sync_op(struct rbd_device *rbd_dev, int flags, - unsigned int num_op, - struct ceph_osd_req_op *ops, + struct ceph_osd_req_op *op, const char *object_name, u64 ofs, u64 inbound_size, char *inbound, @@ -1267,7 +1265,7 @@ static int rbd_req_sync_op(struct rbd_device *rbd_dev, struct page **pages; int num_pages; - rbd_assert(ops != NULL); + rbd_assert(op != NULL); num_pages = calc_pages_for(ofs, inbound_size); pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); @@ -1278,7 +1276,7 @@ static int rbd_req_sync_op(struct rbd_device *rbd_dev, object_name, ofs, inbound_size, NULL, pages, num_pages, flags, - num_op, ops, + op, NULL, 0, NULL, linger_req, ver); @@ -1348,7 +1346,7 @@ static int rbd_do_op(struct request *rq, bio, NULL, 0, flags, - 1, op, + op, coll, coll_index, rbd_req_cb, 0, NULL); if (ret < 0) @@ -1377,7 +1375,7 @@ static int rbd_req_sync_read(struct rbd_device *rbd_dev, return -ENOMEM; ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ, - 1, op, object_name, ofs, len, buf, NULL, ver); + op, object_name, ofs, len, buf, NULL, ver); rbd_destroy_op(op); return ret; @@ -1405,7 +1403,7 @@ static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev, rbd_dev->header_name, 0, 0, NULL, NULL, 0, CEPH_OSD_FLAG_READ, - 1, op, + op, NULL, 0, rbd_simple_req_cb, 0, NULL); @@ -1457,7 +1455,7 @@ static int rbd_req_sync_watch(struct rbd_device *rbd_dev) ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, - 1, op, + op, rbd_dev->header_name, 0, 0, NULL, &rbd_dev->watch_request, NULL); @@ -1494,7 +1492,7 @@ static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev) ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, - 1, op, + op, rbd_dev->header_name, 0, 0, NULL, NULL, NULL); @@ -1545,7 +1543,7 @@ static int rbd_req_sync_exec(struct rbd_device *rbd_dev, op->cls.indata = outbound; op->cls.indata_len = outbound_size; - ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ, 1, op, + ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ, op, object_name, 0, inbound_size, inbound, NULL, ver); From 2b5fc648af5eec2f4fe984cb6b926214e02c5cf4 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 14 Nov 2012 09:38:20 -0600 Subject: [PATCH 46/63] rbd: kill ceph_osd_req_op->flags The flags field of struct ceph_osd_req_op is never used, so just get rid of it. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/osd_client.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 2b04d054e09d..69287ccfe68a 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -157,7 +157,6 @@ struct ceph_osd_client { struct ceph_osd_req_op { u16 op; /* CEPH_OSD_OP_* */ - u32 flags; /* CEPH_OSD_FLAG_* */ union { struct { u64 offset, length; From 0829661863fb5c8031c1c5c119693ea157517783 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 14 Nov 2012 12:25:18 -0600 Subject: [PATCH 47/63] rbd: pull in ceph_calc_raw_layout() This is the first in a series of patches aimed at eliminating the use of ceph_calc_raw_layout() by rbd. It simply pulls in a copy of that function and renames it rbd_calc_raw_layout(). Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 36 +++++++++++++++++++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 7ce178fd6fe5..eee0d3649dcf 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1101,6 +1101,40 @@ static void rbd_layout_init(struct ceph_file_layout *layout, u64 pool_id) layout->fl_pg_pool = cpu_to_le32((u32) pool_id); } +int rbd_calc_raw_layout(struct ceph_file_layout *layout, + u64 off, u64 *plen, u64 *bno, + struct ceph_osd_request *req, + struct ceph_osd_req_op *op) +{ + u64 orig_len = *plen; + u64 objoff, objlen; /* extent in object */ + int r; + + /* object extent? */ + r = ceph_calc_file_object_mapping(layout, off, orig_len, bno, + &objoff, &objlen); + if (r < 0) + return r; + if (objlen < orig_len) { + *plen = objlen; + dout(" skipping last %llu, final file extent %llu~%llu\n", + orig_len - *plen, off, *plen); + } + + if (op->op == CEPH_OSD_OP_READ || op->op == CEPH_OSD_OP_WRITE) { + op->extent.offset = objoff; + op->extent.length = objlen; + } + req->r_num_pages = calc_pages_for(off, *plen); + req->r_page_alignment = off & ~PAGE_MASK; + if (op->op == CEPH_OSD_OP_WRITE) + op->payload_len = *plen; + + dout("calc_layout bno=%llx %llu~%llu (%d pages)\n", + *bno, objoff, objlen, req->r_num_pages); + return 0; +} + /* * Send ceph osd request */ @@ -1167,7 +1201,7 @@ static int rbd_do_request(struct request *rq, osd_req->r_oid_len = strlen(osd_req->r_oid); rbd_layout_init(&osd_req->r_file_layout, rbd_dev->spec->pool_id); - ret = ceph_calc_raw_layout(&osd_req->r_file_layout, + ret = rbd_calc_raw_layout(&osd_req->r_file_layout, ofs, &len, &bno, osd_req, op); rbd_assert(ret == 0); From e01e79273b251dbb35ff2522a688229b09481923 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 14 Nov 2012 12:25:18 -0600 Subject: [PATCH 48/63] rbd: open code rbd_calc_raw_layout() This patch gets rid of rbd_calc_raw_layout() by simply open coding it in its one caller. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 55 +++++++++++++++------------------------------ 1 file changed, 18 insertions(+), 37 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index eee0d3649dcf..f5f4e4a8fc87 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1032,7 +1032,7 @@ static struct ceph_osd_req_op *rbd_create_rw_op(int opcode, u32 payload_len) return NULL; /* * op extent offset and length will be set later on - * in calc_raw_layout() + * after ceph_calc_file_object_mapping(). */ op->op = opcode; op->payload_len = payload_len; @@ -1101,40 +1101,6 @@ static void rbd_layout_init(struct ceph_file_layout *layout, u64 pool_id) layout->fl_pg_pool = cpu_to_le32((u32) pool_id); } -int rbd_calc_raw_layout(struct ceph_file_layout *layout, - u64 off, u64 *plen, u64 *bno, - struct ceph_osd_request *req, - struct ceph_osd_req_op *op) -{ - u64 orig_len = *plen; - u64 objoff, objlen; /* extent in object */ - int r; - - /* object extent? */ - r = ceph_calc_file_object_mapping(layout, off, orig_len, bno, - &objoff, &objlen); - if (r < 0) - return r; - if (objlen < orig_len) { - *plen = objlen; - dout(" skipping last %llu, final file extent %llu~%llu\n", - orig_len - *plen, off, *plen); - } - - if (op->op == CEPH_OSD_OP_READ || op->op == CEPH_OSD_OP_WRITE) { - op->extent.offset = objoff; - op->extent.length = objlen; - } - req->r_num_pages = calc_pages_for(off, *plen); - req->r_page_alignment = off & ~PAGE_MASK; - if (op->op == CEPH_OSD_OP_WRITE) - op->payload_len = *plen; - - dout("calc_layout bno=%llx %llu~%llu (%d pages)\n", - *bno, objoff, objlen, req->r_num_pages); - return 0; -} - /* * Send ceph osd request */ @@ -1158,6 +1124,8 @@ static int rbd_do_request(struct request *rq, struct ceph_osd_request *osd_req; int ret; u64 bno; + u64 obj_off = 0; + u64 obj_len = 0; struct timespec mtime = CURRENT_TIME; struct rbd_request *rbd_req; struct ceph_osd_client *osdc; @@ -1201,9 +1169,22 @@ static int rbd_do_request(struct request *rq, osd_req->r_oid_len = strlen(osd_req->r_oid); rbd_layout_init(&osd_req->r_file_layout, rbd_dev->spec->pool_id); - ret = rbd_calc_raw_layout(&osd_req->r_file_layout, - ofs, &len, &bno, osd_req, op); + ret = ceph_calc_file_object_mapping(&osd_req->r_file_layout, ofs, len, + &bno, &obj_off, &obj_len); rbd_assert(ret == 0); + if (obj_len < len) { + dout(" skipping last %llu, final file extent %llu~%llu\n", + len - obj_len, ofs, obj_len); + len = obj_len; + } + if (op->op == CEPH_OSD_OP_READ || op->op == CEPH_OSD_OP_WRITE) { + op->extent.offset = obj_off; + op->extent.length = obj_len; + if (op->op == CEPH_OSD_OP_WRITE) + op->payload_len = obj_len; + } + osd_req->r_num_pages = calc_pages_for(ofs, len); + osd_req->r_page_alignment = ofs & ~PAGE_MASK; ceph_osdc_build_request(osd_req, ofs, len, 1, op, snapc, snapid, &mtime); From 47dba7ba2623b088cbbe1ac0aaa1a034f3249b6d Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 14 Nov 2012 12:25:19 -0600 Subject: [PATCH 49/63] rbd: don't bother calculating file mapping When rbd_do_request() has a request to process it initializes a ceph file layout structure and uses it to compute offsets and limits for the range of the request using ceph_calc_file_object_mapping(). The layout used is fixed, and is based on RBD_MAX_OBJ_ORDER (30). It sets the layout's object size and stripe unit to be 1 GB (2^30), and sets the stripe count to be 1. The job of ceph_calc_file_object_mapping() is to determine which of a sequence of objects will contain data covered by range, and within that object, at what offset the range starts. It also truncates the length of the range at the end of the selected object if necessary. This is needed for ceph fs, but for rbd it really serves no purpose. It does its own blocking of images into objects, echo of which is (1 << obj_order) in size, and as a result it ignores the "bno" value returned by ceph_calc_file_object_mapping(). In addition, by the point a request has reached this function, it is already destined for a single rbd object, and its length will not exceed that object's extent. Because of this, and because the mapping will result in blocking up the range using an integer multiple of the image's object order, ceph_calc_file_object_mapping() will never change the offset or length values defined by the request. In other words, this call is a big no-op for rbd data requests. There is one exception. We read the header object using this function, and in that case we will not have already limited the request size. However, the header is a single object (not a file or rbd image), and should not be broken into pieces anyway. So in fact we should *not* be calling ceph_calc_file_object_mapping() when operating on the header object. So... Don't call ceph_calc_file_object_mapping() in rbd_do_request(), because useless for image data and incorrect to do sofor the image header. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index f5f4e4a8fc87..1c0192c3cf47 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1123,9 +1123,6 @@ static int rbd_do_request(struct request *rq, { struct ceph_osd_request *osd_req; int ret; - u64 bno; - u64 obj_off = 0; - u64 obj_len = 0; struct timespec mtime = CURRENT_TIME; struct rbd_request *rbd_req; struct ceph_osd_client *osdc; @@ -1169,19 +1166,12 @@ static int rbd_do_request(struct request *rq, osd_req->r_oid_len = strlen(osd_req->r_oid); rbd_layout_init(&osd_req->r_file_layout, rbd_dev->spec->pool_id); - ret = ceph_calc_file_object_mapping(&osd_req->r_file_layout, ofs, len, - &bno, &obj_off, &obj_len); - rbd_assert(ret == 0); - if (obj_len < len) { - dout(" skipping last %llu, final file extent %llu~%llu\n", - len - obj_len, ofs, obj_len); - len = obj_len; - } + if (op->op == CEPH_OSD_OP_READ || op->op == CEPH_OSD_OP_WRITE) { - op->extent.offset = obj_off; - op->extent.length = obj_len; + op->extent.offset = ofs; + op->extent.length = len; if (op->op == CEPH_OSD_OP_WRITE) - op->payload_len = obj_len; + op->payload_len = len; } osd_req->r_num_pages = calc_pages_for(ofs, len); osd_req->r_page_alignment = ofs & ~PAGE_MASK; From 0903e875caa93e1fb231dd66c69b118dbdad25cb Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 14 Nov 2012 12:25:19 -0600 Subject: [PATCH 50/63] rbd: use a common layout for each device Each osd message includes a layout structure, and for rbd it is always the same (at least for osd's in a given pool). Initialize a layout structure when an rbd_dev gets created and just copy that into osd requests for the rbd image. Replace an assertion that was done when initializing the layout structures with code that catches and handles anything that would trigger the assertion as soon as it is identified. This precludes that (bad) condition from ever occurring. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 34 +++++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 1c0192c3cf47..d2a6e9589fab 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -235,6 +235,8 @@ struct rbd_device { char *header_name; + struct ceph_file_layout layout; + struct ceph_osd_event *watch_event; struct ceph_osd_request *watch_request; @@ -1091,16 +1093,6 @@ static void rbd_coll_end_req(struct rbd_request *rbd_req, ret, len); } -static void rbd_layout_init(struct ceph_file_layout *layout, u64 pool_id) -{ - memset(layout, 0, sizeof (*layout)); - layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); - layout->fl_stripe_count = cpu_to_le32(1); - layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); - rbd_assert(pool_id <= (u64) U32_MAX); - layout->fl_pg_pool = cpu_to_le32((u32) pool_id); -} - /* * Send ceph osd request */ @@ -1165,7 +1157,7 @@ static int rbd_do_request(struct request *rq, strncpy(osd_req->r_oid, object_name, sizeof(osd_req->r_oid)); osd_req->r_oid_len = strlen(osd_req->r_oid); - rbd_layout_init(&osd_req->r_file_layout, rbd_dev->spec->pool_id); + osd_req->r_file_layout = rbd_dev->layout; /* struct */ if (op->op == CEPH_OSD_OP_READ || op->op == CEPH_OSD_OP_WRITE) { op->extent.offset = ofs; @@ -2297,6 +2289,13 @@ struct rbd_device *rbd_dev_create(struct rbd_client *rbdc, rbd_dev->spec = spec; rbd_dev->rbd_client = rbdc; + /* Initialize the layout used for all rbd requests */ + + rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); + rbd_dev->layout.fl_stripe_count = cpu_to_le32(1); + rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); + rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id); + return rbd_dev; } @@ -2551,6 +2550,12 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev) if (parent_spec->pool_id == CEPH_NOPOOL) goto out; /* No parent? No problem. */ + /* The ceph file layout needs to fit pool id in 32 bits */ + + ret = -EIO; + if (WARN_ON(parent_spec->pool_id > (u64) U32_MAX)) + goto out; + image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL); if (IS_ERR(image_id)) { ret = PTR_ERR(image_id); @@ -3680,6 +3685,13 @@ static ssize_t rbd_add(struct bus_type *bus, goto err_out_client; spec->pool_id = (u64) rc; + /* The ceph file layout needs to fit pool id in 32 bits */ + + if (WARN_ON(spec->pool_id > (u64) U32_MAX)) { + rc = -EIO; + goto err_out_client; + } + rbd_dev = rbd_dev_create(rbdc, spec); if (!rbd_dev) goto err_out_client; From 907703d050df92979b3848ee42f88d5c9c6c13fe Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 13 Nov 2012 21:11:15 -0600 Subject: [PATCH 51/63] rbd: combine rbd sync watch/unwatch functions The rbd_req_sync_watch() and rbd_req_sync_unwatch() functions are nearly identical. Combine them into a single function with a flag indicating whether a watch is to be initiated or torn down. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 81 +++++++++++++++------------------------------ 1 file changed, 27 insertions(+), 54 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index d2a6e9589fab..9d21fcd7e188 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1429,74 +1429,48 @@ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data) } /* - * Request sync osd watch + * Request sync osd watch/unwatch. The value of "start" determines + * whether a watch request is being initiated or torn down. */ -static int rbd_req_sync_watch(struct rbd_device *rbd_dev) +static int rbd_req_sync_watch(struct rbd_device *rbd_dev, int start) { struct ceph_osd_req_op *op; - struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; + struct ceph_osd_request **linger_req = NULL; + __le64 version = 0; int ret; op = rbd_create_rw_op(CEPH_OSD_OP_WATCH, 0); if (!op) return -ENOMEM; - ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0, - (void *)rbd_dev, &rbd_dev->watch_event); - if (ret < 0) - goto fail; + if (start) { + struct ceph_osd_client *osdc; - op->watch.ver = cpu_to_le64(rbd_dev->header.obj_version); + osdc = &rbd_dev->rbd_client->client->osdc; + ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0, rbd_dev, + &rbd_dev->watch_event); + if (ret < 0) + goto done; + version = cpu_to_le64(rbd_dev->header.obj_version); + linger_req = &rbd_dev->watch_request; + } + + op->watch.ver = version; op->watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie); - op->watch.flag = 1; + op->watch.flag = (u8) start ? 1 : 0; ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, - op, - rbd_dev->header_name, - 0, 0, NULL, - &rbd_dev->watch_request, NULL); - - if (ret < 0) - goto fail_event; + op, rbd_dev->header_name, + 0, 0, NULL, linger_req, NULL); + if (!start || ret < 0) { + ceph_osdc_cancel_event(rbd_dev->watch_event); + rbd_dev->watch_event = NULL; + } +done: rbd_destroy_op(op); - return 0; -fail_event: - ceph_osdc_cancel_event(rbd_dev->watch_event); - rbd_dev->watch_event = NULL; -fail: - rbd_destroy_op(op); - return ret; -} - -/* - * Request sync osd unwatch - */ -static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev) -{ - struct ceph_osd_req_op *op; - int ret; - - op = rbd_create_rw_op(CEPH_OSD_OP_WATCH, 0); - if (!op) - return -ENOMEM; - - op->watch.ver = 0; - op->watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie); - op->watch.flag = 0; - - ret = rbd_req_sync_op(rbd_dev, - CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, - op, - rbd_dev->header_name, - 0, 0, NULL, NULL, NULL); - - - rbd_destroy_op(op); - ceph_osdc_cancel_event(rbd_dev->watch_event); - rbd_dev->watch_event = NULL; return ret; } @@ -3033,7 +3007,7 @@ static int rbd_init_watch_dev(struct rbd_device *rbd_dev) int ret, rc; do { - ret = rbd_req_sync_watch(rbd_dev); + ret = rbd_req_sync_watch(rbd_dev, 1); if (ret == -ERANGE) { rc = rbd_dev_refresh(rbd_dev, NULL); if (rc < 0) @@ -3752,8 +3726,7 @@ static void rbd_dev_release(struct device *dev) rbd_dev->watch_request); } if (rbd_dev->watch_event) - rbd_req_sync_unwatch(rbd_dev); - + rbd_req_sync_watch(rbd_dev, 0); /* clean up and free blkdev */ rbd_free_disk(rbd_dev); From 2e53c6c379b65372df21f4d6019f6eb63af81384 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 30 Nov 2012 09:59:47 -0600 Subject: [PATCH 52/63] rbd: don't leak rbd_req on synchronous requests When rbd_do_request() is called it allocates and populates an rbd_req structure to hold information about the osd request to be sent. This is done for the benefit of the callback function (in particular, rbd_req_cb()), which uses this in processing when the request completes. Synchronous requests provide no callback function, in which case rbd_do_request() waits for the request to complete before returning. This case is not handling the needed free of the rbd_req structure like it should, so it is getting leaked. Note however that the synchronous case has no need for the rbd_req structure at all. So rather than simply freeing this structure for synchronous requests, just don't allocate it to begin with. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 52 ++++++++++++++++++++++----------------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 9d21fcd7e188..28b62367ff93 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1113,20 +1113,11 @@ static int rbd_do_request(struct request *rq, struct ceph_osd_request **linger_req, u64 *ver) { - struct ceph_osd_request *osd_req; - int ret; - struct timespec mtime = CURRENT_TIME; - struct rbd_request *rbd_req; struct ceph_osd_client *osdc; - - rbd_req = kzalloc(sizeof(*rbd_req), GFP_NOIO); - if (!rbd_req) - return -ENOMEM; - - if (coll) { - rbd_req->coll = coll; - rbd_req->coll_index = coll_index; - } + struct ceph_osd_request *osd_req; + struct rbd_request *rbd_req = NULL; + struct timespec mtime = CURRENT_TIME; + int ret; dout("rbd_do_request object_name=%s ofs=%llu len=%llu coll=%p[%d]\n", object_name, (unsigned long long) ofs, @@ -1134,10 +1125,8 @@ static int rbd_do_request(struct request *rq, osdc = &rbd_dev->rbd_client->client->osdc; osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_NOIO); - if (!osd_req) { - ret = -ENOMEM; - goto done_pages; - } + if (!osd_req) + return -ENOMEM; osd_req->r_flags = flags; osd_req->r_pages = pages; @@ -1145,13 +1134,22 @@ static int rbd_do_request(struct request *rq, osd_req->r_bio = bio; bio_get(osd_req->r_bio); } + + if (rbd_cb) { + ret = -ENOMEM; + rbd_req = kmalloc(sizeof(*rbd_req), GFP_NOIO); + if (!rbd_req) + goto done_osd_req; + + rbd_req->rq = rq; + rbd_req->bio = bio; + rbd_req->pages = pages; + rbd_req->len = len; + rbd_req->coll = coll; + rbd_req->coll_index = coll ? coll_index : 0; + } + osd_req->r_callback = rbd_cb; - - rbd_req->rq = rq; - rbd_req->bio = bio; - rbd_req->pages = pages; - rbd_req->len = len; - osd_req->r_priv = rbd_req; strncpy(osd_req->r_oid, object_name, sizeof(osd_req->r_oid)); @@ -1193,10 +1191,12 @@ static int rbd_do_request(struct request *rq, return ret; done_err: - bio_chain_put(rbd_req->bio); - ceph_osdc_put_request(osd_req); -done_pages: + if (bio) + bio_chain_put(osd_req->r_bio); kfree(rbd_req); +done_osd_req: + ceph_osdc_put_request(osd_req); + return ret; } From 1821665749a3d7e26ad470c63fc2c46990dc6041 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 30 Nov 2012 09:59:47 -0600 Subject: [PATCH 53/63] rbd: don't leak rbd_req for rbd_req_sync_notify_ack() When rbd_req_sync_notify_ack() calls rbd_do_request() it supplies rbd_simple_req_cb() as its callback function. Because the callback is supplied, an rbd_req structure gets allocated and populated so it can be used by the callback. However rbd_simple_req_cb() is not freeing (or even using) the rbd_req structure, so it's getting leaked. Since rbd_simple_req_cb() has no need for the rbd_req structure, just avoid allocating one for this case. Of the three calls to rbd_do_request(), only the one from rbd_do_op() needs the rbd_req structure, and that call can be distinguished from the other two because it supplies a non-null rbd_collection pointer. So fix this leak by only allocating the rbd_req structure if a non-null "coll" value is provided to rbd_do_request(). Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 28b62367ff93..619d680960b1 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1135,7 +1135,7 @@ static int rbd_do_request(struct request *rq, bio_get(osd_req->r_bio); } - if (rbd_cb) { + if (coll) { ret = -ENOMEM; rbd_req = kmalloc(sizeof(*rbd_req), GFP_NOIO); if (!rbd_req) @@ -1146,7 +1146,7 @@ static int rbd_do_request(struct request *rq, rbd_req->pages = pages; rbd_req->len = len; rbd_req->coll = coll; - rbd_req->coll_index = coll ? coll_index : 0; + rbd_req->coll_index = coll_index; } osd_req->r_callback = rbd_cb; From c561191813e232aa52022532751855ff5c9fa319 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 19 Nov 2012 22:55:21 -0600 Subject: [PATCH 54/63] rbd: don't assign extent info in rbd_do_request() In rbd_do_request() there's a sort of last-minute assignment of the extent offset and length and payload length for read and write operations. Move those assignments into the caller (in those spots that might initiate read or write operations) Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 619d680960b1..c6917b11800b 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1156,13 +1156,6 @@ static int rbd_do_request(struct request *rq, osd_req->r_oid_len = strlen(osd_req->r_oid); osd_req->r_file_layout = rbd_dev->layout; /* struct */ - - if (op->op == CEPH_OSD_OP_READ || op->op == CEPH_OSD_OP_WRITE) { - op->extent.offset = ofs; - op->extent.length = len; - if (op->op == CEPH_OSD_OP_WRITE) - op->payload_len = len; - } osd_req->r_num_pages = calc_pages_for(ofs, len); osd_req->r_page_alignment = ofs & ~PAGE_MASK; @@ -1269,6 +1262,13 @@ static int rbd_req_sync_op(struct rbd_device *rbd_dev, if (IS_ERR(pages)) return PTR_ERR(pages); + if (op->op == CEPH_OSD_OP_READ || op->op == CEPH_OSD_OP_WRITE) { + op->extent.offset = ofs; + op->extent.length = inbound_size; + if (op->op == CEPH_OSD_OP_WRITE) + op->payload_len = inbound_size; + } + ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP, object_name, ofs, inbound_size, NULL, pages, num_pages, @@ -1332,6 +1332,9 @@ static int rbd_do_op(struct request *rq, op = rbd_create_rw_op(opcode, payload_len); if (!op) goto done; + op->extent.offset = seg_ofs; + op->extent.length = seg_len; + op->payload_len = payload_len; /* we've taken care of segment sizes earlier when we cloned the bios. We should never have a segment From 8d23bf29095e5fab84535035e7a27c4920812c44 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 19 Nov 2012 22:55:21 -0600 Subject: [PATCH 55/63] rbd: don't assign extent info in rbd_req_sync_op() Move the assignment of the extent offset and length and payload length out of rbd_req_sync_op() and into its caller in the one spot where a read (and note--no write) operation might be initiated. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 72 +++++++++++++++++++++++++++++---------------- 1 file changed, 46 insertions(+), 26 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index c6917b11800b..235cda083137 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1025,19 +1025,15 @@ out_err: return NULL; } -static struct ceph_osd_req_op *rbd_create_rw_op(int opcode, u32 payload_len) +static struct ceph_osd_req_op *rbd_create_rw_op(int opcode, u64 ofs, u64 len) { struct ceph_osd_req_op *op; op = kzalloc(sizeof (*op), GFP_NOIO); if (!op) return NULL; - /* - * op extent offset and length will be set later on - * after ceph_calc_file_object_mapping(). - */ + op->op = opcode; - op->payload_len = payload_len; return op; } @@ -1047,6 +1043,42 @@ static void rbd_destroy_op(struct ceph_osd_req_op *op) kfree(op); } +struct ceph_osd_req_op *rbd_osd_req_op_create(u16 opcode, ...) +{ + struct ceph_osd_req_op *op; + va_list args; + + op = kzalloc(sizeof (*op), GFP_NOIO); + if (!op) + return NULL; + op->op = opcode; + va_start(args, opcode); + switch (opcode) { + case CEPH_OSD_OP_READ: + case CEPH_OSD_OP_WRITE: + /* rbd_osd_req_op_create(READ, offset, length) */ + /* rbd_osd_req_op_create(WRITE, offset, length) */ + op->extent.offset = va_arg(args, u64); + op->extent.length = va_arg(args, u64); + if (opcode == CEPH_OSD_OP_WRITE) + op->payload_len = op->extent.length; + break; + default: + rbd_warn(NULL, "unsupported opcode %hu\n", opcode); + kfree(op); + op = NULL; + break; + } + va_end(args); + + return op; +} + +static void rbd_osd_req_op_destroy(struct ceph_osd_req_op *op) +{ + kfree(op); +} + static void rbd_coll_end_req_index(struct request *rq, struct rbd_req_coll *coll, int index, @@ -1262,13 +1294,6 @@ static int rbd_req_sync_op(struct rbd_device *rbd_dev, if (IS_ERR(pages)) return PTR_ERR(pages); - if (op->op == CEPH_OSD_OP_READ || op->op == CEPH_OSD_OP_WRITE) { - op->extent.offset = ofs; - op->extent.length = inbound_size; - if (op->op == CEPH_OSD_OP_WRITE) - op->payload_len = inbound_size; - } - ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP, object_name, ofs, inbound_size, NULL, pages, num_pages, @@ -1304,7 +1329,6 @@ static int rbd_do_op(struct request *rq, u64 seg_len; int ret; struct ceph_osd_req_op *op; - u32 payload_len; int opcode; int flags; u64 snapid; @@ -1319,22 +1343,17 @@ static int rbd_do_op(struct request *rq, opcode = CEPH_OSD_OP_WRITE; flags = CEPH_OSD_FLAG_WRITE|CEPH_OSD_FLAG_ONDISK; snapid = CEPH_NOSNAP; - payload_len = seg_len; } else { opcode = CEPH_OSD_OP_READ; flags = CEPH_OSD_FLAG_READ; rbd_assert(!snapc); snapid = rbd_dev->spec->snap_id; - payload_len = 0; } ret = -ENOMEM; - op = rbd_create_rw_op(opcode, payload_len); + op = rbd_osd_req_op_create(opcode, seg_ofs, seg_len); if (!op) goto done; - op->extent.offset = seg_ofs; - op->extent.length = seg_len; - op->payload_len = payload_len; /* we've taken care of segment sizes earlier when we cloned the bios. We should never have a segment @@ -1352,7 +1371,7 @@ static int rbd_do_op(struct request *rq, if (ret < 0) rbd_coll_end_req_index(rq, coll, coll_index, (s32)ret, seg_len); - rbd_destroy_op(op); + rbd_osd_req_op_destroy(op); done: kfree(seg_name); return ret; @@ -1370,13 +1389,13 @@ static int rbd_req_sync_read(struct rbd_device *rbd_dev, struct ceph_osd_req_op *op; int ret; - op = rbd_create_rw_op(CEPH_OSD_OP_READ, 0); + op = rbd_osd_req_op_create(CEPH_OSD_OP_READ, ofs, len); if (!op) return -ENOMEM; ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ, op, object_name, ofs, len, buf, NULL, ver); - rbd_destroy_op(op); + rbd_osd_req_op_destroy(op); return ret; } @@ -1391,7 +1410,7 @@ static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev, struct ceph_osd_req_op *op; int ret; - op = rbd_create_rw_op(CEPH_OSD_OP_NOTIFY_ACK, 0); + op = rbd_create_rw_op(CEPH_OSD_OP_NOTIFY_ACK, 0, 0); if (!op) return -ENOMEM; @@ -1442,7 +1461,7 @@ static int rbd_req_sync_watch(struct rbd_device *rbd_dev, int start) __le64 version = 0; int ret; - op = rbd_create_rw_op(CEPH_OSD_OP_WATCH, 0); + op = rbd_create_rw_op(CEPH_OSD_OP_WATCH, 0, 0); if (!op) return -ENOMEM; @@ -1505,9 +1524,10 @@ static int rbd_req_sync_exec(struct rbd_device *rbd_dev, * operation. */ payload_size = class_name_len + method_name_len + outbound_size; - op = rbd_create_rw_op(CEPH_OSD_OP_CALL, payload_size); + op = rbd_create_rw_op(CEPH_OSD_OP_CALL, 0, 0); if (!op) return -ENOMEM; + op->payload_len = payload_size; op->cls.class_name = class_name; op->cls.class_len = (__u8) class_name_len; From 2647ba38100765298fc67ce1ec5d32e80d9fe046 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 19 Nov 2012 22:55:21 -0600 Subject: [PATCH 56/63] rbd: move call osd op setup into rbd_osd_req_op_create() Move the initialization of the CEPH_OSD_OP_CALL operation into rbd_osd_req_op_create(). Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 48 ++++++++++++++++++++++++++++----------------- 1 file changed, 30 insertions(+), 18 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 235cda083137..760be82548d9 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -52,10 +52,12 @@ #define SECTOR_SHIFT 9 #define SECTOR_SIZE (1ULL << SECTOR_SHIFT) -/* It might be useful to have this defined elsewhere too */ +/* It might be useful to have these defined elsewhere */ -#define U32_MAX ((u32) (~0U)) -#define U64_MAX ((u64) (~0ULL)) +#define U8_MAX ((u8) (~0U)) +#define U16_MAX ((u16) (~0U)) +#define U32_MAX ((u32) (~0U)) +#define U64_MAX ((u64) (~0ULL)) #define RBD_DRV_NAME "rbd" #define RBD_DRV_NAME_LONG "rbd (rados block device)" @@ -1047,6 +1049,7 @@ struct ceph_osd_req_op *rbd_osd_req_op_create(u16 opcode, ...) { struct ceph_osd_req_op *op; va_list args; + size_t size; op = kzalloc(sizeof (*op), GFP_NOIO); if (!op) @@ -1063,6 +1066,27 @@ struct ceph_osd_req_op *rbd_osd_req_op_create(u16 opcode, ...) if (opcode == CEPH_OSD_OP_WRITE) op->payload_len = op->extent.length; break; + case CEPH_OSD_OP_CALL: + /* rbd_osd_req_op_create(CALL, class, method, data, datalen) */ + op->cls.class_name = va_arg(args, char *); + size = strlen(op->cls.class_name); + rbd_assert(size <= (size_t) U8_MAX); + op->cls.class_len = size; + op->payload_len = size; + + op->cls.method_name = va_arg(args, char *); + size = strlen(op->cls.method_name); + rbd_assert(size <= (size_t) U8_MAX); + op->cls.method_len = size; + op->payload_len += size; + + op->cls.argc = 0; + op->cls.indata = va_arg(args, void *); + size = va_arg(args, size_t); + rbd_assert(size <= (size_t) U32_MAX); + op->cls.indata_len = (u32) size; + op->payload_len += size; + break; default: rbd_warn(NULL, "unsupported opcode %hu\n", opcode); kfree(op); @@ -1510,9 +1534,6 @@ static int rbd_req_sync_exec(struct rbd_device *rbd_dev, u64 *ver) { struct ceph_osd_req_op *op; - int class_name_len = strlen(class_name); - int method_name_len = strlen(method_name); - int payload_size; int ret; /* @@ -1523,25 +1544,16 @@ static int rbd_req_sync_exec(struct rbd_device *rbd_dev, * the perspective of the server side) in the OSD request * operation. */ - payload_size = class_name_len + method_name_len + outbound_size; - op = rbd_create_rw_op(CEPH_OSD_OP_CALL, 0, 0); + op = rbd_osd_req_op_create(CEPH_OSD_OP_CALL, class_name, + method_name, outbound, outbound_size); if (!op) return -ENOMEM; - op->payload_len = payload_size; - - op->cls.class_name = class_name; - op->cls.class_len = (__u8) class_name_len; - op->cls.method_name = method_name; - op->cls.method_len = (__u8) method_name_len; - op->cls.argc = 0; - op->cls.indata = outbound; - op->cls.indata_len = outbound_size; ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ, op, object_name, 0, inbound_size, inbound, NULL, ver); - rbd_destroy_op(op); + rbd_osd_req_op_destroy(op); dout("cls_exec returned %d\n", ret); return ret; From 5efea49a98d1a3b3a7301d3a17f826ad4c31b290 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 19 Nov 2012 22:55:21 -0600 Subject: [PATCH 57/63] rbd: move remaining osd op setup into rbd_osd_req_op_create() The two remaining osd ops used by rbd are CEPH_OSD_OP_WATCH and CEPH_OSD_OP_NOTIFY_ACK. Move the setup of those operations into rbd_osd_req_op_create(), and get rid of rbd_create_rw_op() and rbd_destroy_op(). Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 68 ++++++++++++++++++--------------------------- 1 file changed, 27 insertions(+), 41 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 760be82548d9..3802a7857280 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1027,24 +1027,6 @@ out_err: return NULL; } -static struct ceph_osd_req_op *rbd_create_rw_op(int opcode, u64 ofs, u64 len) -{ - struct ceph_osd_req_op *op; - - op = kzalloc(sizeof (*op), GFP_NOIO); - if (!op) - return NULL; - - op->op = opcode; - - return op; -} - -static void rbd_destroy_op(struct ceph_osd_req_op *op) -{ - kfree(op); -} - struct ceph_osd_req_op *rbd_osd_req_op_create(u16 opcode, ...) { struct ceph_osd_req_op *op; @@ -1087,6 +1069,16 @@ struct ceph_osd_req_op *rbd_osd_req_op_create(u16 opcode, ...) op->cls.indata_len = (u32) size; op->payload_len += size; break; + case CEPH_OSD_OP_NOTIFY_ACK: + case CEPH_OSD_OP_WATCH: + /* rbd_osd_req_op_create(NOTIFY_ACK, cookie, version) */ + /* rbd_osd_req_op_create(WATCH, cookie, version, flag) */ + op->watch.cookie = va_arg(args, u64); + op->watch.ver = va_arg(args, u64); + op->watch.ver = cpu_to_le64(op->watch.ver); + if (opcode == CEPH_OSD_OP_WATCH && va_arg(args, int)) + op->watch.flag = (u8) 1; + break; default: rbd_warn(NULL, "unsupported opcode %hu\n", opcode); kfree(op); @@ -1434,14 +1426,10 @@ static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev, struct ceph_osd_req_op *op; int ret; - op = rbd_create_rw_op(CEPH_OSD_OP_NOTIFY_ACK, 0, 0); + op = rbd_osd_req_op_create(CEPH_OSD_OP_NOTIFY_ACK, notify_id, ver); if (!op) return -ENOMEM; - op->watch.ver = cpu_to_le64(ver); - op->watch.cookie = notify_id; - op->watch.flag = 0; - ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP, rbd_dev->header_name, 0, 0, NULL, NULL, 0, @@ -1450,7 +1438,8 @@ static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev, NULL, 0, rbd_simple_req_cb, 0, NULL); - rbd_destroy_op(op); + rbd_osd_req_op_destroy(op); + return ret; } @@ -1480,14 +1469,9 @@ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data) */ static int rbd_req_sync_watch(struct rbd_device *rbd_dev, int start) { - struct ceph_osd_req_op *op; struct ceph_osd_request **linger_req = NULL; - __le64 version = 0; - int ret; - - op = rbd_create_rw_op(CEPH_OSD_OP_WATCH, 0, 0); - if (!op) - return -ENOMEM; + struct ceph_osd_req_op *op; + int ret = 0; if (start) { struct ceph_osd_client *osdc; @@ -1496,26 +1480,28 @@ static int rbd_req_sync_watch(struct rbd_device *rbd_dev, int start) ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0, rbd_dev, &rbd_dev->watch_event); if (ret < 0) - goto done; - version = cpu_to_le64(rbd_dev->header.obj_version); + return ret; linger_req = &rbd_dev->watch_request; + } else { + rbd_assert(rbd_dev->watch_request != NULL); } - op->watch.ver = version; - op->watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie); - op->watch.flag = (u8) start ? 1 : 0; - - ret = rbd_req_sync_op(rbd_dev, + op = rbd_osd_req_op_create(CEPH_OSD_OP_WATCH, + rbd_dev->watch_event->cookie, + rbd_dev->header.obj_version, start); + if (op) + ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, op, rbd_dev->header_name, 0, 0, NULL, linger_req, NULL); - if (!start || ret < 0) { + /* Cancel the event if we're tearing down, or on error */ + + if (!start || !op || ret < 0) { ceph_osdc_cancel_event(rbd_dev->watch_event); rbd_dev->watch_event = NULL; } -done: - rbd_destroy_op(op); + rbd_osd_req_op_destroy(op); return ret; } From 8b84de7940b69fd7326946ba244621aa5fc412e0 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 20 Nov 2012 14:17:17 -0600 Subject: [PATCH 58/63] rbd: assign watch request more directly Both rbd_req_sync_op() and rbd_do_request() have a "linger" parameter, which is the address of a pointer that should refer to the osd request structure used to issue a request to an osd. Only one case ever supplies a non-null "linger" argument: an CEPH_OSD_OP_WATCH start. And in that one case it is assigned &rbd_dev->watch_request. Within rbd_do_request() (where the assignment ultimately gets made) we know the rbd_dev and therefore its watch_request field. We also know whether the op being sent is CEPH_OSD_OP_WATCH start. Stop opaquely passing down the "linger" pointer, and instead just assign the value directly inside rbd_do_request() when it's needed. This makes it unnecessary for rbd_req_sync_watch() to make arrangements to hold a value that's not available until a bit later. This more clearly separates setting up a watch request from submitting it. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 3802a7857280..a3a11c3c1cac 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1158,7 +1158,6 @@ static int rbd_do_request(struct request *rq, int coll_index, void (*rbd_cb)(struct ceph_osd_request *, struct ceph_msg *), - struct ceph_osd_request **linger_req, u64 *ver) { struct ceph_osd_client *osdc; @@ -1210,9 +1209,9 @@ static int rbd_do_request(struct request *rq, ceph_osdc_build_request(osd_req, ofs, len, 1, op, snapc, snapid, &mtime); - if (linger_req) { + if (op->op == CEPH_OSD_OP_WATCH && op->watch.flag) { ceph_osdc_set_request_linger(osdc, osd_req); - *linger_req = osd_req; + rbd_dev->watch_request = osd_req; } ret = ceph_osdc_start_request(osdc, osd_req, false); @@ -1296,7 +1295,6 @@ static int rbd_req_sync_op(struct rbd_device *rbd_dev, const char *object_name, u64 ofs, u64 inbound_size, char *inbound, - struct ceph_osd_request **linger_req, u64 *ver) { int ret; @@ -1317,7 +1315,7 @@ static int rbd_req_sync_op(struct rbd_device *rbd_dev, op, NULL, 0, NULL, - linger_req, ver); + ver); if (ret < 0) goto done; @@ -1383,7 +1381,7 @@ static int rbd_do_op(struct request *rq, flags, op, coll, coll_index, - rbd_req_cb, 0, NULL); + rbd_req_cb, NULL); if (ret < 0) rbd_coll_end_req_index(rq, coll, coll_index, (s32)ret, seg_len); @@ -1410,7 +1408,7 @@ static int rbd_req_sync_read(struct rbd_device *rbd_dev, return -ENOMEM; ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ, - op, object_name, ofs, len, buf, NULL, ver); + op, object_name, ofs, len, buf, ver); rbd_osd_req_op_destroy(op); return ret; @@ -1436,7 +1434,7 @@ static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev, CEPH_OSD_FLAG_READ, op, NULL, 0, - rbd_simple_req_cb, 0, NULL); + rbd_simple_req_cb, NULL); rbd_osd_req_op_destroy(op); @@ -1469,7 +1467,6 @@ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data) */ static int rbd_req_sync_watch(struct rbd_device *rbd_dev, int start) { - struct ceph_osd_request **linger_req = NULL; struct ceph_osd_req_op *op; int ret = 0; @@ -1481,7 +1478,6 @@ static int rbd_req_sync_watch(struct rbd_device *rbd_dev, int start) &rbd_dev->watch_event); if (ret < 0) return ret; - linger_req = &rbd_dev->watch_request; } else { rbd_assert(rbd_dev->watch_request != NULL); } @@ -1493,7 +1489,7 @@ static int rbd_req_sync_watch(struct rbd_device *rbd_dev, int start) ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, op, rbd_dev->header_name, - 0, 0, NULL, linger_req, NULL); + 0, 0, NULL, NULL); /* Cancel the event if we're tearing down, or on error */ @@ -1537,7 +1533,7 @@ static int rbd_req_sync_exec(struct rbd_device *rbd_dev, ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ, op, object_name, 0, inbound_size, inbound, - NULL, ver); + ver); rbd_osd_req_op_destroy(op); From e0b49868d3629708eda593b6739cb78f33ab238a Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 9 Jan 2013 14:44:18 -0600 Subject: [PATCH 59/63] rbd: fix type of snap_id in rbd_dev_v2_snap_info() The type of the snap_id local variable is defined with the wrong byte order. Fix that. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index a3a11c3c1cac..007b726ea0eb 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -2802,7 +2802,7 @@ out: static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which, u64 *snap_size, u64 *snap_features) { - __le64 snap_id; + u64 snap_id; u8 order; int ret; From 98571b5aa776d4a69eadd7d4e5c9d4e69365ab9a Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Sun, 20 Jan 2013 14:44:42 -0600 Subject: [PATCH 60/63] rbd: small changes A few very minor changes to the rbd code: - RBD_MAX_OPT_LEN is unused, so get rid of it - Consolidate rbd options definitions - Make rbd_segment_name() return pointer to const char Signed-off-by: Alex Elder Reviewed-by: Dan Mick Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 007b726ea0eb..4ed074138f8d 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -69,7 +69,6 @@ (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1)) #define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */ -#define RBD_MAX_OPT_LEN 1024 #define RBD_SNAP_HEAD_NAME "-" @@ -96,8 +95,6 @@ #define DEV_NAME_LEN 32 #define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1) -#define RBD_READ_ONLY_DEFAULT false - /* * block device image metadata (in-memory version) */ @@ -156,10 +153,6 @@ struct rbd_spec { struct kref kref; }; -struct rbd_options { - bool read_only; -}; - /* * an instance of the client. multiple devices may share an rbd client. */ @@ -475,6 +468,12 @@ static match_table_t rbd_opts_tokens = { {-1, NULL} }; +struct rbd_options { + bool read_only; +}; + +#define RBD_READ_ONLY_DEFAULT false + static int parse_rbd_opts_token(char *c, void *private) { struct rbd_options *rbd_opts = private; @@ -773,7 +772,7 @@ static void rbd_header_free(struct rbd_image_header *header) header->snapc = NULL; } -static char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset) +static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset) { char *name; u64 segment; @@ -1338,7 +1337,7 @@ static int rbd_do_op(struct request *rq, struct rbd_req_coll *coll, int coll_index) { - char *seg_name; + const char *seg_name; u64 seg_ofs; u64 seg_len; int ret; From 38901e0f240b634467fb31743365af80a006be33 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 10 Jan 2013 12:56:58 -0600 Subject: [PATCH 61/63] rbd: check for overflow in rbd_get_num_segments() The return type of rbd_get_num_segments() is int, but the values it operates on are u64. Although it's not likely, there's no guarantee the result won't exceed what can be respresented in an int. The function is already designed to return -ERANGE on error, so just add this possible overflow as another reason to return that. Signed-off-by: Alex Elder Reviewed-by: Dan Mick Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 4ed074138f8d..58d01e3a0fce 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -820,6 +820,7 @@ static int rbd_get_num_segments(struct rbd_image_header *header, { u64 start_seg; u64 end_seg; + u64 result; if (!len) return 0; @@ -829,7 +830,11 @@ static int rbd_get_num_segments(struct rbd_image_header *header, start_seg = ofs >> header->obj_order; end_seg = (ofs + len - 1) >> header->obj_order; - return end_seg - start_seg + 1; + result = end_seg - start_seg + 1; + if (result > (u64) INT_MAX) + return -ERANGE; + + return (int) result; } /* From c04306471ad93f1daf60771a0373316d4c3494ae Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 18 Jan 2013 12:31:09 -0600 Subject: [PATCH 62/63] rbd: don't retry setting up header watch When an rbd image is initially mapped a watch event is registered so we can do something if the header object changes. The code that does this currently loops if initiating the watch request results in an ERANGE error. The osds will never return ERANGE, so there's no reason to do this loop, so get rid of it. This resolves: http://tracker.newdream.net/issues/3860 Note that the problem this loop was intended to solve is a race between collecting image header information and setting up the watch on the header object. The real fix for that problem is described here: http://tracker.newdream.net/issues/3871 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 23 ++++------------------- 1 file changed, 4 insertions(+), 19 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 58d01e3a0fce..668936381ab0 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1474,6 +1474,9 @@ static int rbd_req_sync_watch(struct rbd_device *rbd_dev, int start) struct ceph_osd_req_op *op; int ret = 0; + rbd_assert(start ^ !!rbd_dev->watch_event); + rbd_assert(start ^ !!rbd_dev->watch_request); + if (start) { struct ceph_osd_client *osdc; @@ -1482,8 +1485,6 @@ static int rbd_req_sync_watch(struct rbd_device *rbd_dev, int start) &rbd_dev->watch_event); if (ret < 0) return ret; - } else { - rbd_assert(rbd_dev->watch_request != NULL); } op = rbd_osd_req_op_create(CEPH_OSD_OP_WATCH, @@ -3023,22 +3024,6 @@ static void rbd_bus_del_dev(struct rbd_device *rbd_dev) device_unregister(&rbd_dev->dev); } -static int rbd_init_watch_dev(struct rbd_device *rbd_dev) -{ - int ret, rc; - - do { - ret = rbd_req_sync_watch(rbd_dev, 1); - if (ret == -ERANGE) { - rc = rbd_dev_refresh(rbd_dev, NULL); - if (rc < 0) - return rc; - } - } while (ret == -ERANGE); - - return ret; -} - static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0); /* @@ -3584,7 +3569,7 @@ static int rbd_dev_probe_finish(struct rbd_device *rbd_dev) if (ret) goto err_out_bus; - ret = rbd_init_watch_dev(rbd_dev); + ret = rbd_req_sync_watch(rbd_dev, 1); if (ret) goto err_out_bus; From 1ec3911dbd19076bcdfe5540096ff67f91a6ec02 Mon Sep 17 00:00:00 2001 From: Cong Ding Date: Fri, 25 Jan 2013 17:48:59 -0600 Subject: [PATCH 63/63] libceph: fix undefined behavior when using snprintf() The variable "str" is used as both the source and destination in function snprintf(), which is undefined behavior based on C11. The original description in C11 is: "If copying takes place between objects that overlap, the behavior is undefined." And, the function of ceph_osdmap_state_str() is to return the osdmap state, so it should return "doesn't exist" when all the conditions are not satisfied. I fix it in this patch. [elder@inktank.com: shortened the commit message] Signed-off-by: Cong Ding Reviewed-by: Alex Elder --- net/ceph/osdmap.c | 26 +++++++++----------------- 1 file changed, 9 insertions(+), 17 deletions(-) diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 369f03ba9ee5..3c61e21611d3 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -13,26 +13,18 @@ char *ceph_osdmap_state_str(char *str, int len, int state) { - int flag = 0; - if (!len) - goto done; + return str; - *str = '\0'; - if (state) { - if (state & CEPH_OSD_EXISTS) { - snprintf(str, len, "exists"); - flag = 1; - } - if (state & CEPH_OSD_UP) { - snprintf(str, len, "%s%s%s", str, (flag ? ", " : ""), - "up"); - flag = 1; - } - } else { + if ((state & CEPH_OSD_EXISTS) && (state & CEPH_OSD_UP)) + snprintf(str, len, "exists, up"); + else if (state & CEPH_OSD_EXISTS) + snprintf(str, len, "exists"); + else if (state & CEPH_OSD_UP) + snprintf(str, len, "up"); + else snprintf(str, len, "doesn't exist"); - } -done: + return str; }