From 8f97514b423a0983e4c600099882a9c6613142d2 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 26 Oct 2016 07:26:40 -0400 Subject: [PATCH 01/19] nfsd: more robust allocation failure handling in nfsd_reply_cache_init Currently, we try to allocate the cache as a single, large chunk, which can fail if no big chunks of memory are available. We _do_ try to size it according to the amount of memory in the box, but if the server is started well after boot time, then the allocation can fail due to memory fragmentation. Fall back to doing a vzalloc if the kcalloc fails, and switch the shutdown code to do a kvfree to handle freeing correctly. Reported-by: Olaf Hering Cc: Linus Torvalds Signed-off-by: Jeff Layton Signed-off-by: J. Bruce Fields --- fs/nfsd/nfscache.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index 54cde9a5864e..d6b97b424ad1 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -9,6 +9,7 @@ */ #include +#include #include #include #include @@ -174,8 +175,12 @@ int nfsd_reply_cache_init(void) goto out_nomem; drc_hashtbl = kcalloc(hashsize, sizeof(*drc_hashtbl), GFP_KERNEL); - if (!drc_hashtbl) - goto out_nomem; + if (!drc_hashtbl) { + drc_hashtbl = vzalloc(hashsize * sizeof(*drc_hashtbl)); + if (!drc_hashtbl) + goto out_nomem; + } + for (i = 0; i < hashsize; i++) { INIT_LIST_HEAD(&drc_hashtbl[i].lru_head); spin_lock_init(&drc_hashtbl[i].cache_lock); @@ -204,7 +209,7 @@ void nfsd_reply_cache_shutdown(void) } } - kfree (drc_hashtbl); + kvfree(drc_hashtbl); drc_hashtbl = NULL; drc_hashsize = 0; From 851238a22f3be7323feed6a62a6882c284b2f0a5 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 20 Oct 2016 12:21:34 -0400 Subject: [PATCH 02/19] nfsd: fix error handling for clients that fail to return the layout Currently, when the client continually returns NFS4ERR_DELAY on a CB_LAYOUTRECALL, we'll give up trying to retransmit after two lease periods, but leave the layout in place. What we really need to do here is fence the client in this case. Have it fall through to that code in that case instead of into the NFS4ERR_NOMATCHING_LAYOUT case. Signed-off-by: Jeff Layton Reviewed-by: Christoph Hellwig Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4layouts.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c index 42aace4fc4c8..596205d939a1 100644 --- a/fs/nfsd/nfs4layouts.c +++ b/fs/nfsd/nfs4layouts.c @@ -686,10 +686,6 @@ nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task) return 0; } /* Fallthrough */ - case -NFS4ERR_NOMATCHING_LAYOUT: - trace_layout_recall_done(&ls->ls_stid.sc_stateid); - task->tk_status = 0; - return 1; default: /* * Unknown error or non-responding client, we'll need to fence. @@ -702,6 +698,10 @@ nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task) else nfsd4_cb_layout_fail(ls); return -1; + case -NFS4ERR_NOMATCHING_LAYOUT: + trace_layout_recall_done(&ls->ls_stid.sc_stateid); + task->tk_status = 0; + return 1; } } From 916d2d844afd09dc8cf144e0e9dc98daa9dfc34a Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 18 Oct 2016 14:18:40 -0400 Subject: [PATCH 03/19] nfsd: clean up supported attribute handling Minor cleanup, no change in behavior. Provide helpers for some common attribute bitmap operations. Drop some comments that just echo the code. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4proc.c | 43 +++++++++++-------------------------------- fs/nfsd/nfs4xdr.c | 38 ++++++++++++++++++++++++-------------- fs/nfsd/nfsd.h | 23 ++++++++--------------- 3 files changed, 43 insertions(+), 61 deletions(-) diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index abb09b580389..e901cf124e9f 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -96,33 +96,12 @@ check_attr_support(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, { struct dentry *dentry = cstate->current_fh.fh_dentry; - /* - * Check about attributes are supported by the NFSv4 server or not. - * According to spec, unsupported attributes return ERR_ATTRNOTSUPP. - */ - if ((bmval[0] & ~nfsd_suppattrs0(cstate->minorversion)) || - (bmval[1] & ~nfsd_suppattrs1(cstate->minorversion)) || - (bmval[2] & ~nfsd_suppattrs2(cstate->minorversion))) + if (!nfsd_attrs_supported(cstate->minorversion, bmval)) return nfserr_attrnotsupp; - - /* - * Check FATTR4_WORD0_ACL can be supported - * in current environment or not. - */ - if (bmval[0] & FATTR4_WORD0_ACL) { - if (!IS_POSIXACL(d_inode(dentry))) - return nfserr_attrnotsupp; - } - - /* - * According to spec, read-only attributes return ERR_INVAL. - */ - if (writable) { - if ((bmval[0] & ~writable[0]) || (bmval[1] & ~writable[1]) || - (bmval[2] & ~writable[2])) - return nfserr_inval; - } - + if ((bmval[0] & FATTR4_WORD0_ACL) && !IS_POSIXACL(d_inode(dentry))) + return nfserr_attrnotsupp; + if (writable && !bmval_is_subset(bmval, writable)) + return nfserr_inval; return nfs_ok; } @@ -695,9 +674,9 @@ nfsd4_getattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (getattr->ga_bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1) return nfserr_inval; - getattr->ga_bmval[0] &= nfsd_suppattrs0(cstate->minorversion); - getattr->ga_bmval[1] &= nfsd_suppattrs1(cstate->minorversion); - getattr->ga_bmval[2] &= nfsd_suppattrs2(cstate->minorversion); + getattr->ga_bmval[0] &= nfsd_suppattrs[cstate->minorversion][0]; + getattr->ga_bmval[1] &= nfsd_suppattrs[cstate->minorversion][1]; + getattr->ga_bmval[2] &= nfsd_suppattrs[cstate->minorversion][2]; getattr->ga_fhp = &cstate->current_fh; return nfs_ok; @@ -799,9 +778,9 @@ nfsd4_readdir(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (readdir->rd_bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1) return nfserr_inval; - readdir->rd_bmval[0] &= nfsd_suppattrs0(cstate->minorversion); - readdir->rd_bmval[1] &= nfsd_suppattrs1(cstate->minorversion); - readdir->rd_bmval[2] &= nfsd_suppattrs2(cstate->minorversion); + readdir->rd_bmval[0] &= nfsd_suppattrs[cstate->minorversion][0]; + readdir->rd_bmval[1] &= nfsd_suppattrs[cstate->minorversion][1]; + readdir->rd_bmval[2] &= nfsd_suppattrs[cstate->minorversion][2]; if ((cookie == 1) || (cookie == 2) || (cookie == 0 && memcmp(readdir->rd_verf.data, zeroverf.data, NFS4_VERIFIER_SIZE))) diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index c2d2895a1ec1..a99bbb88c6e1 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -57,6 +57,20 @@ #define NFSDDBG_FACILITY NFSDDBG_XDR +u32 nfsd_suppattrs[3][3] = { + {NFSD4_SUPPORTED_ATTRS_WORD0, + NFSD4_SUPPORTED_ATTRS_WORD1, + NFSD4_SUPPORTED_ATTRS_WORD2}, + + {NFSD4_1_SUPPORTED_ATTRS_WORD0, + NFSD4_1_SUPPORTED_ATTRS_WORD1, + NFSD4_1_SUPPORTED_ATTRS_WORD2}, + + {NFSD4_1_SUPPORTED_ATTRS_WORD0, + NFSD4_1_SUPPORTED_ATTRS_WORD1, + NFSD4_2_SUPPORTED_ATTRS_WORD2}, +}; + /* * As per referral draft, the fsid for a referral MUST be different from the fsid of the containing * directory in order to indicate to the client that a filesystem boundary is present @@ -2340,9 +2354,7 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp, struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); BUG_ON(bmval1 & NFSD_WRITEONLY_ATTRS_WORD1); - BUG_ON(bmval0 & ~nfsd_suppattrs0(minorversion)); - BUG_ON(bmval1 & ~nfsd_suppattrs1(minorversion)); - BUG_ON(bmval2 & ~nfsd_suppattrs2(minorversion)); + BUG_ON(!nfsd_attrs_supported(minorversion, bmval)); if (exp->ex_fslocs.migrated) { status = fattr_handle_absent_fs(&bmval0, &bmval1, &bmval2, &rdattr_err); @@ -2409,29 +2421,27 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp, p++; /* to be backfilled later */ if (bmval0 & FATTR4_WORD0_SUPPORTED_ATTRS) { - u32 word0 = nfsd_suppattrs0(minorversion); - u32 word1 = nfsd_suppattrs1(minorversion); - u32 word2 = nfsd_suppattrs2(minorversion); + u32 *supp = nfsd_suppattrs[minorversion]; if (!IS_POSIXACL(dentry->d_inode)) - word0 &= ~FATTR4_WORD0_ACL; + supp[0] &= ~FATTR4_WORD0_ACL; if (!contextsupport) - word2 &= ~FATTR4_WORD2_SECURITY_LABEL; - if (!word2) { + supp[2] &= ~FATTR4_WORD2_SECURITY_LABEL; + if (!supp[2]) { p = xdr_reserve_space(xdr, 12); if (!p) goto out_resource; *p++ = cpu_to_be32(2); - *p++ = cpu_to_be32(word0); - *p++ = cpu_to_be32(word1); + *p++ = cpu_to_be32(supp[0]); + *p++ = cpu_to_be32(supp[1]); } else { p = xdr_reserve_space(xdr, 16); if (!p) goto out_resource; *p++ = cpu_to_be32(3); - *p++ = cpu_to_be32(word0); - *p++ = cpu_to_be32(word1); - *p++ = cpu_to_be32(word2); + *p++ = cpu_to_be32(supp[0]); + *p++ = cpu_to_be32(supp[1]); + *p++ = cpu_to_be32(supp[2]); } } if (bmval0 & FATTR4_WORD0_TYPE) { diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index 9446849888d5..a72d4163273a 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -361,25 +361,18 @@ void nfsd_lockd_shutdown(void); (NFSD4_1_SUPPORTED_ATTRS_WORD2 | \ NFSD4_2_SECURITY_ATTRS) -static inline u32 nfsd_suppattrs0(u32 minorversion) +extern u32 nfsd_suppattrs[3][3]; + +static inline bool bmval_is_subset(u32 *bm1, u32 *bm2) { - return minorversion ? NFSD4_1_SUPPORTED_ATTRS_WORD0 - : NFSD4_SUPPORTED_ATTRS_WORD0; + return !((bm1[0] & ~bm2[0]) || + (bm1[1] & ~bm2[1]) || + (bm1[2] & ~bm2[2])); } -static inline u32 nfsd_suppattrs1(u32 minorversion) +static inline bool nfsd_attrs_supported(u32 minorversion, u32 *bmval) { - return minorversion ? NFSD4_1_SUPPORTED_ATTRS_WORD1 - : NFSD4_SUPPORTED_ATTRS_WORD1; -} - -static inline u32 nfsd_suppattrs2(u32 minorversion) -{ - switch (minorversion) { - default: return NFSD4_2_SUPPORTED_ATTRS_WORD2; - case 1: return NFSD4_1_SUPPORTED_ATTRS_WORD2; - case 0: return NFSD4_SUPPORTED_ATTRS_WORD2; - } + return bmval_is_subset(bmval, nfsd_suppattrs[minorversion]); } /* These will return ERR_INVAL if specified in GETATTR or READDIR. */ From e864c189e1d63f2f6a052e296f0da0616d88b625 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Fri, 10 Jun 2016 16:56:05 -0400 Subject: [PATCH 04/19] nfsd: catch errors in decode_fattr earlier 3c8e03166ae2 "NFSv4: do exact check about attribute specified" fixed some handling of unsupported-attribute errors, but it also delayed checking for unwriteable attributes till after we decode them. This could lead to odd behavior in the case a client attemps to set an attribute we don't know about followed by one we try to parse. In that case the parser for the known attribute will attempt to parse the unknown attribute. It should fail in some safe way, but the error might at least be incorrect (probably bad_xdr instead of inval). So, it's better to do that check at the start. As far as I know this doesn't cause any problems with current clients but it might be a minor issue e.g. if we encounter a future client that supports a new attribute that we currently don't. Cc: Yu Zhiguo Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4xdr.c | 15 +++++++++------ fs/nfsd/nfsd.h | 6 +++++- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index a99bbb88c6e1..281739e1d477 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -310,6 +310,14 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, if ((status = nfsd4_decode_bitmap(argp, bmval))) return status; + if (bmval[0] & ~NFSD_WRITEABLE_ATTRS_WORD0 + || bmval[1] & ~NFSD_WRITEABLE_ATTRS_WORD1 + || bmval[2] & ~NFSD_WRITEABLE_ATTRS_WORD2) { + if (nfsd_attrs_supported(argp->minorversion, bmval)) + return nfserr_inval; + return nfserr_attrnotsupp; + } + READ_BUF(4); expected_len = be32_to_cpup(p++); @@ -449,12 +457,7 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, return nfserr_jukebox; } #endif - - if (bmval[0] & ~NFSD_WRITEABLE_ATTRS_WORD0 - || bmval[1] & ~NFSD_WRITEABLE_ATTRS_WORD1 - || bmval[2] & ~NFSD_WRITEABLE_ATTRS_WORD2) - READ_BUF(expected_len - len); - else if (len != expected_len) + if (len != expected_len) goto xdr_error; DECODE_TAIL; diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index a72d4163273a..7155239b2908 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -379,7 +379,11 @@ static inline bool nfsd_attrs_supported(u32 minorversion, u32 *bmval) #define NFSD_WRITEONLY_ATTRS_WORD1 \ (FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET) -/* These are the only attrs allowed in CREATE/OPEN/SETATTR. */ +/* + * These are the only attrs allowed in CREATE/OPEN/SETATTR. Don't add + * a writeable attribute here without also adding code to parse it to + * nfsd4_decode_fattr(). + */ #define NFSD_WRITEABLE_ATTRS_WORD0 \ (FATTR4_WORD0_SIZE | FATTR4_WORD0_ACL) #define NFSD_WRITEABLE_ATTRS_WORD1 \ From 56094edd17971372c7fea078ab41315ebf6214d0 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Wed, 26 Oct 2016 16:03:00 -0400 Subject: [PATCH 05/19] sunrpc: GFP_KERNEL should be GFP_NOFS in crypto code Writes may depend on the auth_gss crypto code, so we shouldn't be allocating with GFP_KERNEL there. This still leaves some crypto_alloc_* calls which end up doing GFP_KERNEL allocations in the crypto code. Those could probably done at crypto import time. Signed-off-by: J. Bruce Fields --- net/sunrpc/auth_gss/gss_krb5_crypto.c | 12 ++++++------ net/sunrpc/auth_gss/gss_krb5_mech.c | 3 +-- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/net/sunrpc/auth_gss/gss_krb5_crypto.c b/net/sunrpc/auth_gss/gss_krb5_crypto.c index 90115ceefd49..fb39284ec174 100644 --- a/net/sunrpc/auth_gss/gss_krb5_crypto.c +++ b/net/sunrpc/auth_gss/gss_krb5_crypto.c @@ -200,7 +200,7 @@ make_checksum_hmac_md5(struct krb5_ctx *kctx, char *header, int hdrlen, if (IS_ERR(hmac_md5)) goto out_free_md5; - req = ahash_request_alloc(md5, GFP_KERNEL); + req = ahash_request_alloc(md5, GFP_NOFS); if (!req) goto out_free_hmac_md5; @@ -230,7 +230,7 @@ make_checksum_hmac_md5(struct krb5_ctx *kctx, char *header, int hdrlen, goto out; ahash_request_free(req); - req = ahash_request_alloc(hmac_md5, GFP_KERNEL); + req = ahash_request_alloc(hmac_md5, GFP_NOFS); if (!req) goto out_free_hmac_md5; @@ -299,7 +299,7 @@ make_checksum(struct krb5_ctx *kctx, char *header, int hdrlen, if (IS_ERR(tfm)) goto out_free_cksum; - req = ahash_request_alloc(tfm, GFP_KERNEL); + req = ahash_request_alloc(tfm, GFP_NOFS); if (!req) goto out_free_ahash; @@ -397,7 +397,7 @@ make_checksum_v2(struct krb5_ctx *kctx, char *header, int hdrlen, goto out_free_cksum; checksumlen = crypto_ahash_digestsize(tfm); - req = ahash_request_alloc(tfm, GFP_KERNEL); + req = ahash_request_alloc(tfm, GFP_NOFS); if (!req) goto out_free_ahash; @@ -963,7 +963,7 @@ krb5_rc4_setup_seq_key(struct krb5_ctx *kctx, struct crypto_skcipher *cipher, } desc = kmalloc(sizeof(*desc) + crypto_shash_descsize(hmac), - GFP_KERNEL); + GFP_NOFS); if (!desc) { dprintk("%s: failed to allocate shash descriptor for '%s'\n", __func__, kctx->gk5e->cksum_name); @@ -1030,7 +1030,7 @@ krb5_rc4_setup_enc_key(struct krb5_ctx *kctx, struct crypto_skcipher *cipher, } desc = kmalloc(sizeof(*desc) + crypto_shash_descsize(hmac), - GFP_KERNEL); + GFP_NOFS); if (!desc) { dprintk("%s: failed to allocate shash descriptor for '%s'\n", __func__, kctx->gk5e->cksum_name); diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c index 60595835317a..7bb2514aadd9 100644 --- a/net/sunrpc/auth_gss/gss_krb5_mech.c +++ b/net/sunrpc/auth_gss/gss_krb5_mech.c @@ -451,8 +451,7 @@ context_derive_keys_rc4(struct krb5_ctx *ctx) goto out_err_free_hmac; - desc = kmalloc(sizeof(*desc) + crypto_shash_descsize(hmac), - GFP_KERNEL); + desc = kmalloc(sizeof(*desc) + crypto_shash_descsize(hmac), GFP_NOFS); if (!desc) { dprintk("%s: failed to allocate hash descriptor for '%s'\n", __func__, ctx->gk5e->cksum_name); From 88382036674770173128417e4c09e9e549f82d54 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Mon, 14 Nov 2016 11:13:43 -0500 Subject: [PATCH 06/19] nfsd: update workqueue creation No real change in functionality, but the old interface seems to be deprecated. We don't actually care about ordering necessarily, but we do depend on running at most one work item at a time: nfsd4_process_cb_update() assumes that no other thread is running it, and that no new callbacks are starting while it's running. Reviewed-by: Jeff Layton Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4callback.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 211dc2aed8e1..eb78109d666c 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -1061,7 +1061,7 @@ static const struct rpc_call_ops nfsd4_cb_ops = { int nfsd4_create_callback_queue(void) { - callback_wq = create_singlethread_workqueue("nfsd4_callbacks"); + callback_wq = alloc_ordered_workqueue("nfsd4_callbacks", 0); if (!callback_wq) return -ENOMEM; return 0; From 7ba630f54ca6404351cebea03fbd3c95760b9b02 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Sun, 28 Aug 2016 22:36:55 +0200 Subject: [PATCH 07/19] nfsd: constify reply_cache_stats_operations structure reply_cache_stats_operations, of type struct file_operations, is never modified, so declare it as const. Done with the help of Coccinelle. Signed-off-by: Julia Lawall Reviewed-by: Jeff Layton Signed-off-by: J. Bruce Fields --- fs/nfsd/nfsctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 36b2af931e06..be13063bcd66 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -217,7 +217,7 @@ static const struct file_operations pool_stats_operations = { .release = nfsd_pool_stats_release, }; -static struct file_operations reply_cache_stats_operations = { +static const struct file_operations reply_cache_stats_operations = { .open = nfsd_reply_cache_stats_open, .read = seq_read, .llseek = seq_lseek, From 1b9f700b8cfc31089e2dfa5d0905c52fd4529b50 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 29 Nov 2016 11:04:26 -0500 Subject: [PATCH 08/19] svcrdma: Clear xpt_bc_xps in xprt_setup_rdma_bc() error exit arm Logic copied from xs_setup_bc_tcp(). Fixes: 39a9beab5acb ('rpc: share one xps between all backchannels') Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- net/sunrpc/xprtrdma/svc_rdma_backchannel.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c index 20027f8de129..6035c5a380a6 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c +++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c @@ -359,6 +359,7 @@ xprt_setup_rdma_bc(struct xprt_create *args) out_fail: xprt_rdma_free_addresses(xprt); args->bc_xprt->xpt_bc_xprt = NULL; + args->bc_xprt->xpt_bc_xps = NULL; xprt_put(xprt); xprt_free(xprt); return ERR_PTR(-EINVAL); From 4d712ef1db05c3aa5c3b690a50c37ebad584c53f Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 29 Nov 2016 11:04:34 -0500 Subject: [PATCH 09/19] svcauth_gss: Close connection when dropping an incoming message S5.3.3.1 of RFC 2203 requires that an incoming GSS-wrapped message whose sequence number lies outside the current window is dropped. The rationale is: The reason for discarding requests silently is that the server is unable to determine if the duplicate or out of range request was due to a sequencing problem in the client, network, or the operating system, or due to some quirk in routing, or a replay attack by an intruder. Discarding the request allows the client to recover after timing out, if indeed the duplication was unintentional or well intended. However, clients may rely on the server dropping the connection to indicate that a retransmit is needed. Without a connection reset, a client can wait forever without retransmitting, and the workload just stops dead. I've reproduced this behavior by running xfstests generic/323 on an NFSv4.0 mount with proto=rdma and sec=krb5i. To address this issue, have the server close the connection when it silently discards an incoming message due to a GSS sequence number problem. There are a few other places where the server will never reply. Change those spots in a similar fashion. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- net/sunrpc/auth_gss/svcauth_gss.c | 2 +- net/sunrpc/svc.c | 14 +++++++++----- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index 45662d7f0943..886e9d381771 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -1548,7 +1548,7 @@ complete: ret = SVC_COMPLETE; goto out; drop: - ret = SVC_DROP; + ret = SVC_CLOSE; out: if (rsci) cache_put(&rsci->h, sn->rsc_cache); diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 7c8070ec93c8..75f290bddca1 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -1155,8 +1155,7 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv) case SVC_DENIED: goto err_bad_auth; case SVC_CLOSE: - if (test_bit(XPT_TEMP, &rqstp->rq_xprt->xpt_flags)) - svc_close_xprt(rqstp->rq_xprt); + goto close; case SVC_DROP: goto dropit; case SVC_COMPLETE: @@ -1246,7 +1245,7 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv) sendit: if (svc_authorise(rqstp)) - goto dropit; + goto close; return 1; /* Caller can now send it */ dropit: @@ -1254,11 +1253,16 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv) dprintk("svc: svc_process dropit\n"); return 0; + close: + if (test_bit(XPT_TEMP, &rqstp->rq_xprt->xpt_flags)) + svc_close_xprt(rqstp->rq_xprt); + dprintk("svc: svc_process close\n"); + return 0; + err_short_len: svc_printk(rqstp, "short len %Zd, dropping request\n", argv->iov_len); - - goto dropit; /* drop request */ + goto close; err_bad_rpc: serv->sv_stats->rpcbadfmt++; From 5fdca6531434c1c1b2d584873afdda52e5ad448c Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 29 Nov 2016 11:04:42 -0500 Subject: [PATCH 10/19] svcrdma: Renovate sendto chunk list parsing The current sendto code appears to support clients that provide only one of a Read list, a Write list, or a Reply chunk. My reading of that code is that it doesn't support the following cases: - Read list + Write list - Read list + Reply chunk - Write list + Reply chunk - Read list + Write list + Reply chunk The protocol allows more than one Read or Write chunk in those lists. Some clients do send a Read list and Reply chunk simultaneously. NFSv4 WRITE uses a Read list for the data payload, and a Reply chunk because the GETATTR result in the reply can contain a large object like an ACL. Generalize one of the sendto code paths needed to support all of the above cases, and attempt to ensure that only one pass is done through the RPC Call's transport header to gather chunk list information for building the reply. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc_rdma.h | 2 - net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 14 ++++ net/sunrpc/xprtrdma/svc_rdma_sendto.c | 92 +++++++------------------ 3 files changed, 39 insertions(+), 69 deletions(-) diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index cc3ae16eac68..6aef63b9a669 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -236,8 +236,6 @@ extern int rdma_read_chunk_frmr(struct svcxprt_rdma *, struct svc_rqst *, extern int svc_rdma_map_xdr(struct svcxprt_rdma *, struct xdr_buf *, struct svc_rdma_req_map *, bool); extern int svc_rdma_sendto(struct svc_rqst *); -extern struct rpcrdma_read_chunk * - svc_rdma_get_read_chunk(struct rpcrdma_msg *); extern void svc_rdma_send_error(struct svcxprt_rdma *, struct rpcrdma_msg *, int); diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index ad1df979b3f0..873c2a938d35 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -415,6 +415,20 @@ done: return 1; } +/* Returns the address of the first read chunk or if no read chunk + * is present + */ +static struct rpcrdma_read_chunk * +svc_rdma_get_read_chunk(struct rpcrdma_msg *rmsgp) +{ + struct rpcrdma_read_chunk *ch = + (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0]; + + if (ch->rc_discrim == xdr_zero) + return NULL; + return ch; +} + static int rdma_read_chunks(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp, struct svc_rqst *rqstp, diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index f5a91edcd233..0a58d4062f2f 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -153,76 +153,35 @@ static dma_addr_t dma_map_xdr(struct svcxprt_rdma *xprt, return dma_addr; } -/* Returns the address of the first read chunk or if no read chunk - * is present +/* Parse the RPC Call's transport header. */ -struct rpcrdma_read_chunk * -svc_rdma_get_read_chunk(struct rpcrdma_msg *rmsgp) +static void svc_rdma_get_write_arrays(struct rpcrdma_msg *rmsgp, + struct rpcrdma_write_array **write, + struct rpcrdma_write_array **reply) { - struct rpcrdma_read_chunk *ch = - (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0]; + __be32 *p; - if (ch->rc_discrim == xdr_zero) - return NULL; - return ch; -} + p = (__be32 *)&rmsgp->rm_body.rm_chunks[0]; -/* Returns the address of the first read write array element or - * if no write array list is present - */ -static struct rpcrdma_write_array * -svc_rdma_get_write_array(struct rpcrdma_msg *rmsgp) -{ - if (rmsgp->rm_body.rm_chunks[0] != xdr_zero || - rmsgp->rm_body.rm_chunks[1] == xdr_zero) - return NULL; - return (struct rpcrdma_write_array *)&rmsgp->rm_body.rm_chunks[1]; -} + /* Read list */ + while (*p++ != xdr_zero) + p += 5; -/* Returns the address of the first reply array element or if no - * reply array is present - */ -static struct rpcrdma_write_array * -svc_rdma_get_reply_array(struct rpcrdma_msg *rmsgp, - struct rpcrdma_write_array *wr_ary) -{ - struct rpcrdma_read_chunk *rch; - struct rpcrdma_write_array *rp_ary; - - /* XXX: Need to fix when reply chunk may occur with read list - * and/or write list. - */ - if (rmsgp->rm_body.rm_chunks[0] != xdr_zero || - rmsgp->rm_body.rm_chunks[1] != xdr_zero) - return NULL; - - rch = svc_rdma_get_read_chunk(rmsgp); - if (rch) { - while (rch->rc_discrim != xdr_zero) - rch++; - - /* The reply chunk follows an empty write array located - * at 'rc_position' here. The reply array is at rc_target. - */ - rp_ary = (struct rpcrdma_write_array *)&rch->rc_target; - goto found_it; + /* Write list */ + if (*p != xdr_zero) { + *write = (struct rpcrdma_write_array *)p; + while (*p++ != xdr_zero) + p += 1 + be32_to_cpu(*p) * 4; + } else { + *write = NULL; + p++; } - if (wr_ary) { - int chunk = be32_to_cpu(wr_ary->wc_nchunks); - - rp_ary = (struct rpcrdma_write_array *) - &wr_ary->wc_array[chunk].wc_target.rs_length; - goto found_it; - } - - /* No read list, no write list */ - rp_ary = (struct rpcrdma_write_array *)&rmsgp->rm_body.rm_chunks[2]; - - found_it: - if (rp_ary->wc_discrim == xdr_zero) - return NULL; - return rp_ary; + /* Reply chunk */ + if (*p != xdr_zero) + *reply = (struct rpcrdma_write_array *)p; + else + *reply = NULL; } /* RPC-over-RDMA Version One private extension: Remote Invalidation. @@ -244,8 +203,8 @@ static u32 svc_rdma_get_inv_rkey(struct rpcrdma_msg *rdma_argp, inv_rkey = 0; - rd_ary = svc_rdma_get_read_chunk(rdma_argp); - if (rd_ary) { + rd_ary = (struct rpcrdma_read_chunk *)&rdma_argp->rm_body.rm_chunks[0]; + if (rd_ary->rc_discrim != xdr_zero) { inv_rkey = be32_to_cpu(rd_ary->rc_target.rs_handle); goto out; } @@ -622,8 +581,7 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) * places this at the start of page 0. */ rdma_argp = page_address(rqstp->rq_pages[0]); - wr_ary = svc_rdma_get_write_array(rdma_argp); - rp_ary = svc_rdma_get_reply_array(rdma_argp, wr_ary); + svc_rdma_get_write_arrays(rdma_argp, &wr_ary, &rp_ary); inv_rkey = 0; if (rdma->sc_snd_w_inv) From e4eb42cecc6dc546aac888ee4913d59121e886ee Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 29 Nov 2016 11:04:50 -0500 Subject: [PATCH 11/19] svcrdma: Remove BH-disabled spin locking in svc_rdma_send() svcrdma's current SQ accounting algorithm takes sc_lock and disables bottom-halves while posting all RDMA Read, Write, and Send WRs. This is relatively heavyweight serialization. And note that Write and Send are already fully serialized by the xpt_mutex. Using a single atomic_t should be all that is necessary to guarantee that ib_post_send() is called only when there is enough space on the send queue. This is what the other RDMA-enabled storage targets do. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc_rdma.h | 2 +- net/sunrpc/xprtrdma/svc_rdma_sendto.c | 7 ++++++- net/sunrpc/xprtrdma/svc_rdma_transport.c | 25 ++++++++++-------------- 3 files changed, 17 insertions(+), 17 deletions(-) diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 6aef63b9a669..601cb07aa746 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -139,7 +139,7 @@ struct svcxprt_rdma { int sc_max_sge_rd; /* max sge for read target */ bool sc_snd_w_inv; /* OK to use Send With Invalidate */ - atomic_t sc_sq_count; /* Number of SQ WR on queue */ + atomic_t sc_sq_avail; /* SQEs ready to be consumed */ unsigned int sc_sq_depth; /* Depth of SQ */ unsigned int sc_rq_depth; /* Depth of RQ */ u32 sc_max_requests; /* Forward credits */ diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index 0a58d4062f2f..30eeab527bd0 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -594,7 +594,12 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) goto err0; inline_bytes = rqstp->rq_res.len; - /* Create the RDMA response header */ + /* Create the RDMA response header. xprt->xpt_mutex, + * acquired in svc_send(), serializes RPC replies. The + * code path below that inserts the credit grant value + * into each transport header runs only inside this + * critical section. + */ ret = -ENOMEM; res_page = alloc_page(GFP_KERNEL); if (!res_page) diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 6864fb967038..da990d7f8b20 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -434,7 +434,7 @@ static void svc_rdma_send_wc_common(struct svcxprt_rdma *xprt, goto err; out: - atomic_dec(&xprt->sc_sq_count); + atomic_inc(&xprt->sc_sq_avail); wake_up(&xprt->sc_send_wait); return; @@ -1008,6 +1008,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) newxprt->sc_rq_depth = newxprt->sc_max_requests + newxprt->sc_max_bc_requests; newxprt->sc_sq_depth = RPCRDMA_SQ_DEPTH_MULT * newxprt->sc_rq_depth; + atomic_set(&newxprt->sc_sq_avail, newxprt->sc_sq_depth); if (!svc_rdma_prealloc_ctxts(newxprt)) goto errout; @@ -1333,15 +1334,13 @@ int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr) /* If the SQ is full, wait until an SQ entry is available */ while (1) { - spin_lock_bh(&xprt->sc_lock); - if (xprt->sc_sq_depth < atomic_read(&xprt->sc_sq_count) + wr_count) { - spin_unlock_bh(&xprt->sc_lock); + if ((atomic_sub_return(wr_count, &xprt->sc_sq_avail) < 0)) { atomic_inc(&rdma_stat_sq_starve); /* Wait until SQ WR available if SQ still full */ + atomic_add(wr_count, &xprt->sc_sq_avail); wait_event(xprt->sc_send_wait, - atomic_read(&xprt->sc_sq_count) < - xprt->sc_sq_depth); + atomic_read(&xprt->sc_sq_avail) > wr_count); if (test_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags)) return -ENOTCONN; continue; @@ -1351,21 +1350,17 @@ int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr) svc_xprt_get(&xprt->sc_xprt); /* Bump used SQ WR count and post */ - atomic_add(wr_count, &xprt->sc_sq_count); ret = ib_post_send(xprt->sc_qp, wr, &bad_wr); if (ret) { set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); - atomic_sub(wr_count, &xprt->sc_sq_count); for (i = 0; i < wr_count; i ++) svc_xprt_put(&xprt->sc_xprt); - dprintk("svcrdma: failed to post SQ WR rc=%d, " - "sc_sq_count=%d, sc_sq_depth=%d\n", - ret, atomic_read(&xprt->sc_sq_count), - xprt->sc_sq_depth); - } - spin_unlock_bh(&xprt->sc_lock); - if (ret) + dprintk("svcrdma: failed to post SQ WR rc=%d\n", ret); + dprintk(" sc_sq_avail=%d, sc_sq_depth=%d\n", + atomic_read(&xprt->sc_sq_avail), + xprt->sc_sq_depth); wake_up(&xprt->sc_send_wait); + } break; } return ret; From dd6fd213b05e7a1f590b470500343dd97c3a32c1 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 29 Nov 2016 11:04:58 -0500 Subject: [PATCH 12/19] svcrdma: Remove DMA map accounting Clean up: sc_dma_used is not required for correct operation. It is simply a debugging tool to report when svcrdma has leaked DMA maps. However, manipulating an atomic has a measurable CPU cost, and DMA map accounting specific to svcrdma will be meaningless once svcrdma is converted to use the new generic r/w API. A similar kind of debug accounting can be done simply by enabling the IOMMU or by using CONFIG_DMA_API_DEBUG, CONFIG_IOMMU_DEBUG, and CONFIG_IOMMU_LEAK. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc_rdma.h | 2 -- net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 1 - net/sunrpc/xprtrdma/svc_rdma_transport.c | 13 +++---------- 3 files changed, 3 insertions(+), 13 deletions(-) diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 601cb07aa746..43d7c709d117 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -148,7 +148,6 @@ struct svcxprt_rdma { struct ib_pd *sc_pd; - atomic_t sc_dma_used; spinlock_t sc_ctxt_lock; struct list_head sc_ctxts; int sc_ctxt_used; @@ -200,7 +199,6 @@ static inline void svc_rdma_count_mappings(struct svcxprt_rdma *rdma, struct svc_rdma_op_ctxt *ctxt) { ctxt->mapped_sges++; - atomic_inc(&rdma->sc_dma_used); } /* svc_rdma_backchannel.c */ diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index 873c2a938d35..283246e0afc2 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -279,7 +279,6 @@ int rdma_read_chunk_frmr(struct svcxprt_rdma *xprt, frmr->sg); return -ENOMEM; } - atomic_inc(&xprt->sc_dma_used); n = ib_map_mr_sg(frmr->mr, frmr->sg, frmr->sg_nents, NULL, PAGE_SIZE); if (unlikely(n != frmr->sg_nents)) { diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index da990d7f8b20..a613ebd1dd85 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -224,25 +224,22 @@ void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt) struct svcxprt_rdma *xprt = ctxt->xprt; struct ib_device *device = xprt->sc_cm_id->device; u32 lkey = xprt->sc_pd->local_dma_lkey; - unsigned int i, count; + unsigned int i; - for (count = 0, i = 0; i < ctxt->mapped_sges; i++) { + for (i = 0; i < ctxt->mapped_sges; i++) { /* * Unmap the DMA addr in the SGE if the lkey matches * the local_dma_lkey, otherwise, ignore it since it is * an FRMR lkey and will be unmapped later when the * last WR that uses it completes. */ - if (ctxt->sge[i].lkey == lkey) { - count++; + if (ctxt->sge[i].lkey == lkey) ib_dma_unmap_page(device, ctxt->sge[i].addr, ctxt->sge[i].length, ctxt->direction); - } } ctxt->mapped_sges = 0; - atomic_sub(count, &xprt->sc_dma_used); } void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages) @@ -944,7 +941,6 @@ void svc_rdma_put_frmr(struct svcxprt_rdma *rdma, if (frmr) { ib_dma_unmap_sg(rdma->sc_cm_id->device, frmr->sg, frmr->sg_nents, frmr->direction); - atomic_dec(&rdma->sc_dma_used); spin_lock_bh(&rdma->sc_frmr_q_lock); WARN_ON_ONCE(!list_empty(&frmr->frmr_list)); list_add(&frmr->frmr_list, &rdma->sc_frmr_q); @@ -1256,9 +1252,6 @@ static void __svc_rdma_free(struct work_struct *work) if (rdma->sc_ctxt_used != 0) pr_err("svcrdma: ctxt still in use? (%d)\n", rdma->sc_ctxt_used); - if (atomic_read(&rdma->sc_dma_used) != 0) - pr_err("svcrdma: dma still in use? (%d)\n", - atomic_read(&rdma->sc_dma_used)); /* Final put of backchannel client transport */ if (xprt->xpt_bc_xprt) { From 96a58f9c1921f28fab5ed008be791adacb540cc6 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 29 Nov 2016 11:05:07 -0500 Subject: [PATCH 13/19] svcrdma: Remove svc_rdma_op_ctxt::wc_status Clean up: Completion status is already reported in the individual completion handlers. Save a few bytes in struct svc_rdma_op_ctxt. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc_rdma.h | 1 - net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 4 ++-- net/sunrpc/xprtrdma/svc_rdma_transport.c | 1 - 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 43d7c709d117..757fb963696c 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -79,7 +79,6 @@ struct svc_rdma_op_ctxt { struct ib_cqe reg_cqe; struct ib_cqe inv_cqe; struct list_head dto_q; - enum ib_wc_status wc_status; u32 byte_len; u32 position; struct svcxprt_rdma *xprt; diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index 283246e0afc2..428ab4ef77f3 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -640,8 +640,8 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) goto defer; goto out; } - dprintk("svcrdma: processing ctxt=%p on xprt=%p, rqstp=%p, status=%d\n", - ctxt, rdma_xprt, rqstp, ctxt->wc_status); + dprintk("svcrdma: processing ctxt=%p on xprt=%p, rqstp=%p\n", + ctxt, rdma_xprt, rqstp); atomic_inc(&rdma_stat_recv); /* Build up the XDR from the receive buffers. */ diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index a613ebd1dd85..f9d961c00dd2 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -393,7 +393,6 @@ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) /* WARNING: Only wc->wr_cqe and wc->status are reliable */ ctxt = container_of(cqe, struct svc_rdma_op_ctxt, cqe); - ctxt->wc_status = wc->status; svc_rdma_unmap_dma(ctxt); if (wc->status != IB_WC_SUCCESS) From 9a16a34d2146e1bcfad6c6ae6d214d645b424265 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 29 Nov 2016 11:05:15 -0500 Subject: [PATCH 14/19] svcrdma: Remove unused variables in xprt_rdma_bc_allocate() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Clean up. /linux-2.6/net/sunrpc/xprtrdma/svc_rdma_backchannel.c: In function ‘xprt_rdma_bc_allocate’: linux-2.6/net/sunrpc/xprtrdma/svc_rdma_backchannel.c:169:23: warning: variable ‘rdma’ set but not used [-Wunused-but-set-variable] struct svcxprt_rdma *rdma; ^ Fixes: 5d252f90a800 ("svcrdma: Add class for RDMA backwards ...") Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- net/sunrpc/xprtrdma/svc_rdma_backchannel.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c index 6035c5a380a6..288e35c2d8f4 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c +++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c @@ -164,13 +164,9 @@ static int xprt_rdma_bc_allocate(struct rpc_task *task) { struct rpc_rqst *rqst = task->tk_rqstp; - struct svc_xprt *sxprt = rqst->rq_xprt->bc_xprt; size_t size = rqst->rq_callsize; - struct svcxprt_rdma *rdma; struct page *page; - rdma = container_of(sxprt, struct svcxprt_rdma, sc_xprt); - if (size > PAGE_SIZE) { WARN_ONCE(1, "svcrdma: large bc buffer request (size %zu)\n", size); From f5426d37f654f59fb83e6ff3207cc2b157742670 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 29 Nov 2016 11:05:23 -0500 Subject: [PATCH 15/19] svcrdma: Remove unused variable in rdma_copy_tail() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Clean up. linux-2.6/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c: In function ‘rdma_copy_tail’: linux-2.6/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c:376:6: warning: variable ‘ret’ set but not used [-Wunused-but-set-variable] int ret; ^ Fixes: a97c331f9aa9 ("svcrdma: Handle additional inline content") Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index 428ab4ef77f3..57d35fbb1c28 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -373,9 +373,7 @@ rdma_copy_tail(struct svc_rqst *rqstp, struct svc_rdma_op_ctxt *head, u32 position, u32 byte_count, u32 page_offset, int page_no) { char *srcp, *destp; - int ret; - ret = 0; srcp = head->arg.head[0].iov_base + position; byte_count = head->arg.head[0].iov_len - position; if (byte_count > PAGE_SIZE) { From 07257450208b82b75103abcca34c6678f5ce51ad Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 29 Nov 2016 11:05:31 -0500 Subject: [PATCH 16/19] svcrdma: Break up dprintk format in svc_rdma_accept() The current code results in: Nov 7 14:50:19 klimt kernel: svcrdma: newxprt->sc_cm_id=ffff88085590c800, newxprt->sc_pd=ffff880852a7ce00#012 cm_id->device=ffff88084dd20000, sc_pd->device=ffff88084dd20000#012 cap.max_send_wr = 272#012 cap.max_recv_wr = 34#012 cap.max_send_sge = 32#012 cap.max_recv_sge = 32 Nov 7 14:50:19 klimt kernel: svcrdma: new connection ffff880855908000 accepted with the following attributes:#012 local_ip : 10.0.0.5#012 local_port#011 : 20049#012 remote_ip : 10.0.0.2#012 remote_port : 59909#012 max_sge : 32#012 max_sge_rd : 30#012 sq_depth : 272#012 max_requests : 32#012 ord : 16 Split up the output over multiple dprintks and take the opportunity to fix the display of IPv6 addresses. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- net/sunrpc/xprtrdma/svc_rdma_transport.c | 55 ++++++++---------------- 1 file changed, 18 insertions(+), 37 deletions(-) diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index f9d961c00dd2..bdbc248e1c14 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -41,6 +41,7 @@ */ #include +#include #include #include #include @@ -966,6 +967,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) struct rpcrdma_connect_private pmsg; struct ib_qp_init_attr qp_attr; struct ib_device *dev; + struct sockaddr *sap; unsigned int i; int ret = 0; @@ -1046,18 +1048,12 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) qp_attr.qp_type = IB_QPT_RC; qp_attr.send_cq = newxprt->sc_sq_cq; qp_attr.recv_cq = newxprt->sc_rq_cq; - dprintk("svcrdma: newxprt->sc_cm_id=%p, newxprt->sc_pd=%p\n" - " cm_id->device=%p, sc_pd->device=%p\n" - " cap.max_send_wr = %d\n" - " cap.max_recv_wr = %d\n" - " cap.max_send_sge = %d\n" - " cap.max_recv_sge = %d\n", - newxprt->sc_cm_id, newxprt->sc_pd, - dev, newxprt->sc_pd->device, - qp_attr.cap.max_send_wr, - qp_attr.cap.max_recv_wr, - qp_attr.cap.max_send_sge, - qp_attr.cap.max_recv_sge); + dprintk("svcrdma: newxprt->sc_cm_id=%p, newxprt->sc_pd=%p\n", + newxprt->sc_cm_id, newxprt->sc_pd); + dprintk(" cap.max_send_wr = %d, cap.max_recv_wr = %d\n", + qp_attr.cap.max_send_wr, qp_attr.cap.max_recv_wr); + dprintk(" cap.max_send_sge = %d, cap.max_recv_sge = %d\n", + qp_attr.cap.max_send_sge, qp_attr.cap.max_recv_sge); ret = rdma_create_qp(newxprt->sc_cm_id, newxprt->sc_pd, &qp_attr); if (ret) { @@ -1140,31 +1136,16 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) goto errout; } - dprintk("svcrdma: new connection %p accepted with the following " - "attributes:\n" - " local_ip : %pI4\n" - " local_port : %d\n" - " remote_ip : %pI4\n" - " remote_port : %d\n" - " max_sge : %d\n" - " max_sge_rd : %d\n" - " sq_depth : %d\n" - " max_requests : %d\n" - " ord : %d\n", - newxprt, - &((struct sockaddr_in *)&newxprt->sc_cm_id-> - route.addr.src_addr)->sin_addr.s_addr, - ntohs(((struct sockaddr_in *)&newxprt->sc_cm_id-> - route.addr.src_addr)->sin_port), - &((struct sockaddr_in *)&newxprt->sc_cm_id-> - route.addr.dst_addr)->sin_addr.s_addr, - ntohs(((struct sockaddr_in *)&newxprt->sc_cm_id-> - route.addr.dst_addr)->sin_port), - newxprt->sc_max_sge, - newxprt->sc_max_sge_rd, - newxprt->sc_sq_depth, - newxprt->sc_max_requests, - newxprt->sc_ord); + dprintk("svcrdma: new connection %p accepted:\n", newxprt); + sap = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.src_addr; + dprintk(" local address : %pIS:%u\n", sap, rpc_get_port(sap)); + sap = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.dst_addr; + dprintk(" remote address : %pIS:%u\n", sap, rpc_get_port(sap)); + dprintk(" max_sge : %d\n", newxprt->sc_max_sge); + dprintk(" max_sge_rd : %d\n", newxprt->sc_max_sge_rd); + dprintk(" sq_depth : %d\n", newxprt->sc_sq_depth); + dprintk(" max_requests : %d\n", newxprt->sc_max_requests); + dprintk(" ord : %d\n", newxprt->sc_ord); return &newxprt->sc_xprt; From fafedf81700d7d694b3716a14bc55bf4227344c7 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 29 Nov 2016 11:05:39 -0500 Subject: [PATCH 17/19] svcrdma: Further clean-up of svc_rdma_get_inv_rkey() No longer any need for the dprintk(). Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- net/sunrpc/xprtrdma/svc_rdma_sendto.c | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-) diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index 30eeab527bd0..ad4d286a83c5 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -199,31 +199,22 @@ static u32 svc_rdma_get_inv_rkey(struct rpcrdma_msg *rdma_argp, { struct rpcrdma_read_chunk *rd_ary; struct rpcrdma_segment *arg_ch; - u32 inv_rkey; - - inv_rkey = 0; rd_ary = (struct rpcrdma_read_chunk *)&rdma_argp->rm_body.rm_chunks[0]; - if (rd_ary->rc_discrim != xdr_zero) { - inv_rkey = be32_to_cpu(rd_ary->rc_target.rs_handle); - goto out; - } + if (rd_ary->rc_discrim != xdr_zero) + return be32_to_cpu(rd_ary->rc_target.rs_handle); if (wr_ary && be32_to_cpu(wr_ary->wc_nchunks)) { arg_ch = &wr_ary->wc_array[0].wc_target; - inv_rkey = be32_to_cpu(arg_ch->rs_handle); - goto out; + return be32_to_cpu(arg_ch->rs_handle); } if (rp_ary && be32_to_cpu(rp_ary->wc_nchunks)) { arg_ch = &rp_ary->wc_array[0].wc_target; - inv_rkey = be32_to_cpu(arg_ch->rs_handle); - goto out; + return be32_to_cpu(arg_ch->rs_handle); } -out: - dprintk("svcrdma: Send With Invalidate rkey=%08x\n", inv_rkey); - return inv_rkey; + return 0; } /* Assumptions: From 3eb15f2828464791f68b341ce87183012c509fc6 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Sun, 4 Dec 2016 13:45:28 +0100 Subject: [PATCH 18/19] sunrpc: use DEFINE_SPINLOCK() Signed-off-by: Fabian Frederick Signed-off-by: J. Bruce Fields --- net/sunrpc/svcauth.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/sunrpc/svcauth.c b/net/sunrpc/svcauth.c index 69841db1f533..e112da8005b5 100644 --- a/net/sunrpc/svcauth.c +++ b/net/sunrpc/svcauth.c @@ -124,8 +124,7 @@ EXPORT_SYMBOL_GPL(svc_auth_unregister); #define DN_HASHMAX (1< Date: Tue, 12 Jan 2016 20:24:14 +0100 Subject: [PATCH 19/19] nfsd: add support for the umask attribute Clients can set the umask attribute when creating files to cause the server to apply it always except when inheriting permissions from the parent directory. That way, the new files will end up with the same permissions as files created locally. See https://tools.ietf.org/html/draft-ietf-nfsv4-umask-02 for more details. Signed-off-by: Andreas Gruenbacher Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4proc.c | 3 +++ fs/nfsd/nfs4xdr.c | 26 +++++++++++++++++++++----- fs/nfsd/nfsd.h | 9 +++++++-- fs/nfsd/nfssvc.c | 4 ++-- include/linux/nfs4.h | 1 + 5 files changed, 34 insertions(+), 9 deletions(-) diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index e901cf124e9f..74a6e573e061 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -102,6 +102,9 @@ check_attr_support(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, return nfserr_attrnotsupp; if (writable && !bmval_is_subset(bmval, writable)) return nfserr_inval; + if (writable && (bmval[2] & FATTR4_WORD2_MODE_UMASK) && + (bmval[1] & FATTR4_WORD1_MODE)) + return nfserr_inval; return nfs_ok; } diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 281739e1d477..79edde4577b2 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -33,6 +33,7 @@ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +#include #include #include #include @@ -299,7 +300,7 @@ nfsd4_decode_bitmap(struct nfsd4_compoundargs *argp, u32 *bmval) static __be32 nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct iattr *iattr, struct nfs4_acl **acl, - struct xdr_netobj *label) + struct xdr_netobj *label, int *umask) { int expected_len, len = 0; u32 dummy32; @@ -457,6 +458,17 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, return nfserr_jukebox; } #endif + if (bmval[2] & FATTR4_WORD2_MODE_UMASK) { + if (!umask) + goto xdr_error; + READ_BUF(8); + len += 8; + dummy32 = be32_to_cpup(p++); + iattr->ia_mode = dummy32 & (S_IFMT | S_IALLUGO); + dummy32 = be32_to_cpup(p++); + *umask = dummy32 & S_IRWXUGO; + iattr->ia_valid |= ATTR_MODE; + } if (len != expected_len) goto xdr_error; @@ -651,7 +663,8 @@ nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create return status; status = nfsd4_decode_fattr(argp, create->cr_bmval, &create->cr_iattr, - &create->cr_acl, &create->cr_label); + &create->cr_acl, &create->cr_label, + ¤t->fs->umask); if (status) goto out; @@ -896,13 +909,15 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open) case NFS4_OPEN_NOCREATE: break; case NFS4_OPEN_CREATE: + current->fs->umask = 0; READ_BUF(4); open->op_createmode = be32_to_cpup(p++); switch (open->op_createmode) { case NFS4_CREATE_UNCHECKED: case NFS4_CREATE_GUARDED: status = nfsd4_decode_fattr(argp, open->op_bmval, - &open->op_iattr, &open->op_acl, &open->op_label); + &open->op_iattr, &open->op_acl, &open->op_label, + ¤t->fs->umask); if (status) goto out; break; @@ -916,7 +931,8 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open) READ_BUF(NFS4_VERIFIER_SIZE); COPYMEM(open->op_verf.data, NFS4_VERIFIER_SIZE); status = nfsd4_decode_fattr(argp, open->op_bmval, - &open->op_iattr, &open->op_acl, &open->op_label); + &open->op_iattr, &open->op_acl, &open->op_label, + ¤t->fs->umask); if (status) goto out; break; @@ -1153,7 +1169,7 @@ nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *seta if (status) return status; return nfsd4_decode_fattr(argp, setattr->sa_bmval, &setattr->sa_iattr, - &setattr->sa_acl, &setattr->sa_label); + &setattr->sa_acl, &setattr->sa_label, NULL); } static __be32 diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index 7155239b2908..d74c8c44dc35 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -359,6 +359,7 @@ void nfsd_lockd_shutdown(void); #define NFSD4_2_SUPPORTED_ATTRS_WORD2 \ (NFSD4_1_SUPPORTED_ATTRS_WORD2 | \ + FATTR4_WORD2_MODE_UMASK | \ NFSD4_2_SECURITY_ATTRS) extern u32 nfsd_suppattrs[3][3]; @@ -390,10 +391,14 @@ static inline bool nfsd_attrs_supported(u32 minorversion, u32 *bmval) (FATTR4_WORD1_MODE | FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP \ | FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET) #ifdef CONFIG_NFSD_V4_SECURITY_LABEL -#define NFSD_WRITEABLE_ATTRS_WORD2 FATTR4_WORD2_SECURITY_LABEL +#define MAYBE_FATTR4_WORD2_SECURITY_LABEL \ + FATTR4_WORD2_SECURITY_LABEL #else -#define NFSD_WRITEABLE_ATTRS_WORD2 0 +#define MAYBE_FATTR4_WORD2_SECURITY_LABEL 0 #endif +#define NFSD_WRITEABLE_ATTRS_WORD2 \ + (FATTR4_WORD2_MODE_UMASK \ + | MAYBE_FATTR4_WORD2_SECURITY_LABEL) #define NFSD_SUPPATTR_EXCLCREAT_WORD0 \ NFSD_WRITEABLE_ATTRS_WORD0 diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index a2b65fc56dd6..e6bfd96734c0 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -661,8 +661,8 @@ nfsd(void *vrqstp) mutex_lock(&nfsd_mutex); /* At this point, the thread shares current->fs - * with the init process. We need to create files with a - * umask of 0 instead of init's umask. */ + * with the init process. We need to create files with the + * umask as defined by the client instead of init's umask. */ if (unshare_fs_struct() < 0) { printk("Unable to start nfsd thread: out of memory\n"); goto out; diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h index 9094faf0699d..bca536341d1a 100644 --- a/include/linux/nfs4.h +++ b/include/linux/nfs4.h @@ -440,6 +440,7 @@ enum lock_type4 { #define FATTR4_WORD2_MDSTHRESHOLD (1UL << 4) #define FATTR4_WORD2_CLONE_BLKSIZE (1UL << 13) #define FATTR4_WORD2_SECURITY_LABEL (1UL << 16) +#define FATTR4_WORD2_MODE_UMASK (1UL << 17) /* MDS threshold bitmap bits */ #define THRESHOLD_RD (1UL << 0)