mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2025-01-01 10:43:43 +00:00
SUNRPC: Handle TCP socket sends with kernel_sendpage() again
Daire Byrne reports a ~50% aggregrate throughput regression on his Linux NFS server after commitda1661b93b
("SUNRPC: Teach server to use xprt_sock_sendmsg for socket sends"), which replaced kernel_send_page() calls in NFSD's socket send path with calls to sock_sendmsg() using iov_iter. Investigation showed that tcp_sendmsg() was not using zero-copy to send the xdr_buf's bvec pages, but instead was relying on memcpy. This means copying every byte of a large NFS READ payload. It looks like TLS sockets do indeed support a ->sendpage method, so it's really not necessary to use xprt_sock_sendmsg() to support TLS fully on the server. A mechanical reversion ofda1661b93b
is not possible at this point, but we can re-implement the server's TCP socket sendmsg path using kernel_sendpage(). Reported-by: Daire Byrne <daire@dneg.com> BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=209439 Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
This commit is contained in:
parent
d6c9e4368c
commit
4a85a6a332
@ -1062,6 +1062,90 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp)
|
||||
return 0; /* record not complete */
|
||||
}
|
||||
|
||||
static int svc_tcp_send_kvec(struct socket *sock, const struct kvec *vec,
|
||||
int flags)
|
||||
{
|
||||
return kernel_sendpage(sock, virt_to_page(vec->iov_base),
|
||||
offset_in_page(vec->iov_base),
|
||||
vec->iov_len, flags);
|
||||
}
|
||||
|
||||
/*
|
||||
* kernel_sendpage() is used exclusively to reduce the number of
|
||||
* copy operations in this path. Therefore the caller must ensure
|
||||
* that the pages backing @xdr are unchanging.
|
||||
*
|
||||
* In addition, the logic assumes that * .bv_len is never larger
|
||||
* than PAGE_SIZE.
|
||||
*/
|
||||
static int svc_tcp_sendmsg(struct socket *sock, struct msghdr *msg,
|
||||
struct xdr_buf *xdr, rpc_fraghdr marker,
|
||||
unsigned int *sentp)
|
||||
{
|
||||
const struct kvec *head = xdr->head;
|
||||
const struct kvec *tail = xdr->tail;
|
||||
struct kvec rm = {
|
||||
.iov_base = &marker,
|
||||
.iov_len = sizeof(marker),
|
||||
};
|
||||
int flags, ret;
|
||||
|
||||
*sentp = 0;
|
||||
xdr_alloc_bvec(xdr, GFP_KERNEL);
|
||||
|
||||
msg->msg_flags = MSG_MORE;
|
||||
ret = kernel_sendmsg(sock, msg, &rm, 1, rm.iov_len);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
*sentp += ret;
|
||||
if (ret != rm.iov_len)
|
||||
return -EAGAIN;
|
||||
|
||||
flags = head->iov_len < xdr->len ? MSG_MORE | MSG_SENDPAGE_NOTLAST : 0;
|
||||
ret = svc_tcp_send_kvec(sock, head, flags);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
*sentp += ret;
|
||||
if (ret != head->iov_len)
|
||||
goto out;
|
||||
|
||||
if (xdr->page_len) {
|
||||
unsigned int offset, len, remaining;
|
||||
struct bio_vec *bvec;
|
||||
|
||||
bvec = xdr->bvec;
|
||||
offset = xdr->page_base;
|
||||
remaining = xdr->page_len;
|
||||
flags = MSG_MORE | MSG_SENDPAGE_NOTLAST;
|
||||
while (remaining > 0) {
|
||||
if (remaining <= PAGE_SIZE && tail->iov_len == 0)
|
||||
flags = 0;
|
||||
len = min(remaining, bvec->bv_len);
|
||||
ret = kernel_sendpage(sock, bvec->bv_page,
|
||||
bvec->bv_offset + offset,
|
||||
len, flags);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
*sentp += ret;
|
||||
if (ret != len)
|
||||
goto out;
|
||||
remaining -= len;
|
||||
offset = 0;
|
||||
bvec++;
|
||||
}
|
||||
}
|
||||
|
||||
if (tail->iov_len) {
|
||||
ret = svc_tcp_send_kvec(sock, tail, 0);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
*sentp += ret;
|
||||
}
|
||||
|
||||
out:
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* svc_tcp_sendto - Send out a reply on a TCP socket
|
||||
* @rqstp: completed svc_rqst
|
||||
@ -1089,7 +1173,7 @@ static int svc_tcp_sendto(struct svc_rqst *rqstp)
|
||||
mutex_lock(&xprt->xpt_mutex);
|
||||
if (svc_xprt_is_dead(xprt))
|
||||
goto out_notconn;
|
||||
err = xprt_sock_sendmsg(svsk->sk_sock, &msg, xdr, 0, marker, &sent);
|
||||
err = svc_tcp_sendmsg(svsk->sk_sock, &msg, xdr, marker, &sent);
|
||||
xdr_free_bvec(xdr);
|
||||
trace_svcsock_tcp_send(xprt, err < 0 ? err : sent);
|
||||
if (err < 0 || sent != (xdr->len + sizeof(marker)))
|
||||
|
Loading…
Reference in New Issue
Block a user