RPC/RDMA: adhere to protocol for unpadded client trailing write chunks.
authorTom Talpey <talpey@netapp.com>
Thu, 9 Oct 2008 19:01:11 +0000 (15:01 -0400)
committerTrond Myklebust <Trond.Myklebust@netapp.com>
Fri, 10 Oct 2008 19:12:33 +0000 (15:12 -0400)
The RPC/RDMA protocol allows clients and servers to avoid RDMA
operations for data which is purely the result of XDR padding.
On the client, automatically insert the necessary padding for
such server replies, and optionally don't marshal such chunks.

Signed-off-by: Tom Talpey <talpey@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
net/sunrpc/xprtrdma/rpc_rdma.c
net/sunrpc/xprtrdma/transport.c
net/sunrpc/xprtrdma/xprt_rdma.h

index 721dae7..d245c0b 100644 (file)
@@ -118,6 +118,10 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
        }
 
        if (xdrbuf->tail[0].iov_len) {
+               /* the rpcrdma protocol allows us to omit any trailing
+                * xdr pad bytes, saving the server an RDMA operation. */
+               if (xdrbuf->tail[0].iov_len < 4 && xprt_rdma_pad_optimize)
+                       return n;
                if (n == nsegs)
                        return 0;
                seg[n].mr_page = NULL;
@@ -594,7 +598,7 @@ rpcrdma_count_chunks(struct rpcrdma_rep *rep, unsigned int max, int wrchunk, __b
  * Scatter inline received data back into provided iov's.
  */
 static void
-rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len)
+rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad)
 {
        int i, npages, curlen, olen;
        char *destp;
@@ -660,6 +664,13 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len)
        } else
                rqst->rq_rcv_buf.tail[0].iov_len = 0;
 
+       if (pad) {
+               /* implicit padding on terminal chunk */
+               unsigned char *p = rqst->rq_rcv_buf.tail[0].iov_base;
+               while (pad--)
+                       p[rqst->rq_rcv_buf.tail[0].iov_len++] = 0;
+       }
+
        if (copy_len)
                dprintk("RPC:       %s: %d bytes in"
                        " %d extra segments (%d lost)\n",
@@ -794,14 +805,20 @@ repost:
                            ((unsigned char *)iptr - (unsigned char *)headerp);
                        status = rep->rr_len + rdmalen;
                        r_xprt->rx_stats.total_rdma_reply += rdmalen;
+                       /* special case - last chunk may omit padding */
+                       if (rdmalen &= 3) {
+                               rdmalen = 4 - rdmalen;
+                               status += rdmalen;
+                       }
                } else {
                        /* else ordinary inline */
+                       rdmalen = 0;
                        iptr = (__be32 *)((unsigned char *)headerp + 28);
                        rep->rr_len -= 28; /*sizeof *headerp;*/
                        status = rep->rr_len;
                }
                /* Fix up the rpc results for upper layer */
-               rpcrdma_inline_fixup(rqst, (char *)iptr, rep->rr_len);
+               rpcrdma_inline_fixup(rqst, (char *)iptr, rep->rr_len, rdmalen);
                break;
 
        case __constant_htonl(RDMA_NOMSG):
index ec6d1e7..c7d2380 100644 (file)
@@ -71,6 +71,7 @@ static unsigned int xprt_rdma_max_inline_read = RPCRDMA_DEF_INLINE;
 static unsigned int xprt_rdma_max_inline_write = RPCRDMA_DEF_INLINE;
 static unsigned int xprt_rdma_inline_write_padding;
 static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRMR;
+                int xprt_rdma_pad_optimize = 0;
 
 #ifdef RPC_DEBUG
 
@@ -136,6 +137,14 @@ static ctl_table xr_tunables_table[] = {
                .extra2         = &max_memreg,
        },
        {
+               .ctl_name       = CTL_UNNUMBERED,
+               .procname       = "rdma_pad_optimize",
+               .data           = &xprt_rdma_pad_optimize,
+               .maxlen         = sizeof(unsigned int),
+               .mode           = 0644,
+               .proc_handler   = &proc_dointvec,
+       },
+       {
                .ctl_name = 0,
        },
 };
index 2db2344..fde6499 100644 (file)
@@ -280,6 +280,11 @@ struct rpcrdma_xprt {
 #define rpcx_to_rdmax(x) container_of(x, struct rpcrdma_xprt, xprt)
 #define rpcx_to_rdmad(x) (rpcx_to_rdmax(x)->rx_data)
 
+/* Setting this to 0 ensures interoperability with early servers.
+ * Setting this to 1 enhances certain unaligned read/write performance.
+ * Default is 0, see sysctl entry and rpc_rdma.c rpcrdma_convert_iovs() */
+extern int xprt_rdma_pad_optimize;
+
 /*
  * Interface Adapter calls - xprtrdma/verbs.c
  */