8ea283ecc522a8a5e8e033b12f6485fc5ef456c4
[safe/jmp/linux-2.6] / net / sunrpc / xprtrdma / verbs.c
1 /*
2  * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
3  *
4  * This software is available to you under a choice of one of two
5  * licenses.  You may choose to be licensed under the terms of the GNU
6  * General Public License (GPL) Version 2, available from the file
7  * COPYING in the main directory of this source tree, or the BSD-type
8  * license below:
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  *
14  *      Redistributions of source code must retain the above copyright
15  *      notice, this list of conditions and the following disclaimer.
16  *
17  *      Redistributions in binary form must reproduce the above
18  *      copyright notice, this list of conditions and the following
19  *      disclaimer in the documentation and/or other materials provided
20  *      with the distribution.
21  *
22  *      Neither the name of the Network Appliance, Inc. nor the names of
23  *      its contributors may be used to endorse or promote products
24  *      derived from this software without specific prior written
25  *      permission.
26  *
27  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
30  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
31  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
32  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
33  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
34  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
35  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
36  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
37  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38  */
39
40 /*
41  * verbs.c
42  *
43  * Encapsulates the major functions managing:
44  *  o adapters
45  *  o endpoints
46  *  o connections
47  *  o buffer memory
48  */
49
50 #include <linux/pci.h>  /* for Tavor hack below */
51
52 #include "xprt_rdma.h"
53
54 /*
55  * Globals/Macros
56  */
57
58 #ifdef RPC_DEBUG
59 # define RPCDBG_FACILITY        RPCDBG_TRANS
60 #endif
61
62 /*
63  * internal functions
64  */
65
66 /*
67  * handle replies in tasklet context, using a single, global list
68  * rdma tasklet function -- just turn around and call the func
69  * for all replies on the list
70  */
71
72 static DEFINE_SPINLOCK(rpcrdma_tk_lock_g);
73 static LIST_HEAD(rpcrdma_tasklets_g);
74
75 static void
76 rpcrdma_run_tasklet(unsigned long data)
77 {
78         struct rpcrdma_rep *rep;
79         void (*func)(struct rpcrdma_rep *);
80         unsigned long flags;
81
82         data = data;
83         spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
84         while (!list_empty(&rpcrdma_tasklets_g)) {
85                 rep = list_entry(rpcrdma_tasklets_g.next,
86                                  struct rpcrdma_rep, rr_list);
87                 list_del(&rep->rr_list);
88                 func = rep->rr_func;
89                 rep->rr_func = NULL;
90                 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
91
92                 if (func)
93                         func(rep);
94                 else
95                         rpcrdma_recv_buffer_put(rep);
96
97                 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
98         }
99         spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
100 }
101
102 static DECLARE_TASKLET(rpcrdma_tasklet_g, rpcrdma_run_tasklet, 0UL);
103
104 static inline void
105 rpcrdma_schedule_tasklet(struct rpcrdma_rep *rep)
106 {
107         unsigned long flags;
108
109         spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
110         list_add_tail(&rep->rr_list, &rpcrdma_tasklets_g);
111         spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
112         tasklet_schedule(&rpcrdma_tasklet_g);
113 }
114
115 static void
116 rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context)
117 {
118         struct rpcrdma_ep *ep = context;
119
120         dprintk("RPC:       %s: QP error %X on device %s ep %p\n",
121                 __func__, event->event, event->device->name, context);
122         if (ep->rep_connected == 1) {
123                 ep->rep_connected = -EIO;
124                 ep->rep_func(ep);
125                 wake_up_all(&ep->rep_connect_wait);
126         }
127 }
128
129 static void
130 rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context)
131 {
132         struct rpcrdma_ep *ep = context;
133
134         dprintk("RPC:       %s: CQ error %X on device %s ep %p\n",
135                 __func__, event->event, event->device->name, context);
136         if (ep->rep_connected == 1) {
137                 ep->rep_connected = -EIO;
138                 ep->rep_func(ep);
139                 wake_up_all(&ep->rep_connect_wait);
140         }
141 }
142
143 static inline
144 void rpcrdma_event_process(struct ib_wc *wc)
145 {
146         struct rpcrdma_rep *rep =
147                         (struct rpcrdma_rep *)(unsigned long) wc->wr_id;
148
149         dprintk("RPC:       %s: event rep %p status %X opcode %X length %u\n",
150                 __func__, rep, wc->status, wc->opcode, wc->byte_len);
151
152         if (!rep) /* send or bind completion that we don't care about */
153                 return;
154
155         if (IB_WC_SUCCESS != wc->status) {
156                 dprintk("RPC:       %s: %s WC status %X, connection lost\n",
157                         __func__, (wc->opcode & IB_WC_RECV) ? "recv" : "send",
158                          wc->status);
159                 rep->rr_len = ~0U;
160                 rpcrdma_schedule_tasklet(rep);
161                 return;
162         }
163
164         switch (wc->opcode) {
165         case IB_WC_RECV:
166                 rep->rr_len = wc->byte_len;
167                 ib_dma_sync_single_for_cpu(
168                         rdmab_to_ia(rep->rr_buffer)->ri_id->device,
169                         rep->rr_iov.addr, rep->rr_len, DMA_FROM_DEVICE);
170                 /* Keep (only) the most recent credits, after check validity */
171                 if (rep->rr_len >= 16) {
172                         struct rpcrdma_msg *p =
173                                         (struct rpcrdma_msg *) rep->rr_base;
174                         unsigned int credits = ntohl(p->rm_credit);
175                         if (credits == 0) {
176                                 dprintk("RPC:       %s: server"
177                                         " dropped credits to 0!\n", __func__);
178                                 /* don't deadlock */
179                                 credits = 1;
180                         } else if (credits > rep->rr_buffer->rb_max_requests) {
181                                 dprintk("RPC:       %s: server"
182                                         " over-crediting: %d (%d)\n",
183                                         __func__, credits,
184                                         rep->rr_buffer->rb_max_requests);
185                                 credits = rep->rr_buffer->rb_max_requests;
186                         }
187                         atomic_set(&rep->rr_buffer->rb_credits, credits);
188                 }
189                 /* fall through */
190         case IB_WC_BIND_MW:
191                 rpcrdma_schedule_tasklet(rep);
192                 break;
193         default:
194                 dprintk("RPC:       %s: unexpected WC event %X\n",
195                         __func__, wc->opcode);
196                 break;
197         }
198 }
199
200 static inline int
201 rpcrdma_cq_poll(struct ib_cq *cq)
202 {
203         struct ib_wc wc;
204         int rc;
205
206         for (;;) {
207                 rc = ib_poll_cq(cq, 1, &wc);
208                 if (rc < 0) {
209                         dprintk("RPC:       %s: ib_poll_cq failed %i\n",
210                                 __func__, rc);
211                         return rc;
212                 }
213                 if (rc == 0)
214                         break;
215
216                 rpcrdma_event_process(&wc);
217         }
218
219         return 0;
220 }
221
222 /*
223  * rpcrdma_cq_event_upcall
224  *
225  * This upcall handles recv, send, bind and unbind events.
226  * It is reentrant but processes single events in order to maintain
227  * ordering of receives to keep server credits.
228  *
229  * It is the responsibility of the scheduled tasklet to return
230  * recv buffers to the pool. NOTE: this affects synchronization of
231  * connection shutdown. That is, the structures required for
232  * the completion of the reply handler must remain intact until
233  * all memory has been reclaimed.
234  *
235  * Note that send events are suppressed and do not result in an upcall.
236  */
237 static void
238 rpcrdma_cq_event_upcall(struct ib_cq *cq, void *context)
239 {
240         int rc;
241
242         rc = rpcrdma_cq_poll(cq);
243         if (rc)
244                 return;
245
246         rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
247         if (rc) {
248                 dprintk("RPC:       %s: ib_req_notify_cq failed %i\n",
249                         __func__, rc);
250                 return;
251         }
252
253         rpcrdma_cq_poll(cq);
254 }
255
256 #ifdef RPC_DEBUG
257 static const char * const conn[] = {
258         "address resolved",
259         "address error",
260         "route resolved",
261         "route error",
262         "connect request",
263         "connect response",
264         "connect error",
265         "unreachable",
266         "rejected",
267         "established",
268         "disconnected",
269         "device removal"
270 };
271 #endif
272
273 static int
274 rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
275 {
276         struct rpcrdma_xprt *xprt = id->context;
277         struct rpcrdma_ia *ia = &xprt->rx_ia;
278         struct rpcrdma_ep *ep = &xprt->rx_ep;
279         struct sockaddr_in *addr = (struct sockaddr_in *) &ep->rep_remote_addr;
280         struct ib_qp_attr attr;
281         struct ib_qp_init_attr iattr;
282         int connstate = 0;
283
284         switch (event->event) {
285         case RDMA_CM_EVENT_ADDR_RESOLVED:
286         case RDMA_CM_EVENT_ROUTE_RESOLVED:
287                 complete(&ia->ri_done);
288                 break;
289         case RDMA_CM_EVENT_ADDR_ERROR:
290                 ia->ri_async_rc = -EHOSTUNREACH;
291                 dprintk("RPC:       %s: CM address resolution error, ep 0x%p\n",
292                         __func__, ep);
293                 complete(&ia->ri_done);
294                 break;
295         case RDMA_CM_EVENT_ROUTE_ERROR:
296                 ia->ri_async_rc = -ENETUNREACH;
297                 dprintk("RPC:       %s: CM route resolution error, ep 0x%p\n",
298                         __func__, ep);
299                 complete(&ia->ri_done);
300                 break;
301         case RDMA_CM_EVENT_ESTABLISHED:
302                 connstate = 1;
303                 ib_query_qp(ia->ri_id->qp, &attr,
304                         IB_QP_MAX_QP_RD_ATOMIC | IB_QP_MAX_DEST_RD_ATOMIC,
305                         &iattr);
306                 dprintk("RPC:       %s: %d responder resources"
307                         " (%d initiator)\n",
308                         __func__, attr.max_dest_rd_atomic, attr.max_rd_atomic);
309                 goto connected;
310         case RDMA_CM_EVENT_CONNECT_ERROR:
311                 connstate = -ENOTCONN;
312                 goto connected;
313         case RDMA_CM_EVENT_UNREACHABLE:
314                 connstate = -ENETDOWN;
315                 goto connected;
316         case RDMA_CM_EVENT_REJECTED:
317                 connstate = -ECONNREFUSED;
318                 goto connected;
319         case RDMA_CM_EVENT_DISCONNECTED:
320                 connstate = -ECONNABORTED;
321                 goto connected;
322         case RDMA_CM_EVENT_DEVICE_REMOVAL:
323                 connstate = -ENODEV;
324 connected:
325                 dprintk("RPC:       %s: %s: %u.%u.%u.%u:%u"
326                         " (ep 0x%p event 0x%x)\n",
327                         __func__,
328                         (event->event <= 11) ? conn[event->event] :
329                                                 "unknown connection error",
330                         NIPQUAD(addr->sin_addr.s_addr),
331                         ntohs(addr->sin_port),
332                         ep, event->event);
333                 atomic_set(&rpcx_to_rdmax(ep->rep_xprt)->rx_buf.rb_credits, 1);
334                 dprintk("RPC:       %s: %sconnected\n",
335                                         __func__, connstate > 0 ? "" : "dis");
336                 ep->rep_connected = connstate;
337                 ep->rep_func(ep);
338                 wake_up_all(&ep->rep_connect_wait);
339                 break;
340         default:
341                 ia->ri_async_rc = -EINVAL;
342                 dprintk("RPC:       %s: unexpected CM event %X\n",
343                         __func__, event->event);
344                 complete(&ia->ri_done);
345                 break;
346         }
347
348         return 0;
349 }
350
351 static struct rdma_cm_id *
352 rpcrdma_create_id(struct rpcrdma_xprt *xprt,
353                         struct rpcrdma_ia *ia, struct sockaddr *addr)
354 {
355         struct rdma_cm_id *id;
356         int rc;
357
358         id = rdma_create_id(rpcrdma_conn_upcall, xprt, RDMA_PS_TCP);
359         if (IS_ERR(id)) {
360                 rc = PTR_ERR(id);
361                 dprintk("RPC:       %s: rdma_create_id() failed %i\n",
362                         __func__, rc);
363                 return id;
364         }
365
366         ia->ri_async_rc = 0;
367         rc = rdma_resolve_addr(id, NULL, addr, RDMA_RESOLVE_TIMEOUT);
368         if (rc) {
369                 dprintk("RPC:       %s: rdma_resolve_addr() failed %i\n",
370                         __func__, rc);
371                 goto out;
372         }
373         wait_for_completion(&ia->ri_done);
374         rc = ia->ri_async_rc;
375         if (rc)
376                 goto out;
377
378         ia->ri_async_rc = 0;
379         rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT);
380         if (rc) {
381                 dprintk("RPC:       %s: rdma_resolve_route() failed %i\n",
382                         __func__, rc);
383                 goto out;
384         }
385         wait_for_completion(&ia->ri_done);
386         rc = ia->ri_async_rc;
387         if (rc)
388                 goto out;
389
390         return id;
391
392 out:
393         rdma_destroy_id(id);
394         return ERR_PTR(rc);
395 }
396
397 /*
398  * Drain any cq, prior to teardown.
399  */
400 static void
401 rpcrdma_clean_cq(struct ib_cq *cq)
402 {
403         struct ib_wc wc;
404         int count = 0;
405
406         while (1 == ib_poll_cq(cq, 1, &wc))
407                 ++count;
408
409         if (count)
410                 dprintk("RPC:       %s: flushed %d events (last 0x%x)\n",
411                         __func__, count, wc.opcode);
412 }
413
414 /*
415  * Exported functions.
416  */
417
418 /*
419  * Open and initialize an Interface Adapter.
420  *  o initializes fields of struct rpcrdma_ia, including
421  *    interface and provider attributes and protection zone.
422  */
423 int
424 rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
425 {
426         int rc;
427         struct rpcrdma_ia *ia = &xprt->rx_ia;
428
429         init_completion(&ia->ri_done);
430
431         ia->ri_id = rpcrdma_create_id(xprt, ia, addr);
432         if (IS_ERR(ia->ri_id)) {
433                 rc = PTR_ERR(ia->ri_id);
434                 goto out1;
435         }
436
437         ia->ri_pd = ib_alloc_pd(ia->ri_id->device);
438         if (IS_ERR(ia->ri_pd)) {
439                 rc = PTR_ERR(ia->ri_pd);
440                 dprintk("RPC:       %s: ib_alloc_pd() failed %i\n",
441                         __func__, rc);
442                 goto out2;
443         }
444
445         /*
446          * Optionally obtain an underlying physical identity mapping in
447          * order to do a memory window-based bind. This base registration
448          * is protected from remote access - that is enabled only by binding
449          * for the specific bytes targeted during each RPC operation, and
450          * revoked after the corresponding completion similar to a storage
451          * adapter.
452          */
453         if (memreg > RPCRDMA_REGISTER) {
454                 int mem_priv = IB_ACCESS_LOCAL_WRITE;
455                 switch (memreg) {
456 #if RPCRDMA_PERSISTENT_REGISTRATION
457                 case RPCRDMA_ALLPHYSICAL:
458                         mem_priv |= IB_ACCESS_REMOTE_WRITE;
459                         mem_priv |= IB_ACCESS_REMOTE_READ;
460                         break;
461 #endif
462                 case RPCRDMA_MEMWINDOWS_ASYNC:
463                 case RPCRDMA_MEMWINDOWS:
464                         mem_priv |= IB_ACCESS_MW_BIND;
465                         break;
466                 default:
467                         break;
468                 }
469                 ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv);
470                 if (IS_ERR(ia->ri_bind_mem)) {
471                         printk(KERN_ALERT "%s: ib_get_dma_mr for "
472                                 "phys register failed with %lX\n\t"
473                                 "Will continue with degraded performance\n",
474                                 __func__, PTR_ERR(ia->ri_bind_mem));
475                         memreg = RPCRDMA_REGISTER;
476                         ia->ri_bind_mem = NULL;
477                 }
478         }
479
480         /* Else will do memory reg/dereg for each chunk */
481         ia->ri_memreg_strategy = memreg;
482
483         return 0;
484 out2:
485         rdma_destroy_id(ia->ri_id);
486 out1:
487         return rc;
488 }
489
490 /*
491  * Clean up/close an IA.
492  *   o if event handles and PD have been initialized, free them.
493  *   o close the IA
494  */
495 void
496 rpcrdma_ia_close(struct rpcrdma_ia *ia)
497 {
498         int rc;
499
500         dprintk("RPC:       %s: entering\n", __func__);
501         if (ia->ri_bind_mem != NULL) {
502                 rc = ib_dereg_mr(ia->ri_bind_mem);
503                 dprintk("RPC:       %s: ib_dereg_mr returned %i\n",
504                         __func__, rc);
505         }
506         if (ia->ri_id != NULL && !IS_ERR(ia->ri_id) && ia->ri_id->qp)
507                 rdma_destroy_qp(ia->ri_id);
508         if (ia->ri_pd != NULL && !IS_ERR(ia->ri_pd)) {
509                 rc = ib_dealloc_pd(ia->ri_pd);
510                 dprintk("RPC:       %s: ib_dealloc_pd returned %i\n",
511                         __func__, rc);
512         }
513         if (ia->ri_id != NULL && !IS_ERR(ia->ri_id))
514                 rdma_destroy_id(ia->ri_id);
515 }
516
517 /*
518  * Create unconnected endpoint.
519  */
520 int
521 rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
522                                 struct rpcrdma_create_data_internal *cdata)
523 {
524         struct ib_device_attr devattr;
525         int rc, err;
526
527         rc = ib_query_device(ia->ri_id->device, &devattr);
528         if (rc) {
529                 dprintk("RPC:       %s: ib_query_device failed %d\n",
530                         __func__, rc);
531                 return rc;
532         }
533
534         /* check provider's send/recv wr limits */
535         if (cdata->max_requests > devattr.max_qp_wr)
536                 cdata->max_requests = devattr.max_qp_wr;
537
538         ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall;
539         ep->rep_attr.qp_context = ep;
540         /* send_cq and recv_cq initialized below */
541         ep->rep_attr.srq = NULL;
542         ep->rep_attr.cap.max_send_wr = cdata->max_requests;
543         switch (ia->ri_memreg_strategy) {
544         case RPCRDMA_MEMWINDOWS_ASYNC:
545         case RPCRDMA_MEMWINDOWS:
546                 /* Add room for mw_binds+unbinds - overkill! */
547                 ep->rep_attr.cap.max_send_wr++;
548                 ep->rep_attr.cap.max_send_wr *= (2 * RPCRDMA_MAX_SEGS);
549                 if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr)
550                         return -EINVAL;
551                 break;
552         default:
553                 break;
554         }
555         ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
556         ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2);
557         ep->rep_attr.cap.max_recv_sge = 1;
558         ep->rep_attr.cap.max_inline_data = 0;
559         ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
560         ep->rep_attr.qp_type = IB_QPT_RC;
561         ep->rep_attr.port_num = ~0;
562
563         dprintk("RPC:       %s: requested max: dtos: send %d recv %d; "
564                 "iovs: send %d recv %d\n",
565                 __func__,
566                 ep->rep_attr.cap.max_send_wr,
567                 ep->rep_attr.cap.max_recv_wr,
568                 ep->rep_attr.cap.max_send_sge,
569                 ep->rep_attr.cap.max_recv_sge);
570
571         /* set trigger for requesting send completion */
572         ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 /*  - 1*/;
573         switch (ia->ri_memreg_strategy) {
574         case RPCRDMA_MEMWINDOWS_ASYNC:
575         case RPCRDMA_MEMWINDOWS:
576                 ep->rep_cqinit -= RPCRDMA_MAX_SEGS;
577                 break;
578         default:
579                 break;
580         }
581         if (ep->rep_cqinit <= 2)
582                 ep->rep_cqinit = 0;
583         INIT_CQCOUNT(ep);
584         ep->rep_ia = ia;
585         init_waitqueue_head(&ep->rep_connect_wait);
586
587         /*
588          * Create a single cq for receive dto and mw_bind (only ever
589          * care about unbind, really). Send completions are suppressed.
590          * Use single threaded tasklet upcalls to maintain ordering.
591          */
592         ep->rep_cq = ib_create_cq(ia->ri_id->device, rpcrdma_cq_event_upcall,
593                                   rpcrdma_cq_async_error_upcall, NULL,
594                                   ep->rep_attr.cap.max_recv_wr +
595                                   ep->rep_attr.cap.max_send_wr + 1, 0);
596         if (IS_ERR(ep->rep_cq)) {
597                 rc = PTR_ERR(ep->rep_cq);
598                 dprintk("RPC:       %s: ib_create_cq failed: %i\n",
599                         __func__, rc);
600                 goto out1;
601         }
602
603         rc = ib_req_notify_cq(ep->rep_cq, IB_CQ_NEXT_COMP);
604         if (rc) {
605                 dprintk("RPC:       %s: ib_req_notify_cq failed: %i\n",
606                         __func__, rc);
607                 goto out2;
608         }
609
610         ep->rep_attr.send_cq = ep->rep_cq;
611         ep->rep_attr.recv_cq = ep->rep_cq;
612
613         /* Initialize cma parameters */
614
615         /* RPC/RDMA does not use private data */
616         ep->rep_remote_cma.private_data = NULL;
617         ep->rep_remote_cma.private_data_len = 0;
618
619         /* Client offers RDMA Read but does not initiate */
620         switch (ia->ri_memreg_strategy) {
621         case RPCRDMA_BOUNCEBUFFERS:
622                 ep->rep_remote_cma.responder_resources = 0;
623                 break;
624         case RPCRDMA_MTHCAFMR:
625         case RPCRDMA_REGISTER:
626                 ep->rep_remote_cma.responder_resources = cdata->max_requests *
627                                 (RPCRDMA_MAX_DATA_SEGS / 8);
628                 break;
629         case RPCRDMA_MEMWINDOWS:
630         case RPCRDMA_MEMWINDOWS_ASYNC:
631 #if RPCRDMA_PERSISTENT_REGISTRATION
632         case RPCRDMA_ALLPHYSICAL:
633 #endif
634                 ep->rep_remote_cma.responder_resources = cdata->max_requests *
635                                 (RPCRDMA_MAX_DATA_SEGS / 2);
636                 break;
637         default:
638                 break;
639         }
640         if (ep->rep_remote_cma.responder_resources > devattr.max_qp_rd_atom)
641                 ep->rep_remote_cma.responder_resources = devattr.max_qp_rd_atom;
642         ep->rep_remote_cma.initiator_depth = 0;
643
644         ep->rep_remote_cma.retry_count = 7;
645         ep->rep_remote_cma.flow_control = 0;
646         ep->rep_remote_cma.rnr_retry_count = 0;
647
648         return 0;
649
650 out2:
651         err = ib_destroy_cq(ep->rep_cq);
652         if (err)
653                 dprintk("RPC:       %s: ib_destroy_cq returned %i\n",
654                         __func__, err);
655 out1:
656         return rc;
657 }
658
659 /*
660  * rpcrdma_ep_destroy
661  *
662  * Disconnect and destroy endpoint. After this, the only
663  * valid operations on the ep are to free it (if dynamically
664  * allocated) or re-create it.
665  *
666  * The caller's error handling must be sure to not leak the endpoint
667  * if this function fails.
668  */
669 int
670 rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
671 {
672         int rc;
673
674         dprintk("RPC:       %s: entering, connected is %d\n",
675                 __func__, ep->rep_connected);
676
677         if (ia->ri_id->qp) {
678                 rc = rpcrdma_ep_disconnect(ep, ia);
679                 if (rc)
680                         dprintk("RPC:       %s: rpcrdma_ep_disconnect"
681                                 " returned %i\n", __func__, rc);
682         }
683
684         ep->rep_func = NULL;
685
686         /* padding - could be done in rpcrdma_buffer_destroy... */
687         if (ep->rep_pad_mr) {
688                 rpcrdma_deregister_internal(ia, ep->rep_pad_mr, &ep->rep_pad);
689                 ep->rep_pad_mr = NULL;
690         }
691
692         if (ia->ri_id->qp) {
693                 rdma_destroy_qp(ia->ri_id);
694                 ia->ri_id->qp = NULL;
695         }
696
697         rpcrdma_clean_cq(ep->rep_cq);
698         rc = ib_destroy_cq(ep->rep_cq);
699         if (rc)
700                 dprintk("RPC:       %s: ib_destroy_cq returned %i\n",
701                         __func__, rc);
702
703         return rc;
704 }
705
706 /*
707  * Connect unconnected endpoint.
708  */
709 int
710 rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
711 {
712         struct rdma_cm_id *id;
713         int rc = 0;
714         int retry_count = 0;
715         int reconnect = (ep->rep_connected != 0);
716
717         if (reconnect) {
718                 struct rpcrdma_xprt *xprt;
719 retry:
720                 rc = rpcrdma_ep_disconnect(ep, ia);
721                 if (rc && rc != -ENOTCONN)
722                         dprintk("RPC:       %s: rpcrdma_ep_disconnect"
723                                 " status %i\n", __func__, rc);
724                 rpcrdma_clean_cq(ep->rep_cq);
725
726                 xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
727                 id = rpcrdma_create_id(xprt, ia,
728                                 (struct sockaddr *)&xprt->rx_data.addr);
729                 if (IS_ERR(id)) {
730                         rc = PTR_ERR(id);
731                         goto out;
732                 }
733                 /* TEMP TEMP TEMP - fail if new device:
734                  * Deregister/remarshal *all* requests!
735                  * Close and recreate adapter, pd, etc!
736                  * Re-determine all attributes still sane!
737                  * More stuff I haven't thought of!
738                  * Rrrgh!
739                  */
740                 if (ia->ri_id->device != id->device) {
741                         printk("RPC:       %s: can't reconnect on "
742                                 "different device!\n", __func__);
743                         rdma_destroy_id(id);
744                         rc = -ENETDOWN;
745                         goto out;
746                 }
747                 /* END TEMP */
748                 rdma_destroy_id(ia->ri_id);
749                 ia->ri_id = id;
750         }
751
752         rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
753         if (rc) {
754                 dprintk("RPC:       %s: rdma_create_qp failed %i\n",
755                         __func__, rc);
756                 goto out;
757         }
758
759 /* XXX Tavor device performs badly with 2K MTU! */
760 if (strnicmp(ia->ri_id->device->dma_device->bus->name, "pci", 3) == 0) {
761         struct pci_dev *pcid = to_pci_dev(ia->ri_id->device->dma_device);
762         if (pcid->device == PCI_DEVICE_ID_MELLANOX_TAVOR &&
763             (pcid->vendor == PCI_VENDOR_ID_MELLANOX ||
764              pcid->vendor == PCI_VENDOR_ID_TOPSPIN)) {
765                 struct ib_qp_attr attr = {
766                         .path_mtu = IB_MTU_1024
767                 };
768                 rc = ib_modify_qp(ia->ri_id->qp, &attr, IB_QP_PATH_MTU);
769         }
770 }
771
772         /* Theoretically a client initiator_depth > 0 is not needed,
773          * but many peers fail to complete the connection unless they
774          * == responder_resources! */
775         if (ep->rep_remote_cma.initiator_depth !=
776                                 ep->rep_remote_cma.responder_resources)
777                 ep->rep_remote_cma.initiator_depth =
778                         ep->rep_remote_cma.responder_resources;
779
780         ep->rep_connected = 0;
781
782         rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma);
783         if (rc) {
784                 dprintk("RPC:       %s: rdma_connect() failed with %i\n",
785                                 __func__, rc);
786                 goto out;
787         }
788
789         if (reconnect)
790                 return 0;
791
792         wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0);
793
794         /*
795          * Check state. A non-peer reject indicates no listener
796          * (ECONNREFUSED), which may be a transient state. All
797          * others indicate a transport condition which has already
798          * undergone a best-effort.
799          */
800         if (ep->rep_connected == -ECONNREFUSED
801             && ++retry_count <= RDMA_CONNECT_RETRY_MAX) {
802                 dprintk("RPC:       %s: non-peer_reject, retry\n", __func__);
803                 goto retry;
804         }
805         if (ep->rep_connected <= 0) {
806                 /* Sometimes, the only way to reliably connect to remote
807                  * CMs is to use same nonzero values for ORD and IRD. */
808                 ep->rep_remote_cma.initiator_depth =
809                                         ep->rep_remote_cma.responder_resources;
810                 if (ep->rep_remote_cma.initiator_depth == 0)
811                         ++ep->rep_remote_cma.initiator_depth;
812                 if (ep->rep_remote_cma.responder_resources == 0)
813                         ++ep->rep_remote_cma.responder_resources;
814                 if (retry_count++ == 0)
815                         goto retry;
816                 rc = ep->rep_connected;
817         } else {
818                 dprintk("RPC:       %s: connected\n", __func__);
819         }
820
821 out:
822         if (rc)
823                 ep->rep_connected = rc;
824         return rc;
825 }
826
827 /*
828  * rpcrdma_ep_disconnect
829  *
830  * This is separate from destroy to facilitate the ability
831  * to reconnect without recreating the endpoint.
832  *
833  * This call is not reentrant, and must not be made in parallel
834  * on the same endpoint.
835  */
836 int
837 rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
838 {
839         int rc;
840
841         rpcrdma_clean_cq(ep->rep_cq);
842         rc = rdma_disconnect(ia->ri_id);
843         if (!rc) {
844                 /* returns without wait if not connected */
845                 wait_event_interruptible(ep->rep_connect_wait,
846                                                         ep->rep_connected != 1);
847                 dprintk("RPC:       %s: after wait, %sconnected\n", __func__,
848                         (ep->rep_connected == 1) ? "still " : "dis");
849         } else {
850                 dprintk("RPC:       %s: rdma_disconnect %i\n", __func__, rc);
851                 ep->rep_connected = rc;
852         }
853         return rc;
854 }
855
856 /*
857  * Initialize buffer memory
858  */
859 int
860 rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
861         struct rpcrdma_ia *ia, struct rpcrdma_create_data_internal *cdata)
862 {
863         char *p;
864         size_t len;
865         int i, rc;
866
867         buf->rb_max_requests = cdata->max_requests;
868         spin_lock_init(&buf->rb_lock);
869         atomic_set(&buf->rb_credits, 1);
870
871         /* Need to allocate:
872          *   1.  arrays for send and recv pointers
873          *   2.  arrays of struct rpcrdma_req to fill in pointers
874          *   3.  array of struct rpcrdma_rep for replies
875          *   4.  padding, if any
876          *   5.  mw's, if any
877          * Send/recv buffers in req/rep need to be registered
878          */
879
880         len = buf->rb_max_requests *
881                 (sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *));
882         len += cdata->padding;
883         switch (ia->ri_memreg_strategy) {
884         case RPCRDMA_MTHCAFMR:
885                 /* TBD we are perhaps overallocating here */
886                 len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
887                                 sizeof(struct rpcrdma_mw);
888                 break;
889         case RPCRDMA_MEMWINDOWS_ASYNC:
890         case RPCRDMA_MEMWINDOWS:
891                 len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
892                                 sizeof(struct rpcrdma_mw);
893                 break;
894         default:
895                 break;
896         }
897
898         /* allocate 1, 4 and 5 in one shot */
899         p = kzalloc(len, GFP_KERNEL);
900         if (p == NULL) {
901                 dprintk("RPC:       %s: req_t/rep_t/pad kzalloc(%zd) failed\n",
902                         __func__, len);
903                 rc = -ENOMEM;
904                 goto out;
905         }
906         buf->rb_pool = p;       /* for freeing it later */
907
908         buf->rb_send_bufs = (struct rpcrdma_req **) p;
909         p = (char *) &buf->rb_send_bufs[buf->rb_max_requests];
910         buf->rb_recv_bufs = (struct rpcrdma_rep **) p;
911         p = (char *) &buf->rb_recv_bufs[buf->rb_max_requests];
912
913         /*
914          * Register the zeroed pad buffer, if any.
915          */
916         if (cdata->padding) {
917                 rc = rpcrdma_register_internal(ia, p, cdata->padding,
918                                             &ep->rep_pad_mr, &ep->rep_pad);
919                 if (rc)
920                         goto out;
921         }
922         p += cdata->padding;
923
924         /*
925          * Allocate the fmr's, or mw's for mw_bind chunk registration.
926          * We "cycle" the mw's in order to minimize rkey reuse,
927          * and also reduce unbind-to-bind collision.
928          */
929         INIT_LIST_HEAD(&buf->rb_mws);
930         switch (ia->ri_memreg_strategy) {
931         case RPCRDMA_MTHCAFMR:
932                 {
933                 struct rpcrdma_mw *r = (struct rpcrdma_mw *)p;
934                 struct ib_fmr_attr fa = {
935                         RPCRDMA_MAX_DATA_SEGS, 1, PAGE_SHIFT
936                 };
937                 /* TBD we are perhaps overallocating here */
938                 for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
939                         r->r.fmr = ib_alloc_fmr(ia->ri_pd,
940                                 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ,
941                                 &fa);
942                         if (IS_ERR(r->r.fmr)) {
943                                 rc = PTR_ERR(r->r.fmr);
944                                 dprintk("RPC:       %s: ib_alloc_fmr"
945                                         " failed %i\n", __func__, rc);
946                                 goto out;
947                         }
948                         list_add(&r->mw_list, &buf->rb_mws);
949                         ++r;
950                 }
951                 }
952                 break;
953         case RPCRDMA_MEMWINDOWS_ASYNC:
954         case RPCRDMA_MEMWINDOWS:
955                 {
956                 struct rpcrdma_mw *r = (struct rpcrdma_mw *)p;
957                 /* Allocate one extra request's worth, for full cycling */
958                 for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
959                         r->r.mw = ib_alloc_mw(ia->ri_pd);
960                         if (IS_ERR(r->r.mw)) {
961                                 rc = PTR_ERR(r->r.mw);
962                                 dprintk("RPC:       %s: ib_alloc_mw"
963                                         " failed %i\n", __func__, rc);
964                                 goto out;
965                         }
966                         list_add(&r->mw_list, &buf->rb_mws);
967                         ++r;
968                 }
969                 }
970                 break;
971         default:
972                 break;
973         }
974
975         /*
976          * Allocate/init the request/reply buffers. Doing this
977          * using kmalloc for now -- one for each buf.
978          */
979         for (i = 0; i < buf->rb_max_requests; i++) {
980                 struct rpcrdma_req *req;
981                 struct rpcrdma_rep *rep;
982
983                 len = cdata->inline_wsize + sizeof(struct rpcrdma_req);
984                 /* RPC layer requests *double* size + 1K RPC_SLACK_SPACE! */
985                 /* Typical ~2400b, so rounding up saves work later */
986                 if (len < 4096)
987                         len = 4096;
988                 req = kmalloc(len, GFP_KERNEL);
989                 if (req == NULL) {
990                         dprintk("RPC:       %s: request buffer %d alloc"
991                                 " failed\n", __func__, i);
992                         rc = -ENOMEM;
993                         goto out;
994                 }
995                 memset(req, 0, sizeof(struct rpcrdma_req));
996                 buf->rb_send_bufs[i] = req;
997                 buf->rb_send_bufs[i]->rl_buffer = buf;
998
999                 rc = rpcrdma_register_internal(ia, req->rl_base,
1000                                 len - offsetof(struct rpcrdma_req, rl_base),
1001                                 &buf->rb_send_bufs[i]->rl_handle,
1002                                 &buf->rb_send_bufs[i]->rl_iov);
1003                 if (rc)
1004                         goto out;
1005
1006                 buf->rb_send_bufs[i]->rl_size = len-sizeof(struct rpcrdma_req);
1007
1008                 len = cdata->inline_rsize + sizeof(struct rpcrdma_rep);
1009                 rep = kmalloc(len, GFP_KERNEL);
1010                 if (rep == NULL) {
1011                         dprintk("RPC:       %s: reply buffer %d alloc failed\n",
1012                                 __func__, i);
1013                         rc = -ENOMEM;
1014                         goto out;
1015                 }
1016                 memset(rep, 0, sizeof(struct rpcrdma_rep));
1017                 buf->rb_recv_bufs[i] = rep;
1018                 buf->rb_recv_bufs[i]->rr_buffer = buf;
1019                 init_waitqueue_head(&rep->rr_unbind);
1020
1021                 rc = rpcrdma_register_internal(ia, rep->rr_base,
1022                                 len - offsetof(struct rpcrdma_rep, rr_base),
1023                                 &buf->rb_recv_bufs[i]->rr_handle,
1024                                 &buf->rb_recv_bufs[i]->rr_iov);
1025                 if (rc)
1026                         goto out;
1027
1028         }
1029         dprintk("RPC:       %s: max_requests %d\n",
1030                 __func__, buf->rb_max_requests);
1031         /* done */
1032         return 0;
1033 out:
1034         rpcrdma_buffer_destroy(buf);
1035         return rc;
1036 }
1037
1038 /*
1039  * Unregister and destroy buffer memory. Need to deal with
1040  * partial initialization, so it's callable from failed create.
1041  * Must be called before destroying endpoint, as registrations
1042  * reference it.
1043  */
1044 void
1045 rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
1046 {
1047         int rc, i;
1048         struct rpcrdma_ia *ia = rdmab_to_ia(buf);
1049
1050         /* clean up in reverse order from create
1051          *   1.  recv mr memory (mr free, then kfree)
1052          *   1a. bind mw memory
1053          *   2.  send mr memory (mr free, then kfree)
1054          *   3.  padding (if any) [moved to rpcrdma_ep_destroy]
1055          *   4.  arrays
1056          */
1057         dprintk("RPC:       %s: entering\n", __func__);
1058
1059         for (i = 0; i < buf->rb_max_requests; i++) {
1060                 if (buf->rb_recv_bufs && buf->rb_recv_bufs[i]) {
1061                         rpcrdma_deregister_internal(ia,
1062                                         buf->rb_recv_bufs[i]->rr_handle,
1063                                         &buf->rb_recv_bufs[i]->rr_iov);
1064                         kfree(buf->rb_recv_bufs[i]);
1065                 }
1066                 if (buf->rb_send_bufs && buf->rb_send_bufs[i]) {
1067                         while (!list_empty(&buf->rb_mws)) {
1068                                 struct rpcrdma_mw *r;
1069                                 r = list_entry(buf->rb_mws.next,
1070                                         struct rpcrdma_mw, mw_list);
1071                                 list_del(&r->mw_list);
1072                                 switch (ia->ri_memreg_strategy) {
1073                                 case RPCRDMA_MTHCAFMR:
1074                                         rc = ib_dealloc_fmr(r->r.fmr);
1075                                         if (rc)
1076                                                 dprintk("RPC:       %s:"
1077                                                         " ib_dealloc_fmr"
1078                                                         " failed %i\n",
1079                                                         __func__, rc);
1080                                         break;
1081                                 case RPCRDMA_MEMWINDOWS_ASYNC:
1082                                 case RPCRDMA_MEMWINDOWS:
1083                                         rc = ib_dealloc_mw(r->r.mw);
1084                                         if (rc)
1085                                                 dprintk("RPC:       %s:"
1086                                                         " ib_dealloc_mw"
1087                                                         " failed %i\n",
1088                                                         __func__, rc);
1089                                         break;
1090                                 default:
1091                                         break;
1092                                 }
1093                         }
1094                         rpcrdma_deregister_internal(ia,
1095                                         buf->rb_send_bufs[i]->rl_handle,
1096                                         &buf->rb_send_bufs[i]->rl_iov);
1097                         kfree(buf->rb_send_bufs[i]);
1098                 }
1099         }
1100
1101         kfree(buf->rb_pool);
1102 }
1103
1104 /*
1105  * Get a set of request/reply buffers.
1106  *
1107  * Reply buffer (if needed) is attached to send buffer upon return.
1108  * Rule:
1109  *    rb_send_index and rb_recv_index MUST always be pointing to the
1110  *    *next* available buffer (non-NULL). They are incremented after
1111  *    removing buffers, and decremented *before* returning them.
1112  */
1113 struct rpcrdma_req *
1114 rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
1115 {
1116         struct rpcrdma_req *req;
1117         unsigned long flags;
1118
1119         spin_lock_irqsave(&buffers->rb_lock, flags);
1120         if (buffers->rb_send_index == buffers->rb_max_requests) {
1121                 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1122                 dprintk("RPC:       %s: out of request buffers\n", __func__);
1123                 return ((struct rpcrdma_req *)NULL);
1124         }
1125
1126         req = buffers->rb_send_bufs[buffers->rb_send_index];
1127         if (buffers->rb_send_index < buffers->rb_recv_index) {
1128                 dprintk("RPC:       %s: %d extra receives outstanding (ok)\n",
1129                         __func__,
1130                         buffers->rb_recv_index - buffers->rb_send_index);
1131                 req->rl_reply = NULL;
1132         } else {
1133                 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1134                 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1135         }
1136         buffers->rb_send_bufs[buffers->rb_send_index++] = NULL;
1137         if (!list_empty(&buffers->rb_mws)) {
1138                 int i = RPCRDMA_MAX_SEGS - 1;
1139                 do {
1140                         struct rpcrdma_mw *r;
1141                         r = list_entry(buffers->rb_mws.next,
1142                                         struct rpcrdma_mw, mw_list);
1143                         list_del(&r->mw_list);
1144                         req->rl_segments[i].mr_chunk.rl_mw = r;
1145                 } while (--i >= 0);
1146         }
1147         spin_unlock_irqrestore(&buffers->rb_lock, flags);
1148         return req;
1149 }
1150
1151 /*
1152  * Put request/reply buffers back into pool.
1153  * Pre-decrement counter/array index.
1154  */
1155 void
1156 rpcrdma_buffer_put(struct rpcrdma_req *req)
1157 {
1158         struct rpcrdma_buffer *buffers = req->rl_buffer;
1159         struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
1160         int i;
1161         unsigned long flags;
1162
1163         BUG_ON(req->rl_nchunks != 0);
1164         spin_lock_irqsave(&buffers->rb_lock, flags);
1165         buffers->rb_send_bufs[--buffers->rb_send_index] = req;
1166         req->rl_niovs = 0;
1167         if (req->rl_reply) {
1168                 buffers->rb_recv_bufs[--buffers->rb_recv_index] = req->rl_reply;
1169                 init_waitqueue_head(&req->rl_reply->rr_unbind);
1170                 req->rl_reply->rr_func = NULL;
1171                 req->rl_reply = NULL;
1172         }
1173         switch (ia->ri_memreg_strategy) {
1174         case RPCRDMA_MTHCAFMR:
1175         case RPCRDMA_MEMWINDOWS_ASYNC:
1176         case RPCRDMA_MEMWINDOWS:
1177                 /*
1178                  * Cycle mw's back in reverse order, and "spin" them.
1179                  * This delays and scrambles reuse as much as possible.
1180                  */
1181                 i = 1;
1182                 do {
1183                         struct rpcrdma_mw **mw;
1184                         mw = &req->rl_segments[i].mr_chunk.rl_mw;
1185                         list_add_tail(&(*mw)->mw_list, &buffers->rb_mws);
1186                         *mw = NULL;
1187                 } while (++i < RPCRDMA_MAX_SEGS);
1188                 list_add_tail(&req->rl_segments[0].mr_chunk.rl_mw->mw_list,
1189                                         &buffers->rb_mws);
1190                 req->rl_segments[0].mr_chunk.rl_mw = NULL;
1191                 break;
1192         default:
1193                 break;
1194         }
1195         spin_unlock_irqrestore(&buffers->rb_lock, flags);
1196 }
1197
1198 /*
1199  * Recover reply buffers from pool.
1200  * This happens when recovering from error conditions.
1201  * Post-increment counter/array index.
1202  */
1203 void
1204 rpcrdma_recv_buffer_get(struct rpcrdma_req *req)
1205 {
1206         struct rpcrdma_buffer *buffers = req->rl_buffer;
1207         unsigned long flags;
1208
1209         if (req->rl_iov.length == 0)    /* special case xprt_rdma_allocate() */
1210                 buffers = ((struct rpcrdma_req *) buffers)->rl_buffer;
1211         spin_lock_irqsave(&buffers->rb_lock, flags);
1212         if (buffers->rb_recv_index < buffers->rb_max_requests) {
1213                 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1214                 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1215         }
1216         spin_unlock_irqrestore(&buffers->rb_lock, flags);
1217 }
1218
1219 /*
1220  * Put reply buffers back into pool when not attached to
1221  * request. This happens in error conditions, and when
1222  * aborting unbinds. Pre-decrement counter/array index.
1223  */
1224 void
1225 rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
1226 {
1227         struct rpcrdma_buffer *buffers = rep->rr_buffer;
1228         unsigned long flags;
1229
1230         rep->rr_func = NULL;
1231         spin_lock_irqsave(&buffers->rb_lock, flags);
1232         buffers->rb_recv_bufs[--buffers->rb_recv_index] = rep;
1233         spin_unlock_irqrestore(&buffers->rb_lock, flags);
1234 }
1235
1236 /*
1237  * Wrappers for internal-use kmalloc memory registration, used by buffer code.
1238  */
1239
1240 int
1241 rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
1242                                 struct ib_mr **mrp, struct ib_sge *iov)
1243 {
1244         struct ib_phys_buf ipb;
1245         struct ib_mr *mr;
1246         int rc;
1247
1248         /*
1249          * All memory passed here was kmalloc'ed, therefore phys-contiguous.
1250          */
1251         iov->addr = ib_dma_map_single(ia->ri_id->device,
1252                         va, len, DMA_BIDIRECTIONAL);
1253         iov->length = len;
1254
1255         if (ia->ri_bind_mem != NULL) {
1256                 *mrp = NULL;
1257                 iov->lkey = ia->ri_bind_mem->lkey;
1258                 return 0;
1259         }
1260
1261         ipb.addr = iov->addr;
1262         ipb.size = iov->length;
1263         mr = ib_reg_phys_mr(ia->ri_pd, &ipb, 1,
1264                         IB_ACCESS_LOCAL_WRITE, &iov->addr);
1265
1266         dprintk("RPC:       %s: phys convert: 0x%llx "
1267                         "registered 0x%llx length %d\n",
1268                         __func__, (unsigned long long)ipb.addr,
1269                         (unsigned long long)iov->addr, len);
1270
1271         if (IS_ERR(mr)) {
1272                 *mrp = NULL;
1273                 rc = PTR_ERR(mr);
1274                 dprintk("RPC:       %s: failed with %i\n", __func__, rc);
1275         } else {
1276                 *mrp = mr;
1277                 iov->lkey = mr->lkey;
1278                 rc = 0;
1279         }
1280
1281         return rc;
1282 }
1283
1284 int
1285 rpcrdma_deregister_internal(struct rpcrdma_ia *ia,
1286                                 struct ib_mr *mr, struct ib_sge *iov)
1287 {
1288         int rc;
1289
1290         ib_dma_unmap_single(ia->ri_id->device,
1291                         iov->addr, iov->length, DMA_BIDIRECTIONAL);
1292
1293         if (NULL == mr)
1294                 return 0;
1295
1296         rc = ib_dereg_mr(mr);
1297         if (rc)
1298                 dprintk("RPC:       %s: ib_dereg_mr failed %i\n", __func__, rc);
1299         return rc;
1300 }
1301
1302 /*
1303  * Wrappers for chunk registration, shared by read/write chunk code.
1304  */
1305
1306 static void
1307 rpcrdma_map_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg, int writing)
1308 {
1309         seg->mr_dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
1310         seg->mr_dmalen = seg->mr_len;
1311         if (seg->mr_page)
1312                 seg->mr_dma = ib_dma_map_page(ia->ri_id->device,
1313                                 seg->mr_page, offset_in_page(seg->mr_offset),
1314                                 seg->mr_dmalen, seg->mr_dir);
1315         else
1316                 seg->mr_dma = ib_dma_map_single(ia->ri_id->device,
1317                                 seg->mr_offset,
1318                                 seg->mr_dmalen, seg->mr_dir);
1319 }
1320
1321 static void
1322 rpcrdma_unmap_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg)
1323 {
1324         if (seg->mr_page)
1325                 ib_dma_unmap_page(ia->ri_id->device,
1326                                 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1327         else
1328                 ib_dma_unmap_single(ia->ri_id->device,
1329                                 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1330 }
1331
1332 int
1333 rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
1334                         int nsegs, int writing, struct rpcrdma_xprt *r_xprt)
1335 {
1336         struct rpcrdma_ia *ia = &r_xprt->rx_ia;
1337         int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE :
1338                                   IB_ACCESS_REMOTE_READ);
1339         struct rpcrdma_mr_seg *seg1 = seg;
1340         int i;
1341         int rc = 0;
1342
1343         switch (ia->ri_memreg_strategy) {
1344
1345 #if RPCRDMA_PERSISTENT_REGISTRATION
1346         case RPCRDMA_ALLPHYSICAL:
1347                 rpcrdma_map_one(ia, seg, writing);
1348                 seg->mr_rkey = ia->ri_bind_mem->rkey;
1349                 seg->mr_base = seg->mr_dma;
1350                 seg->mr_nsegs = 1;
1351                 nsegs = 1;
1352                 break;
1353 #endif
1354
1355         /* Registration using fast memory registration */
1356         case RPCRDMA_MTHCAFMR:
1357                 {
1358                 u64 physaddrs[RPCRDMA_MAX_DATA_SEGS];
1359                 int len, pageoff = offset_in_page(seg->mr_offset);
1360                 seg1->mr_offset -= pageoff;     /* start of page */
1361                 seg1->mr_len += pageoff;
1362                 len = -pageoff;
1363                 if (nsegs > RPCRDMA_MAX_DATA_SEGS)
1364                         nsegs = RPCRDMA_MAX_DATA_SEGS;
1365                 for (i = 0; i < nsegs;) {
1366                         rpcrdma_map_one(ia, seg, writing);
1367                         physaddrs[i] = seg->mr_dma;
1368                         len += seg->mr_len;
1369                         ++seg;
1370                         ++i;
1371                         /* Check for holes */
1372                         if ((i < nsegs && offset_in_page(seg->mr_offset)) ||
1373                             offset_in_page((seg-1)->mr_offset+(seg-1)->mr_len))
1374                                 break;
1375                 }
1376                 nsegs = i;
1377                 rc = ib_map_phys_fmr(seg1->mr_chunk.rl_mw->r.fmr,
1378                                         physaddrs, nsegs, seg1->mr_dma);
1379                 if (rc) {
1380                         dprintk("RPC:       %s: failed ib_map_phys_fmr "
1381                                 "%u@0x%llx+%i (%d)... status %i\n", __func__,
1382                                 len, (unsigned long long)seg1->mr_dma,
1383                                 pageoff, nsegs, rc);
1384                         while (nsegs--)
1385                                 rpcrdma_unmap_one(ia, --seg);
1386                 } else {
1387                         seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.fmr->rkey;
1388                         seg1->mr_base = seg1->mr_dma + pageoff;
1389                         seg1->mr_nsegs = nsegs;
1390                         seg1->mr_len = len;
1391                 }
1392                 }
1393                 break;
1394
1395         /* Registration using memory windows */
1396         case RPCRDMA_MEMWINDOWS_ASYNC:
1397         case RPCRDMA_MEMWINDOWS:
1398                 {
1399                 struct ib_mw_bind param;
1400                 rpcrdma_map_one(ia, seg, writing);
1401                 param.mr = ia->ri_bind_mem;
1402                 param.wr_id = 0ULL;     /* no send cookie */
1403                 param.addr = seg->mr_dma;
1404                 param.length = seg->mr_len;
1405                 param.send_flags = 0;
1406                 param.mw_access_flags = mem_priv;
1407
1408                 DECR_CQCOUNT(&r_xprt->rx_ep);
1409                 rc = ib_bind_mw(ia->ri_id->qp,
1410                                         seg->mr_chunk.rl_mw->r.mw, &param);
1411                 if (rc) {
1412                         dprintk("RPC:       %s: failed ib_bind_mw "
1413                                 "%u@0x%llx status %i\n",
1414                                 __func__, seg->mr_len,
1415                                 (unsigned long long)seg->mr_dma, rc);
1416                         rpcrdma_unmap_one(ia, seg);
1417                 } else {
1418                         seg->mr_rkey = seg->mr_chunk.rl_mw->r.mw->rkey;
1419                         seg->mr_base = param.addr;
1420                         seg->mr_nsegs = 1;
1421                         nsegs = 1;
1422                 }
1423                 }
1424                 break;
1425
1426         /* Default registration each time */
1427         default:
1428                 {
1429                 struct ib_phys_buf ipb[RPCRDMA_MAX_DATA_SEGS];
1430                 int len = 0;
1431                 if (nsegs > RPCRDMA_MAX_DATA_SEGS)
1432                         nsegs = RPCRDMA_MAX_DATA_SEGS;
1433                 for (i = 0; i < nsegs;) {
1434                         rpcrdma_map_one(ia, seg, writing);
1435                         ipb[i].addr = seg->mr_dma;
1436                         ipb[i].size = seg->mr_len;
1437                         len += seg->mr_len;
1438                         ++seg;
1439                         ++i;
1440                         /* Check for holes */
1441                         if ((i < nsegs && offset_in_page(seg->mr_offset)) ||
1442                             offset_in_page((seg-1)->mr_offset+(seg-1)->mr_len))
1443                                 break;
1444                 }
1445                 nsegs = i;
1446                 seg1->mr_base = seg1->mr_dma;
1447                 seg1->mr_chunk.rl_mr = ib_reg_phys_mr(ia->ri_pd,
1448                                         ipb, nsegs, mem_priv, &seg1->mr_base);
1449                 if (IS_ERR(seg1->mr_chunk.rl_mr)) {
1450                         rc = PTR_ERR(seg1->mr_chunk.rl_mr);
1451                         dprintk("RPC:       %s: failed ib_reg_phys_mr "
1452                                 "%u@0x%llx (%d)... status %i\n",
1453                                 __func__, len,
1454                                 (unsigned long long)seg1->mr_dma, nsegs, rc);
1455                         while (nsegs--)
1456                                 rpcrdma_unmap_one(ia, --seg);
1457                 } else {
1458                         seg1->mr_rkey = seg1->mr_chunk.rl_mr->rkey;
1459                         seg1->mr_nsegs = nsegs;
1460                         seg1->mr_len = len;
1461                 }
1462                 }
1463                 break;
1464         }
1465         if (rc)
1466                 return -1;
1467
1468         return nsegs;
1469 }
1470
1471 int
1472 rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
1473                 struct rpcrdma_xprt *r_xprt, void *r)
1474 {
1475         struct rpcrdma_ia *ia = &r_xprt->rx_ia;
1476         struct rpcrdma_mr_seg *seg1 = seg;
1477         int nsegs = seg->mr_nsegs, rc;
1478
1479         switch (ia->ri_memreg_strategy) {
1480
1481 #if RPCRDMA_PERSISTENT_REGISTRATION
1482         case RPCRDMA_ALLPHYSICAL:
1483                 BUG_ON(nsegs != 1);
1484                 rpcrdma_unmap_one(ia, seg);
1485                 rc = 0;
1486                 break;
1487 #endif
1488
1489         case RPCRDMA_MTHCAFMR:
1490                 {
1491                 LIST_HEAD(l);
1492                 list_add(&seg->mr_chunk.rl_mw->r.fmr->list, &l);
1493                 rc = ib_unmap_fmr(&l);
1494                 while (seg1->mr_nsegs--)
1495                         rpcrdma_unmap_one(ia, seg++);
1496                 }
1497                 if (rc)
1498                         dprintk("RPC:       %s: failed ib_unmap_fmr,"
1499                                 " status %i\n", __func__, rc);
1500                 break;
1501
1502         case RPCRDMA_MEMWINDOWS_ASYNC:
1503         case RPCRDMA_MEMWINDOWS:
1504                 {
1505                 struct ib_mw_bind param;
1506                 BUG_ON(nsegs != 1);
1507                 param.mr = ia->ri_bind_mem;
1508                 param.addr = 0ULL;      /* unbind */
1509                 param.length = 0;
1510                 param.mw_access_flags = 0;
1511                 if (r) {
1512                         param.wr_id = (u64) (unsigned long) r;
1513                         param.send_flags = IB_SEND_SIGNALED;
1514                         INIT_CQCOUNT(&r_xprt->rx_ep);
1515                 } else {
1516                         param.wr_id = 0ULL;
1517                         param.send_flags = 0;
1518                         DECR_CQCOUNT(&r_xprt->rx_ep);
1519                 }
1520                 rc = ib_bind_mw(ia->ri_id->qp,
1521                                 seg->mr_chunk.rl_mw->r.mw, &param);
1522                 rpcrdma_unmap_one(ia, seg);
1523                 }
1524                 if (rc)
1525                         dprintk("RPC:       %s: failed ib_(un)bind_mw,"
1526                                 " status %i\n", __func__, rc);
1527                 else
1528                         r = NULL;       /* will upcall on completion */
1529                 break;
1530
1531         default:
1532                 rc = ib_dereg_mr(seg1->mr_chunk.rl_mr);
1533                 seg1->mr_chunk.rl_mr = NULL;
1534                 while (seg1->mr_nsegs--)
1535                         rpcrdma_unmap_one(ia, seg++);
1536                 if (rc)
1537                         dprintk("RPC:       %s: failed ib_dereg_mr,"
1538                                 " status %i\n", __func__, rc);
1539                 break;
1540         }
1541         if (r) {
1542                 struct rpcrdma_rep *rep = r;
1543                 void (*func)(struct rpcrdma_rep *) = rep->rr_func;
1544                 rep->rr_func = NULL;
1545                 func(rep);      /* dereg done, callback now */
1546         }
1547         return nsegs;
1548 }
1549
1550 /*
1551  * Prepost any receive buffer, then post send.
1552  *
1553  * Receive buffer is donated to hardware, reclaimed upon recv completion.
1554  */
1555 int
1556 rpcrdma_ep_post(struct rpcrdma_ia *ia,
1557                 struct rpcrdma_ep *ep,
1558                 struct rpcrdma_req *req)
1559 {
1560         struct ib_send_wr send_wr, *send_wr_fail;
1561         struct rpcrdma_rep *rep = req->rl_reply;
1562         int rc;
1563
1564         if (rep) {
1565                 rc = rpcrdma_ep_post_recv(ia, ep, rep);
1566                 if (rc)
1567                         goto out;
1568                 req->rl_reply = NULL;
1569         }
1570
1571         send_wr.next = NULL;
1572         send_wr.wr_id = 0ULL;   /* no send cookie */
1573         send_wr.sg_list = req->rl_send_iov;
1574         send_wr.num_sge = req->rl_niovs;
1575         send_wr.opcode = IB_WR_SEND;
1576         if (send_wr.num_sge == 4)       /* no need to sync any pad (constant) */
1577                 ib_dma_sync_single_for_device(ia->ri_id->device,
1578                         req->rl_send_iov[3].addr, req->rl_send_iov[3].length,
1579                         DMA_TO_DEVICE);
1580         ib_dma_sync_single_for_device(ia->ri_id->device,
1581                 req->rl_send_iov[1].addr, req->rl_send_iov[1].length,
1582                 DMA_TO_DEVICE);
1583         ib_dma_sync_single_for_device(ia->ri_id->device,
1584                 req->rl_send_iov[0].addr, req->rl_send_iov[0].length,
1585                 DMA_TO_DEVICE);
1586
1587         if (DECR_CQCOUNT(ep) > 0)
1588                 send_wr.send_flags = 0;
1589         else { /* Provider must take a send completion every now and then */
1590                 INIT_CQCOUNT(ep);
1591                 send_wr.send_flags = IB_SEND_SIGNALED;
1592         }
1593
1594         rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail);
1595         if (rc)
1596                 dprintk("RPC:       %s: ib_post_send returned %i\n", __func__,
1597                         rc);
1598 out:
1599         return rc;
1600 }
1601
1602 /*
1603  * (Re)post a receive buffer.
1604  */
1605 int
1606 rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
1607                      struct rpcrdma_ep *ep,
1608                      struct rpcrdma_rep *rep)
1609 {
1610         struct ib_recv_wr recv_wr, *recv_wr_fail;
1611         int rc;
1612
1613         recv_wr.next = NULL;
1614         recv_wr.wr_id = (u64) (unsigned long) rep;
1615         recv_wr.sg_list = &rep->rr_iov;
1616         recv_wr.num_sge = 1;
1617
1618         ib_dma_sync_single_for_cpu(ia->ri_id->device,
1619                 rep->rr_iov.addr, rep->rr_iov.length, DMA_BIDIRECTIONAL);
1620
1621         DECR_CQCOUNT(ep);
1622         rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail);
1623
1624         if (rc)
1625                 dprintk("RPC:       %s: ib_post_recv returned %i\n", __func__,
1626                         rc);
1627         return rc;
1628 }