IB/ipath: Fix lost UD send work request
[safe/jmp/linux-2.6] / drivers / infiniband / hw / ipath / ipath_ruc.c
index bcaa291..af051f7 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
  * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
  *
  * This software is available to you under a choice of one of two
@@ -78,6 +78,7 @@ const u32 ib_ipath_rnr_table[32] = {
  * ipath_insert_rnr_queue - put QP on the RNR timeout list for the device
  * @qp: the QP
  *
+ * Called with the QP s_lock held and interrupts disabled.
  * XXX Use a simple list for now.  We might need a priority
  * queue if we have lots of QPs waiting for RNR timeouts
  * but that should be rare.
@@ -85,9 +86,9 @@ const u32 ib_ipath_rnr_table[32] = {
 void ipath_insert_rnr_queue(struct ipath_qp *qp)
 {
        struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
-       unsigned long flags;
 
-       spin_lock_irqsave(&dev->pending_lock, flags);
+       /* We already did a spin_lock_irqsave(), so just use spin_lock */
+       spin_lock(&dev->pending_lock);
        if (list_empty(&dev->rnrwait))
                list_add(&qp->timerwait, &dev->rnrwait);
        else {
@@ -109,7 +110,7 @@ void ipath_insert_rnr_queue(struct ipath_qp *qp)
                        nqp->s_rnr_timeout -= qp->s_rnr_timeout;
                list_add(&qp->timerwait, l);
        }
-       spin_unlock_irqrestore(&dev->pending_lock, flags);
+       spin_unlock(&dev->pending_lock);
 }
 
 /**
@@ -140,20 +141,11 @@ int ipath_init_sge(struct ipath_qp *qp, struct ipath_rwqe *wqe,
        goto bail;
 
 bad_lkey:
+       memset(&wc, 0, sizeof(wc));
        wc.wr_id = wqe->wr_id;
        wc.status = IB_WC_LOC_PROT_ERR;
        wc.opcode = IB_WC_RECV;
-       wc.vendor_err = 0;
-       wc.byte_len = 0;
-       wc.imm_data = 0;
        wc.qp = &qp->ibqp;
-       wc.src_qp = 0;
-       wc.wc_flags = 0;
-       wc.pkey_index = 0;
-       wc.slid = 0;
-       wc.sl = 0;
-       wc.dlid_path_bits = 0;
-       wc.port_num = 0;
        /* Signal solicited completion event. */
        ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, 1);
        ret = 0;
@@ -194,6 +186,11 @@ int ipath_get_rwqe(struct ipath_qp *qp, int wr_id_only)
        }
 
        spin_lock_irqsave(&rq->lock, flags);
+       if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK)) {
+               ret = 0;
+               goto unlock;
+       }
+
        wq = rq->wq;
        tail = wq->tail;
        /* Validate tail before using it since it is user writable. */
@@ -201,9 +198,8 @@ int ipath_get_rwqe(struct ipath_qp *qp, int wr_id_only)
                tail = 0;
        do {
                if (unlikely(tail == wq->head)) {
-                       spin_unlock_irqrestore(&rq->lock, flags);
                        ret = 0;
-                       goto bail;
+                       goto unlock;
                }
                /* Make sure entry is read after head index is read. */
                smp_rmb();
@@ -216,7 +212,7 @@ int ipath_get_rwqe(struct ipath_qp *qp, int wr_id_only)
        wq->tail = tail;
 
        ret = 1;
-       qp->r_wrid_valid = 1;
+       set_bit(IPATH_R_WRID_VALID, &qp->r_aflags);
        if (handler) {
                u32 n;
 
@@ -243,8 +239,8 @@ int ipath_get_rwqe(struct ipath_qp *qp, int wr_id_only)
                        goto bail;
                }
        }
+unlock:
        spin_unlock_irqrestore(&rq->lock, flags);
-
 bail:
        return ret;
 }
@@ -270,38 +266,63 @@ static void ipath_ruc_loopback(struct ipath_qp *sqp)
        struct ib_wc wc;
        u64 sdata;
        atomic64_t *maddr;
+       enum ib_wc_status send_status;
 
+       /*
+        * Note that we check the responder QP state after
+        * checking the requester's state.
+        */
        qp = ipath_lookup_qpn(&dev->qp_table, sqp->remote_qpn);
-       if (!qp) {
-               dev->n_pkt_drops++;
-               return;
-       }
 
-again:
        spin_lock_irqsave(&sqp->s_lock, flags);
 
-       if (!(ib_ipath_state_ops[sqp->state] & IPATH_PROCESS_SEND_OK) ||
-           sqp->s_rnr_timeout) {
-               spin_unlock_irqrestore(&sqp->s_lock, flags);
-               goto done;
-       }
+       /* Return if we are already busy processing a work request. */
+       if ((sqp->s_flags & (IPATH_S_BUSY | IPATH_S_ANY_WAIT)) ||
+           !(ib_ipath_state_ops[sqp->state] & IPATH_PROCESS_OR_FLUSH_SEND))
+               goto unlock;
 
-       /* Get the next send request. */
-       if (sqp->s_last == sqp->s_head) {
-               /* Send work queue is empty. */
-               spin_unlock_irqrestore(&sqp->s_lock, flags);
-               goto done;
+       sqp->s_flags |= IPATH_S_BUSY;
+
+again:
+       if (sqp->s_last == sqp->s_head)
+               goto clr_busy;
+       wqe = get_swqe_ptr(sqp, sqp->s_last);
+
+       /* Return if it is not OK to start a new work reqeust. */
+       if (!(ib_ipath_state_ops[sqp->state] & IPATH_PROCESS_NEXT_SEND_OK)) {
+               if (!(ib_ipath_state_ops[sqp->state] & IPATH_FLUSH_SEND))
+                       goto clr_busy;
+               /* We are in the error state, flush the work request. */
+               send_status = IB_WC_WR_FLUSH_ERR;
+               goto flush_send;
        }
 
        /*
         * We can rely on the entry not changing without the s_lock
         * being held until we update s_last.
+        * We increment s_cur to indicate s_last is in progress.
         */
-       wqe = get_swqe_ptr(sqp, sqp->s_last);
+       if (sqp->s_last == sqp->s_cur) {
+               if (++sqp->s_cur >= sqp->s_size)
+                       sqp->s_cur = 0;
+       }
        spin_unlock_irqrestore(&sqp->s_lock, flags);
 
-       wc.wc_flags = 0;
-       wc.imm_data = 0;
+       if (!qp || !(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK)) {
+               dev->n_pkt_drops++;
+               /*
+                * For RC, the requester would timeout and retry so
+                * shortcut the timeouts and just signal too many retries.
+                */
+               if (sqp->ibqp.qp_type == IB_QPT_RC)
+                       send_status = IB_WC_RETRY_EXC_ERR;
+               else
+                       send_status = IB_WC_SUCCESS;
+               goto serr;
+       }
+
+       memset(&wc, 0, sizeof wc);
+       send_status = IB_WC_SUCCESS;
 
        sqp->s_sge.sge = wqe->sg_list[0];
        sqp->s_sge.sg_list = wqe->sg_list + 1;
@@ -310,78 +331,36 @@ again:
        switch (wqe->wr.opcode) {
        case IB_WR_SEND_WITH_IMM:
                wc.wc_flags = IB_WC_WITH_IMM;
-               wc.imm_data = wqe->wr.imm_data;
+               wc.ex.imm_data = wqe->wr.ex.imm_data;
                /* FALLTHROUGH */
        case IB_WR_SEND:
-               if (!ipath_get_rwqe(qp, 0)) {
-               rnr_nak:
-                       /* Handle RNR NAK */
-                       if (qp->ibqp.qp_type == IB_QPT_UC)
-                               goto send_comp;
-                       if (sqp->s_rnr_retry == 0) {
-                               wc.status = IB_WC_RNR_RETRY_EXC_ERR;
-                               goto err;
-                       }
-                       if (sqp->s_rnr_retry_cnt < 7)
-                               sqp->s_rnr_retry--;
-                       dev->n_rnr_naks++;
-                       sqp->s_rnr_timeout =
-                               ib_ipath_rnr_table[qp->r_min_rnr_timer];
-                       ipath_insert_rnr_queue(sqp);
-                       goto done;
-               }
+               if (!ipath_get_rwqe(qp, 0))
+                       goto rnr_nak;
                break;
 
        case IB_WR_RDMA_WRITE_WITH_IMM:
-               if (unlikely(!(qp->qp_access_flags &
-                              IB_ACCESS_REMOTE_WRITE))) {
-                       wc.status = IB_WC_REM_INV_REQ_ERR;
-                       goto err;
-               }
+               if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
+                       goto inv_err;
                wc.wc_flags = IB_WC_WITH_IMM;
-               wc.imm_data = wqe->wr.imm_data;
+               wc.ex.imm_data = wqe->wr.ex.imm_data;
                if (!ipath_get_rwqe(qp, 1))
                        goto rnr_nak;
                /* FALLTHROUGH */
        case IB_WR_RDMA_WRITE:
-               if (unlikely(!(qp->qp_access_flags &
-                              IB_ACCESS_REMOTE_WRITE))) {
-                       wc.status = IB_WC_REM_INV_REQ_ERR;
-                       goto err;
-               }
+               if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
+                       goto inv_err;
                if (wqe->length == 0)
                        break;
                if (unlikely(!ipath_rkey_ok(qp, &qp->r_sge, wqe->length,
                                            wqe->wr.wr.rdma.remote_addr,
                                            wqe->wr.wr.rdma.rkey,
-                                           IB_ACCESS_REMOTE_WRITE))) {
-               acc_err:
-                       wc.status = IB_WC_REM_ACCESS_ERR;
-               err:
-                       wc.wr_id = wqe->wr.wr_id;
-                       wc.opcode = ib_ipath_wc_opcode[wqe->wr.opcode];
-                       wc.vendor_err = 0;
-                       wc.byte_len = 0;
-                       wc.qp = &sqp->ibqp;
-                       wc.src_qp = sqp->remote_qpn;
-                       wc.pkey_index = 0;
-                       wc.slid = sqp->remote_ah_attr.dlid;
-                       wc.sl = sqp->remote_ah_attr.sl;
-                       wc.dlid_path_bits = 0;
-                       wc.port_num = 0;
-                       spin_lock_irqsave(&sqp->s_lock, flags);
-                       ipath_sqerror_qp(sqp, &wc);
-                       spin_unlock_irqrestore(&sqp->s_lock, flags);
-                       goto done;
-               }
+                                           IB_ACCESS_REMOTE_WRITE)))
+                       goto acc_err;
                break;
 
        case IB_WR_RDMA_READ:
-               if (unlikely(!(qp->qp_access_flags &
-                              IB_ACCESS_REMOTE_READ))) {
-                       wc.status = IB_WC_REM_INV_REQ_ERR;
-                       goto err;
-               }
+               if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ)))
+                       goto inv_err;
                if (unlikely(!ipath_rkey_ok(qp, &sqp->s_sge, wqe->length,
                                            wqe->wr.wr.rdma.remote_addr,
                                            wqe->wr.wr.rdma.rkey,
@@ -394,11 +373,8 @@ again:
 
        case IB_WR_ATOMIC_CMP_AND_SWP:
        case IB_WR_ATOMIC_FETCH_AND_ADD:
-               if (unlikely(!(qp->qp_access_flags &
-                              IB_ACCESS_REMOTE_ATOMIC))) {
-                       wc.status = IB_WC_REM_INV_REQ_ERR;
-                       goto err;
-               }
+               if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC)))
+                       goto inv_err;
                if (unlikely(!ipath_rkey_ok(qp, &qp->r_sge, sizeof(u64),
                                            wqe->wr.wr.atomic.remote_addr,
                                            wqe->wr.wr.atomic.rkey,
@@ -415,7 +391,8 @@ again:
                goto send_comp;
 
        default:
-               goto done;
+               send_status = IB_WC_LOC_QP_OP_ERR;
+               goto serr;
        }
 
        sge = &sqp->s_sge.sge;
@@ -448,8 +425,7 @@ again:
                sqp->s_len -= len;
        }
 
-       if (wqe->wr.opcode == IB_WR_RDMA_WRITE ||
-           wqe->wr.opcode == IB_WR_RDMA_READ)
+       if (!test_and_clear_bit(IPATH_R_WRID_VALID, &qp->r_aflags))
                goto send_comp;
 
        if (wqe->wr.opcode == IB_WR_RDMA_WRITE_WITH_IMM)
@@ -458,32 +434,89 @@ again:
                wc.opcode = IB_WC_RECV;
        wc.wr_id = qp->r_wr_id;
        wc.status = IB_WC_SUCCESS;
-       wc.vendor_err = 0;
        wc.byte_len = wqe->length;
        wc.qp = &qp->ibqp;
        wc.src_qp = qp->remote_qpn;
-       wc.pkey_index = 0;
        wc.slid = qp->remote_ah_attr.dlid;
        wc.sl = qp->remote_ah_attr.sl;
-       wc.dlid_path_bits = 0;
        wc.port_num = 1;
        /* Signal completion event if the solicited bit is set. */
        ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc,
                       wqe->wr.send_flags & IB_SEND_SOLICITED);
 
 send_comp:
+       spin_lock_irqsave(&sqp->s_lock, flags);
+flush_send:
        sqp->s_rnr_retry = sqp->s_rnr_retry_cnt;
-       ipath_send_complete(sqp, wqe, IB_WC_SUCCESS);
+       ipath_send_complete(sqp, wqe, send_status);
        goto again;
 
+rnr_nak:
+       /* Handle RNR NAK */
+       if (qp->ibqp.qp_type == IB_QPT_UC)
+               goto send_comp;
+       /*
+        * Note: we don't need the s_lock held since the BUSY flag
+        * makes this single threaded.
+        */
+       if (sqp->s_rnr_retry == 0) {
+               send_status = IB_WC_RNR_RETRY_EXC_ERR;
+               goto serr;
+       }
+       if (sqp->s_rnr_retry_cnt < 7)
+               sqp->s_rnr_retry--;
+       spin_lock_irqsave(&sqp->s_lock, flags);
+       if (!(ib_ipath_state_ops[sqp->state] & IPATH_PROCESS_RECV_OK))
+               goto clr_busy;
+       sqp->s_flags |= IPATH_S_WAITING;
+       dev->n_rnr_naks++;
+       sqp->s_rnr_timeout = ib_ipath_rnr_table[qp->r_min_rnr_timer];
+       ipath_insert_rnr_queue(sqp);
+       goto clr_busy;
+
+inv_err:
+       send_status = IB_WC_REM_INV_REQ_ERR;
+       wc.status = IB_WC_LOC_QP_OP_ERR;
+       goto err;
+
+acc_err:
+       send_status = IB_WC_REM_ACCESS_ERR;
+       wc.status = IB_WC_LOC_PROT_ERR;
+err:
+       /* responder goes to error state */
+       ipath_rc_error(qp, wc.status);
+
+serr:
+       spin_lock_irqsave(&sqp->s_lock, flags);
+       ipath_send_complete(sqp, wqe, send_status);
+       if (sqp->ibqp.qp_type == IB_QPT_RC) {
+               int lastwqe = ipath_error_qp(sqp, IB_WC_WR_FLUSH_ERR);
+
+               sqp->s_flags &= ~IPATH_S_BUSY;
+               spin_unlock_irqrestore(&sqp->s_lock, flags);
+               if (lastwqe) {
+                       struct ib_event ev;
+
+                       ev.device = sqp->ibqp.device;
+                       ev.element.qp = &sqp->ibqp;
+                       ev.event = IB_EVENT_QP_LAST_WQE_REACHED;
+                       sqp->ibqp.event_handler(&ev, sqp->ibqp.qp_context);
+               }
+               goto done;
+       }
+clr_busy:
+       sqp->s_flags &= ~IPATH_S_BUSY;
+unlock:
+       spin_unlock_irqrestore(&sqp->s_lock, flags);
 done:
-       if (atomic_dec_and_test(&qp->refcount))
+       if (qp && atomic_dec_and_test(&qp->refcount))
                wake_up(&qp->wait);
 }
 
-static void want_buffer(struct ipath_devdata *dd)
+static void want_buffer(struct ipath_devdata *dd, struct ipath_qp *qp)
 {
-       if (!(dd->ipath_flags & IPATH_HAS_SEND_DMA)) {
+       if (!(dd->ipath_flags & IPATH_HAS_SEND_DMA) ||
+           qp->ibqp.qp_type == IB_QPT_SMI) {
                unsigned long flags;
 
                spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
@@ -501,26 +534,36 @@ static void want_buffer(struct ipath_devdata *dd)
  * @dev: the device we ran out of buffers on
  *
  * Called when we run out of PIO buffers.
+ * If we are now in the error state, return zero to flush the
+ * send work request.
  */
-static void ipath_no_bufs_available(struct ipath_qp *qp,
+static int ipath_no_bufs_available(struct ipath_qp *qp,
                                    struct ipath_ibdev *dev)
 {
        unsigned long flags;
+       int ret = 1;
 
        /*
         * Note that as soon as want_buffer() is called and
         * possibly before it returns, ipath_ib_piobufavail()
-        * could be called.  If we are still in the tasklet function,
-        * tasklet_hi_schedule() will not call us until the next time
-        * tasklet_hi_schedule() is called.
-        * We leave the busy flag set so that another post send doesn't
-        * try to put the same QP on the piowait list again.
+        * could be called. Therefore, put QP on the piowait list before
+        * enabling the PIO avail interrupt.
         */
-       spin_lock_irqsave(&dev->pending_lock, flags);
-       list_add_tail(&qp->piowait, &dev->piowait);
-       spin_unlock_irqrestore(&dev->pending_lock, flags);
-       want_buffer(dev->dd);
-       dev->n_piowait++;
+       spin_lock_irqsave(&qp->s_lock, flags);
+       if (ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK) {
+               dev->n_piowait++;
+               qp->s_flags |= IPATH_S_WAITING;
+               qp->s_flags &= ~IPATH_S_BUSY;
+               spin_lock(&dev->pending_lock);
+               if (list_empty(&qp->piowait))
+                       list_add_tail(&qp->piowait, &dev->piowait);
+               spin_unlock(&dev->pending_lock);
+       } else
+               ret = 0;
+       spin_unlock_irqrestore(&qp->s_lock, flags);
+       if (ret)
+               want_buffer(dev->dd, qp);
+       return ret;
 }
 
 /**
@@ -596,15 +639,13 @@ void ipath_do_send(unsigned long data)
        struct ipath_qp *qp = (struct ipath_qp *)data;
        struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
        int (*make_req)(struct ipath_qp *qp);
-
-       if (test_and_set_bit(IPATH_S_BUSY, &qp->s_busy))
-               goto bail;
+       unsigned long flags;
 
        if ((qp->ibqp.qp_type == IB_QPT_RC ||
             qp->ibqp.qp_type == IB_QPT_UC) &&
            qp->remote_ah_attr.dlid == dev->dd->ipath_lid) {
                ipath_ruc_loopback(qp);
-               goto clear;
+               goto bail;
        }
 
        if (qp->ibqp.qp_type == IB_QPT_RC)
@@ -614,6 +655,19 @@ void ipath_do_send(unsigned long data)
        else
               make_req = ipath_make_ud_req;
 
+       spin_lock_irqsave(&qp->s_lock, flags);
+
+       /* Return if we are already busy processing a work request. */
+       if ((qp->s_flags & (IPATH_S_BUSY | IPATH_S_ANY_WAIT)) ||
+           !(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_OR_FLUSH_SEND)) {
+               spin_unlock_irqrestore(&qp->s_lock, flags);
+               goto bail;
+       }
+
+       qp->s_flags |= IPATH_S_BUSY;
+
+       spin_unlock_irqrestore(&qp->s_lock, flags);
+
 again:
        /* Check for a constructed packet to be sent. */
        if (qp->s_hdrwords != 0) {
@@ -623,8 +677,8 @@ again:
                 */
                if (ipath_verbs_send(qp, &qp->s_hdr, qp->s_hdrwords,
                                     qp->s_cur_sge, qp->s_cur_size)) {
-                       ipath_no_bufs_available(qp, dev);
-                       goto bail;
+                       if (ipath_no_bufs_available(qp, dev))
+                               goto bail;
                }
                dev->n_unicast_xmit++;
                /* Record that we sent the packet and s_hdr is empty. */
@@ -633,16 +687,20 @@ again:
 
        if (make_req(qp))
                goto again;
-clear:
-       clear_bit(IPATH_S_BUSY, &qp->s_busy);
+
 bail:;
 }
 
+/*
+ * This should be called with s_lock held.
+ */
 void ipath_send_complete(struct ipath_qp *qp, struct ipath_swqe *wqe,
                         enum ib_wc_status status)
 {
-       unsigned long flags;
-       u32 last;
+       u32 old_last, last;
+
+       if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_OR_FLUSH_SEND))
+               return;
 
        /* See ch. 11.2.4.1 and 10.7.3.1 */
        if (!(qp->s_flags & IPATH_S_SIGNAL_REQ_WR) ||
@@ -650,27 +708,25 @@ void ipath_send_complete(struct ipath_qp *qp, struct ipath_swqe *wqe,
            status != IB_WC_SUCCESS) {
                struct ib_wc wc;
 
+               memset(&wc, 0, sizeof wc);
                wc.wr_id = wqe->wr.wr_id;
                wc.status = status;
                wc.opcode = ib_ipath_wc_opcode[wqe->wr.opcode];
-               wc.vendor_err = 0;
-               wc.byte_len = wqe->length;
-               wc.imm_data = 0;
                wc.qp = &qp->ibqp;
-               wc.src_qp = 0;
-               wc.wc_flags = 0;
-               wc.pkey_index = 0;
-               wc.slid = 0;
-               wc.sl = 0;
-               wc.dlid_path_bits = 0;
-               wc.port_num = 0;
-               ipath_cq_enter(to_icq(qp->ibqp.send_cq), &wc, 0);
+               if (status == IB_WC_SUCCESS)
+                       wc.byte_len = wqe->length;
+               ipath_cq_enter(to_icq(qp->ibqp.send_cq), &wc,
+                              status != IB_WC_SUCCESS);
        }
 
-       spin_lock_irqsave(&qp->s_lock, flags);
-       last = qp->s_last;
+       old_last = last = qp->s_last;
        if (++last >= qp->s_size)
                last = 0;
        qp->s_last = last;
-       spin_unlock_irqrestore(&qp->s_lock, flags);
+       if (qp->s_cur == old_last)
+               qp->s_cur = last;
+       if (qp->s_tail == old_last)
+               qp->s_tail = last;
+       if (qp->state == IB_QPS_SQD && last == qp->s_cur)
+               qp->s_draining = 0;
 }