tree-wide: fix assorted typos all over the place
[safe/jmp/linux-2.6] / drivers / infiniband / hw / mlx4 / qp.c
index f5210c1..256a00c 100644 (file)
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -70,14 +71,17 @@ enum {
 };
 
 static const __be32 mlx4_ib_opcode[] = {
-       [IB_WR_SEND]                    = __constant_cpu_to_be32(MLX4_OPCODE_SEND),
-       [IB_WR_LSO]                     = __constant_cpu_to_be32(MLX4_OPCODE_LSO),
-       [IB_WR_SEND_WITH_IMM]           = __constant_cpu_to_be32(MLX4_OPCODE_SEND_IMM),
-       [IB_WR_RDMA_WRITE]              = __constant_cpu_to_be32(MLX4_OPCODE_RDMA_WRITE),
-       [IB_WR_RDMA_WRITE_WITH_IMM]     = __constant_cpu_to_be32(MLX4_OPCODE_RDMA_WRITE_IMM),
-       [IB_WR_RDMA_READ]               = __constant_cpu_to_be32(MLX4_OPCODE_RDMA_READ),
-       [IB_WR_ATOMIC_CMP_AND_SWP]      = __constant_cpu_to_be32(MLX4_OPCODE_ATOMIC_CS),
-       [IB_WR_ATOMIC_FETCH_AND_ADD]    = __constant_cpu_to_be32(MLX4_OPCODE_ATOMIC_FA),
+       [IB_WR_SEND]                    = cpu_to_be32(MLX4_OPCODE_SEND),
+       [IB_WR_LSO]                     = cpu_to_be32(MLX4_OPCODE_LSO),
+       [IB_WR_SEND_WITH_IMM]           = cpu_to_be32(MLX4_OPCODE_SEND_IMM),
+       [IB_WR_RDMA_WRITE]              = cpu_to_be32(MLX4_OPCODE_RDMA_WRITE),
+       [IB_WR_RDMA_WRITE_WITH_IMM]     = cpu_to_be32(MLX4_OPCODE_RDMA_WRITE_IMM),
+       [IB_WR_RDMA_READ]               = cpu_to_be32(MLX4_OPCODE_RDMA_READ),
+       [IB_WR_ATOMIC_CMP_AND_SWP]      = cpu_to_be32(MLX4_OPCODE_ATOMIC_CS),
+       [IB_WR_ATOMIC_FETCH_AND_ADD]    = cpu_to_be32(MLX4_OPCODE_ATOMIC_FA),
+       [IB_WR_SEND_WITH_INV]           = cpu_to_be32(MLX4_OPCODE_SEND_INVAL),
+       [IB_WR_LOCAL_INV]               = cpu_to_be32(MLX4_OPCODE_LOCAL_INVAL),
+       [IB_WR_FAST_REG_MR]             = cpu_to_be32(MLX4_OPCODE_FMR),
 };
 
 static struct mlx4_ib_sqp *to_msqp(struct mlx4_ib_qp *mqp)
@@ -129,9 +133,10 @@ static void stamp_send_wqe(struct mlx4_ib_qp *qp, int n, int size)
        int ind;
        void *buf;
        __be32 stamp;
+       struct mlx4_wqe_ctrl_seg *ctrl;
 
-       s = roundup(size, 1U << qp->sq.wqe_shift);
        if (qp->sq_max_wqes_per_wr > 1) {
+               s = roundup(size, 1U << qp->sq.wqe_shift);
                for (i = 0; i < s; i += 64) {
                        ind = (i >> qp->sq.wqe_shift) + n;
                        stamp = ind & qp->sq.wqe_cnt ? cpu_to_be32(0x7fffffff) :
@@ -141,7 +146,8 @@ static void stamp_send_wqe(struct mlx4_ib_qp *qp, int n, int size)
                        *wqe = stamp;
                }
        } else {
-               buf = get_send_wqe(qp, n & (qp->sq.wqe_cnt - 1));
+               ctrl = buf = get_send_wqe(qp, n & (qp->sq.wqe_cnt - 1));
+               s = (ctrl->fence_size & 0x3f) << 4;
                for (i = 64; i < s; i += 64) {
                        wqe = buf + i;
                        *wqe = cpu_to_be32(0xffffffff);
@@ -333,6 +339,9 @@ static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
                cap->max_inline_data + sizeof (struct mlx4_wqe_inline_seg)) +
                send_wqe_overhead(type, qp->flags);
 
+       if (s > dev->dev->caps.max_sq_desc_sz)
+               return -EINVAL;
+
        /*
         * Hermon supports shrinking WQEs, such that a single work
         * request can include multiple units of 1 << wqe_shift.  This
@@ -343,7 +352,7 @@ static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
         * anymore, so we do this only if selective signaling is off.
         *
         * Further, on 32-bit platforms, we can't use vmap() to make
-        * the QP buffer virtually contigious.  Thus we have to use
+        * the QP buffer virtually contiguous.  Thus we have to use
         * constant-sized WRs to make sure a WR is always fully within
         * a single page-sized chunk.
         *
@@ -372,9 +381,6 @@ static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
                qp->sq.wqe_shift = ilog2(roundup_pow_of_two(s));
 
        for (;;) {
-               if (1 << qp->sq.wqe_shift > dev->dev->caps.max_sq_desc_sz)
-                       return -EINVAL;
-
                qp->sq_max_wqes_per_wr = DIV_ROUND_UP(s, 1U << qp->sq.wqe_shift);
 
                /*
@@ -395,7 +401,8 @@ static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
                ++qp->sq.wqe_shift;
        }
 
-       qp->sq.max_gs = ((qp->sq_max_wqes_per_wr << qp->sq.wqe_shift) -
+       qp->sq.max_gs = (min(dev->dev->caps.max_sq_desc_sz,
+                            (qp->sq_max_wqes_per_wr << qp->sq.wqe_shift)) -
                         send_wqe_overhead(type, qp->flags)) /
                sizeof (struct mlx4_wqe_data_seg);
 
@@ -411,7 +418,9 @@ static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
 
        cap->max_send_wr  = qp->sq.max_post =
                (qp->sq.wqe_cnt - qp->sq_spare_wqes) / qp->sq_max_wqes_per_wr;
-       cap->max_send_sge = qp->sq.max_gs;
+       cap->max_send_sge = min(qp->sq.max_gs,
+                               min(dev->dev->caps.max_sq_sg,
+                                   dev->dev->caps.max_rq_sg));
        /* We don't support inline sends for kernel QPs (yet) */
        cap->max_inline_data = 0;
 
@@ -442,6 +451,7 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
                            struct ib_qp_init_attr *init_attr,
                            struct ib_udata *udata, int sqpn, struct mlx4_ib_qp *qp)
 {
+       int qpn;
        int err;
 
        mutex_init(&qp->mutex);
@@ -449,19 +459,8 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
        spin_lock_init(&qp->rq.lock);
 
        qp->state        = IB_QPS_RESET;
-       qp->atomic_rd_en = 0;
-       qp->resp_depth   = 0;
-
-       qp->rq.head         = 0;
-       qp->rq.tail         = 0;
-       qp->sq.head         = 0;
-       qp->sq.tail         = 0;
-       qp->sq_next_wqe     = 0;
-
        if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR)
                qp->sq_signal_bits = cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE);
-       else
-               qp->sq_signal_bits = 0;
 
        err = set_rq_size(dev, &init_attr->cap, !!pd->uobject, !!init_attr->srq, qp);
        if (err)
@@ -482,7 +481,7 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
                        goto err;
 
                qp->umem = ib_umem_get(pd->uobject->context, ucmd.buf_addr,
-                                      qp->buf_size, 0);
+                                      qp->buf_size, 0, 0);
                if (IS_ERR(qp->umem)) {
                        err = PTR_ERR(qp->umem);
                        goto err;
@@ -506,6 +505,9 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
        } else {
                qp->sq_no_prefetch = 0;
 
+               if (init_attr->create_flags & IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK)
+                       qp->flags |= MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK;
+
                if (init_attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO)
                        qp->flags |= MLX4_IB_QP_LSO;
 
@@ -514,7 +516,7 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
                        goto err;
 
                if (!init_attr->srq) {
-                       err = mlx4_ib_db_alloc(dev, &qp->db, 0);
+                       err = mlx4_db_alloc(dev->dev, &qp->db, 0);
                        if (err)
                                goto err;
 
@@ -544,9 +546,17 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
                }
        }
 
-       err = mlx4_qp_alloc(dev->dev, sqpn, &qp->mqp);
+       if (sqpn) {
+               qpn = sqpn;
+       } else {
+               err = mlx4_qp_reserve_range(dev->dev, 1, 1, &qpn);
+               if (err)
+                       goto err_wrid;
+       }
+
+       err = mlx4_qp_alloc(dev->dev, qpn, &qp->mqp);
        if (err)
-               goto err_wrid;
+               goto err_qpn;
 
        /*
         * Hardware wants QPN written in big-endian order (after
@@ -559,6 +569,10 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
 
        return 0;
 
+err_qpn:
+       if (!sqpn)
+               mlx4_qp_release_range(dev->dev, qpn, 1);
+
 err_wrid:
        if (pd->uobject) {
                if (!init_attr->srq)
@@ -580,7 +594,7 @@ err_buf:
 
 err_db:
        if (!pd->uobject && !init_attr->srq)
-               mlx4_ib_db_free(dev, &qp->db);
+               mlx4_db_free(dev->dev, &qp->db);
 
 err:
        return err;
@@ -601,10 +615,12 @@ static enum mlx4_qp_state to_mlx4_state(enum ib_qp_state state)
 }
 
 static void mlx4_ib_lock_cqs(struct mlx4_ib_cq *send_cq, struct mlx4_ib_cq *recv_cq)
+       __acquires(&send_cq->lock) __acquires(&recv_cq->lock)
 {
-       if (send_cq == recv_cq)
+       if (send_cq == recv_cq) {
                spin_lock_irq(&send_cq->lock);
-       else if (send_cq->mcq.cqn < recv_cq->mcq.cqn) {
+               __acquire(&recv_cq->lock);
+       } else if (send_cq->mcq.cqn < recv_cq->mcq.cqn) {
                spin_lock_irq(&send_cq->lock);
                spin_lock_nested(&recv_cq->lock, SINGLE_DEPTH_NESTING);
        } else {
@@ -614,10 +630,12 @@ static void mlx4_ib_lock_cqs(struct mlx4_ib_cq *send_cq, struct mlx4_ib_cq *recv
 }
 
 static void mlx4_ib_unlock_cqs(struct mlx4_ib_cq *send_cq, struct mlx4_ib_cq *recv_cq)
+       __releases(&send_cq->lock) __releases(&recv_cq->lock)
 {
-       if (send_cq == recv_cq)
+       if (send_cq == recv_cq) {
+               __release(&recv_cq->lock);
                spin_unlock_irq(&send_cq->lock);
-       else if (send_cq->mcq.cqn < recv_cq->mcq.cqn) {
+       else if (send_cq->mcq.cqn < recv_cq->mcq.cqn) {
                spin_unlock(&recv_cq->lock);
                spin_unlock_irq(&send_cq->lock);
        } else {
@@ -654,6 +672,10 @@ static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp,
        mlx4_ib_unlock_cqs(send_cq, recv_cq);
 
        mlx4_qp_free(dev->dev, &qp->mqp);
+
+       if (!is_sqp(dev, qp))
+               mlx4_qp_release_range(dev->dev, qp->mqp.qpn, 1);
+
        mlx4_mtt_cleanup(dev->dev, &qp->mtt);
 
        if (is_user) {
@@ -666,7 +688,7 @@ static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp,
                kfree(qp->rq.wrid);
                mlx4_buf_free(dev->dev, qp->buf_size, &qp->buf);
                if (!qp->ibqp.srq)
-                       mlx4_ib_db_free(dev, &qp->db);
+                       mlx4_db_free(dev->dev, &qp->db);
        }
 }
 
@@ -679,10 +701,15 @@ struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd,
        struct mlx4_ib_qp *qp;
        int err;
 
-       /* We only support LSO, and only for kernel UD QPs. */
-       if (init_attr->create_flags & ~IB_QP_CREATE_IPOIB_UD_LSO)
+       /*
+        * We only support LSO and multicast loopback blocking, and
+        * only for kernel UD QPs.
+        */
+       if (init_attr->create_flags & ~(IB_QP_CREATE_IPOIB_UD_LSO |
+                                       IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK))
                return ERR_PTR(-EINVAL);
-       if (init_attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO &&
+
+       if (init_attr->create_flags &&
            (pd->uobject || init_attr->qp_type != IB_QPT_UD))
                return ERR_PTR(-EINVAL);
 
@@ -691,7 +718,7 @@ struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd,
        case IB_QPT_UC:
        case IB_QPT_UD:
        {
-               qp = kmalloc(sizeof *qp, GFP_KERNEL);
+               qp = kzalloc(sizeof *qp, GFP_KERNEL);
                if (!qp)
                        return ERR_PTR(-ENOMEM);
 
@@ -712,7 +739,7 @@ struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd,
                if (pd->uobject)
                        return ERR_PTR(-EINVAL);
 
-               sqp = kmalloc(sizeof *sqp, GFP_KERNEL);
+               sqp = kzalloc(sizeof *sqp, GFP_KERNEL);
                if (!sqp)
                        return ERR_PTR(-ENOMEM);
 
@@ -896,14 +923,15 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
                        context->mtu_msgmax = (IB_MTU_4096 << 5) |
                                              ilog2(dev->dev->caps.max_gso_sz);
                else
-                       context->mtu_msgmax = (IB_MTU_4096 << 5) | 11;
+                       context->mtu_msgmax = (IB_MTU_4096 << 5) | 12;
        } else if (attr_mask & IB_QP_PATH_MTU) {
                if (attr->path_mtu < IB_MTU_256 || attr->path_mtu > IB_MTU_4096) {
                        printk(KERN_ERR "path MTU (%u) is invalid\n",
                               attr->path_mtu);
                        goto out;
                }
-               context->mtu_msgmax = (attr->path_mtu << 5) | 31;
+               context->mtu_msgmax = (attr->path_mtu << 5) |
+                       ilog2(dev->dev->caps.max_msg_sz);
        }
 
        if (qp->rq.wqe_cnt)
@@ -973,6 +1001,10 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
        context->pd         = cpu_to_be32(to_mpd(ibqp->pd)->pdn);
        context->params1    = cpu_to_be32(MLX4_IB_ACK_REQ_FREQ << 28);
 
+       /* Set "fast registration enabled" for all kernel QPs */
+       if (!qp->ibqp.uobject)
+               context->params1 |= cpu_to_be32(1 << 11);
+
        if (attr_mask & IB_QP_RNR_RETRY) {
                context->params1 |= cpu_to_be32(attr->rnr_retry << 13);
                optpar |= MLX4_QP_OPTPAR_RNR_RETRY;
@@ -1047,6 +1079,9 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
        else
                sqd_event = 0;
 
+       if (!ibqp->uobject && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT)
+               context->rlkey |= (1 << 4);
+
        /*
         * Before passing a kernel QP to the HW, make sure that the
         * ownership bits of the send queue are set and the SQ
@@ -1060,6 +1095,8 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
                for (i = 0; i < qp->sq.wqe_cnt; ++i) {
                        ctrl = get_send_wqe(qp, i);
                        ctrl->owner_opcode = cpu_to_be32(1 << 31);
+                       if (qp->sq_max_wqes_per_wr == 1)
+                               ctrl->fence_size = 1 << (qp->sq.wqe_shift - 4);
 
                        stamp_send_wqe(qp, i, 1 << qp->sq.wqe_shift);
                }
@@ -1124,23 +1161,6 @@ out:
        return err;
 }
 
-static const struct ib_qp_attr mlx4_ib_qp_attr = { .port_num = 1 };
-static const int mlx4_ib_qp_attr_mask_table[IB_QPT_UD + 1] = {
-               [IB_QPT_UD]  = (IB_QP_PKEY_INDEX                |
-                               IB_QP_PORT                      |
-                               IB_QP_QKEY),
-               [IB_QPT_UC]  = (IB_QP_PKEY_INDEX                |
-                               IB_QP_PORT                      |
-                               IB_QP_ACCESS_FLAGS),
-               [IB_QPT_RC]  = (IB_QP_PKEY_INDEX                |
-                               IB_QP_PORT                      |
-                               IB_QP_ACCESS_FLAGS),
-               [IB_QPT_SMI] = (IB_QP_PKEY_INDEX                |
-                               IB_QP_QKEY),
-               [IB_QPT_GSI] = (IB_QP_PKEY_INDEX                |
-                               IB_QP_QKEY),
-};
-
 int mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
                      int attr_mask, struct ib_udata *udata)
 {
@@ -1183,15 +1203,6 @@ int mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
                goto out;
        }
 
-       if (cur_state == IB_QPS_RESET && new_state == IB_QPS_ERR) {
-               err = __mlx4_ib_modify_qp(ibqp, &mlx4_ib_qp_attr,
-                                         mlx4_ib_qp_attr_mask_table[ibqp->qp_type],
-                                         IB_QPS_RESET, IB_QPS_INIT);
-               if (err)
-                       goto out;
-               cur_state = IB_QPS_INIT;
-       }
-
        err = __mlx4_ib_modify_qp(ibqp, attr, attr_mask, cur_state, new_state);
 
 out:
@@ -1249,7 +1260,7 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr,
        case IB_WR_SEND_WITH_IMM:
                sqp->ud_header.bth.opcode        = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE;
                sqp->ud_header.immediate_present = 1;
-               sqp->ud_header.immediate_data    = wr->imm_data;
+               sqp->ud_header.immediate_data    = wr->ex.imm_data;
                break;
        default:
                return -EINVAL;
@@ -1343,6 +1354,44 @@ static int mlx4_wq_overflow(struct mlx4_ib_wq *wq, int nreq, struct ib_cq *ib_cq
        return cur + nreq >= wq->max_post;
 }
 
+static __be32 convert_access(int acc)
+{
+       return (acc & IB_ACCESS_REMOTE_ATOMIC ? cpu_to_be32(MLX4_WQE_FMR_PERM_ATOMIC)       : 0) |
+              (acc & IB_ACCESS_REMOTE_WRITE  ? cpu_to_be32(MLX4_WQE_FMR_PERM_REMOTE_WRITE) : 0) |
+              (acc & IB_ACCESS_REMOTE_READ   ? cpu_to_be32(MLX4_WQE_FMR_PERM_REMOTE_READ)  : 0) |
+              (acc & IB_ACCESS_LOCAL_WRITE   ? cpu_to_be32(MLX4_WQE_FMR_PERM_LOCAL_WRITE)  : 0) |
+               cpu_to_be32(MLX4_WQE_FMR_PERM_LOCAL_READ);
+}
+
+static void set_fmr_seg(struct mlx4_wqe_fmr_seg *fseg, struct ib_send_wr *wr)
+{
+       struct mlx4_ib_fast_reg_page_list *mfrpl = to_mfrpl(wr->wr.fast_reg.page_list);
+       int i;
+
+       for (i = 0; i < wr->wr.fast_reg.page_list_len; ++i)
+               mfrpl->mapped_page_list[i] =
+                       cpu_to_be64(wr->wr.fast_reg.page_list->page_list[i] |
+                                   MLX4_MTT_FLAG_PRESENT);
+
+       fseg->flags             = convert_access(wr->wr.fast_reg.access_flags);
+       fseg->mem_key           = cpu_to_be32(wr->wr.fast_reg.rkey);
+       fseg->buf_list          = cpu_to_be64(mfrpl->map);
+       fseg->start_addr        = cpu_to_be64(wr->wr.fast_reg.iova_start);
+       fseg->reg_len           = cpu_to_be64(wr->wr.fast_reg.length);
+       fseg->offset            = 0; /* XXX -- is this just for ZBVA? */
+       fseg->page_size         = cpu_to_be32(wr->wr.fast_reg.page_shift);
+       fseg->reserved[0]       = 0;
+       fseg->reserved[1]       = 0;
+}
+
+static void set_local_inv_seg(struct mlx4_wqe_local_inval_seg *iseg, u32 rkey)
+{
+       iseg->flags     = 0;
+       iseg->mem_key   = cpu_to_be32(rkey);
+       iseg->guest_id  = 0;
+       iseg->pa        = 0;
+}
+
 static __always_inline void set_raddr_seg(struct mlx4_wqe_raddr_seg *rseg,
                                          u64 remote_addr, u32 rkey)
 {
@@ -1416,8 +1465,9 @@ static void __set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ib_sge *sg)
        dseg->addr       = cpu_to_be64(sg->addr);
 }
 
-static int build_lso_seg(struct mlx4_lso_seg *wqe, struct ib_send_wr *wr,
-                        struct mlx4_ib_qp *qp, unsigned *lso_seg_len)
+static int build_lso_seg(struct mlx4_wqe_lso_seg *wqe, struct ib_send_wr *wr,
+                        struct mlx4_ib_qp *qp, unsigned *lso_seg_len,
+                        __be32 *lso_hdr_sz)
 {
        unsigned halign = ALIGN(sizeof *wqe + wr->wr.ud.hlen, 16);
 
@@ -1434,16 +1484,27 @@ static int build_lso_seg(struct mlx4_lso_seg *wqe, struct ib_send_wr *wr,
 
        memcpy(wqe->header, wr->wr.ud.header, wr->wr.ud.hlen);
 
-       /* make sure LSO header is written before overwriting stamping */
-       wmb();
-
-       wqe->mss_hdr_size = cpu_to_be32((wr->wr.ud.mss - wr->wr.ud.hlen) << 16 |
-                                       wr->wr.ud.hlen);
-
+       *lso_hdr_sz  = cpu_to_be32((wr->wr.ud.mss - wr->wr.ud.hlen) << 16 |
+                                  wr->wr.ud.hlen);
        *lso_seg_len = halign;
        return 0;
 }
 
+static __be32 send_ieth(struct ib_send_wr *wr)
+{
+       switch (wr->opcode) {
+       case IB_WR_SEND_WITH_IMM:
+       case IB_WR_RDMA_WRITE_WITH_IMM:
+               return wr->ex.imm_data;
+
+       case IB_WR_SEND_WITH_INV:
+               return cpu_to_be32(wr->ex.invalidate_rkey);
+
+       default:
+               return 0;
+       }
+}
+
 int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
                      struct ib_send_wr **bad_wr)
 {
@@ -1457,7 +1518,10 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
        unsigned ind;
        int uninitialized_var(stamp);
        int uninitialized_var(size);
-       unsigned seglen;
+       unsigned uninitialized_var(seglen);
+       __be32 dummy;
+       __be32 *lso_wqe;
+       __be32 uninitialized_var(lso_hdr_sz);
        int i;
 
        spin_lock_irqsave(&qp->sq.lock, flags);
@@ -1465,6 +1529,8 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
        ind = qp->sq_next_wqe;
 
        for (nreq = 0; wr; ++nreq, wr = wr->next) {
+               lso_wqe = &dummy;
+
                if (mlx4_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq)) {
                        err = -ENOMEM;
                        *bad_wr = wr;
@@ -1490,11 +1556,7 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
                                     MLX4_WQE_CTRL_TCP_UDP_CSUM) : 0) |
                        qp->sq_signal_bits;
 
-               if (wr->opcode == IB_WR_SEND_WITH_IMM ||
-                   wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM)
-                       ctrl->imm = wr->imm_data;
-               else
-                       ctrl->imm = 0;
+               ctrl->imm = send_ieth(wr);
 
                wqe += sizeof *ctrl;
                size = sizeof *ctrl / 16;
@@ -1526,6 +1588,22 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
                                size += sizeof (struct mlx4_wqe_raddr_seg) / 16;
                                break;
 
+                       case IB_WR_LOCAL_INV:
+                               ctrl->srcrb_flags |=
+                                       cpu_to_be32(MLX4_WQE_CTRL_STRONG_ORDER);
+                               set_local_inv_seg(wqe, wr->ex.invalidate_rkey);
+                               wqe  += sizeof (struct mlx4_wqe_local_inval_seg);
+                               size += sizeof (struct mlx4_wqe_local_inval_seg) / 16;
+                               break;
+
+                       case IB_WR_FAST_REG_MR:
+                               ctrl->srcrb_flags |=
+                                       cpu_to_be32(MLX4_WQE_CTRL_STRONG_ORDER);
+                               set_fmr_seg(wqe, wr);
+                               wqe  += sizeof (struct mlx4_wqe_fmr_seg);
+                               size += sizeof (struct mlx4_wqe_fmr_seg) / 16;
+                               break;
+
                        default:
                                /* No extra segments required for sends */
                                break;
@@ -1538,11 +1616,12 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
                        size += sizeof (struct mlx4_wqe_datagram_seg) / 16;
 
                        if (wr->opcode == IB_WR_LSO) {
-                               err = build_lso_seg(wqe, wr, qp, &seglen);
+                               err = build_lso_seg(wqe, wr, qp, &seglen, &lso_hdr_sz);
                                if (unlikely(err)) {
                                        *bad_wr = wr;
                                        goto out;
                                }
+                               lso_wqe = (__be32 *) wqe;
                                wqe  += seglen;
                                size += seglen / 16;
                        }
@@ -1584,6 +1663,14 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
                for (i = wr->num_sge - 1; i >= 0; --i, --dseg)
                        set_data_seg(dseg, wr->sg_list + i);
 
+               /*
+                * Possibly overwrite stamping in cacheline with LSO
+                * segment only after making sure all data segments
+                * are written.
+                */
+               wmb();
+               *lso_wqe = lso_hdr_sz;
+
                ctrl->fence_size = (wr->send_flags & IB_SEND_FENCE ?
                                    MLX4_WQE_CTRL_FENCE : 0) | size;
 
@@ -1618,7 +1705,6 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
                        stamp_send_wqe(qp, stamp, size * 16);
                        ind = pad_wraparound(qp, ind);
                }
-
        }
 
 out:
@@ -1785,7 +1871,9 @@ int mlx4_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr
        struct mlx4_ib_qp *qp = to_mqp(ibqp);
        struct mlx4_qp_context context;
        int mlx4_state;
-       int err;
+       int err = 0;
+
+       mutex_lock(&qp->mutex);
 
        if (qp->state == IB_QPS_RESET) {
                qp_attr->qp_state = IB_QPS_RESET;
@@ -1793,12 +1881,15 @@ int mlx4_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr
        }
 
        err = mlx4_qp_query(dev->dev, &qp->mqp, &context);
-       if (err)
-               return -EINVAL;
+       if (err) {
+               err = -EINVAL;
+               goto out;
+       }
 
        mlx4_state = be32_to_cpu(context.flags) >> 28;
 
-       qp_attr->qp_state            = to_ib_qp_state(mlx4_state);
+       qp->state                    = to_ib_qp_state(mlx4_state);
+       qp_attr->qp_state            = qp->state;
        qp_attr->path_mtu            = context.mtu_msgmax >> 5;
        qp_attr->path_mig_state      =
                to_ib_mig_state((be32_to_cpu(context.flags) >> 11) & 0x3);
@@ -1857,6 +1948,15 @@ done:
 
        qp_init_attr->cap            = qp_attr->cap;
 
-       return 0;
+       qp_init_attr->create_flags = 0;
+       if (qp->flags & MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK)
+               qp_init_attr->create_flags |= IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK;
+
+       if (qp->flags & MLX4_IB_QP_LSO)
+               qp_init_attr->create_flags |= IB_QP_CREATE_IPOIB_UD_LSO;
+
+out:
+       mutex_unlock(&qp->mutex);
+       return err;
 }