/*
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
};
static const __be32 mlx4_ib_opcode[] = {
- [IB_WR_SEND] = __constant_cpu_to_be32(MLX4_OPCODE_SEND),
- [IB_WR_LSO] = __constant_cpu_to_be32(MLX4_OPCODE_LSO),
- [IB_WR_SEND_WITH_IMM] = __constant_cpu_to_be32(MLX4_OPCODE_SEND_IMM),
- [IB_WR_RDMA_WRITE] = __constant_cpu_to_be32(MLX4_OPCODE_RDMA_WRITE),
- [IB_WR_RDMA_WRITE_WITH_IMM] = __constant_cpu_to_be32(MLX4_OPCODE_RDMA_WRITE_IMM),
- [IB_WR_RDMA_READ] = __constant_cpu_to_be32(MLX4_OPCODE_RDMA_READ),
- [IB_WR_ATOMIC_CMP_AND_SWP] = __constant_cpu_to_be32(MLX4_OPCODE_ATOMIC_CS),
- [IB_WR_ATOMIC_FETCH_AND_ADD] = __constant_cpu_to_be32(MLX4_OPCODE_ATOMIC_FA),
+ [IB_WR_SEND] = cpu_to_be32(MLX4_OPCODE_SEND),
+ [IB_WR_LSO] = cpu_to_be32(MLX4_OPCODE_LSO),
+ [IB_WR_SEND_WITH_IMM] = cpu_to_be32(MLX4_OPCODE_SEND_IMM),
+ [IB_WR_RDMA_WRITE] = cpu_to_be32(MLX4_OPCODE_RDMA_WRITE),
+ [IB_WR_RDMA_WRITE_WITH_IMM] = cpu_to_be32(MLX4_OPCODE_RDMA_WRITE_IMM),
+ [IB_WR_RDMA_READ] = cpu_to_be32(MLX4_OPCODE_RDMA_READ),
+ [IB_WR_ATOMIC_CMP_AND_SWP] = cpu_to_be32(MLX4_OPCODE_ATOMIC_CS),
+ [IB_WR_ATOMIC_FETCH_AND_ADD] = cpu_to_be32(MLX4_OPCODE_ATOMIC_FA),
+ [IB_WR_SEND_WITH_INV] = cpu_to_be32(MLX4_OPCODE_SEND_INVAL),
+ [IB_WR_LOCAL_INV] = cpu_to_be32(MLX4_OPCODE_LOCAL_INVAL),
+ [IB_WR_FAST_REG_MR] = cpu_to_be32(MLX4_OPCODE_FMR),
};
static struct mlx4_ib_sqp *to_msqp(struct mlx4_ib_qp *mqp)
* anymore, so we do this only if selective signaling is off.
*
* Further, on 32-bit platforms, we can't use vmap() to make
- * the QP buffer virtually contigious. Thus we have to use
+ * the QP buffer virtually contiguous. Thus we have to use
* constant-sized WRs to make sure a WR is always fully within
* a single page-sized chunk.
*
struct ib_qp_init_attr *init_attr,
struct ib_udata *udata, int sqpn, struct mlx4_ib_qp *qp)
{
+ int qpn;
int err;
mutex_init(&qp->mutex);
spin_lock_init(&qp->rq.lock);
qp->state = IB_QPS_RESET;
- qp->atomic_rd_en = 0;
- qp->resp_depth = 0;
-
- qp->rq.head = 0;
- qp->rq.tail = 0;
- qp->sq.head = 0;
- qp->sq.tail = 0;
- qp->sq_next_wqe = 0;
-
if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR)
qp->sq_signal_bits = cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE);
- else
- qp->sq_signal_bits = 0;
err = set_rq_size(dev, &init_attr->cap, !!pd->uobject, !!init_attr->srq, qp);
if (err)
} else {
qp->sq_no_prefetch = 0;
+ if (init_attr->create_flags & IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK)
+ qp->flags |= MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK;
+
if (init_attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO)
qp->flags |= MLX4_IB_QP_LSO;
}
}
- err = mlx4_qp_alloc(dev->dev, sqpn, &qp->mqp);
+ if (sqpn) {
+ qpn = sqpn;
+ } else {
+ err = mlx4_qp_reserve_range(dev->dev, 1, 1, &qpn);
+ if (err)
+ goto err_wrid;
+ }
+
+ err = mlx4_qp_alloc(dev->dev, qpn, &qp->mqp);
if (err)
- goto err_wrid;
+ goto err_qpn;
/*
* Hardware wants QPN written in big-endian order (after
return 0;
+err_qpn:
+ if (!sqpn)
+ mlx4_qp_release_range(dev->dev, qpn, 1);
+
err_wrid:
if (pd->uobject) {
if (!init_attr->srq)
}
static void mlx4_ib_lock_cqs(struct mlx4_ib_cq *send_cq, struct mlx4_ib_cq *recv_cq)
+ __acquires(&send_cq->lock) __acquires(&recv_cq->lock)
{
- if (send_cq == recv_cq)
+ if (send_cq == recv_cq) {
spin_lock_irq(&send_cq->lock);
- else if (send_cq->mcq.cqn < recv_cq->mcq.cqn) {
+ __acquire(&recv_cq->lock);
+ } else if (send_cq->mcq.cqn < recv_cq->mcq.cqn) {
spin_lock_irq(&send_cq->lock);
spin_lock_nested(&recv_cq->lock, SINGLE_DEPTH_NESTING);
} else {
}
static void mlx4_ib_unlock_cqs(struct mlx4_ib_cq *send_cq, struct mlx4_ib_cq *recv_cq)
+ __releases(&send_cq->lock) __releases(&recv_cq->lock)
{
- if (send_cq == recv_cq)
+ if (send_cq == recv_cq) {
+ __release(&recv_cq->lock);
spin_unlock_irq(&send_cq->lock);
- else if (send_cq->mcq.cqn < recv_cq->mcq.cqn) {
+ } else if (send_cq->mcq.cqn < recv_cq->mcq.cqn) {
spin_unlock(&recv_cq->lock);
spin_unlock_irq(&send_cq->lock);
} else {
mlx4_ib_unlock_cqs(send_cq, recv_cq);
mlx4_qp_free(dev->dev, &qp->mqp);
+
+ if (!is_sqp(dev, qp))
+ mlx4_qp_release_range(dev->dev, qp->mqp.qpn, 1);
+
mlx4_mtt_cleanup(dev->dev, &qp->mtt);
if (is_user) {
struct mlx4_ib_qp *qp;
int err;
- /* We only support LSO, and only for kernel UD QPs. */
- if (init_attr->create_flags & ~IB_QP_CREATE_IPOIB_UD_LSO)
+ /*
+ * We only support LSO and multicast loopback blocking, and
+ * only for kernel UD QPs.
+ */
+ if (init_attr->create_flags & ~(IB_QP_CREATE_IPOIB_UD_LSO |
+ IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK))
return ERR_PTR(-EINVAL);
- if (init_attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO &&
+
+ if (init_attr->create_flags &&
(pd->uobject || init_attr->qp_type != IB_QPT_UD))
return ERR_PTR(-EINVAL);
case IB_QPT_UC:
case IB_QPT_UD:
{
- qp = kmalloc(sizeof *qp, GFP_KERNEL);
+ qp = kzalloc(sizeof *qp, GFP_KERNEL);
if (!qp)
return ERR_PTR(-ENOMEM);
if (pd->uobject)
return ERR_PTR(-EINVAL);
- sqp = kmalloc(sizeof *sqp, GFP_KERNEL);
+ sqp = kzalloc(sizeof *sqp, GFP_KERNEL);
if (!sqp)
return ERR_PTR(-ENOMEM);
context->mtu_msgmax = (IB_MTU_4096 << 5) |
ilog2(dev->dev->caps.max_gso_sz);
else
- context->mtu_msgmax = (IB_MTU_4096 << 5) | 11;
+ context->mtu_msgmax = (IB_MTU_4096 << 5) | 12;
} else if (attr_mask & IB_QP_PATH_MTU) {
if (attr->path_mtu < IB_MTU_256 || attr->path_mtu > IB_MTU_4096) {
printk(KERN_ERR "path MTU (%u) is invalid\n",
context->pd = cpu_to_be32(to_mpd(ibqp->pd)->pdn);
context->params1 = cpu_to_be32(MLX4_IB_ACK_REQ_FREQ << 28);
+ /* Set "fast registration enabled" for all kernel QPs */
+ if (!qp->ibqp.uobject)
+ context->params1 |= cpu_to_be32(1 << 11);
+
if (attr_mask & IB_QP_RNR_RETRY) {
context->params1 |= cpu_to_be32(attr->rnr_retry << 13);
optpar |= MLX4_QP_OPTPAR_RNR_RETRY;
else
sqd_event = 0;
+ if (!ibqp->uobject && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT)
+ context->rlkey |= (1 << 4);
+
/*
* Before passing a kernel QP to the HW, make sure that the
* ownership bits of the send queue are set and the SQ
return cur + nreq >= wq->max_post;
}
+static __be32 convert_access(int acc)
+{
+ return (acc & IB_ACCESS_REMOTE_ATOMIC ? cpu_to_be32(MLX4_WQE_FMR_PERM_ATOMIC) : 0) |
+ (acc & IB_ACCESS_REMOTE_WRITE ? cpu_to_be32(MLX4_WQE_FMR_PERM_REMOTE_WRITE) : 0) |
+ (acc & IB_ACCESS_REMOTE_READ ? cpu_to_be32(MLX4_WQE_FMR_PERM_REMOTE_READ) : 0) |
+ (acc & IB_ACCESS_LOCAL_WRITE ? cpu_to_be32(MLX4_WQE_FMR_PERM_LOCAL_WRITE) : 0) |
+ cpu_to_be32(MLX4_WQE_FMR_PERM_LOCAL_READ);
+}
+
+static void set_fmr_seg(struct mlx4_wqe_fmr_seg *fseg, struct ib_send_wr *wr)
+{
+ struct mlx4_ib_fast_reg_page_list *mfrpl = to_mfrpl(wr->wr.fast_reg.page_list);
+ int i;
+
+ for (i = 0; i < wr->wr.fast_reg.page_list_len; ++i)
+ mfrpl->mapped_page_list[i] =
+ cpu_to_be64(wr->wr.fast_reg.page_list->page_list[i] |
+ MLX4_MTT_FLAG_PRESENT);
+
+ fseg->flags = convert_access(wr->wr.fast_reg.access_flags);
+ fseg->mem_key = cpu_to_be32(wr->wr.fast_reg.rkey);
+ fseg->buf_list = cpu_to_be64(mfrpl->map);
+ fseg->start_addr = cpu_to_be64(wr->wr.fast_reg.iova_start);
+ fseg->reg_len = cpu_to_be64(wr->wr.fast_reg.length);
+ fseg->offset = 0; /* XXX -- is this just for ZBVA? */
+ fseg->page_size = cpu_to_be32(wr->wr.fast_reg.page_shift);
+ fseg->reserved[0] = 0;
+ fseg->reserved[1] = 0;
+}
+
+static void set_local_inv_seg(struct mlx4_wqe_local_inval_seg *iseg, u32 rkey)
+{
+ iseg->flags = 0;
+ iseg->mem_key = cpu_to_be32(rkey);
+ iseg->guest_id = 0;
+ iseg->pa = 0;
+}
+
static __always_inline void set_raddr_seg(struct mlx4_wqe_raddr_seg *rseg,
u64 remote_addr, u32 rkey)
{
dseg->addr = cpu_to_be64(sg->addr);
}
-static int build_lso_seg(struct mlx4_lso_seg *wqe, struct ib_send_wr *wr,
- struct mlx4_ib_qp *qp, unsigned *lso_seg_len)
+static int build_lso_seg(struct mlx4_wqe_lso_seg *wqe, struct ib_send_wr *wr,
+ struct mlx4_ib_qp *qp, unsigned *lso_seg_len,
+ __be32 *lso_hdr_sz)
{
unsigned halign = ALIGN(sizeof *wqe + wr->wr.ud.hlen, 16);
memcpy(wqe->header, wr->wr.ud.header, wr->wr.ud.hlen);
- /* make sure LSO header is written before overwriting stamping */
- wmb();
-
- wqe->mss_hdr_size = cpu_to_be32((wr->wr.ud.mss - wr->wr.ud.hlen) << 16 |
- wr->wr.ud.hlen);
-
+ *lso_hdr_sz = cpu_to_be32((wr->wr.ud.mss - wr->wr.ud.hlen) << 16 |
+ wr->wr.ud.hlen);
*lso_seg_len = halign;
return 0;
}
+static __be32 send_ieth(struct ib_send_wr *wr)
+{
+ switch (wr->opcode) {
+ case IB_WR_SEND_WITH_IMM:
+ case IB_WR_RDMA_WRITE_WITH_IMM:
+ return wr->ex.imm_data;
+
+ case IB_WR_SEND_WITH_INV:
+ return cpu_to_be32(wr->ex.invalidate_rkey);
+
+ default:
+ return 0;
+ }
+}
+
int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
struct ib_send_wr **bad_wr)
{
int uninitialized_var(stamp);
int uninitialized_var(size);
unsigned uninitialized_var(seglen);
+ __be32 dummy;
+ __be32 *lso_wqe;
+ __be32 uninitialized_var(lso_hdr_sz);
int i;
spin_lock_irqsave(&qp->sq.lock, flags);
ind = qp->sq_next_wqe;
for (nreq = 0; wr; ++nreq, wr = wr->next) {
+ lso_wqe = &dummy;
+
if (mlx4_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq)) {
err = -ENOMEM;
*bad_wr = wr;
MLX4_WQE_CTRL_TCP_UDP_CSUM) : 0) |
qp->sq_signal_bits;
- if (wr->opcode == IB_WR_SEND_WITH_IMM ||
- wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM)
- ctrl->imm = wr->ex.imm_data;
- else
- ctrl->imm = 0;
+ ctrl->imm = send_ieth(wr);
wqe += sizeof *ctrl;
size = sizeof *ctrl / 16;
size += sizeof (struct mlx4_wqe_raddr_seg) / 16;
break;
+ case IB_WR_LOCAL_INV:
+ ctrl->srcrb_flags |=
+ cpu_to_be32(MLX4_WQE_CTRL_STRONG_ORDER);
+ set_local_inv_seg(wqe, wr->ex.invalidate_rkey);
+ wqe += sizeof (struct mlx4_wqe_local_inval_seg);
+ size += sizeof (struct mlx4_wqe_local_inval_seg) / 16;
+ break;
+
+ case IB_WR_FAST_REG_MR:
+ ctrl->srcrb_flags |=
+ cpu_to_be32(MLX4_WQE_CTRL_STRONG_ORDER);
+ set_fmr_seg(wqe, wr);
+ wqe += sizeof (struct mlx4_wqe_fmr_seg);
+ size += sizeof (struct mlx4_wqe_fmr_seg) / 16;
+ break;
+
default:
/* No extra segments required for sends */
break;
size += sizeof (struct mlx4_wqe_datagram_seg) / 16;
if (wr->opcode == IB_WR_LSO) {
- err = build_lso_seg(wqe, wr, qp, &seglen);
+ err = build_lso_seg(wqe, wr, qp, &seglen, &lso_hdr_sz);
if (unlikely(err)) {
*bad_wr = wr;
goto out;
}
+ lso_wqe = (__be32 *) wqe;
wqe += seglen;
size += seglen / 16;
}
for (i = wr->num_sge - 1; i >= 0; --i, --dseg)
set_data_seg(dseg, wr->sg_list + i);
+ /*
+ * Possibly overwrite stamping in cacheline with LSO
+ * segment only after making sure all data segments
+ * are written.
+ */
+ wmb();
+ *lso_wqe = lso_hdr_sz;
+
ctrl->fence_size = (wr->send_flags & IB_SEND_FENCE ?
MLX4_WQE_CTRL_FENCE : 0) | size;
stamp_send_wqe(qp, stamp, size * 16);
ind = pad_wraparound(qp, ind);
}
-
}
out:
qp_init_attr->cap = qp_attr->cap;
+ qp_init_attr->create_flags = 0;
+ if (qp->flags & MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK)
+ qp_init_attr->create_flags |= IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK;
+
+ if (qp->flags & MLX4_IB_QP_LSO)
+ qp_init_attr->create_flags |= IB_QP_CREATE_IPOIB_UD_LSO;
+
out:
mutex_unlock(&qp->mutex);
return err;