From 00f7ec36c9324928e4cd23f02e6d8550f30c32ca Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Mon, 14 Jul 2008 23:48:45 -0700 Subject: [PATCH] RDMA/core: Add memory management extensions support This patch adds support for the IB "base memory management extension" (BMME) and the equivalent iWARP operations (which the iWARP verbs mandates all devices must implement). The new operations are: - Allocate an ib_mr for use in fast register work requests. - Allocate/free a physical buffer lists for use in fast register work requests. This allows device drivers to allocate this memory as needed for use in posting send requests (eg via dma_alloc_coherent). - New send queue work requests: * send with remote invalidate * fast register memory region * local invalidate memory region * RDMA read with invalidate local memory region (iWARP only) Consumer interface details: - A new device capability flag IB_DEVICE_MEM_MGT_EXTENSIONS is added to indicate device support for these features. - New send work request opcodes IB_WR_FAST_REG_MR, IB_WR_LOCAL_INV, IB_WR_RDMA_READ_WITH_INV are added. - A new consumer API function, ib_alloc_mr() is added to allocate fast register memory regions. - New consumer API functions, ib_alloc_fast_reg_page_list() and ib_free_fast_reg_page_list() are added to allocate and free device-specific memory for fast registration page lists. - A new consumer API function, ib_update_fast_reg_key(), is added to allow the key portion of the R_Key and L_Key of a fast registration MR to be updated. Consumers call this if desired before posting a IB_WR_FAST_REG_MR work request. Consumers can use this as follows: - MR is allocated with ib_alloc_mr(). - Page list memory is allocated with ib_alloc_fast_reg_page_list(). - MR R_Key/L_Key "key" field is updated with ib_update_fast_reg_key(). - MR made VALID and bound to a specific page list via ib_post_send(IB_WR_FAST_REG_MR) - MR made INVALID via ib_post_send(IB_WR_LOCAL_INV), ib_post_send(IB_WR_RDMA_READ_WITH_INV) or an incoming send with invalidate operation. - MR is deallocated with ib_dereg_mr() - page lists dealloced via ib_free_fast_reg_page_list(). Applications can allocate a fast register MR once, and then can repeatedly bind the MR to different physical block lists (PBLs) via posting work requests to a send queue (SQ). For each outstanding MR-to-PBL binding in the SQ pipe, a fast_reg_page_list needs to be allocated (the fast_reg_page_list is owned by the low-level driver from the consumer posting a work request until the request completes). Thus pipelining can be achieved while still allowing device-specific page_list processing. The 32-bit fast register memory key/STag is composed of a 24-bit index and an 8-bit key. The application can change the key each time it fast registers thus allowing more control over the peer's use of the key/STag (ie it can effectively be changed each time the rkey is rebound to a page list). Signed-off-by: Steve Wise Signed-off-by: Roland Dreier --- drivers/infiniband/core/uverbs_cmd.c | 2 +- drivers/infiniband/core/verbs.c | 46 ++++++++++++++++++ drivers/infiniband/hw/ehca/ehca_reqs.c | 2 +- drivers/infiniband/hw/ipath/ipath_cq.c | 2 +- drivers/infiniband/hw/ipath/ipath_rc.c | 4 +- drivers/infiniband/hw/ipath/ipath_ruc.c | 4 +- drivers/infiniband/hw/ipath/ipath_uc.c | 8 ++-- drivers/infiniband/hw/ipath/ipath_ud.c | 8 ++-- drivers/infiniband/hw/mlx4/cq.c | 12 ++--- drivers/infiniband/hw/mthca/mthca_cq.c | 4 +- include/rdma/ib_user_verbs.h | 5 +- include/rdma/ib_verbs.h | 83 ++++++++++++++++++++++++++++++++- 12 files changed, 154 insertions(+), 26 deletions(-) diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 112b37c..56feab6 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -917,7 +917,7 @@ ssize_t ib_uverbs_poll_cq(struct ib_uverbs_file *file, resp->wc[i].opcode = wc[i].opcode; resp->wc[i].vendor_err = wc[i].vendor_err; resp->wc[i].byte_len = wc[i].byte_len; - resp->wc[i].imm_data = (__u32 __force) wc[i].imm_data; + resp->wc[i].ex.imm_data = (__u32 __force) wc[i].ex.imm_data; resp->wc[i].qp_num = wc[i].qp->qp_num; resp->wc[i].src_qp = wc[i].src_qp; resp->wc[i].wc_flags = wc[i].wc_flags; diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 9f399d3..e0fbe59 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -753,6 +753,52 @@ int ib_dereg_mr(struct ib_mr *mr) } EXPORT_SYMBOL(ib_dereg_mr); +struct ib_mr *ib_alloc_fast_reg_mr(struct ib_pd *pd, int max_page_list_len) +{ + struct ib_mr *mr; + + if (!pd->device->alloc_fast_reg_mr) + return ERR_PTR(-ENOSYS); + + mr = pd->device->alloc_fast_reg_mr(pd, max_page_list_len); + + if (!IS_ERR(mr)) { + mr->device = pd->device; + mr->pd = pd; + mr->uobject = NULL; + atomic_inc(&pd->usecnt); + atomic_set(&mr->usecnt, 0); + } + + return mr; +} +EXPORT_SYMBOL(ib_alloc_fast_reg_mr); + +struct ib_fast_reg_page_list *ib_alloc_fast_reg_page_list(struct ib_device *device, + int max_page_list_len) +{ + struct ib_fast_reg_page_list *page_list; + + if (!device->alloc_fast_reg_page_list) + return ERR_PTR(-ENOSYS); + + page_list = device->alloc_fast_reg_page_list(device, max_page_list_len); + + if (!IS_ERR(page_list)) { + page_list->device = device; + page_list->max_page_list_len = max_page_list_len; + } + + return page_list; +} +EXPORT_SYMBOL(ib_alloc_fast_reg_page_list); + +void ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list) +{ + page_list->device->free_fast_reg_page_list(page_list); +} +EXPORT_SYMBOL(ib_free_fast_reg_page_list); + /* Memory windows */ struct ib_mw *ib_alloc_mw(struct ib_pd *pd) diff --git a/drivers/infiniband/hw/ehca/ehca_reqs.c b/drivers/infiniband/hw/ehca/ehca_reqs.c index f093b00..b799b27 100644 --- a/drivers/infiniband/hw/ehca/ehca_reqs.c +++ b/drivers/infiniband/hw/ehca/ehca_reqs.c @@ -681,7 +681,7 @@ poll_cq_one_read_cqe: wc->dlid_path_bits = cqe->dlid; wc->src_qp = cqe->remote_qp_number; wc->wc_flags = cqe->w_completion_flags; - wc->imm_data = cpu_to_be32(cqe->immediate_data); + wc->ex.imm_data = cpu_to_be32(cqe->immediate_data); wc->sl = cqe->service_level; poll_cq_one_exit0: diff --git a/drivers/infiniband/hw/ipath/ipath_cq.c b/drivers/infiniband/hw/ipath/ipath_cq.c index a03bd28..d385e41 100644 --- a/drivers/infiniband/hw/ipath/ipath_cq.c +++ b/drivers/infiniband/hw/ipath/ipath_cq.c @@ -82,7 +82,7 @@ void ipath_cq_enter(struct ipath_cq *cq, struct ib_wc *entry, int solicited) wc->uqueue[head].opcode = entry->opcode; wc->uqueue[head].vendor_err = entry->vendor_err; wc->uqueue[head].byte_len = entry->byte_len; - wc->uqueue[head].imm_data = (__u32 __force)entry->imm_data; + wc->uqueue[head].ex.imm_data = (__u32 __force) entry->ex.imm_data; wc->uqueue[head].qp_num = entry->qp->qp_num; wc->uqueue[head].src_qp = entry->src_qp; wc->uqueue[head].wc_flags = entry->wc_flags; diff --git a/drivers/infiniband/hw/ipath/ipath_rc.c b/drivers/infiniband/hw/ipath/ipath_rc.c index 108df66..9771052 100644 --- a/drivers/infiniband/hw/ipath/ipath_rc.c +++ b/drivers/infiniband/hw/ipath/ipath_rc.c @@ -1703,11 +1703,11 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, case OP(SEND_LAST_WITH_IMMEDIATE): send_last_imm: if (header_in_data) { - wc.imm_data = *(__be32 *) data; + wc.ex.imm_data = *(__be32 *) data; data += sizeof(__be32); } else { /* Immediate data comes after BTH */ - wc.imm_data = ohdr->u.imm_data; + wc.ex.imm_data = ohdr->u.imm_data; } hdrsize += 4; wc.wc_flags = IB_WC_WITH_IMM; diff --git a/drivers/infiniband/hw/ipath/ipath_ruc.c b/drivers/infiniband/hw/ipath/ipath_ruc.c index a4b5521..af051f7 100644 --- a/drivers/infiniband/hw/ipath/ipath_ruc.c +++ b/drivers/infiniband/hw/ipath/ipath_ruc.c @@ -331,7 +331,7 @@ again: switch (wqe->wr.opcode) { case IB_WR_SEND_WITH_IMM: wc.wc_flags = IB_WC_WITH_IMM; - wc.imm_data = wqe->wr.ex.imm_data; + wc.ex.imm_data = wqe->wr.ex.imm_data; /* FALLTHROUGH */ case IB_WR_SEND: if (!ipath_get_rwqe(qp, 0)) @@ -342,7 +342,7 @@ again: if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE))) goto inv_err; wc.wc_flags = IB_WC_WITH_IMM; - wc.imm_data = wqe->wr.ex.imm_data; + wc.ex.imm_data = wqe->wr.ex.imm_data; if (!ipath_get_rwqe(qp, 1)) goto rnr_nak; /* FALLTHROUGH */ diff --git a/drivers/infiniband/hw/ipath/ipath_uc.c b/drivers/infiniband/hw/ipath/ipath_uc.c index 0596ec1..82cc588 100644 --- a/drivers/infiniband/hw/ipath/ipath_uc.c +++ b/drivers/infiniband/hw/ipath/ipath_uc.c @@ -379,11 +379,11 @@ void ipath_uc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, case OP(SEND_LAST_WITH_IMMEDIATE): send_last_imm: if (header_in_data) { - wc.imm_data = *(__be32 *) data; + wc.ex.imm_data = *(__be32 *) data; data += sizeof(__be32); } else { /* Immediate data comes after BTH */ - wc.imm_data = ohdr->u.imm_data; + wc.ex.imm_data = ohdr->u.imm_data; } hdrsize += 4; wc.wc_flags = IB_WC_WITH_IMM; @@ -483,11 +483,11 @@ void ipath_uc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE): rdma_last_imm: if (header_in_data) { - wc.imm_data = *(__be32 *) data; + wc.ex.imm_data = *(__be32 *) data; data += sizeof(__be32); } else { /* Immediate data comes after BTH */ - wc.imm_data = ohdr->u.imm_data; + wc.ex.imm_data = ohdr->u.imm_data; } hdrsize += 4; wc.wc_flags = IB_WC_WITH_IMM; diff --git a/drivers/infiniband/hw/ipath/ipath_ud.c b/drivers/infiniband/hw/ipath/ipath_ud.c index 77ca8ca..36aa242 100644 --- a/drivers/infiniband/hw/ipath/ipath_ud.c +++ b/drivers/infiniband/hw/ipath/ipath_ud.c @@ -96,7 +96,7 @@ static void ipath_ud_loopback(struct ipath_qp *sqp, struct ipath_swqe *swqe) if (swqe->wr.opcode == IB_WR_SEND_WITH_IMM) { wc.wc_flags = IB_WC_WITH_IMM; - wc.imm_data = swqe->wr.ex.imm_data; + wc.ex.imm_data = swqe->wr.ex.imm_data; } /* @@ -492,14 +492,14 @@ void ipath_ud_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, if (qp->ibqp.qp_num > 1 && opcode == IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE) { if (header_in_data) { - wc.imm_data = *(__be32 *) data; + wc.ex.imm_data = *(__be32 *) data; data += sizeof(__be32); } else - wc.imm_data = ohdr->u.ud.imm_data; + wc.ex.imm_data = ohdr->u.ud.imm_data; wc.wc_flags = IB_WC_WITH_IMM; hdrsize += sizeof(u32); } else if (opcode == IB_OPCODE_UD_SEND_ONLY) { - wc.imm_data = 0; + wc.ex.imm_data = 0; wc.wc_flags = 0; } else { dev->n_pkt_drops++; diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c index 4521319..299f208 100644 --- a/drivers/infiniband/hw/mlx4/cq.c +++ b/drivers/infiniband/hw/mlx4/cq.c @@ -663,18 +663,18 @@ repoll: switch (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) { case MLX4_RECV_OPCODE_RDMA_WRITE_IMM: - wc->opcode = IB_WC_RECV_RDMA_WITH_IMM; - wc->wc_flags = IB_WC_WITH_IMM; - wc->imm_data = cqe->immed_rss_invalid; + wc->opcode = IB_WC_RECV_RDMA_WITH_IMM; + wc->wc_flags = IB_WC_WITH_IMM; + wc->ex.imm_data = cqe->immed_rss_invalid; break; case MLX4_RECV_OPCODE_SEND: wc->opcode = IB_WC_RECV; wc->wc_flags = 0; break; case MLX4_RECV_OPCODE_SEND_IMM: - wc->opcode = IB_WC_RECV; - wc->wc_flags = IB_WC_WITH_IMM; - wc->imm_data = cqe->immed_rss_invalid; + wc->opcode = IB_WC_RECV; + wc->wc_flags = IB_WC_WITH_IMM; + wc->ex.imm_data = cqe->immed_rss_invalid; break; } diff --git a/drivers/infiniband/hw/mthca/mthca_cq.c b/drivers/infiniband/hw/mthca/mthca_cq.c index f788fce..d9f4735 100644 --- a/drivers/infiniband/hw/mthca/mthca_cq.c +++ b/drivers/infiniband/hw/mthca/mthca_cq.c @@ -620,13 +620,13 @@ static inline int mthca_poll_one(struct mthca_dev *dev, case IB_OPCODE_SEND_LAST_WITH_IMMEDIATE: case IB_OPCODE_SEND_ONLY_WITH_IMMEDIATE: entry->wc_flags = IB_WC_WITH_IMM; - entry->imm_data = cqe->imm_etype_pkey_eec; + entry->ex.imm_data = cqe->imm_etype_pkey_eec; entry->opcode = IB_WC_RECV; break; case IB_OPCODE_RDMA_WRITE_LAST_WITH_IMMEDIATE: case IB_OPCODE_RDMA_WRITE_ONLY_WITH_IMMEDIATE: entry->wc_flags = IB_WC_WITH_IMM; - entry->imm_data = cqe->imm_etype_pkey_eec; + entry->ex.imm_data = cqe->imm_etype_pkey_eec; entry->opcode = IB_WC_RECV_RDMA_WITH_IMM; break; default: diff --git a/include/rdma/ib_user_verbs.h b/include/rdma/ib_user_verbs.h index 885254f..a17f771 100644 --- a/include/rdma/ib_user_verbs.h +++ b/include/rdma/ib_user_verbs.h @@ -289,7 +289,10 @@ struct ib_uverbs_wc { __u32 opcode; __u32 vendor_err; __u32 byte_len; - __u32 imm_data; + union { + __u32 imm_data; + __u32 invalidate_rkey; + } ex; __u32 qp_num; __u32 src_qp; __u32 wc_flags; diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 5f5621b..74c24b9 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -103,6 +103,7 @@ enum ib_device_cap_flags { */ IB_DEVICE_UD_IP_CSUM = (1<<18), IB_DEVICE_UD_TSO = (1<<19), + IB_DEVICE_MEM_MGT_EXTENSIONS = (1<<21), }; enum ib_atomic_cap { @@ -148,6 +149,7 @@ struct ib_device_attr { int max_srq; int max_srq_wr; int max_srq_sge; + unsigned int max_fast_reg_page_list_len; u16 max_pkeys; u8 local_ca_ack_delay; }; @@ -411,6 +413,8 @@ enum ib_wc_opcode { IB_WC_FETCH_ADD, IB_WC_BIND_MW, IB_WC_LSO, + IB_WC_LOCAL_INV, + IB_WC_FAST_REG_MR, /* * Set value of IB_WC_RECV so consumers can test if a completion is a * receive by testing (opcode & IB_WC_RECV). @@ -421,7 +425,8 @@ enum ib_wc_opcode { enum ib_wc_flags { IB_WC_GRH = 1, - IB_WC_WITH_IMM = (1<<1) + IB_WC_WITH_IMM = (1<<1), + IB_WC_WITH_INVALIDATE = (1<<2), }; struct ib_wc { @@ -431,7 +436,10 @@ struct ib_wc { u32 vendor_err; u32 byte_len; struct ib_qp *qp; - __be32 imm_data; + union { + __be32 imm_data; + u32 invalidate_rkey; + } ex; u32 src_qp; int wc_flags; u16 pkey_index; @@ -625,6 +633,9 @@ enum ib_wr_opcode { IB_WR_ATOMIC_FETCH_AND_ADD, IB_WR_LSO, IB_WR_SEND_WITH_INV, + IB_WR_RDMA_READ_WITH_INV, + IB_WR_LOCAL_INV, + IB_WR_FAST_REG_MR, }; enum ib_send_flags { @@ -641,6 +652,12 @@ struct ib_sge { u32 lkey; }; +struct ib_fast_reg_page_list { + struct ib_device *device; + u64 *page_list; + unsigned int max_page_list_len; +}; + struct ib_send_wr { struct ib_send_wr *next; u64 wr_id; @@ -673,6 +690,15 @@ struct ib_send_wr { u16 pkey_index; /* valid for GSI only */ u8 port_num; /* valid for DR SMPs on switch only */ } ud; + struct { + u64 iova_start; + struct ib_fast_reg_page_list *page_list; + unsigned int page_shift; + unsigned int page_list_len; + u32 length; + int access_flags; + u32 rkey; + } fast_reg; } wr; }; @@ -1011,6 +1037,11 @@ struct ib_device { int (*query_mr)(struct ib_mr *mr, struct ib_mr_attr *mr_attr); int (*dereg_mr)(struct ib_mr *mr); + struct ib_mr * (*alloc_fast_reg_mr)(struct ib_pd *pd, + int max_page_list_len); + struct ib_fast_reg_page_list * (*alloc_fast_reg_page_list)(struct ib_device *device, + int page_list_len); + void (*free_fast_reg_page_list)(struct ib_fast_reg_page_list *page_list); int (*rereg_phys_mr)(struct ib_mr *mr, int mr_rereg_mask, struct ib_pd *pd, @@ -1805,6 +1836,54 @@ int ib_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr); int ib_dereg_mr(struct ib_mr *mr); /** + * ib_alloc_fast_reg_mr - Allocates memory region usable with the + * IB_WR_FAST_REG_MR send work request. + * @pd: The protection domain associated with the region. + * @max_page_list_len: requested max physical buffer list length to be + * used with fast register work requests for this MR. + */ +struct ib_mr *ib_alloc_fast_reg_mr(struct ib_pd *pd, int max_page_list_len); + +/** + * ib_alloc_fast_reg_page_list - Allocates a page list array + * @device - ib device pointer. + * @page_list_len - size of the page list array to be allocated. + * + * This allocates and returns a struct ib_fast_reg_page_list * and a + * page_list array that is at least page_list_len in size. The actual + * size is returned in max_page_list_len. The caller is responsible + * for initializing the contents of the page_list array before posting + * a send work request with the IB_WC_FAST_REG_MR opcode. + * + * The page_list array entries must be translated using one of the + * ib_dma_*() functions just like the addresses passed to + * ib_map_phys_fmr(). Once the ib_post_send() is issued, the struct + * ib_fast_reg_page_list must not be modified by the caller until the + * IB_WC_FAST_REG_MR work request completes. + */ +struct ib_fast_reg_page_list *ib_alloc_fast_reg_page_list( + struct ib_device *device, int page_list_len); + +/** + * ib_free_fast_reg_page_list - Deallocates a previously allocated + * page list array. + * @page_list - struct ib_fast_reg_page_list pointer to be deallocated. + */ +void ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list); + +/** + * ib_update_fast_reg_key - updates the key portion of the fast_reg MR + * R_Key and L_Key. + * @mr - struct ib_mr pointer to be updated. + * @newkey - new key to be used. + */ +static inline void ib_update_fast_reg_key(struct ib_mr *mr, u8 newkey) +{ + mr->lkey = (mr->lkey & 0xffffff00) | newkey; + mr->rkey = (mr->rkey & 0xffffff00) | newkey; +} + +/** * ib_alloc_mw - Allocates a memory window. * @pd: The protection domain associated with the memory window. */ -- 1.8.2.3