[dpdk-dev,v2,13/23] mlx4: use MOFED 3.0 fast verbs interface for RX operations

Message ID 1435656489-27986-14-git-send-email-adrien.mazarguil@6wind.com (mailing list archive)
State Accepted, archived
Headers

Commit Message

Adrien Mazarguil June 30, 2015, 9:27 a.m. UTC
  From: Alex Rosenbaum <Alexr@mellanox.com>

This commit replaces the CQ polling and QP posting functions
(mlx4_rx_burst() only) with a new low level interface to improve
performance.

Signed-off-by: Alex Rosenbaum <Alexr@mellanox.com>
Signed-off-by: Gilad Berman <giladb@mellanox.com>
Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
---
 drivers/net/mlx4/mlx4.c | 249 +++++++++++++++++++++++++++++++-----------------
 1 file changed, 162 insertions(+), 87 deletions(-)
  

Patch

diff --git a/drivers/net/mlx4/mlx4.c b/drivers/net/mlx4/mlx4.c
index cdc679a..1881f5b 100644
--- a/drivers/net/mlx4/mlx4.c
+++ b/drivers/net/mlx4/mlx4.c
@@ -188,6 +188,8 @@  struct rxq {
 	struct ibv_mr *mr; /* Memory Region (for mp). */
 	struct ibv_cq *cq; /* Completion Queue. */
 	struct ibv_qp *qp; /* Queue Pair. */
+	struct ibv_exp_qp_burst_family *if_qp; /* QP burst interface. */
+	struct ibv_exp_cq_family *if_cq; /* CQ interface. */
 	/*
 	 * Each VLAN ID requires a separate flow steering rule.
 	 */
@@ -2319,11 +2321,35 @@  rxq_promiscuous_disable(struct rxq *rxq)
 static void
 rxq_cleanup(struct rxq *rxq)
 {
+	struct ibv_exp_release_intf_params params;
+
 	DEBUG("cleaning up %p", (void *)rxq);
 	if (rxq->sp)
 		rxq_free_elts_sp(rxq);
 	else
 		rxq_free_elts(rxq);
+	if (rxq->if_qp != NULL) {
+		assert(rxq->priv != NULL);
+		assert(rxq->priv->ctx != NULL);
+		assert(rxq->qp != NULL);
+		params = (struct ibv_exp_release_intf_params){
+			.comp_mask = 0,
+		};
+		claim_zero(ibv_exp_release_intf(rxq->priv->ctx,
+						rxq->if_qp,
+						&params));
+	}
+	if (rxq->if_cq != NULL) {
+		assert(rxq->priv != NULL);
+		assert(rxq->priv->ctx != NULL);
+		assert(rxq->cq != NULL);
+		params = (struct ibv_exp_release_intf_params){
+			.comp_mask = 0,
+		};
+		claim_zero(ibv_exp_release_intf(rxq->priv->ctx,
+						rxq->if_cq,
+						&params));
+	}
 	if (rxq->qp != NULL) {
 		rxq_promiscuous_disable(rxq);
 		rxq_allmulticast_disable(rxq);
@@ -2360,34 +2386,23 @@  mlx4_rx_burst_sp(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 	struct rxq_elt_sp (*elts)[rxq->elts_n] = rxq->elts.sp;
 	const unsigned int elts_n = rxq->elts_n;
 	unsigned int elts_head = rxq->elts_head;
-	struct ibv_wc wcs[pkts_n];
 	struct ibv_recv_wr head;
 	struct ibv_recv_wr **next = &head.next;
 	struct ibv_recv_wr *bad_wr;
-	int ret = 0;
-	int wcs_n;
-	int i;
+	unsigned int i;
+	unsigned int pkts_ret = 0;
+	int ret;
 
 	if (unlikely(!rxq->sp))
 		return mlx4_rx_burst(dpdk_rxq, pkts, pkts_n);
 	if (unlikely(elts == NULL)) /* See RTE_DEV_CMD_SET_MTU. */
 		return 0;
-	wcs_n = ibv_poll_cq(rxq->cq, pkts_n, wcs);
-	if (unlikely(wcs_n == 0))
-		return 0;
-	if (unlikely(wcs_n < 0)) {
-		DEBUG("rxq=%p, ibv_poll_cq() failed (wc_n=%d)",
-		      (void *)rxq, wcs_n);
-		return 0;
-	}
-	assert(wcs_n <= (int)pkts_n);
-	/* For each work completion. */
-	for (i = 0; (i != wcs_n); ++i) {
-		struct ibv_wc *wc = &wcs[i];
-		uint64_t wr_id = wc->wr_id;
-		uint32_t len = wc->byte_len;
+	for (i = 0; (i != pkts_n); ++i) {
 		struct rxq_elt_sp *elt = &(*elts)[elts_head];
 		struct ibv_recv_wr *wr = &elt->wr;
+		uint64_t wr_id = wr->wr_id;
+		unsigned int len;
+		unsigned int pkt_buf_len;
 		struct rte_mbuf *pkt_buf = NULL; /* Buffer returned in pkts. */
 		struct rte_mbuf **pkt_buf_next = &pkt_buf;
 		unsigned int seg_headroom = RTE_PKTMBUF_HEADROOM;
@@ -2398,26 +2413,51 @@  mlx4_rx_burst_sp(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		(void)wr_id;
 #endif
 		assert(wr_id < rxq->elts_n);
-		assert(wr_id == wr->wr_id);
 		assert(wr->sg_list == elt->sges);
 		assert(wr->num_sge == elemof(elt->sges));
 		assert(elts_head < rxq->elts_n);
 		assert(rxq->elts_head < rxq->elts_n);
-		/* Link completed WRs together for repost. */
-		*next = wr;
-		next = &wr->next;
-		if (unlikely(wc->status != IBV_WC_SUCCESS)) {
-			/* Whatever, just repost the offending WR. */
-			DEBUG("rxq=%p, wr_id=%" PRIu64 ": bad work completion"
-			      " status (%d): %s",
-			      (void *)rxq, wc->wr_id, wc->status,
-			      ibv_wc_status_str(wc->status));
+		ret = rxq->if_cq->poll_length(rxq->cq, NULL, NULL);
+		if (unlikely(ret < 0)) {
+			struct ibv_wc wc;
+			int wcs_n;
+
+			DEBUG("rxq=%p, poll_length() failed (ret=%d)",
+			      (void *)rxq, ret);
+			/* ibv_poll_cq() must be used in case of failure. */
+			wcs_n = ibv_poll_cq(rxq->cq, 1, &wc);
+			if (unlikely(wcs_n == 0))
+				break;
+			if (unlikely(wcs_n < 0)) {
+				DEBUG("rxq=%p, ibv_poll_cq() failed (wcs_n=%d)",
+				      (void *)rxq, wcs_n);
+				break;
+			}
+			assert(wcs_n == 1);
+			if (unlikely(wc.status != IBV_WC_SUCCESS)) {
+				/* Whatever, just repost the offending WR. */
+				DEBUG("rxq=%p, wr_id=%" PRIu64 ": bad work"
+				      " completion status (%d): %s",
+				      (void *)rxq, wc.wr_id, wc.status,
+				      ibv_wc_status_str(wc.status));
 #ifdef MLX4_PMD_SOFT_COUNTERS
-			/* Increase dropped packets counter. */
-			++rxq->stats.idropped;
+				/* Increment dropped packets counter. */
+				++rxq->stats.idropped;
 #endif
-			goto repost;
+				/* Link completed WRs together for repost. */
+				*next = wr;
+				next = &wr->next;
+				goto repost;
+			}
+			ret = wc.byte_len;
 		}
+		if (ret == 0)
+			break;
+		len = ret;
+		pkt_buf_len = len;
+		/* Link completed WRs together for repost. */
+		*next = wr;
+		next = &wr->next;
 		/*
 		 * Replace spent segments with new ones, concatenate and
 		 * return them as pkt_buf.
@@ -2502,42 +2542,43 @@  mlx4_rx_burst_sp(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		assert(j != 0);
 		NB_SEGS(pkt_buf) = j;
 		PORT(pkt_buf) = rxq->port_id;
-		PKT_LEN(pkt_buf) = wc->byte_len;
+		PKT_LEN(pkt_buf) = pkt_buf_len;
 		pkt_buf->ol_flags = 0;
 
 		/* Return packet. */
 		*(pkts++) = pkt_buf;
-		++ret;
+		++pkts_ret;
 #ifdef MLX4_PMD_SOFT_COUNTERS
 		/* Increase bytes counter. */
-		rxq->stats.ibytes += wc->byte_len;
+		rxq->stats.ibytes += pkt_buf_len;
 #endif
 repost:
 		if (++elts_head >= elts_n)
 			elts_head = 0;
 		continue;
 	}
+	if (unlikely(i == 0))
+		return 0;
 	*next = NULL;
 	/* Repost WRs. */
 #ifdef DEBUG_RECV
-	DEBUG("%p: reposting %d WRs starting from %" PRIu64 " (%p)",
-	      (void *)rxq, wcs_n, wcs[0].wr_id, (void *)head.next);
+	DEBUG("%p: reposting %d WRs", (void *)rxq, i);
 #endif
-	i = ibv_post_recv(rxq->qp, head.next, &bad_wr);
-	if (unlikely(i)) {
+	ret = ibv_post_recv(rxq->qp, head.next, &bad_wr);
+	if (unlikely(ret)) {
 		/* Inability to repost WRs is fatal. */
 		DEBUG("%p: ibv_post_recv(): failed for WR %p: %s",
 		      (void *)rxq->priv,
 		      (void *)bad_wr,
-		      strerror(i));
+		      strerror(ret));
 		abort();
 	}
 	rxq->elts_head = elts_head;
 #ifdef MLX4_PMD_SOFT_COUNTERS
 	/* Increase packets counter. */
-	rxq->stats.ipackets += ret;
+	rxq->stats.ipackets += pkts_ret;
 #endif
-	return ret;
+	return pkts_ret;
 }
 
 /**
@@ -2564,58 +2605,64 @@  mlx4_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 	struct rxq_elt (*elts)[rxq->elts_n] = rxq->elts.no_sp;
 	const unsigned int elts_n = rxq->elts_n;
 	unsigned int elts_head = rxq->elts_head;
-	struct ibv_wc wcs[pkts_n];
-	struct ibv_recv_wr head;
-	struct ibv_recv_wr **next = &head.next;
-	struct ibv_recv_wr *bad_wr;
-	int ret = 0;
-	int wcs_n;
-	int i;
+	struct ibv_sge sges[pkts_n];
+	unsigned int i;
+	unsigned int pkts_ret = 0;
+	int ret;
 
 	if (unlikely(rxq->sp))
 		return mlx4_rx_burst_sp(dpdk_rxq, pkts, pkts_n);
-	wcs_n = ibv_poll_cq(rxq->cq, pkts_n, wcs);
-	if (unlikely(wcs_n == 0))
-		return 0;
-	if (unlikely(wcs_n < 0)) {
-		DEBUG("rxq=%p, ibv_poll_cq() failed (wc_n=%d)",
-		      (void *)rxq, wcs_n);
-		return 0;
-	}
-	assert(wcs_n <= (int)pkts_n);
-	/* For each work completion. */
-	for (i = 0; (i != wcs_n); ++i) {
-		struct ibv_wc *wc = &wcs[i];
-		uint64_t wr_id = wc->wr_id;
-		uint32_t len = wc->byte_len;
+	for (i = 0; (i != pkts_n); ++i) {
 		struct rxq_elt *elt = &(*elts)[elts_head];
 		struct ibv_recv_wr *wr = &elt->wr;
+		uint64_t wr_id = wr->wr_id;
+		unsigned int len;
 		struct rte_mbuf *seg = (void *)((uintptr_t)elt->sge.addr -
 			WR_ID(wr_id).offset);
 		struct rte_mbuf *rep;
 
 		/* Sanity checks. */
 		assert(WR_ID(wr_id).id < rxq->elts_n);
-		assert(wr_id == wr->wr_id);
 		assert(wr->sg_list == &elt->sge);
 		assert(wr->num_sge == 1);
 		assert(elts_head < rxq->elts_n);
 		assert(rxq->elts_head < rxq->elts_n);
-		/* Link completed WRs together for repost. */
-		*next = wr;
-		next = &wr->next;
-		if (unlikely(wc->status != IBV_WC_SUCCESS)) {
-			/* Whatever, just repost the offending WR. */
-			DEBUG("rxq=%p, wr_id=%" PRIu32 ": bad work completion"
-			      " status (%d): %s",
-			      (void *)rxq, WR_ID(wr_id).id, wc->status,
-			      ibv_wc_status_str(wc->status));
+		ret = rxq->if_cq->poll_length(rxq->cq, NULL, NULL);
+		if (unlikely(ret < 0)) {
+			struct ibv_wc wc;
+			int wcs_n;
+
+			DEBUG("rxq=%p, poll_length() failed (ret=%d)",
+			      (void *)rxq, ret);
+			/* ibv_poll_cq() must be used in case of failure. */
+			wcs_n = ibv_poll_cq(rxq->cq, 1, &wc);
+			if (unlikely(wcs_n == 0))
+				break;
+			if (unlikely(wcs_n < 0)) {
+				DEBUG("rxq=%p, ibv_poll_cq() failed (wcs_n=%d)",
+				      (void *)rxq, wcs_n);
+				break;
+			}
+			assert(wcs_n == 1);
+			if (unlikely(wc.status != IBV_WC_SUCCESS)) {
+				/* Whatever, just repost the offending WR. */
+				DEBUG("rxq=%p, wr_id=%" PRIu64 ": bad work"
+				      " completion status (%d): %s",
+				      (void *)rxq, wc.wr_id, wc.status,
+				      ibv_wc_status_str(wc.status));
 #ifdef MLX4_PMD_SOFT_COUNTERS
-			/* Increase dropped packets counter. */
-			++rxq->stats.idropped;
+				/* Increment dropped packets counter. */
+				++rxq->stats.idropped;
 #endif
-			goto repost;
+				/* Add SGE to array for repost. */
+				sges[i] = elt->sge;
+				goto repost;
+			}
+			ret = wc.byte_len;
 		}
+		if (ret == 0)
+			break;
+		len = ret;
 		/*
 		 * Fetch initial bytes of packet descriptor into a
 		 * cacheline while allocating rep.
@@ -2644,6 +2691,9 @@  mlx4_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 			 (uintptr_t)rep);
 		assert(WR_ID(wr->wr_id).id == WR_ID(wr_id).id);
 
+		/* Add SGE to array for repost. */
+		sges[i] = elt->sge;
+
 		/* Update seg information. */
 		SET_DATA_OFF(seg, RTE_PKTMBUF_HEADROOM);
 		NB_SEGS(seg) = 1;
@@ -2655,37 +2705,36 @@  mlx4_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 
 		/* Return packet. */
 		*(pkts++) = seg;
-		++ret;
+		++pkts_ret;
 #ifdef MLX4_PMD_SOFT_COUNTERS
 		/* Increase bytes counter. */
-		rxq->stats.ibytes += wc->byte_len;
+		rxq->stats.ibytes += len;
 #endif
 repost:
 		if (++elts_head >= elts_n)
 			elts_head = 0;
 		continue;
 	}
-	*next = NULL;
+	if (unlikely(i == 0))
+		return 0;
 	/* Repost WRs. */
 #ifdef DEBUG_RECV
-	DEBUG("%p: reposting %d WRs starting from %" PRIu32 " (%p)",
-	      (void *)rxq, wcs_n, WR_ID(wcs[0].wr_id).id, (void *)head.next);
+	DEBUG("%p: reposting %u WRs", (void *)rxq, i);
 #endif
-	i = ibv_post_recv(rxq->qp, head.next, &bad_wr);
-	if (unlikely(i)) {
+	ret = rxq->if_qp->recv_burst(rxq->qp, sges, i);
+	if (unlikely(ret)) {
 		/* Inability to repost WRs is fatal. */
-		DEBUG("%p: ibv_post_recv(): failed for WR %p: %s",
+		DEBUG("%p: recv_burst(): failed (ret=%d)",
 		      (void *)rxq->priv,
-		      (void *)bad_wr,
-		      strerror(i));
+		      ret);
 		abort();
 	}
 	rxq->elts_head = elts_head;
 #ifdef MLX4_PMD_SOFT_COUNTERS
 	/* Increase packets counter. */
-	rxq->stats.ipackets += ret;
+	rxq->stats.ipackets += pkts_ret;
 #endif
-	return ret;
+	return pkts_ret;
 }
 
 /**
@@ -3019,6 +3068,10 @@  rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
 		.socket = socket
 	};
 	struct ibv_exp_qp_attr mod;
+	union {
+		struct ibv_exp_query_intf_params params;
+	} attr;
+	enum ibv_exp_query_intf_status status;
 	struct ibv_recv_wr *bad_wr;
 	struct rte_mbuf *buf;
 	int ret = 0;
@@ -3160,6 +3213,28 @@  skip_alloc:
 	/* Save port ID. */
 	tmpl.port_id = dev->data->port_id;
 	DEBUG("%p: RTE port ID: %u", (void *)rxq, tmpl.port_id);
+	attr.params = (struct ibv_exp_query_intf_params){
+		.intf_scope = IBV_EXP_INTF_GLOBAL,
+		.intf = IBV_EXP_INTF_CQ,
+		.obj = tmpl.cq,
+	};
+	tmpl.if_cq = ibv_exp_query_intf(priv->ctx, &attr.params, &status);
+	if (tmpl.if_cq == NULL) {
+		ERROR("%p: CQ interface family query failed with status %d",
+		      (void *)dev, status);
+		goto error;
+	}
+	attr.params = (struct ibv_exp_query_intf_params){
+		.intf_scope = IBV_EXP_INTF_GLOBAL,
+		.intf = IBV_EXP_INTF_QP_BURST,
+		.obj = tmpl.qp,
+	};
+	tmpl.if_qp = ibv_exp_query_intf(priv->ctx, &attr.params, &status);
+	if (tmpl.if_qp == NULL) {
+		ERROR("%p: QP interface family query failed with status %d",
+		      (void *)dev, status);
+		goto error;
+	}
 	/* Clean up rxq in case we're reinitializing it. */
 	DEBUG("%p: cleaning-up old rxq just in case", (void *)rxq);
 	rxq_cleanup(rxq);