net/mlx5: fix vectorized Rx burst termination

Message ID 1591069841-7846-1-git-send-email-akozyrev@mellanox.com (mailing list archive)
State Accepted, archived
Delegated to: Raslan Darawsheh
Headers
Series net/mlx5: fix vectorized Rx burst termination |

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/iol-intel-Performance success Performance Testing PASS
ci/iol-nxp-Performance success Performance Testing PASS
ci/iol-mellanox-Performance success Performance Testing PASS
ci/travis-robot success Travis build: passed
ci/iol-testing warning Testing issues
ci/Intel-compilation success Compilation OK

Commit Message

Alexander Kozyrev June 2, 2020, 3:50 a.m. UTC
  Maximum burst size of Vectorized Rx burst routine is set to
MLX5_VPMD_RX_MAX_BURST(64). This limits the performance of any
application that would like to gather more than 64 packets from
the single Rx burst for batch processing (i.e. VPP).

The situation gets worse with a mix of zipped and unzipped CQEs.
They are processed separately and the Rx burst function returns
small number of packets every call.

Repeat the cycle of gathering packets from the vectorized Rx routine
until a requested number of packets are collected or there are no
more CQEs left to process.

Fixes: 6cb559d67b83 ("net/mlx5: add vectorized Rx/Tx burst for x86")
Cc: stable@dpdk.org

Signed-off-by: Alexander Kozyrev <akozyrev@mellanox.com>
Acked-by: Slava Ovsiienko <viacheslavo@mellanox.com>
---
 drivers/net/mlx5/mlx5_rxtx_vec.c         | 19 +++++++++++++------
 drivers/net/mlx5/mlx5_rxtx_vec_altivec.h | 13 ++++++++++---
 drivers/net/mlx5/mlx5_rxtx_vec_neon.h    | 13 ++++++++++---
 drivers/net/mlx5/mlx5_rxtx_vec_sse.h     | 13 ++++++++++---
 4 files changed, 43 insertions(+), 15 deletions(-)
  

Comments

Matan Azrad June 2, 2020, 5:59 a.m. UTC | #1
From: Alexander Kozyrev
> Maximum burst size of Vectorized Rx burst routine is set to
> MLX5_VPMD_RX_MAX_BURST(64). This limits the performance of any
> application that would like to gather more than 64 packets from the single Rx
> burst for batch processing (i.e. VPP).
> 
> The situation gets worse with a mix of zipped and unzipped CQEs.
> They are processed separately and the Rx burst function returns small
> number of packets every call.
> 
> Repeat the cycle of gathering packets from the vectorized Rx routine until a
> requested number of packets are collected or there are no more CQEs left to
> process.
> 
> Fixes: 6cb559d67b83 ("net/mlx5: add vectorized Rx/Tx burst for x86")
> Cc: stable@dpdk.org
> 
> Signed-off-by: Alexander Kozyrev <akozyrev@mellanox.com>
> Acked-by: Slava Ovsiienko <viacheslavo@mellanox.com>
Acked-by: Matan Azrad <matan@mellanox.com>

> ---
>  drivers/net/mlx5/mlx5_rxtx_vec.c         | 19 +++++++++++++------
>  drivers/net/mlx5/mlx5_rxtx_vec_altivec.h | 13 ++++++++++---
>  drivers/net/mlx5/mlx5_rxtx_vec_neon.h    | 13 ++++++++++---
>  drivers/net/mlx5/mlx5_rxtx_vec_sse.h     | 13 ++++++++++---
>  4 files changed, 43 insertions(+), 15 deletions(-)
> 
> diff --git a/drivers/net/mlx5/mlx5_rxtx_vec.c
> b/drivers/net/mlx5/mlx5_rxtx_vec.c
> index 1518bdd..b38bd20 100644
> --- a/drivers/net/mlx5/mlx5_rxtx_vec.c
> +++ b/drivers/net/mlx5/mlx5_rxtx_vec.c
> @@ -103,13 +103,20 @@
>  mlx5_rx_burst_vec(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t
> pkts_n)  {
>  	struct mlx5_rxq_data *rxq = dpdk_rxq;
> -	uint16_t nb_rx;
> +	uint16_t nb_rx = 0;
> +	uint16_t tn = 0;
>  	uint64_t err = 0;
> -
> -	nb_rx = rxq_burst_v(rxq, pkts, pkts_n, &err);
> -	if (unlikely(err | rxq->err_state))
> -		nb_rx = rxq_handle_pending_error(rxq, pkts, nb_rx);
> -	return nb_rx;
> +	bool no_cq = false;
> +
> +	do {
> +		nb_rx = rxq_burst_v(rxq, pkts + tn, pkts_n - tn, &err,
> &no_cq);
> +		if (unlikely(err | rxq->err_state))
> +			nb_rx = rxq_handle_pending_error(rxq, pkts + tn,
> nb_rx);
> +		tn += nb_rx;
> +		if (unlikely(no_cq))
> +			break;
> +	} while (tn != pkts_n);
> +	return tn;
>  }
> 
>  /**
> diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_altivec.h
> b/drivers/net/mlx5/mlx5_rxtx_vec_altivec.h
> index 26715ef..b55138a 100644
> --- a/drivers/net/mlx5/mlx5_rxtx_vec_altivec.h
> +++ b/drivers/net/mlx5/mlx5_rxtx_vec_altivec.h
> @@ -564,13 +564,15 @@
>   * @param[out] err
>   *   Pointer to a flag. Set non-zero value if pkts array has at least one error
>   *   packet to handle.
> + * @param[out] no_cq
> + *  Pointer to a boolean. Set true if no new CQE seen.
>   *
>   * @return
>   *   Number of packets received including errors (<= pkts_n).
>   */
>  static inline uint16_t
>  rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t
> pkts_n,
> -	    uint64_t *err)
> +	    uint64_t *err, bool *no_cq)
>  {
>  	const uint16_t q_n = 1 << rxq->cqe_n;
>  	const uint16_t q_mask = q_n - 1;
> @@ -663,8 +665,10 @@
>  	/* Not to cross queue end. */
>  	pkts_n = RTE_MIN(pkts_n, q_n - elts_idx);
>  	pkts_n = RTE_MIN(pkts_n, q_n - cq_idx);
> -	if (!pkts_n)
> +	if (!pkts_n) {
> +		*no_cq = !rcvd_pkt;
>  		return rcvd_pkt;
> +	}
>  	/* At this point, there shouldn't be any remaining packets. */
>  	MLX5_ASSERT(rxq->decompressed == 0);
> 
> @@ -1079,8 +1083,10 @@
>  			break;
>  	}
>  	/* If no new CQE seen, return without updating cq_db. */
> -	if (unlikely(!nocmp_n && comp_idx ==
> MLX5_VPMD_DESCS_PER_LOOP))
> +	if (unlikely(!nocmp_n && comp_idx ==
> MLX5_VPMD_DESCS_PER_LOOP)) {
> +		*no_cq = true;
>  		return rcvd_pkt;
> +	}
>  	/* Update the consumer indexes for non-compressed CQEs. */
>  	MLX5_ASSERT(nocmp_n <= pkts_n);
>  	rxq->cq_ci += nocmp_n;
> @@ -1108,6 +1114,7 @@
>  	}
>  	rte_compiler_barrier();
>  	*rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci);
> +	*no_cq = !rcvd_pkt;
>  	return rcvd_pkt;
>  }
> 
> diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
> b/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
> index ecafbf8..3007c03 100644
> --- a/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
> +++ b/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
> @@ -378,13 +378,15 @@
>   * @param[out] err
>   *   Pointer to a flag. Set non-zero value if pkts array has at least one error
>   *   packet to handle.
> + * @param[out] no_cq
> + *   Pointer to a boolean. Set true if no new CQE seen.
>   *
>   * @return
>   *   Number of packets received including errors (<= pkts_n).
>   */
>  static inline uint16_t
>  rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t
> pkts_n,
> -	    uint64_t *err)
> +	    uint64_t *err, bool *no_cq)
>  {
>  	const uint16_t q_n = 1 << rxq->cqe_n;
>  	const uint16_t q_mask = q_n - 1;
> @@ -485,8 +487,10 @@
>  	/* Not to cross queue end. */
>  	pkts_n = RTE_MIN(pkts_n, q_n - elts_idx);
>  	pkts_n = RTE_MIN(pkts_n, q_n - cq_idx);
> -	if (!pkts_n)
> +	if (!pkts_n) {
> +		*no_cq = !rcvd_pkt;
>  		return rcvd_pkt;
> +	}
>  	/* At this point, there shouldn't be any remained packets. */
>  	MLX5_ASSERT(rxq->decompressed == 0);
>  	/*
> @@ -745,8 +749,10 @@
>  			break;
>  	}
>  	/* If no new CQE seen, return without updating cq_db. */
> -	if (unlikely(!nocmp_n && comp_idx ==
> MLX5_VPMD_DESCS_PER_LOOP))
> +	if (unlikely(!nocmp_n && comp_idx ==
> MLX5_VPMD_DESCS_PER_LOOP)) {
> +		*no_cq = true;
>  		return rcvd_pkt;
> +	}
>  	/* Update the consumer indexes for non-compressed CQEs. */
>  	MLX5_ASSERT(nocmp_n <= pkts_n);
>  	rxq->cq_ci += nocmp_n;
> @@ -774,6 +780,7 @@
>  	}
>  	rte_cio_wmb();
>  	*rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci);
> +	*no_cq = !rcvd_pkt;
>  	return rcvd_pkt;
>  }
> 
> diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
> b/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
> index 6847ae7..da5960a 100644
> --- a/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
> +++ b/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
> @@ -385,13 +385,15 @@
>   * @param[out] err
>   *   Pointer to a flag. Set non-zero value if pkts array has at least one error
>   *   packet to handle.
> + * @param[out] no_cq
> + *   Pointer to a boolean. Set true if no new CQE seen.
>   *
>   * @return
>   *   Number of packets received including errors (<= pkts_n).
>   */
>  static inline uint16_t
>  rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t
> pkts_n,
> -	    uint64_t *err)
> +	    uint64_t *err, bool *no_cq)
>  {
>  	const uint16_t q_n = 1 << rxq->cqe_n;
>  	const uint16_t q_mask = q_n - 1;
> @@ -473,8 +475,10 @@
>  	/* Not to cross queue end. */
>  	pkts_n = RTE_MIN(pkts_n, q_n - elts_idx);
>  	pkts_n = RTE_MIN(pkts_n, q_n - cq_idx);
> -	if (!pkts_n)
> +	if (!pkts_n) {
> +		*no_cq = !rcvd_pkt;
>  		return rcvd_pkt;
> +	}
>  	/* At this point, there shouldn't be any remained packets. */
>  	MLX5_ASSERT(rxq->decompressed == 0);
>  	/*
> @@ -696,8 +700,10 @@
>  			break;
>  	}
>  	/* If no new CQE seen, return without updating cq_db. */
> -	if (unlikely(!nocmp_n && comp_idx ==
> MLX5_VPMD_DESCS_PER_LOOP))
> +	if (unlikely(!nocmp_n && comp_idx ==
> MLX5_VPMD_DESCS_PER_LOOP)) {
> +		*no_cq = true;
>  		return rcvd_pkt;
> +	}
>  	/* Update the consumer indexes for non-compressed CQEs. */
>  	MLX5_ASSERT(nocmp_n <= pkts_n);
>  	rxq->cq_ci += nocmp_n;
> @@ -725,6 +731,7 @@
>  	}
>  	rte_compiler_barrier();
>  	*rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci);
> +	*no_cq = !rcvd_pkt;
>  	return rcvd_pkt;
>  }
> 
> --
> 1.8.3.1
  
Raslan Darawsheh June 3, 2020, 12:33 p.m. UTC | #2
Hi,

> -----Original Message-----
> From: Alexander Kozyrev <akozyrev@mellanox.com>
> Sent: Tuesday, June 2, 2020 6:51 AM
> To: dev@dpdk.org
> Cc: stable@dpdk.org; Raslan Darawsheh <rasland@mellanox.com>; Slava
> Ovsiienko <viacheslavo@mellanox.com>
> Subject: [PATCH] net/mlx5: fix vectorized Rx burst termination
> 
> Maximum burst size of Vectorized Rx burst routine is set to
> MLX5_VPMD_RX_MAX_BURST(64). This limits the performance of any
> application that would like to gather more than 64 packets from
> the single Rx burst for batch processing (i.e. VPP).
> 
> The situation gets worse with a mix of zipped and unzipped CQEs.
> They are processed separately and the Rx burst function returns
> small number of packets every call.
> 
> Repeat the cycle of gathering packets from the vectorized Rx routine
> until a requested number of packets are collected or there are no
> more CQEs left to process.
> 
> Fixes: 6cb559d67b83 ("net/mlx5: add vectorized Rx/Tx burst for x86")
> Cc: stable@dpdk.org
> 
> Signed-off-by: Alexander Kozyrev <akozyrev@mellanox.com>
> Acked-by: Slava Ovsiienko <viacheslavo@mellanox.com>
> ---
>  drivers/net/mlx5/mlx5_rxtx_vec.c         | 19 +++++++++++++------
>  drivers/net/mlx5/mlx5_rxtx_vec_altivec.h | 13 ++++++++++---
>  drivers/net/mlx5/mlx5_rxtx_vec_neon.h    | 13 ++++++++++---
>  drivers/net/mlx5/mlx5_rxtx_vec_sse.h     | 13 ++++++++++---
>  4 files changed, 43 insertions(+), 15 deletions(-)
> 

Patch applied to next-net-mlx,

Kindest regards
Raslan Darawsheh
  

Patch

diff --git a/drivers/net/mlx5/mlx5_rxtx_vec.c b/drivers/net/mlx5/mlx5_rxtx_vec.c
index 1518bdd..b38bd20 100644
--- a/drivers/net/mlx5/mlx5_rxtx_vec.c
+++ b/drivers/net/mlx5/mlx5_rxtx_vec.c
@@ -103,13 +103,20 @@ 
 mlx5_rx_burst_vec(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 {
 	struct mlx5_rxq_data *rxq = dpdk_rxq;
-	uint16_t nb_rx;
+	uint16_t nb_rx = 0;
+	uint16_t tn = 0;
 	uint64_t err = 0;
-
-	nb_rx = rxq_burst_v(rxq, pkts, pkts_n, &err);
-	if (unlikely(err | rxq->err_state))
-		nb_rx = rxq_handle_pending_error(rxq, pkts, nb_rx);
-	return nb_rx;
+	bool no_cq = false;
+
+	do {
+		nb_rx = rxq_burst_v(rxq, pkts + tn, pkts_n - tn, &err, &no_cq);
+		if (unlikely(err | rxq->err_state))
+			nb_rx = rxq_handle_pending_error(rxq, pkts + tn, nb_rx);
+		tn += nb_rx;
+		if (unlikely(no_cq))
+			break;
+	} while (tn != pkts_n);
+	return tn;
 }
 
 /**
diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_altivec.h b/drivers/net/mlx5/mlx5_rxtx_vec_altivec.h
index 26715ef..b55138a 100644
--- a/drivers/net/mlx5/mlx5_rxtx_vec_altivec.h
+++ b/drivers/net/mlx5/mlx5_rxtx_vec_altivec.h
@@ -564,13 +564,15 @@ 
  * @param[out] err
  *   Pointer to a flag. Set non-zero value if pkts array has at least one error
  *   packet to handle.
+ * @param[out] no_cq
+ *  Pointer to a boolean. Set true if no new CQE seen.
  *
  * @return
  *   Number of packets received including errors (<= pkts_n).
  */
 static inline uint16_t
 rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t pkts_n,
-	    uint64_t *err)
+	    uint64_t *err, bool *no_cq)
 {
 	const uint16_t q_n = 1 << rxq->cqe_n;
 	const uint16_t q_mask = q_n - 1;
@@ -663,8 +665,10 @@ 
 	/* Not to cross queue end. */
 	pkts_n = RTE_MIN(pkts_n, q_n - elts_idx);
 	pkts_n = RTE_MIN(pkts_n, q_n - cq_idx);
-	if (!pkts_n)
+	if (!pkts_n) {
+		*no_cq = !rcvd_pkt;
 		return rcvd_pkt;
+	}
 	/* At this point, there shouldn't be any remaining packets. */
 	MLX5_ASSERT(rxq->decompressed == 0);
 
@@ -1079,8 +1083,10 @@ 
 			break;
 	}
 	/* If no new CQE seen, return without updating cq_db. */
-	if (unlikely(!nocmp_n && comp_idx == MLX5_VPMD_DESCS_PER_LOOP))
+	if (unlikely(!nocmp_n && comp_idx == MLX5_VPMD_DESCS_PER_LOOP)) {
+		*no_cq = true;
 		return rcvd_pkt;
+	}
 	/* Update the consumer indexes for non-compressed CQEs. */
 	MLX5_ASSERT(nocmp_n <= pkts_n);
 	rxq->cq_ci += nocmp_n;
@@ -1108,6 +1114,7 @@ 
 	}
 	rte_compiler_barrier();
 	*rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci);
+	*no_cq = !rcvd_pkt;
 	return rcvd_pkt;
 }
 
diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_neon.h b/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
index ecafbf8..3007c03 100644
--- a/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
+++ b/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
@@ -378,13 +378,15 @@ 
  * @param[out] err
  *   Pointer to a flag. Set non-zero value if pkts array has at least one error
  *   packet to handle.
+ * @param[out] no_cq
+ *   Pointer to a boolean. Set true if no new CQE seen.
  *
  * @return
  *   Number of packets received including errors (<= pkts_n).
  */
 static inline uint16_t
 rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t pkts_n,
-	    uint64_t *err)
+	    uint64_t *err, bool *no_cq)
 {
 	const uint16_t q_n = 1 << rxq->cqe_n;
 	const uint16_t q_mask = q_n - 1;
@@ -485,8 +487,10 @@ 
 	/* Not to cross queue end. */
 	pkts_n = RTE_MIN(pkts_n, q_n - elts_idx);
 	pkts_n = RTE_MIN(pkts_n, q_n - cq_idx);
-	if (!pkts_n)
+	if (!pkts_n) {
+		*no_cq = !rcvd_pkt;
 		return rcvd_pkt;
+	}
 	/* At this point, there shouldn't be any remained packets. */
 	MLX5_ASSERT(rxq->decompressed == 0);
 	/*
@@ -745,8 +749,10 @@ 
 			break;
 	}
 	/* If no new CQE seen, return without updating cq_db. */
-	if (unlikely(!nocmp_n && comp_idx == MLX5_VPMD_DESCS_PER_LOOP))
+	if (unlikely(!nocmp_n && comp_idx == MLX5_VPMD_DESCS_PER_LOOP)) {
+		*no_cq = true;
 		return rcvd_pkt;
+	}
 	/* Update the consumer indexes for non-compressed CQEs. */
 	MLX5_ASSERT(nocmp_n <= pkts_n);
 	rxq->cq_ci += nocmp_n;
@@ -774,6 +780,7 @@ 
 	}
 	rte_cio_wmb();
 	*rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci);
+	*no_cq = !rcvd_pkt;
 	return rcvd_pkt;
 }
 
diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_sse.h b/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
index 6847ae7..da5960a 100644
--- a/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
+++ b/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
@@ -385,13 +385,15 @@ 
  * @param[out] err
  *   Pointer to a flag. Set non-zero value if pkts array has at least one error
  *   packet to handle.
+ * @param[out] no_cq
+ *   Pointer to a boolean. Set true if no new CQE seen.
  *
  * @return
  *   Number of packets received including errors (<= pkts_n).
  */
 static inline uint16_t
 rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t pkts_n,
-	    uint64_t *err)
+	    uint64_t *err, bool *no_cq)
 {
 	const uint16_t q_n = 1 << rxq->cqe_n;
 	const uint16_t q_mask = q_n - 1;
@@ -473,8 +475,10 @@ 
 	/* Not to cross queue end. */
 	pkts_n = RTE_MIN(pkts_n, q_n - elts_idx);
 	pkts_n = RTE_MIN(pkts_n, q_n - cq_idx);
-	if (!pkts_n)
+	if (!pkts_n) {
+		*no_cq = !rcvd_pkt;
 		return rcvd_pkt;
+	}
 	/* At this point, there shouldn't be any remained packets. */
 	MLX5_ASSERT(rxq->decompressed == 0);
 	/*
@@ -696,8 +700,10 @@ 
 			break;
 	}
 	/* If no new CQE seen, return without updating cq_db. */
-	if (unlikely(!nocmp_n && comp_idx == MLX5_VPMD_DESCS_PER_LOOP))
+	if (unlikely(!nocmp_n && comp_idx == MLX5_VPMD_DESCS_PER_LOOP)) {
+		*no_cq = true;
 		return rcvd_pkt;
+	}
 	/* Update the consumer indexes for non-compressed CQEs. */
 	MLX5_ASSERT(nocmp_n <= pkts_n);
 	rxq->cq_ci += nocmp_n;
@@ -725,6 +731,7 @@ 
 	}
 	rte_compiler_barrier();
 	*rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci);
+	*no_cq = !rcvd_pkt;
 	return rcvd_pkt;
 }