net/bnxt: reduce barriers in NEON vector Rx

Message ID 20220613062225.2317537-1-ruifeng.wang@arm.com (mailing list archive)
State Accepted, archived
Delegated to: Ajit Khaparde
Headers
Series net/bnxt: reduce barriers in NEON vector Rx |

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/Intel-compilation success Compilation OK
ci/intel-Testing success Testing PASS
ci/iol-mellanox-Performance success Performance Testing PASS
ci/iol-intel-Performance success Performance Testing PASS
ci/iol-intel-Functional success Functional Testing PASS
ci/github-robot: build fail github build: failed
ci/iol-x86_64-compile-testing success Testing PASS
ci/iol-aarch64-compile-testing success Testing PASS
ci/iol-aarch64-unit-testing success Testing PASS
ci/iol-x86_64-unit-testing success Testing PASS
ci/iol-abi-testing success Testing PASS

Commit Message

Ruifeng Wang June 13, 2022, 6:22 a.m. UTC
  To read descriptors in expected order, barriers are inserted after each
descriptor read. The excessive use of barriers is unnecessary and could
cause performance drop.

Removed barriers between descriptor reads. And changed counting of valid
packets so as to handle discontinuous valid packets. Because out of
order read could lead to valid descriptors that fetched being
discontinuous.

In VPP L3 routing test, 6% performance gain was observed. The test was
done on a platform with ThunderX2 CPU and Broadcom PS225 NIC.

Signed-off-by: Ruifeng Wang <ruifeng.wang@arm.com>
---
 drivers/net/bnxt/bnxt_rxtx_vec_neon.c | 47 ++++++++++++++-------------
 1 file changed, 25 insertions(+), 22 deletions(-)
  

Comments

Ajit Khaparde June 26, 2022, 8:44 p.m. UTC | #1
On Sun, Jun 12, 2022 at 11:22 PM Ruifeng Wang <ruifeng.wang@arm.com> wrote:
>
> To read descriptors in expected order, barriers are inserted after each
> descriptor read. The excessive use of barriers is unnecessary and could
> cause performance drop.
>
> Removed barriers between descriptor reads. And changed counting of valid
> packets so as to handle discontinuous valid packets. Because out of
> order read could lead to valid descriptors that fetched being
> discontinuous.
>
> In VPP L3 routing test, 6% performance gain was observed. The test was
> done on a platform with ThunderX2 CPU and Broadcom PS225 NIC.
>
> Signed-off-by: Ruifeng Wang <ruifeng.wang@arm.com>
Reviewed-by: Ajit Khaparde <ajit.khaparde@broadcom.com>

Patch applied to dpdk-next-net-brcm. Thanks

>
> ---
>  drivers/net/bnxt/bnxt_rxtx_vec_neon.c | 47 ++++++++++++++-------------
>  1 file changed, 25 insertions(+), 22 deletions(-)
>
> diff --git a/drivers/net/bnxt/bnxt_rxtx_vec_neon.c b/drivers/net/bnxt/bnxt_rxtx_vec_neon.c
> index 32f8e59b3a..6a4ece681b 100644
> --- a/drivers/net/bnxt/bnxt_rxtx_vec_neon.c
> +++ b/drivers/net/bnxt/bnxt_rxtx_vec_neon.c
> @@ -235,34 +235,32 @@ recv_burst_vec_neon(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
>                  * IO barriers are used to ensure consistent state.
>                  */
>                 rxcmp1[3] = vld1q_u32((void *)&cpr->cp_desc_ring[cons + 7]);
> -               rte_io_rmb();
> +               rxcmp1[2] = vld1q_u32((void *)&cpr->cp_desc_ring[cons + 5]);
> +               rxcmp1[1] = vld1q_u32((void *)&cpr->cp_desc_ring[cons + 3]);
> +               rxcmp1[0] = vld1q_u32((void *)&cpr->cp_desc_ring[cons + 1]);
> +
> +               /* Use acquire fence to order loads of descriptor words. */
> +               rte_atomic_thread_fence(__ATOMIC_ACQUIRE);
>                 /* Reload lower 64b of descriptors to make it ordered after info3_v. */
>                 rxcmp1[3] = vreinterpretq_u32_u64(vld1q_lane_u64
>                                 ((void *)&cpr->cp_desc_ring[cons + 7],
>                                 vreinterpretq_u64_u32(rxcmp1[3]), 0));
> -               rxcmp[3] = vld1q_u32((void *)&cpr->cp_desc_ring[cons + 6]);
> -
> -               rxcmp1[2] = vld1q_u32((void *)&cpr->cp_desc_ring[cons + 5]);
> -               rte_io_rmb();
>                 rxcmp1[2] = vreinterpretq_u32_u64(vld1q_lane_u64
>                                 ((void *)&cpr->cp_desc_ring[cons + 5],
>                                 vreinterpretq_u64_u32(rxcmp1[2]), 0));
> -               rxcmp[2] = vld1q_u32((void *)&cpr->cp_desc_ring[cons + 4]);
> -
> -               t1 = vreinterpretq_u64_u32(vzip2q_u32(rxcmp1[2], rxcmp1[3]));
> -
> -               rxcmp1[1] = vld1q_u32((void *)&cpr->cp_desc_ring[cons + 3]);
> -               rte_io_rmb();
>                 rxcmp1[1] = vreinterpretq_u32_u64(vld1q_lane_u64
>                                 ((void *)&cpr->cp_desc_ring[cons + 3],
>                                 vreinterpretq_u64_u32(rxcmp1[1]), 0));
> -               rxcmp[1] = vld1q_u32((void *)&cpr->cp_desc_ring[cons + 2]);
> -
> -               rxcmp1[0] = vld1q_u32((void *)&cpr->cp_desc_ring[cons + 1]);
> -               rte_io_rmb();
>                 rxcmp1[0] = vreinterpretq_u32_u64(vld1q_lane_u64
>                                 ((void *)&cpr->cp_desc_ring[cons + 1],
>                                 vreinterpretq_u64_u32(rxcmp1[0]), 0));
> +
> +               rxcmp[3] = vld1q_u32((void *)&cpr->cp_desc_ring[cons + 6]);
> +               rxcmp[2] = vld1q_u32((void *)&cpr->cp_desc_ring[cons + 4]);
> +
> +               t1 = vreinterpretq_u64_u32(vzip2q_u32(rxcmp1[2], rxcmp1[3]));
> +
> +               rxcmp[1] = vld1q_u32((void *)&cpr->cp_desc_ring[cons + 2]);
>                 rxcmp[0] = vld1q_u32((void *)&cpr->cp_desc_ring[cons + 0]);
>
>                 t0 = vreinterpretq_u64_u32(vzip2q_u32(rxcmp1[0], rxcmp1[1]));
> @@ -278,16 +276,21 @@ recv_burst_vec_neon(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
>                  * bits and count the number of set bits in order to determine
>                  * the number of valid descriptors.
>                  */
> -               valid = vget_lane_u64(vreinterpret_u64_u16(vqmovn_u32(info3_v)),
> -                                     0);
> +               valid = vget_lane_u64(vreinterpret_u64_s16(vshr_n_s16
> +                               (vreinterpret_s16_u16(vshl_n_u16
> +                               (vqmovn_u32(info3_v), 15)), 15)), 0);
> +
>                 /*
>                  * At this point, 'valid' is a 64-bit value containing four
> -                * 16-bit fields, each of which is either 0x0001 or 0x0000.
> -                * Compute number of valid descriptors from the index of
> -                * the highest non-zero field.
> +                * 16-bit fields, each of which is either 0xffff or 0x0000.
> +                * Count the number of consecutive 1s from LSB in order to
> +                * determine the number of valid descriptors.
>                  */
> -               num_valid = (sizeof(uint64_t) / sizeof(uint16_t)) -
> -                               (__builtin_clzl(valid & desc_valid_mask) / 16);
> +               valid = ~(valid & desc_valid_mask);
> +               if (valid == 0)
> +                       num_valid = 4;
> +               else
> +                       num_valid = __builtin_ctzl(valid) / 16;
>
>                 if (num_valid == 0)
>                         break;
> --
> 2.25.1
>
  

Patch

diff --git a/drivers/net/bnxt/bnxt_rxtx_vec_neon.c b/drivers/net/bnxt/bnxt_rxtx_vec_neon.c
index 32f8e59b3a..6a4ece681b 100644
--- a/drivers/net/bnxt/bnxt_rxtx_vec_neon.c
+++ b/drivers/net/bnxt/bnxt_rxtx_vec_neon.c
@@ -235,34 +235,32 @@  recv_burst_vec_neon(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 		 * IO barriers are used to ensure consistent state.
 		 */
 		rxcmp1[3] = vld1q_u32((void *)&cpr->cp_desc_ring[cons + 7]);
-		rte_io_rmb();
+		rxcmp1[2] = vld1q_u32((void *)&cpr->cp_desc_ring[cons + 5]);
+		rxcmp1[1] = vld1q_u32((void *)&cpr->cp_desc_ring[cons + 3]);
+		rxcmp1[0] = vld1q_u32((void *)&cpr->cp_desc_ring[cons + 1]);
+
+		/* Use acquire fence to order loads of descriptor words. */
+		rte_atomic_thread_fence(__ATOMIC_ACQUIRE);
 		/* Reload lower 64b of descriptors to make it ordered after info3_v. */
 		rxcmp1[3] = vreinterpretq_u32_u64(vld1q_lane_u64
 				((void *)&cpr->cp_desc_ring[cons + 7],
 				vreinterpretq_u64_u32(rxcmp1[3]), 0));
-		rxcmp[3] = vld1q_u32((void *)&cpr->cp_desc_ring[cons + 6]);
-
-		rxcmp1[2] = vld1q_u32((void *)&cpr->cp_desc_ring[cons + 5]);
-		rte_io_rmb();
 		rxcmp1[2] = vreinterpretq_u32_u64(vld1q_lane_u64
 				((void *)&cpr->cp_desc_ring[cons + 5],
 				vreinterpretq_u64_u32(rxcmp1[2]), 0));
-		rxcmp[2] = vld1q_u32((void *)&cpr->cp_desc_ring[cons + 4]);
-
-		t1 = vreinterpretq_u64_u32(vzip2q_u32(rxcmp1[2], rxcmp1[3]));
-
-		rxcmp1[1] = vld1q_u32((void *)&cpr->cp_desc_ring[cons + 3]);
-		rte_io_rmb();
 		rxcmp1[1] = vreinterpretq_u32_u64(vld1q_lane_u64
 				((void *)&cpr->cp_desc_ring[cons + 3],
 				vreinterpretq_u64_u32(rxcmp1[1]), 0));
-		rxcmp[1] = vld1q_u32((void *)&cpr->cp_desc_ring[cons + 2]);
-
-		rxcmp1[0] = vld1q_u32((void *)&cpr->cp_desc_ring[cons + 1]);
-		rte_io_rmb();
 		rxcmp1[0] = vreinterpretq_u32_u64(vld1q_lane_u64
 				((void *)&cpr->cp_desc_ring[cons + 1],
 				vreinterpretq_u64_u32(rxcmp1[0]), 0));
+
+		rxcmp[3] = vld1q_u32((void *)&cpr->cp_desc_ring[cons + 6]);
+		rxcmp[2] = vld1q_u32((void *)&cpr->cp_desc_ring[cons + 4]);
+
+		t1 = vreinterpretq_u64_u32(vzip2q_u32(rxcmp1[2], rxcmp1[3]));
+
+		rxcmp[1] = vld1q_u32((void *)&cpr->cp_desc_ring[cons + 2]);
 		rxcmp[0] = vld1q_u32((void *)&cpr->cp_desc_ring[cons + 0]);
 
 		t0 = vreinterpretq_u64_u32(vzip2q_u32(rxcmp1[0], rxcmp1[1]));
@@ -278,16 +276,21 @@  recv_burst_vec_neon(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
 		 * bits and count the number of set bits in order to determine
 		 * the number of valid descriptors.
 		 */
-		valid = vget_lane_u64(vreinterpret_u64_u16(vqmovn_u32(info3_v)),
-				      0);
+		valid = vget_lane_u64(vreinterpret_u64_s16(vshr_n_s16
+				(vreinterpret_s16_u16(vshl_n_u16
+				(vqmovn_u32(info3_v), 15)), 15)), 0);
+
 		/*
 		 * At this point, 'valid' is a 64-bit value containing four
-		 * 16-bit fields, each of which is either 0x0001 or 0x0000.
-		 * Compute number of valid descriptors from the index of
-		 * the highest non-zero field.
+		 * 16-bit fields, each of which is either 0xffff or 0x0000.
+		 * Count the number of consecutive 1s from LSB in order to
+		 * determine the number of valid descriptors.
 		 */
-		num_valid = (sizeof(uint64_t) / sizeof(uint16_t)) -
-				(__builtin_clzl(valid & desc_valid_mask) / 16);
+		valid = ~(valid & desc_valid_mask);
+		if (valid == 0)
+			num_valid = 4;
+		else
+			num_valid = __builtin_ctzl(valid) / 16;
 
 		if (num_valid == 0)
 			break;