[v4,3/3] net/octeon_ep: use AVX2 instructions for Rx

Message ID 20231207064941.1256-3-pbhagavatula@marvell.com (mailing list archive)
State Changes Requested
Delegated to: Jerin Jacob
Headers
Series [v4,1/3] net/octeon_ep: optimize Rx and Tx routines |

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/loongarch-compilation success Compilation OK
ci/loongarch-unit-testing success Unit Testing PASS
ci/github-robot: build success github build: passed
ci/Intel-compilation success Compilation OK
ci/intel-Testing success Testing PASS
ci/intel-Functional success Functional PASS
ci/iol-intel-Functional success Functional Testing PASS
ci/iol-mellanox-Performance success Performance Testing PASS
ci/iol-abi-testing success Testing PASS
ci/iol-intel-Performance success Performance Testing PASS
ci/iol-broadcom-Functional success Functional Testing PASS
ci/iol-unit-arm64-testing success Testing PASS
ci/iol-unit-amd64-testing success Testing PASS
ci/iol-compile-amd64-testing success Testing PASS
ci/iol-compile-arm64-testing success Testing PASS
ci/iol-broadcom-Performance success Performance Testing PASS
ci/iol-sample-apps-testing success Testing PASS

Commit Message

Pavan Nikhilesh Bhagavatula Dec. 7, 2023, 6:49 a.m. UTC
  From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Optimize Rx routine to use AVX2 instructions when underlying
architecture supports it.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 doc/guides/rel_notes/release_24_03.rst |   5 +
 drivers/net/octeon_ep/cnxk_ep_rx_avx.c | 123 +++++++++++++++++++++++++
 drivers/net/octeon_ep/meson.build      |  12 +++
 drivers/net/octeon_ep/otx_ep_ethdev.c  |  10 ++
 drivers/net/octeon_ep/otx_ep_rxtx.h    |   6 ++
 5 files changed, 156 insertions(+)
 create mode 100644 drivers/net/octeon_ep/cnxk_ep_rx_avx.c
  

Comments

Jerin Jacob Dec. 11, 2023, 12:05 p.m. UTC | #1
On Thu, Dec 7, 2023 at 2:03 PM <pbhagavatula@marvell.com> wrote:
>
> From: Pavan Nikhilesh <pbhagavatula@marvell.com>
>
> Optimize Rx routine to use AVX2 instructions when underlying
> architecture supports it.
>
> Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
> ---
>  doc/guides/rel_notes/release_24_03.rst |   5 +
>  drivers/net/octeon_ep/cnxk_ep_rx_avx.c | 123 +++++++++++++++++++++++++
>  drivers/net/octeon_ep/meson.build      |  12 +++
>  drivers/net/octeon_ep/otx_ep_ethdev.c  |  10 ++
>  drivers/net/octeon_ep/otx_ep_rxtx.h    |   6 ++
>  5 files changed, 156 insertions(+)
>  create mode 100644 drivers/net/octeon_ep/cnxk_ep_rx_avx.c
>
> diff --git a/doc/guides/rel_notes/release_24_03.rst b/doc/guides/rel_notes/release_24_03.rst
> index 6f8ad27808..2191dd78e7 100644
> --- a/doc/guides/rel_notes/release_24_03.rst
> +++ b/doc/guides/rel_notes/release_24_03.rst
> @@ -55,6 +55,11 @@ New Features
>       Also, make sure to start the actual text at the margin.
>       =======================================================
>
> +* **Updated Marvell Octeon ep driver.**
> +
> +  * Added SSE/AVX2 Rx routines.
> +  * Updated Tx queue thresholds.
Please add little  more info in Tx queue one.

Also, Split doc changes to respective patches.
  

Patch

diff --git a/doc/guides/rel_notes/release_24_03.rst b/doc/guides/rel_notes/release_24_03.rst
index 6f8ad27808..2191dd78e7 100644
--- a/doc/guides/rel_notes/release_24_03.rst
+++ b/doc/guides/rel_notes/release_24_03.rst
@@ -55,6 +55,11 @@  New Features
      Also, make sure to start the actual text at the margin.
      =======================================================
 
+* **Updated Marvell Octeon ep driver.**
+
+  * Added SSE/AVX2 Rx routines.
+  * Updated Tx queue thresholds.
+
 
 Removed Items
 -------------
diff --git a/drivers/net/octeon_ep/cnxk_ep_rx_avx.c b/drivers/net/octeon_ep/cnxk_ep_rx_avx.c
new file mode 100644
index 0000000000..ae4615e6da
--- /dev/null
+++ b/drivers/net/octeon_ep/cnxk_ep_rx_avx.c
@@ -0,0 +1,123 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2023 Marvell.
+ */
+
+#include "cnxk_ep_rx.h"
+
+static __rte_always_inline void
+cnxk_ep_process_pkts_vec_avx(struct rte_mbuf **rx_pkts, struct otx_ep_droq *droq, uint16_t new_pkts)
+{
+	struct rte_mbuf **recv_buf_list = droq->recv_buf_list;
+	uint32_t bytes_rsvd = 0, read_idx = droq->read_idx;
+	const uint64_t rearm_data = droq->rearm_data;
+	struct rte_mbuf *m[CNXK_EP_OQ_DESC_PER_LOOP_AVX];
+	uint32_t pidx[CNXK_EP_OQ_DESC_PER_LOOP_AVX];
+	uint32_t idx[CNXK_EP_OQ_DESC_PER_LOOP_AVX];
+	uint16_t nb_desc = droq->nb_desc;
+	uint16_t pkts = 0;
+	uint8_t i;
+
+	idx[0] = read_idx;
+	while (pkts < new_pkts) {
+		__m256i data[CNXK_EP_OQ_DESC_PER_LOOP_AVX];
+		/* mask to shuffle from desc. to mbuf (2 descriptors)*/
+		const __m256i mask =
+			_mm256_set_epi8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 20, 21, 0xFF, 0xFF, 20,
+					21, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+					0xFF, 0xFF, 0xFF, 7, 6, 5, 4, 3, 2, 1, 0);
+
+		/* Load indexes. */
+		for (i = 1; i < CNXK_EP_OQ_DESC_PER_LOOP_AVX; i++)
+			idx[i] = otx_ep_incr_index(idx[i - 1], 1, nb_desc);
+
+		/* Prefetch next indexes. */
+		if (new_pkts - pkts > 8) {
+			pidx[0] = otx_ep_incr_index(idx[i - 1], 1, nb_desc);
+			for (i = 1; i < CNXK_EP_OQ_DESC_PER_LOOP_AVX; i++)
+				pidx[i] = otx_ep_incr_index(pidx[i - 1], 1, nb_desc);
+
+			for (i = 0; i < CNXK_EP_OQ_DESC_PER_LOOP_AVX; i++) {
+				rte_prefetch0(recv_buf_list[pidx[i]]);
+				rte_prefetch0(rte_pktmbuf_mtod(recv_buf_list[pidx[i]], void *));
+			}
+		}
+
+		/* Load mbuf array. */
+		for (i = 0; i < CNXK_EP_OQ_DESC_PER_LOOP_AVX; i++)
+			m[i] = recv_buf_list[idx[i]];
+
+		/* Load rearm data and packet length for shuffle. */
+		for (i = 0; i < CNXK_EP_OQ_DESC_PER_LOOP_AVX; i++)
+			data[i] = _mm256_set_epi64x(0,
+				rte_pktmbuf_mtod(m[i], struct otx_ep_droq_info *)->length >> 16,
+				0, rearm_data);
+
+		/* Shuffle data to its place and sum the packet length. */
+		for (i = 0; i < CNXK_EP_OQ_DESC_PER_LOOP_AVX; i++) {
+			data[i] = _mm256_shuffle_epi8(data[i], mask);
+			bytes_rsvd += _mm256_extract_epi16(data[i], 10);
+		}
+
+		/* Store the 256bit data to the mbuf. */
+		for (i = 0; i < CNXK_EP_OQ_DESC_PER_LOOP_AVX; i++)
+			_mm256_storeu_si256((__m256i *)&m[i]->rearm_data, data[i]);
+
+		for (i = 0; i < CNXK_EP_OQ_DESC_PER_LOOP_AVX; i++)
+			rx_pkts[pkts++] = m[i];
+		idx[0] = otx_ep_incr_index(idx[i - 1], 1, nb_desc);
+	}
+	droq->read_idx = idx[0];
+
+	droq->refill_count += new_pkts;
+	droq->pkts_pending -= new_pkts;
+	/* Stats */
+	droq->stats.pkts_received += new_pkts;
+	droq->stats.bytes_received += bytes_rsvd;
+}
+
+uint16_t __rte_noinline __rte_hot
+cnxk_ep_recv_pkts_avx(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
+{
+	struct otx_ep_droq *droq = (struct otx_ep_droq *)rx_queue;
+	uint16_t new_pkts, vpkts;
+
+	new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
+	vpkts = RTE_ALIGN_FLOOR(new_pkts, CNXK_EP_OQ_DESC_PER_LOOP_AVX);
+	cnxk_ep_process_pkts_vec_avx(rx_pkts, droq, vpkts);
+	cnxk_ep_process_pkts_scalar(&rx_pkts[vpkts], droq, new_pkts - vpkts);
+
+	/* Refill RX buffers */
+	if (droq->refill_count >= DROQ_REFILL_THRESHOLD)
+		cnxk_ep_rx_refill(droq);
+
+	return new_pkts;
+}
+
+uint16_t __rte_noinline __rte_hot
+cn9k_ep_recv_pkts_avx(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts)
+{
+	struct otx_ep_droq *droq = (struct otx_ep_droq *)rx_queue;
+	uint16_t new_pkts, vpkts;
+
+	new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts);
+	vpkts = RTE_ALIGN_FLOOR(new_pkts, CNXK_EP_OQ_DESC_PER_LOOP_AVX);
+	cnxk_ep_process_pkts_vec_avx(rx_pkts, droq, vpkts);
+	cnxk_ep_process_pkts_scalar(&rx_pkts[vpkts], droq, new_pkts - vpkts);
+
+	/* Refill RX buffers */
+	if (droq->refill_count >= DROQ_REFILL_THRESHOLD) {
+		cnxk_ep_rx_refill(droq);
+	} else {
+		/* SDP output goes into DROP state when output doorbell count
+		 * goes below drop count. When door bell count is written with
+		 * a value greater than drop count SDP output should come out
+		 * of DROP state. Due to a race condition this is not happening.
+		 * Writing doorbell register with 0 again may make SDP output
+		 * come out of this state.
+		 */
+
+		rte_write32(0, droq->pkts_credit_reg);
+	}
+
+	return new_pkts;
+}
diff --git a/drivers/net/octeon_ep/meson.build b/drivers/net/octeon_ep/meson.build
index feba1fdf25..e8ae56018d 100644
--- a/drivers/net/octeon_ep/meson.build
+++ b/drivers/net/octeon_ep/meson.build
@@ -15,6 +15,18 @@  sources = files(
 
 if arch_subdir == 'x86'
     sources += files('cnxk_ep_rx_sse.c')
+    if cc.get_define('__AVX2__', args: machine_args) != ''
+        cflags += ['-DCC_AVX2_SUPPORT']
+        sources += files('cnxk_ep_rx_avx.c')
+    elif cc.has_argument('-mavx2')
+        cflags += ['-DCC_AVX2_SUPPORT']
+        otx_ep_avx2_lib = static_library('otx_ep_avx2_lib',
+                        'cnxk_ep_rx_avx.c',
+                        dependencies: [static_rte_ethdev, static_rte_pci, static_rte_bus_pci],
+                        include_directories: includes,
+                        c_args: [cflags, '-mavx2'])
+        objs += otx_ep_avx2_lib.extract_objects('cnxk_ep_rx_avx.c')
+    endif
 endif
 
 extra_flags = ['-Wno-strict-aliasing']
diff --git a/drivers/net/octeon_ep/otx_ep_ethdev.c b/drivers/net/octeon_ep/otx_ep_ethdev.c
index 51b34cdaa0..42a97ea110 100644
--- a/drivers/net/octeon_ep/otx_ep_ethdev.c
+++ b/drivers/net/octeon_ep/otx_ep_ethdev.c
@@ -54,6 +54,11 @@  otx_ep_set_rx_func(struct rte_eth_dev *eth_dev)
 		eth_dev->rx_pkt_burst = &cnxk_ep_recv_pkts;
 #ifdef RTE_ARCH_X86
 		eth_dev->rx_pkt_burst = &cnxk_ep_recv_pkts_sse;
+#ifdef CC_AVX2_SUPPORT
+		if (rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_256 &&
+		    rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1)
+			eth_dev->rx_pkt_burst = &cnxk_ep_recv_pkts_avx;
+#endif
 #endif
 		if (otx_epvf->rx_offloads & RTE_ETH_RX_OFFLOAD_SCATTER)
 			eth_dev->rx_pkt_burst = &cnxk_ep_recv_pkts_mseg;
@@ -61,6 +66,11 @@  otx_ep_set_rx_func(struct rte_eth_dev *eth_dev)
 		eth_dev->rx_pkt_burst = &cn9k_ep_recv_pkts;
 #ifdef RTE_ARCH_X86
 		eth_dev->rx_pkt_burst = &cn9k_ep_recv_pkts_sse;
+#ifdef CC_AVX2_SUPPORT
+		if (rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_256 &&
+		    rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1)
+			eth_dev->rx_pkt_burst = &cn9k_ep_recv_pkts_avx;
+#endif
 #endif
 
 		if (otx_epvf->rx_offloads & RTE_ETH_RX_OFFLOAD_SCATTER)
diff --git a/drivers/net/octeon_ep/otx_ep_rxtx.h b/drivers/net/octeon_ep/otx_ep_rxtx.h
index efc41a8275..0adcbc7814 100644
--- a/drivers/net/octeon_ep/otx_ep_rxtx.h
+++ b/drivers/net/octeon_ep/otx_ep_rxtx.h
@@ -51,6 +51,9 @@  cnxk_ep_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t budget);
 uint16_t
 cnxk_ep_recv_pkts_sse(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t budget);
 
+uint16_t
+cnxk_ep_recv_pkts_avx(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t budget);
+
 uint16_t
 cnxk_ep_recv_pkts_mseg(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t budget);
 
@@ -60,6 +63,9 @@  cn9k_ep_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t budget);
 uint16_t
 cn9k_ep_recv_pkts_sse(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t budget);
 
+uint16_t
+cn9k_ep_recv_pkts_avx(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t budget);
+
 uint16_t
 cn9k_ep_recv_pkts_mseg(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t budget);
 #endif /* _OTX_EP_RXTX_H_ */