diff mbox series

[v4,3/4] net/iavf: add offload path for Rx AVX512

Message ID 1617947944-130983-4-git-send-email-wenzhuo.lu@intel.com (mailing list archive)
State Superseded
Delegated to: Qi Zhang
Headers show
Series add Rx/Tx offload paths for IAVF AVX512 | expand

Checks

Context Check Description
ci/checkpatch success coding style OK

Commit Message

Wenzhuo Lu April 9, 2021, 5:59 a.m. UTC
Add a specific path for RX AVX512 (traditional).
In this path, support the HW offload features, like,
checksum, VLAN stripping, RSS hash.
This path is chosen automatically according to the
configuration.

'inline' is used, then the duplicate code is generated
by the compiler.

Signed-off-by: Wenzhuo Lu <wenzhuo.lu@intel.com>
---
 drivers/net/iavf/iavf_rxtx.c            | 105 +++++++---
 drivers/net/iavf/iavf_rxtx.h            |  12 ++
 drivers/net/iavf/iavf_rxtx_vec_avx512.c | 353 ++++++++++++++++++++------------
 drivers/net/iavf/iavf_rxtx_vec_common.h |  17 +-
 4 files changed, 324 insertions(+), 163 deletions(-)
diff mbox series

Patch

diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c
index 099ede7..ca01ed9 100644
--- a/drivers/net/iavf/iavf_rxtx.c
+++ b/drivers/net/iavf/iavf_rxtx.c
@@ -2392,22 +2392,23 @@ 
 #ifdef RTE_ARCH_X86
 	struct iavf_rx_queue *rxq;
 	int i;
+	int check_ret;
+	bool use_sse = false;
 	bool use_avx2 = false;
-#ifdef CC_AVX512_SUPPORT
 	bool use_avx512 = false;
-#endif
+	bool use_flex = false;
 
-	if (!iavf_rx_vec_dev_check(dev) &&
-			rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) {
-		for (i = 0; i < dev->data->nb_rx_queues; i++) {
-			rxq = dev->data->rx_queues[i];
-			(void)iavf_rxq_vec_setup(rxq);
+	check_ret = iavf_rx_vec_dev_check(dev);
+	if (check_ret >= 0 &&
+	    rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) {
+		if (check_ret == IAVF_VECTOR_PATH) {
+			use_sse = true;
+			if ((rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1 ||
+			     rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1) &&
+			    rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_256)
+				use_avx2 = true;
 		}
 
-		if ((rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1 ||
-		     rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1) &&
-				rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_256)
-			use_avx2 = true;
 #ifdef CC_AVX512_SUPPORT
 		if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1 &&
 		    rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512BW) == 1 &&
@@ -2415,13 +2416,38 @@ 
 			use_avx512 = true;
 #endif
 
+		if (!use_sse && !use_avx2 && !use_avx512)
+			goto normal;
+
+		if (vf->vf_res->vf_cap_flags &
+			VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC) {
+			use_flex = true;
+			if (use_avx512 && check_ret == IAVF_VECTOR_OFFLOAD_PATH)
+				use_flex = false;
+		}
+
+		for (i = 0; i < dev->data->nb_rx_queues; i++) {
+			rxq = dev->data->rx_queues[i];
+			(void)iavf_rxq_vec_setup(rxq);
+		}
+
 		if (dev->data->scattered_rx) {
-			PMD_DRV_LOG(DEBUG,
-				    "Using %sVector Scattered Rx (port %d).",
-				    use_avx2 ? "avx2 " : "",
-				    dev->data->port_id);
-			if (vf->vf_res->vf_cap_flags &
-				VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC) {
+			if (!use_avx512) {
+				PMD_DRV_LOG(DEBUG,
+					    "Using %sVector Scattered Rx (port %d).",
+					    use_avx2 ? "avx2 " : "",
+					    dev->data->port_id);
+			} else {
+				if (check_ret == IAVF_VECTOR_PATH)
+					PMD_DRV_LOG(DEBUG,
+						    "Using AVX512 Vector Scattered Rx (port %d).",
+						    dev->data->port_id);
+				else
+					PMD_DRV_LOG(DEBUG,
+						    "Using AVX512 OFFLOAD Vector Scattered Rx (port %d).",
+						    dev->data->port_id);
+			}
+			if (use_flex) {
 				dev->rx_pkt_burst = use_avx2 ?
 					iavf_recv_scattered_pkts_vec_avx2_flex_rxd :
 					iavf_recv_scattered_pkts_vec_flex_rxd;
@@ -2435,17 +2461,32 @@ 
 					iavf_recv_scattered_pkts_vec_avx2 :
 					iavf_recv_scattered_pkts_vec;
 #ifdef CC_AVX512_SUPPORT
-				if (use_avx512)
-					dev->rx_pkt_burst =
-						iavf_recv_scattered_pkts_vec_avx512;
+				if (use_avx512) {
+					if (check_ret == IAVF_VECTOR_PATH)
+						dev->rx_pkt_burst =
+							iavf_recv_scattered_pkts_vec_avx512;
+					else
+						dev->rx_pkt_burst =
+							iavf_recv_scattered_pkts_vec_avx512_offload;
+				}
 #endif
 			}
 		} else {
-			PMD_DRV_LOG(DEBUG, "Using %sVector Rx (port %d).",
-				    use_avx2 ? "avx2 " : "",
-				    dev->data->port_id);
-			if (vf->vf_res->vf_cap_flags &
-				VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC) {
+			if (!use_avx512) {
+				PMD_DRV_LOG(DEBUG, "Using %sVector Rx (port %d).",
+					    use_avx2 ? "avx2 " : "",
+					    dev->data->port_id);
+			} else {
+				if (check_ret == IAVF_VECTOR_PATH)
+					PMD_DRV_LOG(DEBUG,
+						    "Using AVX512 Vector Rx (port %d).",
+						    dev->data->port_id);
+				else
+					PMD_DRV_LOG(DEBUG,
+						    "Using AVX512 OFFLOAD Vector Rx (port %d).",
+						    dev->data->port_id);
+			}
+			if (use_flex) {
 				dev->rx_pkt_burst = use_avx2 ?
 					iavf_recv_pkts_vec_avx2_flex_rxd :
 					iavf_recv_pkts_vec_flex_rxd;
@@ -2459,17 +2500,23 @@ 
 					iavf_recv_pkts_vec_avx2 :
 					iavf_recv_pkts_vec;
 #ifdef CC_AVX512_SUPPORT
-				if (use_avx512)
-					dev->rx_pkt_burst =
-						iavf_recv_pkts_vec_avx512;
+				if (use_avx512) {
+					if (check_ret == IAVF_VECTOR_PATH)
+						dev->rx_pkt_burst =
+							iavf_recv_pkts_vec_avx512;
+					else
+						dev->rx_pkt_burst =
+							iavf_recv_pkts_vec_avx512_offload;
+				}
 #endif
 			}
 		}
 
 		return;
 	}
-#endif
 
+normal:
+#endif
 	if (dev->data->scattered_rx) {
 		PMD_DRV_LOG(DEBUG, "Using a Scattered Rx callback (port=%d).",
 			    dev->data->port_id);
diff --git a/drivers/net/iavf/iavf_rxtx.h b/drivers/net/iavf/iavf_rxtx.h
index bead119..a8e5664 100644
--- a/drivers/net/iavf/iavf_rxtx.h
+++ b/drivers/net/iavf/iavf_rxtx.h
@@ -35,6 +35,12 @@ 
 		DEV_TX_OFFLOAD_UDP_CKSUM |		 \
 		DEV_TX_OFFLOAD_TCP_CKSUM)
 
+#define IAVF_RX_VECTOR_OFFLOAD (				 \
+		DEV_RX_OFFLOAD_CHECKSUM |		 \
+		DEV_RX_OFFLOAD_SCTP_CKSUM |		 \
+		DEV_RX_OFFLOAD_VLAN |		 \
+		DEV_RX_OFFLOAD_RSS_HASH)
+
 #define IAVF_VECTOR_PATH 0
 #define IAVF_VECTOR_OFFLOAD_PATH 1
 
@@ -484,12 +490,18 @@  uint16_t iavf_xmit_pkts_vec_avx2(void *tx_queue, struct rte_mbuf **tx_pkts,
 int iavf_txq_vec_setup(struct iavf_tx_queue *txq);
 uint16_t iavf_recv_pkts_vec_avx512(void *rx_queue, struct rte_mbuf **rx_pkts,
 				   uint16_t nb_pkts);
+uint16_t iavf_recv_pkts_vec_avx512_offload(void *rx_queue,
+					   struct rte_mbuf **rx_pkts,
+					   uint16_t nb_pkts);
 uint16_t iavf_recv_pkts_vec_avx512_flex_rxd(void *rx_queue,
 					    struct rte_mbuf **rx_pkts,
 					    uint16_t nb_pkts);
 uint16_t iavf_recv_scattered_pkts_vec_avx512(void *rx_queue,
 					     struct rte_mbuf **rx_pkts,
 					     uint16_t nb_pkts);
+uint16_t iavf_recv_scattered_pkts_vec_avx512_offload(void *rx_queue,
+						     struct rte_mbuf **rx_pkts,
+						     uint16_t nb_pkts);
 uint16_t iavf_recv_scattered_pkts_vec_avx512_flex_rxd(void *rx_queue,
 						      struct rte_mbuf **rx_pkts,
 						      uint16_t nb_pkts);
diff --git a/drivers/net/iavf/iavf_rxtx_vec_avx512.c b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
index fbbf4b9..9030ca5 100644
--- a/drivers/net/iavf/iavf_rxtx_vec_avx512.c
+++ b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
@@ -13,6 +13,22 @@ 
 #define IAVF_DESCS_PER_LOOP_AVX 8
 #define PKTLEN_SHIFT 10
 
+/******************************************************************************
+ * If user knows a specific offload is not enabled by APP,
+ * the macro can be commented to save the effort of fast path.
+ * Currently below 2 features are supported in RX path,
+ * 1, checksum offload
+ * 2, VLAN/QINQ stripping
+ * 3, RSS hash
+ * 4, packet type analysis
+ * 5, flow director ID report
+ ******************************************************************************/
+#define IAVF_RX_CSUM_OFFLOAD
+#define IAVF_RX_VLAN_OFFLOAD
+#define IAVF_RX_RSS_OFFLOAD
+#define IAVF_RX_PTYPE_OFFLOAD
+#define IAVF_RX_FDIR_OFFLOAD
+
 static __rte_always_inline void
 iavf_rxq_rearm(struct iavf_rx_queue *rxq)
 {
@@ -144,12 +160,15 @@ 
 }
 
 #define IAVF_RX_LEN_MASK 0x80808080
-static inline uint16_t
+static __rte_always_inline uint16_t
 _iavf_recv_raw_pkts_vec_avx512(struct iavf_rx_queue *rxq,
 			       struct rte_mbuf **rx_pkts,
-			       uint16_t nb_pkts, uint8_t *split_packet)
+			       uint16_t nb_pkts, uint8_t *split_packet,
+			       bool offload)
 {
+#ifdef IAVF_RX_PTYPE_OFFLOAD
 	const uint32_t *type_table = rxq->vsi->adapter->ptype_tbl;
+#endif
 
 	const __m256i mbuf_init = _mm256_set_epi64x(0, 0, 0,
 						    rxq->mbuf_initializer);
@@ -252,71 +271,6 @@ 
 	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, hash) !=
 			 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 12);
 
-	/* Status/Error flag masks */
-	/**
-	 * mask everything except RSS, flow director and VLAN flags
-	 * bit2 is for VLAN tag, bit11 for flow director indication
-	 * bit13:12 for RSS indication. Bits 3-5 of error
-	 * field (bits 22-24) are for IP/L4 checksum errors
-	 */
-	const __m256i flags_mask =
-		_mm256_set1_epi32((1 << 2) | (1 << 11) |
-				  (3 << 12) | (7 << 22));
-	/**
-	 * data to be shuffled by result of flag mask. If VLAN bit is set,
-	 * (bit 2), then position 4 in this array will be used in the
-	 * destination
-	 */
-	const __m256i vlan_flags_shuf =
-		_mm256_set_epi32(0, 0, PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED, 0,
-				 0, 0, PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED, 0);
-	/**
-	 * data to be shuffled by result of flag mask, shifted down 11.
-	 * If RSS/FDIR bits are set, shuffle moves appropriate flags in
-	 * place.
-	 */
-	const __m256i rss_flags_shuf =
-		_mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
-				PKT_RX_RSS_HASH | PKT_RX_FDIR, PKT_RX_RSS_HASH,
-				0, 0, 0, 0, PKT_RX_FDIR, 0,/* end up 128-bits */
-				0, 0, 0, 0, 0, 0, 0, 0,
-				PKT_RX_RSS_HASH | PKT_RX_FDIR, PKT_RX_RSS_HASH,
-				0, 0, 0, 0, PKT_RX_FDIR, 0);
-
-	/**
-	 * data to be shuffled by the result of the flags mask shifted by 22
-	 * bits.  This gives use the l3_l4 flags.
-	 */
-	const __m256i l3_l4_flags_shuf = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
-			/* shift right 1 bit to make sure it not exceed 255 */
-			(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
-			 PKT_RX_IP_CKSUM_BAD) >> 1,
-			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_OUTER_IP_CKSUM_BAD |
-			 PKT_RX_L4_CKSUM_BAD) >> 1,
-			(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
-			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_OUTER_IP_CKSUM_BAD) >> 1,
-			(PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
-			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD) >> 1,
-			PKT_RX_IP_CKSUM_BAD >> 1,
-			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD) >> 1,
-			/* second 128-bits */
-			0, 0, 0, 0, 0, 0, 0, 0,
-			(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
-			 PKT_RX_IP_CKSUM_BAD) >> 1,
-			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_OUTER_IP_CKSUM_BAD |
-			 PKT_RX_L4_CKSUM_BAD) >> 1,
-			(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
-			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_OUTER_IP_CKSUM_BAD) >> 1,
-			(PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
-			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD) >> 1,
-			PKT_RX_IP_CKSUM_BAD >> 1,
-			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD) >> 1);
-
-	const __m256i cksum_mask =
-		_mm256_set1_epi32(PKT_RX_IP_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD |
-				  PKT_RX_L4_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD |
-				  PKT_RX_OUTER_IP_CKSUM_BAD);
-
 	uint16_t i, received;
 
 	for (i = 0, received = 0; i < nb_pkts;
@@ -384,6 +338,7 @@ 
 		__m512i mb4_7 = _mm512_shuffle_epi8(desc4_7, shuf_msk);
 
 		mb4_7 = _mm512_add_epi32(mb4_7, crc_adjust);
+#ifdef IAVF_RX_PTYPE_OFFLOAD
 		/**
 		 * to get packet types, shift 64-bit values down 30 bits
 		 * and so ptype is in lower 8-bits in each
@@ -402,6 +357,7 @@ 
 			 0, 0, 0, type_table[ptype5],
 			 0, 0, 0, type_table[ptype4]);
 		mb4_7 = _mm512_mask_blend_epi32(0x1111, mb4_7, ptype4_7);
+#endif
 
 		/**
 		 * convert descriptors 0-3 into mbufs, adjusting length and
@@ -415,6 +371,7 @@ 
 		__m512i mb0_3 = _mm512_shuffle_epi8(desc0_3, shuf_msk);
 
 		mb0_3 = _mm512_add_epi32(mb0_3, crc_adjust);
+#ifdef IAVF_RX_PTYPE_OFFLOAD
 		/* get the packet types */
 		const __m512i ptypes0_3 = _mm512_srli_epi64(desc0_3, 30);
 		const __m256i ptypes2_3 = _mm512_extracti64x4_epi64(ptypes0_3, 1);
@@ -430,6 +387,7 @@ 
 			 0, 0, 0, type_table[ptype1],
 			 0, 0, 0, type_table[ptype0]);
 		mb0_3 = _mm512_mask_blend_epi32(0x1111, mb0_3, ptype0_3);
+#endif
 
 		/**
 		 * use permute/extract to get status content
@@ -449,27 +407,122 @@ 
 
 		/* now do flag manipulation */
 
-		/* get only flag/error bits we want */
-		const __m256i flag_bits =
-			_mm256_and_si256(status0_7, flags_mask);
-		/* set vlan and rss flags */
-		const __m256i vlan_flags =
-			_mm256_shuffle_epi8(vlan_flags_shuf, flag_bits);
-		const __m256i rss_flags =
-			_mm256_shuffle_epi8(rss_flags_shuf,
-					    _mm256_srli_epi32(flag_bits, 11));
-		/**
-		 * l3_l4_error flags, shuffle, then shift to correct adjustment
-		 * of flags in flags_shuf, and finally mask out extra bits
-		 */
-		__m256i l3_l4_flags = _mm256_shuffle_epi8(l3_l4_flags_shuf,
-						_mm256_srli_epi32(flag_bits, 22));
-		l3_l4_flags = _mm256_slli_epi32(l3_l4_flags, 1);
-		l3_l4_flags = _mm256_and_si256(l3_l4_flags, cksum_mask);
-
 		/* merge flags */
-		const __m256i mbuf_flags = _mm256_or_si256(l3_l4_flags,
-				_mm256_or_si256(rss_flags, vlan_flags));
+		__m256i mbuf_flags = _mm256_set1_epi32(0);
+
+		if (offload) {
+#if defined(IAVF_RX_CSUM_OFFLOAD) || defined(IAVF_RX_VLAN_OFFLOAD) || defined(IAVF_RX_RSS_OFFLOAD)
+			/* Status/Error flag masks */
+			/**
+			 * mask everything except RSS, flow director and VLAN flags
+			 * bit2 is for VLAN tag, bit11 for flow director indication
+			 * bit13:12 for RSS indication. Bits 3-5 of error
+			 * field (bits 22-24) are for IP/L4 checksum errors
+			 */
+			const __m256i flags_mask =
+				_mm256_set1_epi32((1 << 2) | (1 << 11) |
+						  (3 << 12) | (7 << 22));
+#endif
+
+#ifdef IAVF_RX_VLAN_OFFLOAD
+			/**
+			 * data to be shuffled by result of flag mask. If VLAN bit is set,
+			 * (bit 2), then position 4 in this array will be used in the
+			 * destination
+			 */
+			const __m256i vlan_flags_shuf =
+				_mm256_set_epi32(0, 0, PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED, 0,
+						 0, 0, PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED, 0);
+#endif
+
+#ifdef IAVF_RX_RSS_OFFLOAD
+			/**
+			 * data to be shuffled by result of flag mask, shifted down 11.
+			 * If RSS/FDIR bits are set, shuffle moves appropriate flags in
+			 * place.
+			 */
+			const __m256i rss_flags_shuf =
+				_mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
+						PKT_RX_RSS_HASH | PKT_RX_FDIR, PKT_RX_RSS_HASH,
+						0, 0, 0, 0, PKT_RX_FDIR, 0,/* end up 128-bits */
+						0, 0, 0, 0, 0, 0, 0, 0,
+						PKT_RX_RSS_HASH | PKT_RX_FDIR, PKT_RX_RSS_HASH,
+						0, 0, 0, 0, PKT_RX_FDIR, 0);
+#endif
+
+#ifdef IAVF_RX_CSUM_OFFLOAD
+			/**
+			 * data to be shuffled by the result of the flags mask shifted by 22
+			 * bits.  This gives use the l3_l4 flags.
+			 */
+			const __m256i l3_l4_flags_shuf = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
+					/* shift right 1 bit to make sure it not exceed 255 */
+					(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
+					 PKT_RX_IP_CKSUM_BAD) >> 1,
+					(PKT_RX_IP_CKSUM_GOOD | PKT_RX_OUTER_IP_CKSUM_BAD |
+					 PKT_RX_L4_CKSUM_BAD) >> 1,
+					(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
+					(PKT_RX_IP_CKSUM_GOOD | PKT_RX_OUTER_IP_CKSUM_BAD) >> 1,
+					(PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
+					(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD) >> 1,
+					PKT_RX_IP_CKSUM_BAD >> 1,
+					(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD) >> 1,
+					/* second 128-bits */
+					0, 0, 0, 0, 0, 0, 0, 0,
+					(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
+					 PKT_RX_IP_CKSUM_BAD) >> 1,
+					(PKT_RX_IP_CKSUM_GOOD | PKT_RX_OUTER_IP_CKSUM_BAD |
+					 PKT_RX_L4_CKSUM_BAD) >> 1,
+					(PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
+					(PKT_RX_IP_CKSUM_GOOD | PKT_RX_OUTER_IP_CKSUM_BAD) >> 1,
+					(PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
+					(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD) >> 1,
+					PKT_RX_IP_CKSUM_BAD >> 1,
+					(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD) >> 1);
+
+			const __m256i cksum_mask =
+				_mm256_set1_epi32(PKT_RX_IP_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD |
+						  PKT_RX_L4_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD |
+						  PKT_RX_OUTER_IP_CKSUM_BAD);
+#endif
+
+#if defined(IAVF_RX_CSUM_OFFLOAD) || defined(IAVF_RX_VLAN_OFFLOAD) || defined(IAVF_RX_RSS_OFFLOAD)
+			/* get only flag/error bits we want */
+			const __m256i flag_bits =
+				_mm256_and_si256(status0_7, flags_mask);
+#endif
+			/* set vlan and rss flags */
+#ifdef IAVF_RX_VLAN_OFFLOAD
+			const __m256i vlan_flags =
+				_mm256_shuffle_epi8(vlan_flags_shuf, flag_bits);
+#endif
+#ifdef IAVF_RX_RSS_OFFLOAD
+			const __m256i rss_flags =
+				_mm256_shuffle_epi8(rss_flags_shuf,
+						    _mm256_srli_epi32(flag_bits, 11));
+#endif
+#ifdef IAVF_RX_CSUM_OFFLOAD
+			/**
+			 * l3_l4_error flags, shuffle, then shift to correct adjustment
+			 * of flags in flags_shuf, and finally mask out extra bits
+			 */
+			__m256i l3_l4_flags = _mm256_shuffle_epi8(l3_l4_flags_shuf,
+							_mm256_srli_epi32(flag_bits, 22));
+			l3_l4_flags = _mm256_slli_epi32(l3_l4_flags, 1);
+			l3_l4_flags = _mm256_and_si256(l3_l4_flags, cksum_mask);
+#endif
+
+#ifdef IAVF_RX_CSUM_OFFLOAD
+			mbuf_flags = _mm256_or_si256(mbuf_flags, l3_l4_flags);
+#endif
+#ifdef IAVF_RX_RSS_OFFLOAD
+			mbuf_flags = _mm256_or_si256(mbuf_flags, rss_flags);
+#endif
+#ifdef IAVF_RX_VLAN_OFFLOAD
+			mbuf_flags = _mm256_or_si256(mbuf_flags, vlan_flags);
+#endif
+		}
+
 		/**
 		 * At this point, we have the 8 sets of flags in the low 16-bits
 		 * of each 32-bit value in vlan0.
@@ -487,7 +540,7 @@ 
 		RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, rearm_data) !=
 				 RTE_ALIGN(offsetof(struct rte_mbuf,
 						    rearm_data),
-					   16));
+						    16));
 		/* build up data and do writes */
 		__m256i rearm0, rearm1, rearm2, rearm3, rearm4, rearm5,
 			rearm6, rearm7;
@@ -496,21 +549,28 @@ 
 		const __m256i mb0_1 = _mm512_extracti64x4_epi64(mb0_3, 0);
 		const __m256i mb2_3 = _mm512_extracti64x4_epi64(mb0_3, 1);
 
-		rearm6 = _mm256_blend_epi32(mbuf_init,
-					    _mm256_slli_si256(mbuf_flags, 8),
-					    0x04);
-		rearm4 = _mm256_blend_epi32(mbuf_init,
-					    _mm256_slli_si256(mbuf_flags, 4),
-					    0x04);
-		rearm2 = _mm256_blend_epi32(mbuf_init, mbuf_flags, 0x04);
-		rearm0 = _mm256_blend_epi32(mbuf_init,
-					    _mm256_srli_si256(mbuf_flags, 4),
-					    0x04);
-		/* permute to add in the rx_descriptor e.g. rss fields */
-		rearm6 = _mm256_permute2f128_si256(rearm6, mb6_7, 0x20);
-		rearm4 = _mm256_permute2f128_si256(rearm4, mb4_5, 0x20);
-		rearm2 = _mm256_permute2f128_si256(rearm2, mb2_3, 0x20);
-		rearm0 = _mm256_permute2f128_si256(rearm0, mb0_1, 0x20);
+		if (offload) {
+			rearm6 = _mm256_blend_epi32(mbuf_init,
+						    _mm256_slli_si256(mbuf_flags, 8),
+						    0x04);
+			rearm4 = _mm256_blend_epi32(mbuf_init,
+						    _mm256_slli_si256(mbuf_flags, 4),
+						    0x04);
+			rearm2 = _mm256_blend_epi32(mbuf_init, mbuf_flags, 0x04);
+			rearm0 = _mm256_blend_epi32(mbuf_init,
+						    _mm256_srli_si256(mbuf_flags, 4),
+						    0x04);
+			/* permute to add in the rx_descriptor e.g. rss fields */
+			rearm6 = _mm256_permute2f128_si256(rearm6, mb6_7, 0x20);
+			rearm4 = _mm256_permute2f128_si256(rearm4, mb4_5, 0x20);
+			rearm2 = _mm256_permute2f128_si256(rearm2, mb2_3, 0x20);
+			rearm0 = _mm256_permute2f128_si256(rearm0, mb0_1, 0x20);
+		} else {
+			rearm6 = _mm256_permute2f128_si256(mbuf_init, mb6_7, 0x20);
+			rearm4 = _mm256_permute2f128_si256(mbuf_init, mb4_5, 0x20);
+			rearm2 = _mm256_permute2f128_si256(mbuf_init, mb2_3, 0x20);
+			rearm0 = _mm256_permute2f128_si256(mbuf_init, mb0_1, 0x20);
+		}
 		/* write to mbuf */
 		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 6]->rearm_data,
 				    rearm6);
@@ -522,24 +582,31 @@ 
 				    rearm0);
 
 		/* repeat for the odd mbufs */
-		const __m256i odd_flags =
-			_mm256_castsi128_si256
-				(_mm256_extracti128_si256(mbuf_flags, 1));
-		rearm7 = _mm256_blend_epi32(mbuf_init,
-					    _mm256_slli_si256(odd_flags, 8),
-					    0x04);
-		rearm5 = _mm256_blend_epi32(mbuf_init,
-					    _mm256_slli_si256(odd_flags, 4),
-					    0x04);
-		rearm3 = _mm256_blend_epi32(mbuf_init, odd_flags, 0x04);
-		rearm1 = _mm256_blend_epi32(mbuf_init,
-					    _mm256_srli_si256(odd_flags, 4),
-					    0x04);
-		/* since odd mbufs are already in hi 128-bits use blend */
-		rearm7 = _mm256_blend_epi32(rearm7, mb6_7, 0xF0);
-		rearm5 = _mm256_blend_epi32(rearm5, mb4_5, 0xF0);
-		rearm3 = _mm256_blend_epi32(rearm3, mb2_3, 0xF0);
-		rearm1 = _mm256_blend_epi32(rearm1, mb0_1, 0xF0);
+		if (offload) {
+			const __m256i odd_flags =
+				_mm256_castsi128_si256
+					(_mm256_extracti128_si256(mbuf_flags, 1));
+			rearm7 = _mm256_blend_epi32(mbuf_init,
+						    _mm256_slli_si256(odd_flags, 8),
+						    0x04);
+			rearm5 = _mm256_blend_epi32(mbuf_init,
+						    _mm256_slli_si256(odd_flags, 4),
+						    0x04);
+			rearm3 = _mm256_blend_epi32(mbuf_init, odd_flags, 0x04);
+			rearm1 = _mm256_blend_epi32(mbuf_init,
+						    _mm256_srli_si256(odd_flags, 4),
+						    0x04);
+			/* since odd mbufs are already in hi 128-bits use blend */
+			rearm7 = _mm256_blend_epi32(rearm7, mb6_7, 0xF0);
+			rearm5 = _mm256_blend_epi32(rearm5, mb4_5, 0xF0);
+			rearm3 = _mm256_blend_epi32(rearm3, mb2_3, 0xF0);
+			rearm1 = _mm256_blend_epi32(rearm1, mb0_1, 0xF0);
+		} else {
+			rearm7 = _mm256_blend_epi32(mbuf_init, mb6_7, 0xF0);
+			rearm5 = _mm256_blend_epi32(mbuf_init, mb4_5, 0xF0);
+			rearm3 = _mm256_blend_epi32(mbuf_init, mb2_3, 0xF0);
+			rearm1 = _mm256_blend_epi32(mbuf_init, mb0_1, 0xF0);
+		}
 		/* again write to mbufs */
 		_mm256_storeu_si256((__m256i *)&rx_pkts[i + 7]->rearm_data,
 				    rearm7);
@@ -1250,7 +1317,8 @@ 
 iavf_recv_pkts_vec_avx512(void *rx_queue, struct rte_mbuf **rx_pkts,
 			  uint16_t nb_pkts)
 {
-	return _iavf_recv_raw_pkts_vec_avx512(rx_queue, rx_pkts, nb_pkts, NULL);
+	return _iavf_recv_raw_pkts_vec_avx512(rx_queue, rx_pkts, nb_pkts,
+					      NULL, false);
 }
 
 /**
@@ -1270,16 +1338,16 @@ 
  * Notice:
  * - nb_pkts < IAVF_DESCS_PER_LOOP, just return no packet
  */
-static uint16_t
+static __rte_always_inline uint16_t
 iavf_recv_scattered_burst_vec_avx512(void *rx_queue, struct rte_mbuf **rx_pkts,
-				     uint16_t nb_pkts)
+				     uint16_t nb_pkts, bool offload)
 {
 	struct iavf_rx_queue *rxq = rx_queue;
 	uint8_t split_flags[IAVF_VPMD_RX_MAX_BURST] = {0};
 
 	/* get some new buffers */
 	uint16_t nb_bufs = _iavf_recv_raw_pkts_vec_avx512(rxq, rx_pkts, nb_pkts,
-							  split_flags);
+							  split_flags, offload);
 	if (nb_bufs == 0)
 		return 0;
 
@@ -1312,22 +1380,30 @@ 
  * Notice:
  * - nb_pkts < IAVF_DESCS_PER_LOOP, just return no packet
  */
-uint16_t
-iavf_recv_scattered_pkts_vec_avx512(void *rx_queue, struct rte_mbuf **rx_pkts,
-				    uint16_t nb_pkts)
+static __rte_always_inline uint16_t
+iavf_recv_scattered_pkts_vec_avx512_cmn(void *rx_queue, struct rte_mbuf **rx_pkts,
+					uint16_t nb_pkts, bool offload)
 {
 	uint16_t retval = 0;
 
 	while (nb_pkts > IAVF_VPMD_RX_MAX_BURST) {
 		uint16_t burst = iavf_recv_scattered_burst_vec_avx512(rx_queue,
-				rx_pkts + retval, IAVF_VPMD_RX_MAX_BURST);
+				rx_pkts + retval, IAVF_VPMD_RX_MAX_BURST, offload);
 		retval += burst;
 		nb_pkts -= burst;
 		if (burst < IAVF_VPMD_RX_MAX_BURST)
 			return retval;
 	}
 	return retval + iavf_recv_scattered_burst_vec_avx512(rx_queue,
-				rx_pkts + retval, nb_pkts);
+				rx_pkts + retval, nb_pkts, offload);
+}
+
+uint16_t
+iavf_recv_scattered_pkts_vec_avx512(void *rx_queue, struct rte_mbuf **rx_pkts,
+				    uint16_t nb_pkts)
+{
+	return iavf_recv_scattered_pkts_vec_avx512_cmn(rx_queue, rx_pkts,
+						       nb_pkts, false);
 }
 
 /**
@@ -1400,6 +1476,23 @@ 
 				rx_pkts + retval, nb_pkts);
 }
 
+uint16_t
+iavf_recv_pkts_vec_avx512_offload(void *rx_queue, struct rte_mbuf **rx_pkts,
+				  uint16_t nb_pkts)
+{
+	return _iavf_recv_raw_pkts_vec_avx512(rx_queue, rx_pkts,
+					      nb_pkts, NULL, true);
+}
+
+uint16_t
+iavf_recv_scattered_pkts_vec_avx512_offload(void *rx_queue,
+					    struct rte_mbuf **rx_pkts,
+					    uint16_t nb_pkts)
+{
+	return iavf_recv_scattered_pkts_vec_avx512_cmn(rx_queue, rx_pkts,
+						       nb_pkts, true);
+}
+
 static __rte_always_inline int
 iavf_tx_free_bufs_avx512(struct iavf_tx_queue *txq)
 {
diff --git a/drivers/net/iavf/iavf_rxtx_vec_common.h b/drivers/net/iavf/iavf_rxtx_vec_common.h
index 8e96cb5..d156d79 100644
--- a/drivers/net/iavf/iavf_rxtx_vec_common.h
+++ b/drivers/net/iavf/iavf_rxtx_vec_common.h
@@ -15,7 +15,7 @@ 
 #pragma GCC diagnostic ignored "-Wcast-qual"
 #endif
 
-static inline uint16_t
+static __rte_always_inline uint16_t
 reassemble_packets(struct iavf_rx_queue *rxq, struct rte_mbuf **rx_bufs,
 		   uint16_t nb_bufs, uint8_t *split_flags)
 {
@@ -231,7 +231,10 @@ 
 	if (rxq->proto_xtr != IAVF_PROTO_XTR_NONE)
 		return -1;
 
-	return 0;
+	if (rxq->offloads & IAVF_RX_VECTOR_OFFLOAD)
+		return IAVF_VECTOR_OFFLOAD_PATH;
+
+	return IAVF_VECTOR_PATH;
 }
 
 static inline int
@@ -258,14 +261,20 @@ 
 {
 	int i;
 	struct iavf_rx_queue *rxq;
+	int ret;
+	int result = 0;
 
 	for (i = 0; i < dev->data->nb_rx_queues; i++) {
 		rxq = dev->data->rx_queues[i];
-		if (iavf_rx_vec_queue_default(rxq))
+		ret = iavf_rx_vec_queue_default(rxq);
+
+		if (ret < 0)
 			return -1;
+		if (ret > result)
+			result = ret;
 	}
 
-	return 0;
+	return result;
 }
 
 static inline int