[v2,1/5] net/ice: support flex Rx descriptor RxDID #22
diff mbox series

Message ID 20200907091711.5980-2-junyux.jiang@intel.com
State Superseded, archived
Delegated to: Qi Zhang
Headers show
Series
  • supports RxDID #22 and FDID
Related show

Checks

Context Check Description
ci/checkpatch success coding style OK

Commit Message

Junyu Jiang Sept. 7, 2020, 9:17 a.m. UTC
This patch supports RxDID #22 by the following changes:
-add structure and macro definition for RxDID #22,
-support RxDID #22 format in normal path,
-change RSS hash parsing from RxDID #22 in AVX/SSE data path.

Signed-off-by: Junyu Jiang <junyux.jiang@intel.com>
---
 drivers/net/ice/ice_rxtx.c          | 16 ++---
 drivers/net/ice/ice_rxtx.h          | 42 +++++++++++++
 drivers/net/ice/ice_rxtx_vec_avx2.c | 98 +++++++++++++++++++++++++++--
 drivers/net/ice/ice_rxtx_vec_sse.c  | 89 +++++++++++++++++++++-----
 4 files changed, 218 insertions(+), 27 deletions(-)

Patch
diff mbox series

diff --git a/drivers/net/ice/ice_rxtx.c b/drivers/net/ice/ice_rxtx.c
index 2e1f06d2c..a31a976a1 100644
--- a/drivers/net/ice/ice_rxtx.c
+++ b/drivers/net/ice/ice_rxtx.c
@@ -50,7 +50,7 @@  static inline uint8_t
 ice_proto_xtr_type_to_rxdid(uint8_t xtr_type)
 {
 	static uint8_t rxdid_map[] = {
-		[PROTO_XTR_NONE]      = ICE_RXDID_COMMS_GENERIC,
+		[PROTO_XTR_NONE]      = ICE_RXDID_COMMS_OVS,
 		[PROTO_XTR_VLAN]      = ICE_RXDID_COMMS_AUX_VLAN,
 		[PROTO_XTR_IPV4]      = ICE_RXDID_COMMS_AUX_IPV4,
 		[PROTO_XTR_IPV6]      = ICE_RXDID_COMMS_AUX_IPV6,
@@ -59,7 +59,7 @@  ice_proto_xtr_type_to_rxdid(uint8_t xtr_type)
 	};
 
 	return xtr_type < RTE_DIM(rxdid_map) ?
-				rxdid_map[xtr_type] : ICE_RXDID_COMMS_GENERIC;
+				rxdid_map[xtr_type] : ICE_RXDID_COMMS_OVS;
 }
 
 static enum ice_status
@@ -72,7 +72,7 @@  ice_program_hw_rx_queue(struct ice_rx_queue *rxq)
 	enum ice_status err;
 	uint16_t buf_size, len;
 	struct rte_eth_rxmode *rxmode = &dev->data->dev_conf.rxmode;
-	uint32_t rxdid = ICE_RXDID_COMMS_GENERIC;
+	uint32_t rxdid = ICE_RXDID_COMMS_OVS;
 	uint32_t regval;
 
 	/* Set buffer size as the head split is disabled. */
@@ -1309,7 +1309,7 @@  ice_rxd_to_vlan_tci(struct rte_mbuf *mb, volatile union ice_rx_flex_desc *rxdp)
 
 static void
 ice_rxd_to_proto_xtr(struct rte_mbuf *mb,
-		     volatile struct ice_32b_rx_flex_desc_comms *desc)
+		     volatile struct ice_32b_rx_flex_desc_comms_ovs *desc)
 {
 	uint16_t stat_err = rte_le_to_cpu_16(desc->status_error1);
 	uint32_t metadata;
@@ -1338,8 +1338,9 @@  static inline void
 ice_rxd_to_pkt_fields(struct rte_mbuf *mb,
 		      volatile union ice_rx_flex_desc *rxdp)
 {
-	volatile struct ice_32b_rx_flex_desc_comms *desc =
-			(volatile struct ice_32b_rx_flex_desc_comms *)rxdp;
+	volatile struct ice_32b_rx_flex_desc_comms_ovs *desc =
+			(volatile struct ice_32b_rx_flex_desc_comms_ovs *)rxdp;
+#ifndef RTE_LIBRTE_ICE_16BYTE_RX_DESC
 	uint16_t stat_err;
 
 	stat_err = rte_le_to_cpu_16(desc->status_error0);
@@ -1347,13 +1348,14 @@  ice_rxd_to_pkt_fields(struct rte_mbuf *mb,
 		mb->ol_flags |= PKT_RX_RSS_HASH;
 		mb->hash.rss = rte_le_to_cpu_32(desc->rss_hash);
 	}
+#endif
 
-#ifndef RTE_LIBRTE_ICE_16BYTE_RX_DESC
 	if (desc->flow_id != 0xFFFFFFFF) {
 		mb->ol_flags |= PKT_RX_FDIR | PKT_RX_FDIR_ID;
 		mb->hash.fdir.hi = rte_le_to_cpu_32(desc->flow_id);
 	}
 
+#ifndef RTE_LIBRTE_ICE_16BYTE_RX_DESC
 	if (unlikely(rte_net_ice_dynf_proto_xtr_metadata_avail()))
 		ice_rxd_to_proto_xtr(mb, desc);
 #endif
diff --git a/drivers/net/ice/ice_rxtx.h b/drivers/net/ice/ice_rxtx.h
index 2fdcfb7d0..e21ba152d 100644
--- a/drivers/net/ice/ice_rxtx.h
+++ b/drivers/net/ice/ice_rxtx.h
@@ -38,6 +38,8 @@ 
 
 #define ICE_FDIR_PKT_LEN	512
 
+#define ICE_RXDID_COMMS_OVS	22
+
 typedef void (*ice_rx_release_mbufs_t)(struct ice_rx_queue *rxq);
 typedef void (*ice_tx_release_mbufs_t)(struct ice_tx_queue *txq);
 
@@ -135,6 +137,46 @@  union ice_tx_offload {
 	};
 };
 
+/* Rx Flex Descriptor for Comms Package Profile
+ * RxDID Profile ID 22 (swap Hash and FlowID)
+ * Flex-field 0: Flow ID lower 16-bits
+ * Flex-field 1: Flow ID upper 16-bits
+ * Flex-field 2: RSS hash lower 16-bits
+ * Flex-field 3: RSS hash upper 16-bits
+ * Flex-field 4: AUX0
+ * Flex-field 5: AUX1
+ */
+struct ice_32b_rx_flex_desc_comms_ovs {
+	/* Qword 0 */
+	u8 rxdid;
+	u8 mir_id_umb_cast;
+	__le16 ptype_flexi_flags0;
+	__le16 pkt_len;
+	__le16 hdr_len_sph_flex_flags1;
+
+	/* Qword 1 */
+	__le16 status_error0;
+	__le16 l2tag1;
+	__le32 flow_id;
+
+	/* Qword 2 */
+	__le16 status_error1;
+	u8 flexi_flags2;
+	u8 ts_low;
+	__le16 l2tag2_1st;
+	__le16 l2tag2_2nd;
+
+	/* Qword 3 */
+	__le32 rss_hash;
+	union {
+		struct {
+			__le16 aux0;
+			__le16 aux1;
+		} flex;
+		__le32 ts_high;
+	} flex_ts;
+};
+
 int ice_rx_queue_setup(struct rte_eth_dev *dev,
 		       uint16_t queue_idx,
 		       uint16_t nb_desc,
diff --git a/drivers/net/ice/ice_rxtx_vec_avx2.c b/drivers/net/ice/ice_rxtx_vec_avx2.c
index be50677c2..07d129e3f 100644
--- a/drivers/net/ice/ice_rxtx_vec_avx2.c
+++ b/drivers/net/ice/ice_rxtx_vec_avx2.c
@@ -191,8 +191,8 @@  _ice_recv_raw_pkts_vec_avx2(struct ice_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 	const __m256i shuf_msk =
 		_mm256_set_epi8
 			(/* first descriptor */
-			 15, 14,
-			 13, 12,	/* octet 12~15, 32 bits rss */
+			 0xFF, 0xFF,
+			 0xFF, 0xFF,	/* rss hash parsed separately */
 			 11, 10,	/* octet 10~11, 16 bits vlan_macip */
 			 5, 4,		/* octet 4~5, 16 bits data_len */
 			 0xFF, 0xFF,	/* skip hi 16 bits pkt_len, zero out */
@@ -200,8 +200,8 @@  _ice_recv_raw_pkts_vec_avx2(struct ice_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 			 0xFF, 0xFF,	/* pkt_type set as unknown */
 			 0xFF, 0xFF,	/*pkt_type set as unknown */
 			 /* second descriptor */
-			 15, 14,
-			 13, 12,	/* octet 12~15, 32 bits rss */
+			 0xFF, 0xFF,
+			 0xFF, 0xFF,	/* rss hash parsed separately */
 			 11, 10,	/* octet 10~11, 16 bits vlan_macip */
 			 5, 4,		/* octet 4~5, 16 bits data_len */
 			 0xFF, 0xFF,	/* skip hi 16 bits pkt_len, zero out */
@@ -461,6 +461,96 @@  _ice_recv_raw_pkts_vec_avx2(struct ice_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 		/* merge flags */
 		const __m256i mbuf_flags = _mm256_or_si256(l3_l4_flags,
 				rss_vlan_flags);
+
+#ifndef RTE_LIBRTE_ICE_16BYTE_RX_DESC
+		/**
+		 * needs to load 2nd 16B of each desc for RSS hash parsing,
+		 * will cause performance drop to get into this context.
+		 */
+		if (rxq->vsi->adapter->eth_dev->data->dev_conf.rxmode.offloads &
+				DEV_RX_OFFLOAD_RSS_HASH) {
+			/* load bottom half of every 32B desc */
+			const __m128i raw_desc_bh7 =
+				_mm_load_si128
+					((void *)(&rxdp[7].wb.status_error1));
+			rte_compiler_barrier();
+			const __m128i raw_desc_bh6 =
+				_mm_load_si128
+					((void *)(&rxdp[6].wb.status_error1));
+			rte_compiler_barrier();
+			const __m128i raw_desc_bh5 =
+				_mm_load_si128
+					((void *)(&rxdp[5].wb.status_error1));
+			rte_compiler_barrier();
+			const __m128i raw_desc_bh4 =
+				_mm_load_si128
+					((void *)(&rxdp[4].wb.status_error1));
+			rte_compiler_barrier();
+			const __m128i raw_desc_bh3 =
+				_mm_load_si128
+					((void *)(&rxdp[3].wb.status_error1));
+			rte_compiler_barrier();
+			const __m128i raw_desc_bh2 =
+				_mm_load_si128
+					((void *)(&rxdp[2].wb.status_error1));
+			rte_compiler_barrier();
+			const __m128i raw_desc_bh1 =
+				_mm_load_si128
+					((void *)(&rxdp[1].wb.status_error1));
+			rte_compiler_barrier();
+			const __m128i raw_desc_bh0 =
+				_mm_load_si128
+					((void *)(&rxdp[0].wb.status_error1));
+
+			__m256i raw_desc_bh6_7 =
+				_mm256_inserti128_si256
+					(_mm256_castsi128_si256(raw_desc_bh6),
+					raw_desc_bh7, 1);
+			__m256i raw_desc_bh4_5 =
+				_mm256_inserti128_si256
+					(_mm256_castsi128_si256(raw_desc_bh4),
+					raw_desc_bh5, 1);
+			__m256i raw_desc_bh2_3 =
+				_mm256_inserti128_si256
+					(_mm256_castsi128_si256(raw_desc_bh2),
+					raw_desc_bh3, 1);
+			__m256i raw_desc_bh0_1 =
+				_mm256_inserti128_si256
+					(_mm256_castsi128_si256(raw_desc_bh0),
+					raw_desc_bh1, 1);
+
+			/**
+			 * to shift the 32b RSS hash value to the
+			 * highest 32b of each 128b before mask
+			 */
+			__m256i rss_hash6_7 =
+				_mm256_slli_epi64(raw_desc_bh6_7, 32);
+			__m256i rss_hash4_5 =
+				_mm256_slli_epi64(raw_desc_bh4_5, 32);
+			__m256i rss_hash2_3 =
+				_mm256_slli_epi64(raw_desc_bh2_3, 32);
+			__m256i rss_hash0_1 =
+				_mm256_slli_epi64(raw_desc_bh0_1, 32);
+
+			__m256i rss_hash_msk =
+				_mm256_set_epi32(0xFFFFFFFF, 0, 0, 0,
+						 0xFFFFFFFF, 0, 0, 0);
+
+			rss_hash6_7 = _mm256_and_si256
+					(rss_hash6_7, rss_hash_msk);
+			rss_hash4_5 = _mm256_and_si256
+					(rss_hash4_5, rss_hash_msk);
+			rss_hash2_3 = _mm256_and_si256
+					(rss_hash2_3, rss_hash_msk);
+			rss_hash0_1 = _mm256_and_si256
+					(rss_hash0_1, rss_hash_msk);
+
+			mb6_7 = _mm256_or_si256(mb6_7, rss_hash6_7);
+			mb4_5 = _mm256_or_si256(mb4_5, rss_hash4_5);
+			mb2_3 = _mm256_or_si256(mb2_3, rss_hash2_3);
+			mb0_1 = _mm256_or_si256(mb0_1, rss_hash0_1);
+		} /* if() on RSS hash parsing */
+#endif
 		/**
 		 * At this point, we have the 8 sets of flags in the low 16-bits
 		 * of each 32-bit value in vlan0.
diff --git a/drivers/net/ice/ice_rxtx_vec_sse.c b/drivers/net/ice/ice_rxtx_vec_sse.c
index 382ef31f3..fffb27138 100644
--- a/drivers/net/ice/ice_rxtx_vec_sse.c
+++ b/drivers/net/ice/ice_rxtx_vec_sse.c
@@ -230,7 +230,8 @@  _ice_recv_raw_pkts_vec(struct ice_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 	const __m128i zero = _mm_setzero_si128();
 	/* mask to shuffle from desc. to mbuf */
 	const __m128i shuf_msk = _mm_set_epi8
-			(15, 14, 13, 12,  /* octet 12~15, 32 bits rss */
+			(0xFF, 0xFF,
+			 0xFF, 0xFF,  /* rss hash parsed separately */
 			 11, 10,      /* octet 10~11, 16 bits vlan_macip */
 			 5, 4,        /* octet 4~5, 16 bits data_len */
 			 0xFF, 0xFF,  /* skip high 16 bits pkt_len, zero out */
@@ -321,7 +322,7 @@  _ice_recv_raw_pkts_vec(struct ice_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 	     pos += ICE_DESCS_PER_LOOP,
 	     rxdp += ICE_DESCS_PER_LOOP) {
 		__m128i descs[ICE_DESCS_PER_LOOP];
-		__m128i pkt_mb1, pkt_mb2, pkt_mb3, pkt_mb4;
+		__m128i pkt_mb0, pkt_mb1, pkt_mb2, pkt_mb3;
 		__m128i staterr, sterr_tmp1, sterr_tmp2;
 		/* 2 64 bit or 4 32 bit mbuf pointers in one XMM reg. */
 		__m128i mbp1;
@@ -367,8 +368,12 @@  _ice_recv_raw_pkts_vec(struct ice_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 		rte_compiler_barrier();
 
 		/* D.1 pkt 3,4 convert format from desc to pktmbuf */
-		pkt_mb4 = _mm_shuffle_epi8(descs[3], shuf_msk);
-		pkt_mb3 = _mm_shuffle_epi8(descs[2], shuf_msk);
+		pkt_mb3 = _mm_shuffle_epi8(descs[3], shuf_msk);
+		pkt_mb2 = _mm_shuffle_epi8(descs[2], shuf_msk);
+
+		/* D.1 pkt 1,2 convert format from desc to pktmbuf */
+		pkt_mb1 = _mm_shuffle_epi8(descs[1], shuf_msk);
+		pkt_mb0 = _mm_shuffle_epi8(descs[0], shuf_msk);
 
 		/* C.1 4=>2 filter staterr info only */
 		sterr_tmp2 = _mm_unpackhi_epi32(descs[3], descs[2]);
@@ -378,12 +383,68 @@  _ice_recv_raw_pkts_vec(struct ice_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 		ice_rx_desc_to_olflags_v(rxq, descs, &rx_pkts[pos]);
 
 		/* D.2 pkt 3,4 set in_port/nb_seg and remove crc */
-		pkt_mb4 = _mm_add_epi16(pkt_mb4, crc_adjust);
 		pkt_mb3 = _mm_add_epi16(pkt_mb3, crc_adjust);
+		pkt_mb2 = _mm_add_epi16(pkt_mb2, crc_adjust);
 
-		/* D.1 pkt 1,2 convert format from desc to pktmbuf */
-		pkt_mb2 = _mm_shuffle_epi8(descs[1], shuf_msk);
-		pkt_mb1 = _mm_shuffle_epi8(descs[0], shuf_msk);
+		/* D.2 pkt 1,2 set in_port/nb_seg and remove crc */
+		pkt_mb1 = _mm_add_epi16(pkt_mb1, crc_adjust);
+		pkt_mb0 = _mm_add_epi16(pkt_mb0, crc_adjust);
+
+#ifndef RTE_LIBRTE_ICE_16BYTE_RX_DESC
+		/**
+		 * needs to load 2nd 16B of each desc for RSS hash parsing,
+		 * will cause performance drop to get into this context.
+		 */
+		if (rxq->vsi->adapter->eth_dev->data->dev_conf.rxmode.offloads &
+				DEV_RX_OFFLOAD_RSS_HASH) {
+			/* load bottom half of every 32B desc */
+			const __m128i raw_desc_bh3 =
+				_mm_load_si128
+					((void *)(&rxdp[3].wb.status_error1));
+			rte_compiler_barrier();
+			const __m128i raw_desc_bh2 =
+				_mm_load_si128
+					((void *)(&rxdp[2].wb.status_error1));
+			rte_compiler_barrier();
+			const __m128i raw_desc_bh1 =
+				_mm_load_si128
+					((void *)(&rxdp[1].wb.status_error1));
+			rte_compiler_barrier();
+			const __m128i raw_desc_bh0 =
+				_mm_load_si128
+					((void *)(&rxdp[0].wb.status_error1));
+
+			/**
+			 * to shift the 32b RSS hash value to the
+			 * highest 32b of each 128b before mask
+			 */
+			__m128i rss_hash3 =
+				_mm_slli_epi64(raw_desc_bh3, 32);
+			__m128i rss_hash2 =
+				_mm_slli_epi64(raw_desc_bh2, 32);
+			__m128i rss_hash1 =
+				_mm_slli_epi64(raw_desc_bh1, 32);
+			__m128i rss_hash0 =
+				_mm_slli_epi64(raw_desc_bh0, 32);
+
+			__m128i rss_hash_msk =
+				_mm_set_epi32(0xFFFFFFFF, 0, 0, 0);
+
+			rss_hash3 = _mm_and_si128
+					(rss_hash3, rss_hash_msk);
+			rss_hash2 = _mm_and_si128
+					(rss_hash2, rss_hash_msk);
+			rss_hash1 = _mm_and_si128
+					(rss_hash1, rss_hash_msk);
+			rss_hash0 = _mm_and_si128
+					(rss_hash0, rss_hash_msk);
+
+			pkt_mb3 = _mm_or_si128(pkt_mb3, rss_hash3);
+			pkt_mb2 = _mm_or_si128(pkt_mb2, rss_hash2);
+			pkt_mb1 = _mm_or_si128(pkt_mb1, rss_hash1);
+			pkt_mb0 = _mm_or_si128(pkt_mb0, rss_hash0);
+		} /* if() on RSS hash parsing */
+#endif
 
 		/* C.2 get 4 pkts staterr value  */
 		staterr = _mm_unpacklo_epi32(sterr_tmp1, sterr_tmp2);
@@ -391,14 +452,10 @@  _ice_recv_raw_pkts_vec(struct ice_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 		/* D.3 copy final 3,4 data to rx_pkts */
 		_mm_storeu_si128
 			((void *)&rx_pkts[pos + 3]->rx_descriptor_fields1,
-			 pkt_mb4);
+			 pkt_mb3);
 		_mm_storeu_si128
 			((void *)&rx_pkts[pos + 2]->rx_descriptor_fields1,
-			 pkt_mb3);
-
-		/* D.2 pkt 1,2 set in_port/nb_seg and remove crc */
-		pkt_mb2 = _mm_add_epi16(pkt_mb2, crc_adjust);
-		pkt_mb1 = _mm_add_epi16(pkt_mb1, crc_adjust);
+			 pkt_mb2);
 
 		/* C* extract and record EOP bit */
 		if (split_packet) {
@@ -422,9 +479,9 @@  _ice_recv_raw_pkts_vec(struct ice_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 		/* D.3 copy final 1,2 data to rx_pkts */
 		_mm_storeu_si128
 			((void *)&rx_pkts[pos + 1]->rx_descriptor_fields1,
-			 pkt_mb2);
+			 pkt_mb1);
 		_mm_storeu_si128((void *)&rx_pkts[pos]->rx_descriptor_fields1,
-				 pkt_mb1);
+				 pkt_mb0);
 		ice_rx_desc_to_ptype_v(descs, &rx_pkts[pos], ptype_tbl);
 		/* C.4 calc avaialbe number of desc */
 		var = __builtin_popcountll(_mm_cvtsi128_si64(staterr));