[5/5] net/hns3: optimize SVE Rx performance

Message ID 20230711102448.11627-6-liudongdong3@huawei.com (mailing list archive)
State Accepted, archived
Delegated to: Ferruh Yigit
Headers
Series net/hns3: some performance optimizations |

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/loongarch-compilation success Compilation OK
ci/loongarch-unit-testing success Unit Testing PASS
ci/Intel-compilation success Compilation OK
ci/intel-Testing success Testing PASS
ci/github-robot: build success github build: passed
ci/intel-Functional success Functional PASS
ci/iol-mellanox-Performance success Performance Testing PASS
ci/iol-abi-testing success Testing PASS
ci/iol-aarch-unit-testing success Testing PASS
ci/iol-broadcom-Performance success Performance Testing PASS
ci/iol-intel-Performance success Performance Testing PASS
ci/iol-unit-testing success Testing PASS
ci/iol-aarch64-compile-testing success Testing PASS
ci/iol-testing success Testing PASS
ci/iol-x86_64-unit-testing success Testing PASS
ci/iol-x86_64-compile-testing success Testing PASS
ci/iol-broadcom-Functional success Functional Testing PASS
ci/iol-intel-Functional success Functional Testing PASS

Commit Message

Dongdong Liu July 11, 2023, 10:24 a.m. UTC
From: Huisong Li <lihuisong@huawei.com>

This patch optimizes SVE Rx performance by the following ways:
1> optimize the calculation of valid BD number.
2> remove a temporary variable (key_fields)
3> use C language to parse some descriptor fields, instead of
   SVE instruction.
4> small step prefetch descriptor.

On the rxonly forwarding mode, the performance of a single queue
or 64B packet is improved by ~40%.

Signed-off-by: Huisong Li <lihuisong@huawei.com>
Signed-off-by: Dongdong Liu <liudongdong3@huawei.com>
---
 drivers/net/hns3/hns3_rxtx_vec_sve.c | 138 ++++++---------------------
 1 file changed, 28 insertions(+), 110 deletions(-)
  

Patch

diff --git a/drivers/net/hns3/hns3_rxtx_vec_sve.c b/drivers/net/hns3/hns3_rxtx_vec_sve.c
index 54aef7db8d..0e9abfebec 100644
--- a/drivers/net/hns3/hns3_rxtx_vec_sve.c
+++ b/drivers/net/hns3/hns3_rxtx_vec_sve.c
@@ -20,40 +20,36 @@ 
 
 #define BD_SIZE			32
 #define BD_FIELD_ADDR_OFFSET	0
-#define BD_FIELD_L234_OFFSET	8
-#define BD_FIELD_XLEN_OFFSET	12
-#define BD_FIELD_RSS_OFFSET	16
-#define BD_FIELD_OL_OFFSET	24
 #define BD_FIELD_VALID_OFFSET	28
 
-typedef struct {
-	uint32_t l234_info[HNS3_SVE_DEFAULT_DESCS_PER_LOOP];
-	uint32_t ol_info[HNS3_SVE_DEFAULT_DESCS_PER_LOOP];
-	uint32_t bd_base_info[HNS3_SVE_DEFAULT_DESCS_PER_LOOP];
-} HNS3_SVE_KEY_FIELD_S;
-
 static inline uint32_t
 hns3_desc_parse_field_sve(struct hns3_rx_queue *rxq,
 			  struct rte_mbuf **rx_pkts,
-			  HNS3_SVE_KEY_FIELD_S *key,
+			  struct hns3_desc *rxdp,
 			  uint32_t   bd_vld_num)
 {
+	uint32_t l234_info, ol_info, bd_base_info;
 	uint32_t retcode = 0;
 	int ret, i;
 
 	for (i = 0; i < (int)bd_vld_num; i++) {
 		/* init rte_mbuf.rearm_data last 64-bit */
 		rx_pkts[i]->ol_flags = RTE_MBUF_F_RX_RSS_HASH;
-
-		ret = hns3_handle_bdinfo(rxq, rx_pkts[i], key->bd_base_info[i],
-					 key->l234_info[i]);
+		rx_pkts[i]->hash.rss = rxdp[i].rx.rss_hash;
+		rx_pkts[i]->pkt_len = rte_le_to_cpu_16(rxdp[i].rx.pkt_len) -
+					rxq->crc_len;
+		rx_pkts[i]->data_len = rx_pkts[i]->pkt_len;
+
+		l234_info = rxdp[i].rx.l234_info;
+		ol_info = rxdp[i].rx.ol_info;
+		bd_base_info = rxdp[i].rx.bd_base_info;
+		ret = hns3_handle_bdinfo(rxq, rx_pkts[i], bd_base_info, l234_info);
 		if (unlikely(ret)) {
 			retcode |= 1u << i;
 			continue;
 		}
 
-		rx_pkts[i]->packet_type = hns3_rx_calc_ptype(rxq,
-					key->l234_info[i], key->ol_info[i]);
+		rx_pkts[i]->packet_type = hns3_rx_calc_ptype(rxq, l234_info, ol_info);
 
 		/* Increment bytes counter */
 		rxq->basic_stats.bytes += rx_pkts[i]->pkt_len;
@@ -77,46 +73,16 @@  hns3_recv_burst_vec_sve(struct hns3_rx_queue *__restrict rxq,
 			uint16_t nb_pkts,
 			uint64_t *bd_err_mask)
 {
-#define XLEN_ADJUST_LEN		32
-#define RSS_ADJUST_LEN		16
-#define GEN_VLD_U8_ZIP_INDEX	svindex_s8(28, -4)
 	uint16_t rx_id = rxq->next_to_use;
 	struct hns3_entry *sw_ring = &rxq->sw_ring[rx_id];
 	struct hns3_desc *rxdp = &rxq->rx_ring[rx_id];
-	struct hns3_desc *rxdp2;
-	HNS3_SVE_KEY_FIELD_S key_field;
+	struct hns3_desc *rxdp2, *next_rxdp;
 	uint64_t bd_valid_num;
 	uint32_t parse_retcode;
 	uint16_t nb_rx = 0;
 	int pos, offset;
 
-	uint16_t xlen_adjust[XLEN_ADJUST_LEN] = {
-		0,  0xffff, 1,  0xffff,    /* 1st mbuf: pkt_len and dat_len */
-		2,  0xffff, 3,  0xffff,    /* 2st mbuf: pkt_len and dat_len */
-		4,  0xffff, 5,  0xffff,    /* 3st mbuf: pkt_len and dat_len */
-		6,  0xffff, 7,  0xffff,    /* 4st mbuf: pkt_len and dat_len */
-		8,  0xffff, 9,  0xffff,    /* 5st mbuf: pkt_len and dat_len */
-		10, 0xffff, 11, 0xffff,    /* 6st mbuf: pkt_len and dat_len */
-		12, 0xffff, 13, 0xffff,    /* 7st mbuf: pkt_len and dat_len */
-		14, 0xffff, 15, 0xffff,    /* 8st mbuf: pkt_len and dat_len */
-	};
-
-	uint32_t rss_adjust[RSS_ADJUST_LEN] = {
-		0, 0xffff,        /* 1st mbuf: rss */
-		1, 0xffff,        /* 2st mbuf: rss */
-		2, 0xffff,        /* 3st mbuf: rss */
-		3, 0xffff,        /* 4st mbuf: rss */
-		4, 0xffff,        /* 5st mbuf: rss */
-		5, 0xffff,        /* 6st mbuf: rss */
-		6, 0xffff,        /* 7st mbuf: rss */
-		7, 0xffff,        /* 8st mbuf: rss */
-	};
-
 	svbool_t pg32 = svwhilelt_b32(0, HNS3_SVE_DEFAULT_DESCS_PER_LOOP);
-	svuint16_t xlen_tbl1 = svld1_u16(PG16_256BIT, xlen_adjust);
-	svuint16_t xlen_tbl2 = svld1_u16(PG16_256BIT, &xlen_adjust[16]);
-	svuint32_t rss_tbl1 = svld1_u32(PG32_256BIT, rss_adjust);
-	svuint32_t rss_tbl2 = svld1_u32(PG32_256BIT, &rss_adjust[8]);
 
 	/* compile-time verifies the xlen_adjust mask */
 	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) !=
@@ -126,30 +92,21 @@  hns3_recv_burst_vec_sve(struct hns3_rx_queue *__restrict rxq,
 
 	for (pos = 0; pos < nb_pkts; pos += HNS3_SVE_DEFAULT_DESCS_PER_LOOP,
 				     rxdp += HNS3_SVE_DEFAULT_DESCS_PER_LOOP) {
-		svuint64_t vld_clz, mbp1st, mbp2st, mbuf_init;
-		svuint64_t xlen1st, xlen2st, rss1st, rss2st;
-		svuint32_t l234, ol, vld, vld2, xlen, rss;
-		svuint8_t  vld_u8;
+		svuint64_t mbp1st, mbp2st, mbuf_init;
+		svuint32_t vld;
+		svbool_t vld_op;
 
 		/* calc how many bd valid: part 1 */
 		vld = svld1_gather_u32offset_u32(pg32, (uint32_t *)rxdp,
 			svindex_u32(BD_FIELD_VALID_OFFSET, BD_SIZE));
-		vld2 = svlsl_n_u32_z(pg32, vld,
-				    HNS3_UINT32_BIT - 1 - HNS3_RXD_VLD_B);
-		vld2 = svreinterpret_u32_s32(svasr_n_s32_z(pg32,
-			svreinterpret_s32_u32(vld2), HNS3_UINT32_BIT - 1));
+		vld = svand_n_u32_z(pg32, vld, BIT(HNS3_RXD_VLD_B));
+		vld_op = svcmpne_n_u32(pg32, vld, BIT(HNS3_RXD_VLD_B));
+		bd_valid_num = svcntp_b32(pg32, svbrkb_b_z(pg32, vld_op));
+		if (bd_valid_num == 0)
+			break;
 
 		/* load 4 mbuf pointer */
 		mbp1st = svld1_u64(PG64_256BIT, (uint64_t *)&sw_ring[pos]);
-
-		/* calc how many bd valid: part 2 */
-		vld_u8 = svtbl_u8(svreinterpret_u8_u32(vld2),
-				  svreinterpret_u8_s8(GEN_VLD_U8_ZIP_INDEX));
-		vld_clz = svnot_u64_z(PG64_64BIT, svreinterpret_u64_u8(vld_u8));
-		vld_clz = svclz_u64_z(PG64_64BIT, vld_clz);
-		svst1_u64(PG64_64BIT, &bd_valid_num, vld_clz);
-		bd_valid_num /= HNS3_UINT8_BIT;
-
 		/* load 4 more mbuf pointer */
 		mbp2st = svld1_u64(PG64_256BIT, (uint64_t *)&sw_ring[pos + 4]);
 
@@ -159,65 +116,25 @@  hns3_recv_burst_vec_sve(struct hns3_rx_queue *__restrict rxq,
 
 		/* store 4 mbuf pointer into rx_pkts */
 		svst1_u64(PG64_256BIT, (uint64_t *)&rx_pkts[pos], mbp1st);
-
-		/* load key field to vector reg */
-		l234 = svld1_gather_u32offset_u32(pg32, (uint32_t *)rxdp2,
-				svindex_u32(BD_FIELD_L234_OFFSET, BD_SIZE));
-		ol = svld1_gather_u32offset_u32(pg32, (uint32_t *)rxdp2,
-				svindex_u32(BD_FIELD_OL_OFFSET, BD_SIZE));
-
 		/* store 4 mbuf pointer into rx_pkts again */
 		svst1_u64(PG64_256BIT, (uint64_t *)&rx_pkts[pos + 4], mbp2st);
 
-		/* load datalen, pktlen and rss_hash */
-		xlen = svld1_gather_u32offset_u32(pg32, (uint32_t *)rxdp2,
-				svindex_u32(BD_FIELD_XLEN_OFFSET, BD_SIZE));
-		rss = svld1_gather_u32offset_u32(pg32, (uint32_t *)rxdp2,
-				svindex_u32(BD_FIELD_RSS_OFFSET, BD_SIZE));
-
-		/* store key field to stash buffer */
-		svst1_u32(pg32, (uint32_t *)key_field.l234_info, l234);
-		svst1_u32(pg32, (uint32_t *)key_field.bd_base_info, vld);
-		svst1_u32(pg32, (uint32_t *)key_field.ol_info, ol);
-
-		/* sub crc_len for pkt_len and data_len */
-		xlen = svreinterpret_u32_u16(svsub_n_u16_z(PG16_256BIT,
-			svreinterpret_u16_u32(xlen), rxq->crc_len));
-
 		/* init mbuf_initializer */
 		mbuf_init = svdup_n_u64(rxq->mbuf_initializer);
-
-		/* extract datalen, pktlen and rss from xlen and rss */
-		xlen1st = svreinterpret_u64_u16(
-			svtbl_u16(svreinterpret_u16_u32(xlen), xlen_tbl1));
-		xlen2st = svreinterpret_u64_u16(
-			svtbl_u16(svreinterpret_u16_u32(xlen), xlen_tbl2));
-		rss1st = svreinterpret_u64_u32(
-			svtbl_u32(svreinterpret_u32_u32(rss), rss_tbl1));
-		rss2st = svreinterpret_u64_u32(
-			svtbl_u32(svreinterpret_u32_u32(rss), rss_tbl2));
-
 		/* save mbuf_initializer */
 		svst1_scatter_u64base_offset_u64(PG64_256BIT, mbp1st,
 			offsetof(struct rte_mbuf, rearm_data), mbuf_init);
 		svst1_scatter_u64base_offset_u64(PG64_256BIT, mbp2st,
 			offsetof(struct rte_mbuf, rearm_data), mbuf_init);
 
-		/* save datalen and pktlen and rss */
-		svst1_scatter_u64base_offset_u64(PG64_256BIT, mbp1st,
-			offsetof(struct rte_mbuf, pkt_len), xlen1st);
-		svst1_scatter_u64base_offset_u64(PG64_256BIT, mbp1st,
-			offsetof(struct rte_mbuf, hash.rss), rss1st);
-		svst1_scatter_u64base_offset_u64(PG64_256BIT, mbp2st,
-			offsetof(struct rte_mbuf, pkt_len), xlen2st);
-		svst1_scatter_u64base_offset_u64(PG64_256BIT, mbp2st,
-			offsetof(struct rte_mbuf, hash.rss), rss2st);
-
-		rte_prefetch_non_temporal(rxdp +
-					  HNS3_SVE_DEFAULT_DESCS_PER_LOOP);
+		next_rxdp = rxdp + HNS3_SVE_DEFAULT_DESCS_PER_LOOP;
+		rte_prefetch_non_temporal(next_rxdp);
+		rte_prefetch_non_temporal(next_rxdp + 2);
+		rte_prefetch_non_temporal(next_rxdp + 4);
+		rte_prefetch_non_temporal(next_rxdp + 6);
 
 		parse_retcode = hns3_desc_parse_field_sve(rxq, &rx_pkts[pos],
-					&key_field, bd_valid_num);
+					&rxdp2[offset], bd_valid_num);
 		if (unlikely(parse_retcode))
 			(*bd_err_mask) |= ((uint64_t)parse_retcode) << pos;
 
@@ -237,6 +154,7 @@  hns3_recv_burst_vec_sve(struct hns3_rx_queue *__restrict rxq,
 	return nb_rx;
 }
 
+
 uint16_t
 hns3_recv_pkts_vec_sve(void *__restrict rx_queue,
 		       struct rte_mbuf **__restrict rx_pkts,