[v2,1/2] net/hns3: optimized Tx performance by mbuf fast free

Message ID 20211116012212.64819-2-humin29@huawei.com (mailing list archive)
State Accepted, archived
Delegated to: Ferruh Yigit
Headers
Series performance optimized for hns3 PMD |

Checks

Context Check Description
ci/checkpatch success coding style OK

Commit Message

humin (Q) Nov. 16, 2021, 1:22 a.m. UTC
  From: Chengwen Feng <fengchengwen@huawei.com>

Currently the vector and simple xmit algorithm don't support multi_segs,
so if Tx offload support MBUF_FAST_FREE, driver could invoke
rte_mempool_put_bulk() to free Tx mbufs in this situation.

In the testpmd single core MAC forwarding scenario, the performance is
improved by 8% at 64B on Kunpeng920 platform.

Cc: stable@dpdk.org

Signed-off-by: Chengwen Feng <fengchengwen@huawei.com>
---
 doc/guides/nics/features/hns3.ini |  1 +
 drivers/net/hns3/hns3_rxtx.c      | 11 +++++++++++
 drivers/net/hns3/hns3_rxtx.h      |  2 ++
 drivers/net/hns3/hns3_rxtx_vec.h  |  9 +++++++++
 4 files changed, 23 insertions(+)
  

Patch

diff --git a/doc/guides/nics/features/hns3.ini b/doc/guides/nics/features/hns3.ini
index c3464c8396..405b94f05c 100644
--- a/doc/guides/nics/features/hns3.ini
+++ b/doc/guides/nics/features/hns3.ini
@@ -12,6 +12,7 @@  Queue start/stop     = Y
 Runtime Rx queue setup = Y
 Runtime Tx queue setup = Y
 Burst mode info      = Y
+Fast mbuf free       = Y
 Free Tx mbuf on demand = Y
 MTU update           = Y
 Scattered Rx         = Y
diff --git a/drivers/net/hns3/hns3_rxtx.c b/drivers/net/hns3/hns3_rxtx.c
index d26e262335..f0a57611ec 100644
--- a/drivers/net/hns3/hns3_rxtx.c
+++ b/drivers/net/hns3/hns3_rxtx.c
@@ -3059,6 +3059,8 @@  hns3_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t nb_desc,
 	txq->min_tx_pkt_len = hw->min_tx_pkt_len;
 	txq->tso_mode = hw->tso_mode;
 	txq->udp_cksum_mode = hw->udp_cksum_mode;
+	txq->mbuf_fast_free_en = !!(dev->data->dev_conf.txmode.offloads &
+				    RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE);
 	memset(&txq->basic_stats, 0, sizeof(struct hns3_tx_basic_stats));
 	memset(&txq->dfx_stats, 0, sizeof(struct hns3_tx_dfx_stats));
 
@@ -3991,6 +3993,14 @@  hns3_tx_free_buffer_simple(struct hns3_tx_queue *txq)
 
 		tx_entry = &txq->sw_ring[txq->next_to_clean];
 
+		if (txq->mbuf_fast_free_en) {
+			rte_mempool_put_bulk(tx_entry->mbuf->pool,
+					(void **)tx_entry, txq->tx_rs_thresh);
+			for (i = 0; i < txq->tx_rs_thresh; i++)
+				tx_entry[i].mbuf = NULL;
+			goto update_field;
+		}
+
 		for (i = 0; i < txq->tx_rs_thresh; i++)
 			rte_prefetch0((tx_entry + i)->mbuf);
 		for (i = 0; i < txq->tx_rs_thresh; i++, tx_entry++) {
@@ -3998,6 +4008,7 @@  hns3_tx_free_buffer_simple(struct hns3_tx_queue *txq)
 			tx_entry->mbuf = NULL;
 		}
 
+update_field:
 		txq->next_to_clean = (tx_next_clean + 1) % txq->nb_tx_desc;
 		txq->tx_bd_ready += txq->tx_rs_thresh;
 	}
diff --git a/drivers/net/hns3/hns3_rxtx.h b/drivers/net/hns3/hns3_rxtx.h
index 63bafc68b6..df731856ef 100644
--- a/drivers/net/hns3/hns3_rxtx.h
+++ b/drivers/net/hns3/hns3_rxtx.h
@@ -495,6 +495,8 @@  struct hns3_tx_queue {
 	 * this point.
 	 */
 	uint16_t pvid_sw_shift_en:1;
+	/* check whether the mbuf fast free offload is enabled */
+	uint16_t mbuf_fast_free_en:1;
 
 	/*
 	 * For better performance in tx datapath, releasing mbuf in batches is
diff --git a/drivers/net/hns3/hns3_rxtx_vec.h b/drivers/net/hns3/hns3_rxtx_vec.h
index 67c75e44ef..4985a7cae8 100644
--- a/drivers/net/hns3/hns3_rxtx_vec.h
+++ b/drivers/net/hns3/hns3_rxtx_vec.h
@@ -18,6 +18,14 @@  hns3_tx_bulk_free_buffers(struct hns3_tx_queue *txq)
 	int i;
 
 	tx_entry = &txq->sw_ring[txq->next_to_clean];
+	if (txq->mbuf_fast_free_en) {
+		rte_mempool_put_bulk(tx_entry->mbuf->pool, (void **)tx_entry,
+				     txq->tx_rs_thresh);
+		for (i = 0; i < txq->tx_rs_thresh; i++)
+			tx_entry[i].mbuf = NULL;
+		goto update_field;
+	}
+
 	for (i = 0; i < txq->tx_rs_thresh; i++, tx_entry++) {
 		m = rte_pktmbuf_prefree_seg(tx_entry->mbuf);
 		tx_entry->mbuf = NULL;
@@ -36,6 +44,7 @@  hns3_tx_bulk_free_buffers(struct hns3_tx_queue *txq)
 	if (nb_free)
 		rte_mempool_put_bulk(free[0]->pool, (void **)free, nb_free);
 
+update_field:
 	/* Update numbers of available descriptor due to buffer freed */
 	txq->tx_bd_ready += txq->tx_rs_thresh;
 	txq->next_to_clean += txq->tx_rs_thresh;