[v2] net/cnxk: performance improvement for SW mbuf free

Message ID 20240301031645.1656237-1-rbhansali@marvell.com (mailing list archive)
State Accepted, archived
Delegated to: Jerin Jacob
Headers
Series [v2] net/cnxk: performance improvement for SW mbuf free |

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/loongarch-compilation warning apply patch failure
ci/Intel-compilation warning apply issues
ci/iol-testing warning apply patch failure

Commit Message

Rahul Bhansali March 1, 2024, 3:16 a.m. UTC
  Performance improvement is done for Tx fastpath flag MBUF_NOFF when
tx_compl_ena is false and mbuf has an external buffer.
In such case, Instead of individual external mbuf free before LMTST,
a chain of external mbuf will be created and free all after LMTST.
This not only improve the performance but also fixes SQ corruption.

CN10k performance improvement is ~14%.
CN9k performance improvement is ~20%.

Fixes: 51a636528515 ("net/cnxk: fix crash during Tx completion")
Cc: stable@dpdk.org

Signed-off-by: Rahul Bhansali <rbhansali@marvell.com>
---
Changes in v2: updated release_24_03.rst for SW mbuf free optimization

 doc/guides/rel_notes/release_24_03.rst |  1 +
 drivers/event/cnxk/cn10k_tx_worker.h   |  8 ++-
 drivers/event/cnxk/cn9k_worker.h       |  9 ++-
 drivers/net/cnxk/cn10k_tx.h            | 97 ++++++++++++++++++--------
 drivers/net/cnxk/cn9k_tx.h             | 88 +++++++++++++++--------
 5 files changed, 136 insertions(+), 67 deletions(-)

--
2.25.1
  

Comments

Jerin Jacob March 3, 2024, 3:33 p.m. UTC | #1
On Fri, Mar 1, 2024 at 8:47 AM Rahul Bhansali <rbhansali@marvell.com> wrote:
>
> Performance improvement is done for Tx fastpath flag MBUF_NOFF when
> tx_compl_ena is false and mbuf has an external buffer.
> In such case, Instead of individual external mbuf free before LMTST,
> a chain of external mbuf will be created and free all after LMTST.
> This not only improve the performance but also fixes SQ corruption.
>
> CN10k performance improvement is ~14%.
> CN9k performance improvement is ~20%.
>
> Fixes: 51a636528515 ("net/cnxk: fix crash during Tx completion")
> Cc: stable@dpdk.org
>
> Signed-off-by: Rahul Bhansali <rbhansali@marvell.com>

Applied to dpdk-next-net-mrvl/for-main. Thanks
  

Patch

diff --git a/doc/guides/rel_notes/release_24_03.rst b/doc/guides/rel_notes/release_24_03.rst
index 8d440d56a5..39ffef11b0 100644
--- a/doc/guides/rel_notes/release_24_03.rst
+++ b/doc/guides/rel_notes/release_24_03.rst
@@ -111,6 +111,7 @@  New Features
   * Added support for ``RTE_FLOW_ITEM_TYPE_PPPOES`` flow item.
   * Added support for ``RTE_FLOW_ACTION_TYPE_SAMPLE`` flow item.
   * Added support for Rx inject.
+  * Optimized SW external mbuf free for better performance and avoid SQ corruption.

 * **Updated Marvell OCTEON EP driver.**

diff --git a/drivers/event/cnxk/cn10k_tx_worker.h b/drivers/event/cnxk/cn10k_tx_worker.h
index 53e0dde20c..256237b895 100644
--- a/drivers/event/cnxk/cn10k_tx_worker.h
+++ b/drivers/event/cnxk/cn10k_tx_worker.h
@@ -70,6 +70,7 @@  cn10k_sso_tx_one(struct cn10k_sso_hws *ws, struct rte_mbuf *m, uint64_t *cmd,
 		 const uint64_t *txq_data, const uint32_t flags)
 {
 	uint8_t lnum = 0, loff = 0, shft = 0;
+	struct rte_mbuf *extm = NULL;
 	struct cn10k_eth_txq *txq;
 	uintptr_t laddr;
 	uint16_t segdw;
@@ -90,7 +91,7 @@  cn10k_sso_tx_one(struct cn10k_sso_hws *ws, struct rte_mbuf *m, uint64_t *cmd,
 	if (flags & NIX_TX_OFFLOAD_TSO_F)
 		cn10k_nix_xmit_prepare_tso(m, flags);

-	cn10k_nix_xmit_prepare(txq, m, cmd, flags, txq->lso_tun_fmt, &sec,
+	cn10k_nix_xmit_prepare(txq, m, &extm, cmd, flags, txq->lso_tun_fmt, &sec,
 			       txq->mark_flag, txq->mark_fmt);

 	laddr = lmt_addr;
@@ -105,7 +106,7 @@  cn10k_sso_tx_one(struct cn10k_sso_hws *ws, struct rte_mbuf *m, uint64_t *cmd,
 	cn10k_nix_xmit_mv_lmt_base(laddr, cmd, flags);

 	if (flags & NIX_TX_MULTI_SEG_F)
-		segdw = cn10k_nix_prepare_mseg(txq, m, (uint64_t *)laddr, flags);
+		segdw = cn10k_nix_prepare_mseg(txq, m, &extm, (uint64_t *)laddr, flags);
 	else
 		segdw = cn10k_nix_tx_ext_subs(flags) + 2;

@@ -127,6 +128,9 @@  cn10k_sso_tx_one(struct cn10k_sso_hws *ws, struct rte_mbuf *m, uint64_t *cmd,
 	/* Memory barrier to make sure lmtst store completes */
 	rte_io_wmb();

+	if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F && !txq->tx_compl.ena)
+		cn10k_nix_free_extmbuf(extm);
+
 	return 1;
 }

diff --git a/drivers/event/cnxk/cn9k_worker.h b/drivers/event/cnxk/cn9k_worker.h
index 0451157812..107265d54b 100644
--- a/drivers/event/cnxk/cn9k_worker.h
+++ b/drivers/event/cnxk/cn9k_worker.h
@@ -746,7 +746,7 @@  static __rte_always_inline uint16_t
 cn9k_sso_hws_event_tx(uint64_t base, struct rte_event *ev, uint64_t *cmd,
 		      uint64_t *txq_data, const uint32_t flags)
 {
-	struct rte_mbuf *m = ev->mbuf;
+	struct rte_mbuf *m = ev->mbuf, *extm = NULL;
 	struct cn9k_eth_txq *txq;

 	/* Perform header writes before barrier for TSO */
@@ -767,7 +767,7 @@  cn9k_sso_hws_event_tx(uint64_t base, struct rte_event *ev, uint64_t *cmd,
 	if (cn9k_sso_sq_depth(txq) <= 0)
 		return 0;
 	cn9k_nix_tx_skeleton(txq, cmd, flags, 0);
-	cn9k_nix_xmit_prepare(txq, m, cmd, flags, txq->lso_tun_fmt, txq->mark_flag,
+	cn9k_nix_xmit_prepare(txq, m, &extm, cmd, flags, txq->lso_tun_fmt, txq->mark_flag,
 			      txq->mark_fmt);

 	if (flags & NIX_TX_OFFLOAD_SECURITY_F) {
@@ -789,7 +789,7 @@  cn9k_sso_hws_event_tx(uint64_t base, struct rte_event *ev, uint64_t *cmd,
 	}

 	if (flags & NIX_TX_MULTI_SEG_F) {
-		const uint16_t segdw = cn9k_nix_prepare_mseg(txq, m, cmd, flags);
+		const uint16_t segdw = cn9k_nix_prepare_mseg(txq, m, &extm, cmd, flags);
 		cn9k_nix_xmit_prepare_tstamp(txq, cmd, m->ol_flags, segdw,
 					     flags);
 		if (!CNXK_TT_FROM_EVENT(ev->event)) {
@@ -819,6 +819,9 @@  cn9k_sso_hws_event_tx(uint64_t base, struct rte_event *ev, uint64_t *cmd,
 	}

 done:
+	if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F && !txq->tx_compl.ena)
+		cn9k_nix_free_extmbuf(extm);
+
 	return 1;
 }

diff --git a/drivers/net/cnxk/cn10k_tx.h b/drivers/net/cnxk/cn10k_tx.h
index 266c899a05..5c4b9e559e 100644
--- a/drivers/net/cnxk/cn10k_tx.h
+++ b/drivers/net/cnxk/cn10k_tx.h
@@ -733,8 +733,19 @@  cn10k_nix_prep_sec(struct rte_mbuf *m, uint64_t *cmd, uintptr_t *nixtx_addr,
 }
 #endif

+static inline void
+cn10k_nix_free_extmbuf(struct rte_mbuf *m)
+{
+	struct rte_mbuf *m_next;
+	while (m != NULL) {
+		m_next = m->next;
+		rte_pktmbuf_free_seg(m);
+		m = m_next;
+	}
+}
+
 static __rte_always_inline uint64_t
-cn10k_nix_prefree_seg(struct rte_mbuf *m, struct cn10k_eth_txq *txq,
+cn10k_nix_prefree_seg(struct rte_mbuf *m, struct rte_mbuf **extm, struct cn10k_eth_txq *txq,
 		      struct nix_send_hdr_s *send_hdr, uint64_t *aura)
 {
 	struct rte_mbuf *prev = NULL;
@@ -742,7 +753,8 @@  cn10k_nix_prefree_seg(struct rte_mbuf *m, struct cn10k_eth_txq *txq,

 	if (RTE_MBUF_HAS_EXTBUF(m)) {
 		if (unlikely(txq->tx_compl.ena == 0)) {
-			rte_pktmbuf_free_seg(m);
+			m->next = *extm;
+			*extm = m;
 			return 1;
 		}
 		if (send_hdr->w0.pnc) {
@@ -766,7 +778,8 @@  cn10k_nix_prefree_seg(struct rte_mbuf *m, struct cn10k_eth_txq *txq,
 #if defined(RTE_ARCH_ARM64)
 /* Only called for first segments of single segmented mbufs */
 static __rte_always_inline void
-cn10k_nix_prefree_seg_vec(struct rte_mbuf **mbufs, struct cn10k_eth_txq *txq,
+cn10k_nix_prefree_seg_vec(struct rte_mbuf **mbufs, struct rte_mbuf **extm,
+			  struct cn10k_eth_txq *txq,
 			  uint64x2_t *senddesc01_w0, uint64x2_t *senddesc23_w0,
 			  uint64x2_t *senddesc01_w1, uint64x2_t *senddesc23_w1)
 {
@@ -790,7 +803,8 @@  cn10k_nix_prefree_seg_vec(struct rte_mbuf **mbufs, struct cn10k_eth_txq *txq,
 		w1 = vgetq_lane_u64(*senddesc01_w1, 0);
 		w1 &= ~0xFFFF000000000000UL;
 		if (unlikely(!tx_compl_ena)) {
-			rte_pktmbuf_free_seg(m0);
+			m0->next = *extm;
+			*extm = m0;
 		} else {
 			sqe_id = rte_atomic_fetch_add_explicit(&txq->tx_compl.sqe_id, 1,
 							       rte_memory_order_relaxed);
@@ -820,7 +834,8 @@  cn10k_nix_prefree_seg_vec(struct rte_mbuf **mbufs, struct cn10k_eth_txq *txq,
 		w1 = vgetq_lane_u64(*senddesc01_w1, 1);
 		w1 &= ~0xFFFF000000000000UL;
 		if (unlikely(!tx_compl_ena)) {
-			rte_pktmbuf_free_seg(m1);
+			m1->next = *extm;
+			*extm = m1;
 		} else {
 			sqe_id = rte_atomic_fetch_add_explicit(&txq->tx_compl.sqe_id, 1,
 							       rte_memory_order_relaxed);
@@ -850,7 +865,8 @@  cn10k_nix_prefree_seg_vec(struct rte_mbuf **mbufs, struct cn10k_eth_txq *txq,
 		w1 = vgetq_lane_u64(*senddesc23_w1, 0);
 		w1 &= ~0xFFFF000000000000UL;
 		if (unlikely(!tx_compl_ena)) {
-			rte_pktmbuf_free_seg(m2);
+			m2->next = *extm;
+			*extm = m2;
 		} else {
 			sqe_id = rte_atomic_fetch_add_explicit(&txq->tx_compl.sqe_id, 1,
 							       rte_memory_order_relaxed);
@@ -880,7 +896,8 @@  cn10k_nix_prefree_seg_vec(struct rte_mbuf **mbufs, struct cn10k_eth_txq *txq,
 		w1 = vgetq_lane_u64(*senddesc23_w1, 1);
 		w1 &= ~0xFFFF000000000000UL;
 		if (unlikely(!tx_compl_ena)) {
-			rte_pktmbuf_free_seg(m3);
+			m3->next = *extm;
+			*extm = m3;
 		} else {
 			sqe_id = rte_atomic_fetch_add_explicit(&txq->tx_compl.sqe_id, 1,
 							       rte_memory_order_relaxed);
@@ -962,9 +979,9 @@  cn10k_nix_xmit_prepare_tso(struct rte_mbuf *m, const uint64_t flags)

 static __rte_always_inline void
 cn10k_nix_xmit_prepare(struct cn10k_eth_txq *txq,
-		       struct rte_mbuf *m, uint64_t *cmd, const uint16_t flags,
-		       const uint64_t lso_tun_fmt, bool *sec, uint8_t mark_flag,
-		       uint64_t mark_fmt)
+		       struct rte_mbuf *m, struct rte_mbuf **extm, uint64_t *cmd,
+		       const uint16_t flags, const uint64_t lso_tun_fmt, bool *sec,
+		       uint8_t mark_flag, uint64_t mark_fmt)
 {
 	uint8_t mark_off = 0, mark_vlan = 0, markptr = 0;
 	struct nix_send_ext_s *send_hdr_ext;
@@ -1164,7 +1181,7 @@  cn10k_nix_xmit_prepare(struct cn10k_eth_txq *txq,
 			 * DF bit = 0 otherwise
 			 */
 			aura = send_hdr->w0.aura;
-			send_hdr->w0.df = cn10k_nix_prefree_seg(m, txq, send_hdr, &aura);
+			send_hdr->w0.df = cn10k_nix_prefree_seg(m, extm, txq, send_hdr, &aura);
 			send_hdr->w0.aura = aura;
 		}
 #ifdef RTE_LIBRTE_MEMPOOL_DEBUG
@@ -1240,8 +1257,8 @@  cn10k_nix_xmit_prepare_tstamp(struct cn10k_eth_txq *txq, uintptr_t lmt_addr,
 }

 static __rte_always_inline uint16_t
-cn10k_nix_prepare_mseg(struct cn10k_eth_txq *txq,
-		       struct rte_mbuf *m, uint64_t *cmd, const uint16_t flags)
+cn10k_nix_prepare_mseg(struct cn10k_eth_txq *txq, struct rte_mbuf *m, struct rte_mbuf **extm,
+		       uint64_t *cmd, const uint16_t flags)
 {
 	uint64_t prefree = 0, aura0, aura, nb_segs, segdw;
 	struct nix_send_hdr_s *send_hdr;
@@ -1284,7 +1301,7 @@  cn10k_nix_prepare_mseg(struct cn10k_eth_txq *txq,
 	/* Set invert df if buffer is not to be freed by H/W */
 	if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) {
 		aura = send_hdr->w0.aura;
-		prefree = cn10k_nix_prefree_seg(m, txq, send_hdr, &aura);
+		prefree = cn10k_nix_prefree_seg(m, extm, txq, send_hdr, &aura);
 		send_hdr->w0.aura = aura;
 		l_sg.i1 = prefree;
 	}
@@ -1331,7 +1348,7 @@  cn10k_nix_prepare_mseg(struct cn10k_eth_txq *txq,
 		cookie = RTE_MBUF_DIRECT(m) ? m : rte_mbuf_from_indirect(m);
 		if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) {
 			aura = roc_npa_aura_handle_to_aura(m->pool->pool_id);
-			prefree = cn10k_nix_prefree_seg(m, txq, send_hdr, &aura);
+			prefree = cn10k_nix_prefree_seg(m, extm, txq, send_hdr, &aura);
 			is_sg2 = aura != aura0 && !prefree;
 		}

@@ -1425,6 +1442,7 @@  cn10k_nix_xmit_pkts(void *tx_queue, uint64_t *ws, struct rte_mbuf **tx_pkts,
 	uint8_t lnum, c_lnum, c_shft, c_loff;
 	uintptr_t pa, lbase = txq->lmt_base;
 	uint16_t lmt_id, burst, left, i;
+	struct rte_mbuf *extm = NULL;
 	uintptr_t c_lbase = lbase;
 	uint64_t lso_tun_fmt = 0;
 	uint64_t mark_fmt = 0;
@@ -1479,7 +1497,7 @@  cn10k_nix_xmit_pkts(void *tx_queue, uint64_t *ws, struct rte_mbuf **tx_pkts,
 		if (flags & NIX_TX_OFFLOAD_TSO_F)
 			cn10k_nix_xmit_prepare_tso(tx_pkts[i], flags);

-		cn10k_nix_xmit_prepare(txq, tx_pkts[i], cmd, flags, lso_tun_fmt,
+		cn10k_nix_xmit_prepare(txq, tx_pkts[i], &extm, cmd, flags, lso_tun_fmt,
 				       &sec, mark_flag, mark_fmt);

 		laddr = (uintptr_t)LMT_OFF(lbase, lnum, 0);
@@ -1554,6 +1572,11 @@  cn10k_nix_xmit_pkts(void *tx_queue, uint64_t *ws, struct rte_mbuf **tx_pkts,
 	}

 	rte_io_wmb();
+	if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F && !txq->tx_compl.ena) {
+		cn10k_nix_free_extmbuf(extm);
+		extm = NULL;
+	}
+
 	if (left)
 		goto again;

@@ -1569,6 +1592,7 @@  cn10k_nix_xmit_pkts_mseg(void *tx_queue, uint64_t *ws,
 	uintptr_t pa0, pa1, lbase = txq->lmt_base;
 	const rte_iova_t io_addr = txq->io_addr;
 	uint16_t segdw, lmt_id, burst, left, i;
+	struct rte_mbuf *extm = NULL;
 	uint8_t lnum, c_lnum, c_loff;
 	uintptr_t c_lbase = lbase;
 	uint64_t lso_tun_fmt = 0;
@@ -1630,7 +1654,7 @@  cn10k_nix_xmit_pkts_mseg(void *tx_queue, uint64_t *ws,
 		if (flags & NIX_TX_OFFLOAD_TSO_F)
 			cn10k_nix_xmit_prepare_tso(tx_pkts[i], flags);

-		cn10k_nix_xmit_prepare(txq, tx_pkts[i], cmd, flags, lso_tun_fmt,
+		cn10k_nix_xmit_prepare(txq, tx_pkts[i], &extm, cmd, flags, lso_tun_fmt,
 				       &sec, mark_flag, mark_fmt);

 		laddr = (uintptr_t)LMT_OFF(lbase, lnum, 0);
@@ -1644,7 +1668,7 @@  cn10k_nix_xmit_pkts_mseg(void *tx_queue, uint64_t *ws,
 		/* Move NIX desc to LMT/NIXTX area */
 		cn10k_nix_xmit_mv_lmt_base(laddr, cmd, flags);
 		/* Store sg list directly on lmt line */
-		segdw = cn10k_nix_prepare_mseg(txq, tx_pkts[i], (uint64_t *)laddr,
+		segdw = cn10k_nix_prepare_mseg(txq, tx_pkts[i], &extm, (uint64_t *)laddr,
 					       flags);
 		cn10k_nix_xmit_prepare_tstamp(txq, laddr, tx_pkts[i]->ol_flags,
 					      segdw, flags);
@@ -1717,6 +1741,11 @@  cn10k_nix_xmit_pkts_mseg(void *tx_queue, uint64_t *ws,
 	}

 	rte_io_wmb();
+	if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F && !txq->tx_compl.ena) {
+		cn10k_nix_free_extmbuf(extm);
+		extm = NULL;
+	}
+
 	if (left)
 		goto again;

@@ -1767,7 +1796,7 @@  cn10k_nix_prepare_tso(struct rte_mbuf *m, union nix_send_hdr_w1_u *w1,

 static __rte_always_inline uint16_t
 cn10k_nix_prepare_mseg_vec_noff(struct cn10k_eth_txq *txq,
-				struct rte_mbuf *m, uint64_t *cmd,
+				struct rte_mbuf *m, struct rte_mbuf **extm, uint64_t *cmd,
 				uint64x2_t *cmd0, uint64x2_t *cmd1,
 				uint64x2_t *cmd2, uint64x2_t *cmd3,
 				const uint32_t flags)
@@ -1782,7 +1811,7 @@  cn10k_nix_prepare_mseg_vec_noff(struct cn10k_eth_txq *txq,
 		vst1q_u64(cmd + 2, *cmd1); /* sg */
 	}

-	segdw = cn10k_nix_prepare_mseg(txq, m, cmd, flags);
+	segdw = cn10k_nix_prepare_mseg(txq, m, extm, cmd, flags);

 	if (flags & NIX_TX_OFFLOAD_TSTAMP_F)
 		vst1q_u64(cmd + segdw * 2 - 2, *cmd3);
@@ -1892,7 +1921,7 @@  cn10k_nix_prepare_mseg_vec(struct rte_mbuf *m, uint64_t *cmd, uint64x2_t *cmd0,

 static __rte_always_inline uint8_t
 cn10k_nix_prep_lmt_mseg_vector(struct cn10k_eth_txq *txq,
-			       struct rte_mbuf **mbufs, uint64x2_t *cmd0,
+			       struct rte_mbuf **mbufs, struct rte_mbuf **extm, uint64x2_t *cmd0,
 			       uint64x2_t *cmd1, uint64x2_t *cmd2,
 			       uint64x2_t *cmd3, uint8_t *segdw,
 			       uint64_t *lmt_addr, __uint128_t *data128,
@@ -1910,7 +1939,7 @@  cn10k_nix_prep_lmt_mseg_vector(struct cn10k_eth_txq *txq,
 				lmt_addr += 16;
 				off = 0;
 			}
-			off += cn10k_nix_prepare_mseg_vec_noff(txq, mbufs[j],
+			off += cn10k_nix_prepare_mseg_vec_noff(txq, mbufs[j], extm,
 					lmt_addr + off * 2, &cmd0[j], &cmd1[j],
 					&cmd2[j], &cmd3[j], flags);
 		}
@@ -2063,14 +2092,14 @@  cn10k_nix_lmt_next(uint8_t dw, uintptr_t laddr, uint8_t *lnum, uint8_t *loff,

 static __rte_always_inline void
 cn10k_nix_xmit_store(struct cn10k_eth_txq *txq,
-		     struct rte_mbuf *mbuf, uint8_t segdw, uintptr_t laddr,
+		     struct rte_mbuf *mbuf, struct rte_mbuf **extm, uint8_t segdw, uintptr_t laddr,
 		     uint64x2_t cmd0, uint64x2_t cmd1, uint64x2_t cmd2,
 		     uint64x2_t cmd3, const uint16_t flags)
 {
 	uint8_t off;

 	if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) {
-		cn10k_nix_prepare_mseg_vec_noff(txq, mbuf, LMT_OFF(laddr, 0, 0),
+		cn10k_nix_prepare_mseg_vec_noff(txq, mbuf, extm, LMT_OFF(laddr, 0, 0),
 						&cmd0, &cmd1, &cmd2, &cmd3,
 						flags);
 		return;
@@ -2154,6 +2183,7 @@  cn10k_nix_xmit_pkts_vector(void *tx_queue, uint64_t *ws,
 		__uint128_t data128;
 		uint64_t data[2];
 	} wd;
+	struct rte_mbuf *extm = NULL;

 	if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F && txq->tx_compl.ena)
 		handle_tx_completion_pkts(txq, flags & NIX_TX_VWQE_F);
@@ -3003,8 +3033,8 @@  cn10k_nix_xmit_pkts_vector(void *tx_queue, uint64_t *ws,
 		    !(flags & NIX_TX_MULTI_SEG_F) &&
 		    !(flags & NIX_TX_OFFLOAD_SECURITY_F)) {
 			/* Set don't free bit if reference count > 1 */
-			cn10k_nix_prefree_seg_vec(tx_pkts, txq, &senddesc01_w0, &senddesc23_w0,
-						  &senddesc01_w1, &senddesc23_w1);
+			cn10k_nix_prefree_seg_vec(tx_pkts, &extm, txq, &senddesc01_w0,
+						  &senddesc23_w0, &senddesc01_w1, &senddesc23_w1);
 		} else if (!(flags & NIX_TX_MULTI_SEG_F) &&
 			   !(flags & NIX_TX_OFFLOAD_SECURITY_F)) {
 			/* Move mbufs to iova */
@@ -3076,7 +3106,7 @@  cn10k_nix_xmit_pkts_vector(void *tx_queue, uint64_t *ws,
 						   &shift, &wd.data128, &next);

 			/* Store mbuf0 to LMTLINE/CPT NIXTX area */
-			cn10k_nix_xmit_store(txq, tx_pkts[0], segdw[0], next,
+			cn10k_nix_xmit_store(txq, tx_pkts[0], &extm, segdw[0], next,
 					     cmd0[0], cmd1[0], cmd2[0], cmd3[0],
 					     flags);

@@ -3092,7 +3122,7 @@  cn10k_nix_xmit_pkts_vector(void *tx_queue, uint64_t *ws,
 						   &shift, &wd.data128, &next);

 			/* Store mbuf1 to LMTLINE/CPT NIXTX area */
-			cn10k_nix_xmit_store(txq, tx_pkts[1], segdw[1], next,
+			cn10k_nix_xmit_store(txq, tx_pkts[1], &extm, segdw[1], next,
 					     cmd0[1], cmd1[1], cmd2[1], cmd3[1],
 					     flags);

@@ -3108,7 +3138,7 @@  cn10k_nix_xmit_pkts_vector(void *tx_queue, uint64_t *ws,
 						   &shift, &wd.data128, &next);

 			/* Store mbuf2 to LMTLINE/CPT NIXTX area */
-			cn10k_nix_xmit_store(txq, tx_pkts[2], segdw[2], next,
+			cn10k_nix_xmit_store(txq, tx_pkts[2], &extm, segdw[2], next,
 					     cmd0[2], cmd1[2], cmd2[2], cmd3[2],
 					     flags);

@@ -3124,7 +3154,7 @@  cn10k_nix_xmit_pkts_vector(void *tx_queue, uint64_t *ws,
 						   &shift, &wd.data128, &next);

 			/* Store mbuf3 to LMTLINE/CPT NIXTX area */
-			cn10k_nix_xmit_store(txq, tx_pkts[3], segdw[3], next,
+			cn10k_nix_xmit_store(txq, tx_pkts[3], &extm, segdw[3], next,
 					     cmd0[3], cmd1[3], cmd2[3], cmd3[3],
 					     flags);

@@ -3132,7 +3162,7 @@  cn10k_nix_xmit_pkts_vector(void *tx_queue, uint64_t *ws,
 			uint8_t j;

 			segdw[4] = 8;
-			j = cn10k_nix_prep_lmt_mseg_vector(txq, tx_pkts, cmd0, cmd1,
+			j = cn10k_nix_prep_lmt_mseg_vector(txq, tx_pkts, &extm, cmd0, cmd1,
 							  cmd2, cmd3, segdw,
 							  (uint64_t *)
 							  LMT_OFF(laddr, lnum,
@@ -3282,6 +3312,11 @@  cn10k_nix_xmit_pkts_vector(void *tx_queue, uint64_t *ws,
 	}

 	rte_io_wmb();
+	if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F && !txq->tx_compl.ena) {
+		cn10k_nix_free_extmbuf(extm);
+		extm = NULL;
+	}
+
 	if (left)
 		goto again;

diff --git a/drivers/net/cnxk/cn9k_tx.h b/drivers/net/cnxk/cn9k_tx.h
index 94acbe64fa..018fae2eb7 100644
--- a/drivers/net/cnxk/cn9k_tx.h
+++ b/drivers/net/cnxk/cn9k_tx.h
@@ -82,16 +82,28 @@  cn9k_nix_tx_skeleton(struct cn9k_eth_txq *txq, uint64_t *cmd,
 	}
 }

+static __rte_always_inline void
+cn9k_nix_free_extmbuf(struct rte_mbuf *m)
+{
+	struct rte_mbuf *m_next;
+	while (m != NULL) {
+		m_next = m->next;
+		rte_pktmbuf_free_seg(m);
+		m = m_next;
+	}
+}
+
 static __rte_always_inline uint64_t
-cn9k_nix_prefree_seg(struct rte_mbuf *m, struct cn9k_eth_txq *txq, struct nix_send_hdr_s *send_hdr,
-		     uint64_t *aura)
+cn9k_nix_prefree_seg(struct rte_mbuf *m, struct rte_mbuf **extm, struct cn9k_eth_txq *txq,
+		     struct nix_send_hdr_s *send_hdr, uint64_t *aura)
 {
 	struct rte_mbuf *prev;
 	uint32_t sqe_id;

 	if (RTE_MBUF_HAS_EXTBUF(m)) {
 		if (unlikely(txq->tx_compl.ena == 0)) {
-			rte_pktmbuf_free_seg(m);
+			m->next = *extm;
+			*extm = m;
 			return 1;
 		}
 		if (send_hdr->w0.pnc) {
@@ -115,7 +127,7 @@  cn9k_nix_prefree_seg(struct rte_mbuf *m, struct cn9k_eth_txq *txq, struct nix_se
 #if defined(RTE_ARCH_ARM64)
 /* Only called for first segments of single segmented mbufs */
 static __rte_always_inline void
-cn9k_nix_prefree_seg_vec(struct rte_mbuf **mbufs, struct cn9k_eth_txq *txq,
+cn9k_nix_prefree_seg_vec(struct rte_mbuf **mbufs, struct rte_mbuf **extm, struct cn9k_eth_txq *txq,
 			 uint64x2_t *senddesc01_w0, uint64x2_t *senddesc23_w0,
 			 uint64x2_t *senddesc01_w1, uint64x2_t *senddesc23_w1)
 {
@@ -139,7 +151,8 @@  cn9k_nix_prefree_seg_vec(struct rte_mbuf **mbufs, struct cn9k_eth_txq *txq,
 		w1 = vgetq_lane_u64(*senddesc01_w1, 0);
 		w1 &= ~0xFFFF000000000000UL;
 		if (unlikely(!tx_compl_ena)) {
-			rte_pktmbuf_free_seg(m0);
+			m0->next = *extm;
+			*extm = m0;
 		} else {
 			sqe_id = rte_atomic_fetch_add_explicit(&txq->tx_compl.sqe_id, 1,
 							       rte_memory_order_relaxed);
@@ -169,7 +182,8 @@  cn9k_nix_prefree_seg_vec(struct rte_mbuf **mbufs, struct cn9k_eth_txq *txq,
 		w1 = vgetq_lane_u64(*senddesc01_w1, 1);
 		w1 &= ~0xFFFF000000000000UL;
 		if (unlikely(!tx_compl_ena)) {
-			rte_pktmbuf_free_seg(m1);
+			m1->next = *extm;
+			*extm = m1;
 		} else {
 			sqe_id = rte_atomic_fetch_add_explicit(&txq->tx_compl.sqe_id, 1,
 							       rte_memory_order_relaxed);
@@ -199,7 +213,8 @@  cn9k_nix_prefree_seg_vec(struct rte_mbuf **mbufs, struct cn9k_eth_txq *txq,
 		w1 = vgetq_lane_u64(*senddesc23_w1, 0);
 		w1 &= ~0xFFFF000000000000UL;
 		if (unlikely(!tx_compl_ena)) {
-			rte_pktmbuf_free_seg(m2);
+			m2->next = *extm;
+			*extm = m2;
 		} else {
 			sqe_id = rte_atomic_fetch_add_explicit(&txq->tx_compl.sqe_id, 1,
 							       rte_memory_order_relaxed);
@@ -229,7 +244,8 @@  cn9k_nix_prefree_seg_vec(struct rte_mbuf **mbufs, struct cn9k_eth_txq *txq,
 		w1 = vgetq_lane_u64(*senddesc23_w1, 1);
 		w1 &= ~0xFFFF000000000000UL;
 		if (unlikely(!tx_compl_ena)) {
-			rte_pktmbuf_free_seg(m3);
+			m3->next = *extm;
+			*extm = m3;
 		} else {
 			sqe_id = rte_atomic_fetch_add_explicit(&txq->tx_compl.sqe_id, 1,
 							       rte_memory_order_relaxed);
@@ -310,10 +326,9 @@  cn9k_nix_xmit_prepare_tso(struct rte_mbuf *m, const uint64_t flags)
 }

 static __rte_always_inline void
-cn9k_nix_xmit_prepare(struct cn9k_eth_txq *txq,
-		      struct rte_mbuf *m, uint64_t *cmd, const uint16_t flags,
-		      const uint64_t lso_tun_fmt, uint8_t mark_flag,
-		      uint64_t mark_fmt)
+cn9k_nix_xmit_prepare(struct cn9k_eth_txq *txq, struct rte_mbuf *m, struct rte_mbuf **extm,
+		      uint64_t *cmd, const uint16_t flags, const uint64_t lso_tun_fmt,
+		      uint8_t mark_flag, uint64_t mark_fmt)
 {
 	uint8_t mark_off = 0, mark_vlan = 0, markptr = 0;
 	struct nix_send_ext_s *send_hdr_ext;
@@ -509,7 +524,7 @@  cn9k_nix_xmit_prepare(struct cn9k_eth_txq *txq,
 			 * DF bit = 0 otherwise
 			 */
 			aura = send_hdr->w0.aura;
-			send_hdr->w0.df = cn9k_nix_prefree_seg(m, txq, send_hdr, &aura);
+			send_hdr->w0.df = cn9k_nix_prefree_seg(m, extm, txq, send_hdr, &aura);
 			send_hdr->w0.aura = aura;
 			/* Ensuring mbuf fields which got updated in
 			 * cnxk_nix_prefree_seg are written before LMTST.
@@ -600,8 +615,8 @@  cn9k_nix_xmit_submit_lmt_release(const rte_iova_t io_addr)
 }

 static __rte_always_inline uint16_t
-cn9k_nix_prepare_mseg(struct cn9k_eth_txq *txq,
-		      struct rte_mbuf *m, uint64_t *cmd, const uint16_t flags)
+cn9k_nix_prepare_mseg(struct cn9k_eth_txq *txq, struct rte_mbuf *m, struct rte_mbuf **extm,
+		      uint64_t *cmd, const uint16_t flags)
 {
 	struct nix_send_hdr_s *send_hdr;
 	uint64_t prefree = 0, aura;
@@ -634,7 +649,7 @@  cn9k_nix_prepare_mseg(struct cn9k_eth_txq *txq,
 	/* Set invert df if buffer is not to be freed by H/W */
 	if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) {
 		aura = send_hdr->w0.aura;
-		prefree = (cn9k_nix_prefree_seg(m, txq, send_hdr, &aura) << 55);
+		prefree = (cn9k_nix_prefree_seg(m, extm, txq, send_hdr, &aura) << 55);
 		send_hdr->w0.aura = aura;
 		sg_u |= prefree;
 		rte_io_wmb();
@@ -664,7 +679,7 @@  cn9k_nix_prepare_mseg(struct cn9k_eth_txq *txq,
 		cookie = RTE_MBUF_DIRECT(m) ? m : rte_mbuf_from_indirect(m);
 		/* Set invert df if buffer is not to be freed by H/W */
 		if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) {
-			sg_u |= (cn9k_nix_prefree_seg(m, txq, send_hdr, NULL) << (i + 55));
+			sg_u |= (cn9k_nix_prefree_seg(m, extm, txq, send_hdr, NULL) << (i + 55));
 			/* Commit changes to mbuf */
 			rte_io_wmb();
 		}
@@ -748,6 +763,7 @@  cn9k_nix_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts,
 	const rte_iova_t io_addr = txq->io_addr;
 	uint64_t lso_tun_fmt = 0, mark_fmt = 0;
 	void *lmt_addr = txq->lmt_addr;
+	struct rte_mbuf *extm = NULL;
 	uint8_t mark_flag = 0;
 	uint16_t i;

@@ -778,13 +794,16 @@  cn9k_nix_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts,
 		rte_io_wmb();

 	for (i = 0; i < pkts; i++) {
-		cn9k_nix_xmit_prepare(txq, tx_pkts[i], cmd, flags, lso_tun_fmt,
+		cn9k_nix_xmit_prepare(txq, tx_pkts[i], &extm, cmd, flags, lso_tun_fmt,
 				      mark_flag, mark_fmt);
 		cn9k_nix_xmit_prepare_tstamp(txq, cmd, tx_pkts[i]->ol_flags, 4,
 					     flags);
 		cn9k_nix_xmit_one(cmd, lmt_addr, io_addr, flags);
 	}

+	if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F && !txq->tx_compl.ena)
+		cn9k_nix_free_extmbuf(extm);
+
 	/* Reduce the cached count */
 	txq->fc_cache_pkts -= pkts;

@@ -799,6 +818,7 @@  cn9k_nix_xmit_pkts_mseg(void *tx_queue, struct rte_mbuf **tx_pkts,
 	const rte_iova_t io_addr = txq->io_addr;
 	uint64_t lso_tun_fmt = 0, mark_fmt = 0;
 	void *lmt_addr = txq->lmt_addr;
+	struct rte_mbuf *extm = NULL;
 	uint8_t mark_flag = 0;
 	uint16_t segdw;
 	uint64_t i;
@@ -830,14 +850,17 @@  cn9k_nix_xmit_pkts_mseg(void *tx_queue, struct rte_mbuf **tx_pkts,
 		rte_io_wmb();

 	for (i = 0; i < pkts; i++) {
-		cn9k_nix_xmit_prepare(txq, tx_pkts[i], cmd, flags, lso_tun_fmt,
+		cn9k_nix_xmit_prepare(txq, tx_pkts[i], &extm, cmd, flags, lso_tun_fmt,
 				      mark_flag, mark_fmt);
-		segdw = cn9k_nix_prepare_mseg(txq, tx_pkts[i], cmd, flags);
+		segdw = cn9k_nix_prepare_mseg(txq, tx_pkts[i], &extm, cmd, flags);
 		cn9k_nix_xmit_prepare_tstamp(txq, cmd, tx_pkts[i]->ol_flags,
 					     segdw, flags);
 		cn9k_nix_xmit_mseg_one(cmd, lmt_addr, io_addr, segdw);
 	}

+	if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F && !txq->tx_compl.ena)
+		cn9k_nix_free_extmbuf(extm);
+
 	/* Reduce the cached count */
 	txq->fc_cache_pkts -= pkts;

@@ -885,7 +908,7 @@  cn9k_nix_prepare_tso(struct rte_mbuf *m, union nix_send_hdr_w1_u *w1,

 static __rte_always_inline uint8_t
 cn9k_nix_prepare_mseg_vec_list(struct cn9k_eth_txq *txq,
-			       struct rte_mbuf *m, uint64_t *cmd,
+			       struct rte_mbuf *m, struct rte_mbuf **extm, uint64_t *cmd,
 			       struct nix_send_hdr_s *send_hdr,
 			       union nix_send_sg_s *sg, const uint32_t flags)
 {
@@ -910,7 +933,7 @@  cn9k_nix_prepare_mseg_vec_list(struct cn9k_eth_txq *txq,
 	cookie = RTE_MBUF_DIRECT(m) ? m : rte_mbuf_from_indirect(m);
 	if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) {
 		aura = send_hdr->w0.aura;
-		sg_u |= (cn9k_nix_prefree_seg(m, txq, send_hdr, &aura) << 55);
+		sg_u |= (cn9k_nix_prefree_seg(m, extm, txq, send_hdr, &aura) << 55);
 		send_hdr->w0.aura = aura;
 	}
 	/* Mark mempool object as "put" since it is freed by NIX */
@@ -935,7 +958,7 @@  cn9k_nix_prepare_mseg_vec_list(struct cn9k_eth_txq *txq,
 		cookie = RTE_MBUF_DIRECT(m) ? m : rte_mbuf_from_indirect(m);
 		/* Set invert df if buffer is not to be freed by H/W */
 		if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F)
-			sg_u |= (cn9k_nix_prefree_seg(m, txq, send_hdr, &aura) << (i + 55));
+			sg_u |= (cn9k_nix_prefree_seg(m, extm, txq, send_hdr, &aura) << (i + 55));
 			/* Mark mempool object as "put" since it is freed by NIX
 			 */
 #ifdef RTE_LIBRTE_MEMPOOL_DEBUG
@@ -981,9 +1004,8 @@  cn9k_nix_prepare_mseg_vec_list(struct cn9k_eth_txq *txq,
 }

 static __rte_always_inline uint8_t
-cn9k_nix_prepare_mseg_vec(struct cn9k_eth_txq *txq,
-			  struct rte_mbuf *m, uint64_t *cmd, uint64x2_t *cmd0,
-			  uint64x2_t *cmd1, const uint32_t flags)
+cn9k_nix_prepare_mseg_vec(struct cn9k_eth_txq *txq, struct rte_mbuf *m, struct rte_mbuf **extm,
+			  uint64_t *cmd, uint64x2_t *cmd0, uint64x2_t *cmd1, const uint32_t flags)
 {
 	struct nix_send_hdr_s send_hdr;
 	struct rte_mbuf *cookie;
@@ -998,7 +1020,7 @@  cn9k_nix_prepare_mseg_vec(struct cn9k_eth_txq *txq,
 			send_hdr.w1.u = vgetq_lane_u64(cmd0[0], 1);
 			sg.u = vgetq_lane_u64(cmd1[0], 0);
 			aura = send_hdr.w0.aura;
-			sg.u |= (cn9k_nix_prefree_seg(m, txq, &send_hdr, &aura) << 55);
+			sg.u |= (cn9k_nix_prefree_seg(m, extm, txq, &send_hdr, &aura) << 55);
 			send_hdr.w0.aura = aura;
 			cmd1[0] = vsetq_lane_u64(sg.u, cmd1[0], 0);
 			cmd0[0] = vsetq_lane_u64(send_hdr.w0.u, cmd0[0], 0);
@@ -1021,7 +1043,7 @@  cn9k_nix_prepare_mseg_vec(struct cn9k_eth_txq *txq,
 	send_hdr.w1.u = vgetq_lane_u64(cmd0[0], 1);
 	sg.u = vgetq_lane_u64(cmd1[0], 0);

-	ret = cn9k_nix_prepare_mseg_vec_list(txq, m, cmd, &send_hdr, &sg, flags);
+	ret = cn9k_nix_prepare_mseg_vec_list(txq, m, extm, cmd, &send_hdr, &sg, flags);

 	cmd0[0] = vsetq_lane_u64(send_hdr.w0.u, cmd0[0], 0);
 	cmd0[0] = vsetq_lane_u64(send_hdr.w1.u, cmd0[0], 1);
@@ -1168,6 +1190,7 @@  cn9k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 	uint64_t *lmt_addr = txq->lmt_addr;
 	rte_iova_t io_addr = txq->io_addr;
 	uint64x2_t ltypes01, ltypes23;
+	struct rte_mbuf *extm = NULL;
 	uint64x2_t xtmp128, ytmp128;
 	uint64x2_t xmask01, xmask23;
 	uint64_t lmt_status, i;
@@ -1933,8 +1956,8 @@  cn9k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 		if ((flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) &&
 		    !(flags & NIX_TX_MULTI_SEG_F)) {
 			/* Set don't free bit if reference count > 1 */
-			cn9k_nix_prefree_seg_vec(tx_pkts, txq, &senddesc01_w0, &senddesc23_w0,
-						 &senddesc01_w1, &senddesc23_w1);
+			cn9k_nix_prefree_seg_vec(tx_pkts, &extm, txq, &senddesc01_w0,
+						 &senddesc23_w0, &senddesc01_w1, &senddesc23_w1);
 			/* Ensuring mbuf fields which got updated in
 			 * cnxk_nix_prefree_seg are written before LMTST.
 			 */
@@ -1995,7 +2018,7 @@  cn9k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 			/* Build mseg list for each packet individually. */
 			for (j = 0; j < NIX_DESCS_PER_LOOP; j++)
 				segdw[j] = cn9k_nix_prepare_mseg_vec(txq,
-							tx_pkts[j],
+							tx_pkts[j], &extm,
 							seg_list[j], &cmd0[j],
 							&cmd1[j], flags);
 			segdw[4] = 8;
@@ -2070,6 +2093,9 @@  cn9k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 		tx_pkts = tx_pkts + NIX_DESCS_PER_LOOP;
 	}

+	if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F && !txq->tx_compl.ena)
+		cn9k_nix_free_extmbuf(extm);
+
 	if (unlikely(pkts_left)) {
 		if (flags & NIX_TX_MULTI_SEG_F)
 			pkts += cn9k_nix_xmit_pkts_mseg(tx_queue, tx_pkts,