[v2] net/iavf: fix TSO offload for tunnel case

Message ID 20220926051725.261950-1-zhichaox.zeng@intel.com (mailing list archive)
State Superseded, archived
Delegated to: Qi Zhang
Headers
Series [v2] net/iavf: fix TSO offload for tunnel case |

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/Intel-compilation success Compilation OK
ci/iol-mellanox-Performance success Performance Testing PASS
ci/iol-aarch64-compile-testing fail Testing issues
ci/iol-intel-Performance success Performance Testing PASS
ci/iol-intel-Functional success Functional Testing PASS
ci/iol-aarch64-unit-testing fail Testing issues
ci/iol-x86_64-unit-testing success Testing PASS
ci/github-robot: build fail github build: failed
ci/iol-x86_64-compile-testing success Testing PASS
ci/intel-Testing success Testing PASS

Commit Message

Zhichao Zeng Sept. 26, 2022, 5:17 a.m. UTC
This patch is to fix the tunnel TSO not enabling issue, simplify
the logic of calculating 'Tx Buffer Size' of data descriptor with IPSec
and fix handling that the mbuf size exceeds the TX descriptor
hardware limit(1B-16KB) which causes malicious behavior to the NIC.

Fixes: 1e728b01120c ("net/iavf: rework Tx path")

---
v2: rework patch

Signed-off-by: Zhichao Zeng <zhichaox.zeng@intel.com>
---
 drivers/common/iavf/iavf_osdep.h |  2 +
 drivers/net/iavf/iavf_rxtx.c     | 95 +++++++++++++++++++-------------
 2 files changed, 59 insertions(+), 38 deletions(-)
  

Comments

Ke Xu Sept. 26, 2022, 9:48 a.m. UTC | #1
> -----Original Message-----
> From: Zhichao Zeng <zhichaox.zeng@intel.com>
> Sent: Monday, September 26, 2022 1:17 PM
> To: dev@dpdk.org
> Cc: Yang, Qiming <qiming.yang@intel.com>; Zhou, YidingX
> <yidingx.zhou@intel.com>; Zhang, Qi Z <qi.z.zhang@intel.com>; Zeng,
> ZhichaoX <zhichaox.zeng@intel.com>; Wu, Jingjing <jingjing.wu@intel.com>;
> Xing, Beilei <beilei.xing@intel.com>; Sinha, Abhijit <abhijit.sinha@intel.com>;
> Doherty, Declan <declan.doherty@intel.com>; Nicolau, Radu
> <radu.nicolau@intel.com>
> Subject: [PATCH v2] net/iavf: fix TSO offload for tunnel case
> 
> This patch is to fix the tunnel TSO not enabling issue, simplify the logic of
> calculating 'Tx Buffer Size' of data descriptor with IPSec and fix handling that
> the mbuf size exceeds the TX descriptor hardware limit(1B-16KB) which
> causes malicious behavior to the NIC.
> 
> Fixes: 1e728b01120c ("net/iavf: rework Tx path")
> 
> ---
> v2: rework patch
> 
> Signed-off-by: Zhichao Zeng <zhichaox.zeng@intel.com>

Tested and passed.
Tested-bu: Ke Xu <ke1.xu@intel.com>

> ---
>  drivers/common/iavf/iavf_osdep.h |  2 +
>  drivers/net/iavf/iavf_rxtx.c     | 95 +++++++++++++++++++-------------
>  2 files changed, 59 insertions(+), 38 deletions(-)
  
Qi Zhang Sept. 27, 2022, 2:33 a.m. UTC | #2
> -----Original Message-----
> From: Zeng, ZhichaoX <zhichaox.zeng@intel.com>
> Sent: Monday, September 26, 2022 1:17 PM
> To: dev@dpdk.org
> Cc: Yang, Qiming <qiming.yang@intel.com>; Zhou, YidingX
> <yidingx.zhou@intel.com>; Zhang, Qi Z <qi.z.zhang@intel.com>; Zeng,
> ZhichaoX <zhichaox.zeng@intel.com>; Wu, Jingjing <jingjing.wu@intel.com>;
> Xing, Beilei <beilei.xing@intel.com>; Sinha, Abhijit <abhijit.sinha@intel.com>;
> Doherty, Declan <declan.doherty@intel.com>; Nicolau, Radu
> <radu.nicolau@intel.com>
> Subject: [PATCH v2] net/iavf: fix TSO offload for tunnel case
> 
> This patch is to fix the tunnel TSO not enabling issue, simplify the logic of
> calculating 'Tx Buffer Size' of data descriptor with IPSec and fix handling that
> the mbuf size exceeds the TX descriptor hardware limit(1B-16KB) which
> causes malicious behavior to the NIC.
> 
> Fixes: 1e728b01120c ("net/iavf: rework Tx path")
> 
> ---
> v2: rework patch
> 
> Signed-off-by: Zhichao Zeng <zhichaox.zeng@intel.com>
> ---
>  drivers/common/iavf/iavf_osdep.h |  2 +
>  drivers/net/iavf/iavf_rxtx.c     | 95 +++++++++++++++++++-------------
>  2 files changed, 59 insertions(+), 38 deletions(-)
> 
> diff --git a/drivers/common/iavf/iavf_osdep.h
> b/drivers/common/iavf/iavf_osdep.h
> index 31d3d809f9..bf1436dfc6 100644
> --- a/drivers/common/iavf/iavf_osdep.h
> +++ b/drivers/common/iavf/iavf_osdep.h
> @@ -126,6 +126,8 @@ writeq(uint64_t value, volatile void *addr)  #define
> iavf_memset(a, b, c, d) memset((a), (b), (c))  #define iavf_memcpy(a, b, c, d)
> rte_memcpy((a), (b), (c))
> 
> +#define DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d))
> +

This looks like not necessary be added in osdep.h
Can we simply  make it local or at some header file in net/iavf, so we don't need to have a patch that cross the modules.

>  #define iavf_usec_delay(x) rte_delay_us_sleep(x)  #define
> iavf_msec_delay(x) iavf_usec_delay(1000 * (x))
> 
> diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c index
> 109ba756f8..a06d9d3da6 100644
> --- a/drivers/net/iavf/iavf_rxtx.c
> +++ b/drivers/net/iavf/iavf_rxtx.c
> @@ -2417,7 +2417,7 @@ iavf_fill_ctx_desc_segmentation_field(volatile
> uint64_t *field,
>  		total_length = m->pkt_len - (m->l2_len + m->l3_len + m-
> >l4_len);
> 
>  		if (m->ol_flags & RTE_MBUF_F_TX_TUNNEL_MASK)
> -			total_length -= m->outer_l3_len;
> +			total_length -= m->outer_l3_len + m->outer_l2_len;
>  	}
> 
>  #ifdef RTE_LIBRTE_IAVF_DEBUG_TX
> @@ -2581,50 +2581,39 @@ iavf_build_data_desc_cmd_offset_fields(volatile
> uint64_t *qw1,
>  		((uint64_t)l2tag1 <<
> IAVF_TXD_DATA_QW1_L2TAG1_SHIFT));  }
> 
> +/* HW requires that TX buffer size ranges from 1B up to (16K-1)B. */
> +#define IAVF_MAX_DATA_PER_TXD \
> +	(IAVF_TXD_QW1_TX_BUF_SZ_MASK >>
> IAVF_TXD_QW1_TX_BUF_SZ_SHIFT)
> +
> +/* Calculate the number of TX descriptors needed for each pkt */ static
> +inline uint16_t iavf_calc_pkt_desc(struct rte_mbuf *tx_pkt) {
> +	struct rte_mbuf *txd = tx_pkt;
> +	uint16_t count = 0;
> +
> +	while (txd != NULL) {
> +		count += DIV_ROUND_UP(txd->data_len,
> IAVF_MAX_DATA_PER_TXD);
> +		txd = txd->next;
> +	}
> +
> +	return count;
> +}
> +
>  static inline void
>  iavf_fill_data_desc(volatile struct iavf_tx_desc *desc,
> -	struct rte_mbuf *m, uint64_t desc_template,
> -	uint16_t tlen, uint16_t ipseclen)
> +	uint64_t desc_template,	uint16_t buffsz,
> +	uint64_t buffer_addr)
>  {
> -	uint32_t hdrlen = m->l2_len;
> -	uint32_t bufsz = 0;
> -
>  	/* fill data descriptor qw1 from template */
>  	desc->cmd_type_offset_bsz = desc_template;
> 
> -	/* set data buffer address */
> -	desc->buffer_addr = rte_mbuf_data_iova(m);
> -
> -	/* calculate data buffer size less set header lengths */
> -	if ((m->ol_flags & RTE_MBUF_F_TX_TUNNEL_MASK) &&
> -			(m->ol_flags & (RTE_MBUF_F_TX_TCP_SEG |
> -					RTE_MBUF_F_TX_UDP_SEG))) {
> -		hdrlen += m->outer_l3_len;
> -		if (m->ol_flags & RTE_MBUF_F_TX_L4_MASK)
> -			hdrlen += m->l3_len + m->l4_len;
> -		else
> -			hdrlen += m->l3_len;
> -		if (m->ol_flags & RTE_MBUF_F_TX_SEC_OFFLOAD)
> -			hdrlen += ipseclen;
> -		bufsz = hdrlen + tlen;
> -	} else if ((m->ol_flags & RTE_MBUF_F_TX_SEC_OFFLOAD) &&
> -			(m->ol_flags & (RTE_MBUF_F_TX_TCP_SEG |
> -					RTE_MBUF_F_TX_UDP_SEG))) {
> -		hdrlen += m->outer_l3_len + m->l3_len + ipseclen;
> -		if (m->ol_flags & RTE_MBUF_F_TX_L4_MASK)
> -			hdrlen += m->l4_len;
> -		bufsz = hdrlen + tlen;
> -
> -	} else {
> -		bufsz = m->data_len;
> -	}
> -
>  	/* set data buffer size */
>  	desc->cmd_type_offset_bsz |=
> -		(((uint64_t)bufsz <<
> IAVF_TXD_DATA_QW1_TX_BUF_SZ_SHIFT) &
> +		(((uint64_t)buffsz <<
> IAVF_TXD_DATA_QW1_TX_BUF_SZ_SHIFT) &
>  		IAVF_TXD_DATA_QW1_TX_BUF_SZ_MASK);
> 
> -	desc->buffer_addr = rte_cpu_to_le_64(desc->buffer_addr);
> +	desc->buffer_addr = rte_cpu_to_le_64(buffer_addr);
>  	desc->cmd_type_offset_bsz = rte_cpu_to_le_64(desc-
> >cmd_type_offset_bsz);
>  }
> 
> @@ -2649,8 +2638,10 @@ iavf_xmit_pkts(void *tx_queue, struct rte_mbuf
> **tx_pkts, uint16_t nb_pkts)
>  	struct iavf_tx_entry *txe_ring = txq->sw_ring;
>  	struct iavf_tx_entry *txe, *txn;
>  	struct rte_mbuf *mb, *mb_seg;
> +	uint64_t buf_dma_addr;
>  	uint16_t desc_idx, desc_idx_last;
>  	uint16_t idx;
> +	uint16_t slen;
> 
> 
>  	/* Check if the descriptor ring needs to be cleaned. */ @@ -2689,8
> +2680,14 @@ iavf_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
> uint16_t nb_pkts)
>  		 * The number of descriptors that must be allocated for
>  		 * a packet equals to the number of the segments of that
>  		 * packet plus the context and ipsec descriptors if needed.
> +		 * Recalculate the needed tx descs when TSO enabled in case
> +		 * the mbuf data size exceeds max data size that hw allows
> +		 * per tx desc.
>  		 */
> -		nb_desc_required = nb_desc_data + nb_desc_ctx +
> nb_desc_ipsec;
> +		if (mb->ol_flags & RTE_MBUF_F_TX_TCP_SEG)
> +			nb_desc_required = iavf_calc_pkt_desc(mb) +
> nb_desc_ctx + nb_desc_ipsec;
> +		else
> +			nb_desc_required = nb_desc_data + nb_desc_ctx +
> nb_desc_ipsec;
> 
>  		desc_idx_last = (uint16_t)(desc_idx + nb_desc_required - 1);
> 
> @@ -2786,8 +2783,30 @@ iavf_xmit_pkts(void *tx_queue, struct rte_mbuf
> **tx_pkts, uint16_t nb_pkts)
>  				rte_pktmbuf_free_seg(txe->mbuf);
> 
>  			txe->mbuf = mb_seg;
> -			iavf_fill_data_desc(ddesc, mb_seg,
> -					ddesc_template, tlen, ipseclen);
> +			slen = mb_seg->data_len;
> +			if (mb_seg->ol_flags &
> RTE_MBUF_F_TX_SEC_OFFLOAD)
> +				slen += ipseclen;
> +			buf_dma_addr = rte_mbuf_data_iova(mb_seg);
> +			while ((mb_seg->ol_flags &
> (RTE_MBUF_F_TX_TCP_SEG |
> +					RTE_MBUF_F_TX_UDP_SEG)) &&
> +					unlikely(slen >
> IAVF_MAX_DATA_PER_TXD)) {
> +				iavf_fill_data_desc(ddesc, ddesc_template,
> +					IAVF_MAX_DATA_PER_TXD,
> buf_dma_addr);
> +
> +				IAVF_DUMP_TX_DESC(txq, ddesc, desc_idx);
> +
> +				buf_dma_addr +=
> IAVF_MAX_DATA_PER_TXD;
> +				slen -= IAVF_MAX_DATA_PER_TXD;
> +
> +				txe->last_id = desc_idx_last;
> +				desc_idx = txe->next_id;
> +				txe = txn;
> +				ddesc = &txr[desc_idx];
> +				txn = &txe_ring[txe->next_id];
> +			}
> +
> +			iavf_fill_data_desc(ddesc, ddesc_template,
> +					slen, buf_dma_addr);
> 
>  			IAVF_DUMP_TX_DESC(txq, ddesc, desc_idx);
> 
> --
> 2.25.1
  

Patch

diff --git a/drivers/common/iavf/iavf_osdep.h b/drivers/common/iavf/iavf_osdep.h
index 31d3d809f9..bf1436dfc6 100644
--- a/drivers/common/iavf/iavf_osdep.h
+++ b/drivers/common/iavf/iavf_osdep.h
@@ -126,6 +126,8 @@  writeq(uint64_t value, volatile void *addr)
 #define iavf_memset(a, b, c, d) memset((a), (b), (c))
 #define iavf_memcpy(a, b, c, d) rte_memcpy((a), (b), (c))
 
+#define DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d))
+
 #define iavf_usec_delay(x) rte_delay_us_sleep(x)
 #define iavf_msec_delay(x) iavf_usec_delay(1000 * (x))
 
diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c
index 109ba756f8..a06d9d3da6 100644
--- a/drivers/net/iavf/iavf_rxtx.c
+++ b/drivers/net/iavf/iavf_rxtx.c
@@ -2417,7 +2417,7 @@  iavf_fill_ctx_desc_segmentation_field(volatile uint64_t *field,
 		total_length = m->pkt_len - (m->l2_len + m->l3_len + m->l4_len);
 
 		if (m->ol_flags & RTE_MBUF_F_TX_TUNNEL_MASK)
-			total_length -= m->outer_l3_len;
+			total_length -= m->outer_l3_len + m->outer_l2_len;
 	}
 
 #ifdef RTE_LIBRTE_IAVF_DEBUG_TX
@@ -2581,50 +2581,39 @@  iavf_build_data_desc_cmd_offset_fields(volatile uint64_t *qw1,
 		((uint64_t)l2tag1 << IAVF_TXD_DATA_QW1_L2TAG1_SHIFT));
 }
 
+/* HW requires that TX buffer size ranges from 1B up to (16K-1)B. */
+#define IAVF_MAX_DATA_PER_TXD \
+	(IAVF_TXD_QW1_TX_BUF_SZ_MASK >> IAVF_TXD_QW1_TX_BUF_SZ_SHIFT)
+
+/* Calculate the number of TX descriptors needed for each pkt */
+static inline uint16_t
+iavf_calc_pkt_desc(struct rte_mbuf *tx_pkt)
+{
+	struct rte_mbuf *txd = tx_pkt;
+	uint16_t count = 0;
+
+	while (txd != NULL) {
+		count += DIV_ROUND_UP(txd->data_len, IAVF_MAX_DATA_PER_TXD);
+		txd = txd->next;
+	}
+
+	return count;
+}
+
 static inline void
 iavf_fill_data_desc(volatile struct iavf_tx_desc *desc,
-	struct rte_mbuf *m, uint64_t desc_template,
-	uint16_t tlen, uint16_t ipseclen)
+	uint64_t desc_template,	uint16_t buffsz,
+	uint64_t buffer_addr)
 {
-	uint32_t hdrlen = m->l2_len;
-	uint32_t bufsz = 0;
-
 	/* fill data descriptor qw1 from template */
 	desc->cmd_type_offset_bsz = desc_template;
 
-	/* set data buffer address */
-	desc->buffer_addr = rte_mbuf_data_iova(m);
-
-	/* calculate data buffer size less set header lengths */
-	if ((m->ol_flags & RTE_MBUF_F_TX_TUNNEL_MASK) &&
-			(m->ol_flags & (RTE_MBUF_F_TX_TCP_SEG |
-					RTE_MBUF_F_TX_UDP_SEG))) {
-		hdrlen += m->outer_l3_len;
-		if (m->ol_flags & RTE_MBUF_F_TX_L4_MASK)
-			hdrlen += m->l3_len + m->l4_len;
-		else
-			hdrlen += m->l3_len;
-		if (m->ol_flags & RTE_MBUF_F_TX_SEC_OFFLOAD)
-			hdrlen += ipseclen;
-		bufsz = hdrlen + tlen;
-	} else if ((m->ol_flags & RTE_MBUF_F_TX_SEC_OFFLOAD) &&
-			(m->ol_flags & (RTE_MBUF_F_TX_TCP_SEG |
-					RTE_MBUF_F_TX_UDP_SEG))) {
-		hdrlen += m->outer_l3_len + m->l3_len + ipseclen;
-		if (m->ol_flags & RTE_MBUF_F_TX_L4_MASK)
-			hdrlen += m->l4_len;
-		bufsz = hdrlen + tlen;
-
-	} else {
-		bufsz = m->data_len;
-	}
-
 	/* set data buffer size */
 	desc->cmd_type_offset_bsz |=
-		(((uint64_t)bufsz << IAVF_TXD_DATA_QW1_TX_BUF_SZ_SHIFT) &
+		(((uint64_t)buffsz << IAVF_TXD_DATA_QW1_TX_BUF_SZ_SHIFT) &
 		IAVF_TXD_DATA_QW1_TX_BUF_SZ_MASK);
 
-	desc->buffer_addr = rte_cpu_to_le_64(desc->buffer_addr);
+	desc->buffer_addr = rte_cpu_to_le_64(buffer_addr);
 	desc->cmd_type_offset_bsz = rte_cpu_to_le_64(desc->cmd_type_offset_bsz);
 }
 
@@ -2649,8 +2638,10 @@  iavf_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
 	struct iavf_tx_entry *txe_ring = txq->sw_ring;
 	struct iavf_tx_entry *txe, *txn;
 	struct rte_mbuf *mb, *mb_seg;
+	uint64_t buf_dma_addr;
 	uint16_t desc_idx, desc_idx_last;
 	uint16_t idx;
+	uint16_t slen;
 
 
 	/* Check if the descriptor ring needs to be cleaned. */
@@ -2689,8 +2680,14 @@  iavf_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
 		 * The number of descriptors that must be allocated for
 		 * a packet equals to the number of the segments of that
 		 * packet plus the context and ipsec descriptors if needed.
+		 * Recalculate the needed tx descs when TSO enabled in case
+		 * the mbuf data size exceeds max data size that hw allows
+		 * per tx desc.
 		 */
-		nb_desc_required = nb_desc_data + nb_desc_ctx + nb_desc_ipsec;
+		if (mb->ol_flags & RTE_MBUF_F_TX_TCP_SEG)
+			nb_desc_required = iavf_calc_pkt_desc(mb) + nb_desc_ctx + nb_desc_ipsec;
+		else
+			nb_desc_required = nb_desc_data + nb_desc_ctx + nb_desc_ipsec;
 
 		desc_idx_last = (uint16_t)(desc_idx + nb_desc_required - 1);
 
@@ -2786,8 +2783,30 @@  iavf_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
 				rte_pktmbuf_free_seg(txe->mbuf);
 
 			txe->mbuf = mb_seg;
-			iavf_fill_data_desc(ddesc, mb_seg,
-					ddesc_template, tlen, ipseclen);
+			slen = mb_seg->data_len;
+			if (mb_seg->ol_flags & RTE_MBUF_F_TX_SEC_OFFLOAD)
+				slen += ipseclen;
+			buf_dma_addr = rte_mbuf_data_iova(mb_seg);
+			while ((mb_seg->ol_flags & (RTE_MBUF_F_TX_TCP_SEG |
+					RTE_MBUF_F_TX_UDP_SEG)) &&
+					unlikely(slen > IAVF_MAX_DATA_PER_TXD)) {
+				iavf_fill_data_desc(ddesc, ddesc_template,
+					IAVF_MAX_DATA_PER_TXD, buf_dma_addr);
+
+				IAVF_DUMP_TX_DESC(txq, ddesc, desc_idx);
+
+				buf_dma_addr += IAVF_MAX_DATA_PER_TXD;
+				slen -= IAVF_MAX_DATA_PER_TXD;
+
+				txe->last_id = desc_idx_last;
+				desc_idx = txe->next_id;
+				txe = txn;
+				ddesc = &txr[desc_idx];
+				txn = &txe_ring[txe->next_id];
+			}
+
+			iavf_fill_data_desc(ddesc, ddesc_template,
+					slen, buf_dma_addr);
 
 			IAVF_DUMP_TX_DESC(txq, ddesc, desc_idx);