[3/3] net/i40e: fix segment fault in AVX512

Message ID 1615512441-17495-1-git-send-email-wenzhuo.lu@intel.com (mailing list archive)
State Superseded, archived
Delegated to: Qi Zhang
Headers
Series [1/3] net/iavf: fix segment fault in AVX512 |

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/Intel-compilation success Compilation OK
ci/intel-Testing success Testing PASS
ci/travis-robot success travis build: passed
ci/github-robot success github build: passed
ci/iol-mellanox-Performance success Performance Testing PASS
ci/iol-mellanox-Functional success Functional Testing PASS
ci/iol-intel-Performance success Performance Testing PASS
ci/iol-abi-testing success Testing PASS
ci/iol-testing success Testing PASS

Commit Message

Wenzhuo Lu March 12, 2021, 1:27 a.m. UTC
  Fix segment fault when failing to get the memory from the pool.

Fixes: e6a6a138919f ("net/i40e: add AVX512 vector path")
Cc: stable@dpdk.org

Reported-by: David Coyle <David.Coyle@intel.com>
Signed-off-by: Wenzhuo Lu <wenzhuo.lu@intel.com>
---
 drivers/net/i40e/i40e_rxtx_vec_avx512.c | 128 ++++++++++++++++++++++++++++++++
 1 file changed, 128 insertions(+)
  

Comments

Coyle, David March 15, 2021, 5:38 p.m. UTC | #1
Hi Wenzhuo

> -----Original Message-----
> From: dev <dev-bounces@dpdk.org> On Behalf Of Wenzhuo Lu
> Sent: Friday, March 12, 2021 1:27 AM
> To: dev@dpdk.org
> Cc: Lu, Wenzhuo <wenzhuo.lu@intel.com>; stable@dpdk.org
> Subject: [dpdk-dev] [PATCH 3/3] net/i40e: fix segment fault in AVX512
> 
> Fix segment fault when failing to get the memory from the pool.
> 
> Fixes: e6a6a138919f ("net/i40e: add AVX512 vector path")
> Cc: stable@dpdk.org
> 
> Reported-by: David Coyle <David.Coyle@intel.com>
> Signed-off-by: Wenzhuo Lu <wenzhuo.lu@intel.com>
> ---
>  drivers/net/i40e/i40e_rxtx_vec_avx512.c | 128
> ++++++++++++++++++++++++++++++++
>  1 file changed, 128 insertions(+)
> 
> diff --git a/drivers/net/i40e/i40e_rxtx_vec_avx512.c
> b/drivers/net/i40e/i40e_rxtx_vec_avx512.c
> index 862c916..36521da 100644
> --- a/drivers/net/i40e/i40e_rxtx_vec_avx512.c
> +++ b/drivers/net/i40e/i40e_rxtx_vec_avx512.c
> @@ -32,6 +32,9 @@
> 
>  	rxdp = rxq->rx_ring + rxq->rxrearm_start;
> 
> +	if (!cache)
> +		goto normal;

[DC] Like in IAVF and ICE, should we also check for cache->len == 0, like is done in Tx path?

> +
>  	/* We need to pull 'n' more MBUFs into the software ring from
> mempool
>  	 * We inline the mempool function here, so we can vectorize the
> copy
>  	 * from the cache into the shadow ring.
> @@ -132,7 +135,132 @@
>  #endif
>  		rxep += 8, rxdp += 8, cache->len -= 8;
>  	}
> +	goto done;
> +
> +normal:
> +	/* Pull 'n' more MBUFs into the software ring */
> +	if (rte_mempool_get_bulk(rxq->mp,
> +				 (void *)rxep,
> +				 RTE_I40E_RXQ_REARM_THRESH) < 0) {
> +		if (rxq->rxrearm_nb + RTE_I40E_RXQ_REARM_THRESH >=
> +		    rxq->nb_rx_desc) {
> +			__m128i dma_addr0;
> +
> +			dma_addr0 = _mm_setzero_si128();
> +			for (i = 0; i < RTE_I40E_DESCS_PER_LOOP; i++) {
> +				rxep[i].mbuf = &rxq->fake_mbuf;
> +				_mm_store_si128((__m128i *)&rxdp[i].read,
> +						dma_addr0);
> +			}
> +		}
> +		rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed
> +=
> +			RTE_I40E_RXQ_REARM_THRESH;
> +		return;
> +	}
> +
> +#ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
> +	struct rte_mbuf *mb0, *mb1;
> +	__m128i dma_addr0, dma_addr1;
> +	__m128i hdr_room = _mm_set_epi64x(RTE_PKTMBUF_HEADROOM,
> +			RTE_PKTMBUF_HEADROOM);
> +	/* Initialize the mbufs in vector, process 4 mbufs in one loop */

[DC] Comment should say 2 mbufs

> +	for (i = 0; i < RTE_I40E_RXQ_REARM_THRESH; i += 2, rxep += 2) {
> +		__m128i vaddr0, vaddr1;
> +
> +		mb0 = rxep[0].mbuf;
> +		mb1 = rxep[1].mbuf;
> +
> +		/* load buf_addr(lo 64bit) and buf_physaddr(hi 64bit) */
> +		RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) !=
> +				offsetof(struct rte_mbuf, buf_addr) + 8);
> +		vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr);
> +		vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr);
> +
> +		/* convert pa to dma_addr hdr/data */
> +		dma_addr0 = _mm_unpackhi_epi64(vaddr0, vaddr0);
> +		dma_addr1 = _mm_unpackhi_epi64(vaddr1, vaddr1);
> +
> +		/* add headroom to pa values */
> +		dma_addr0 = _mm_add_epi64(dma_addr0, hdr_room);
> +		dma_addr1 = _mm_add_epi64(dma_addr1, hdr_room);
> +
> +		/* flush desc with pa dma_addr */
> +		_mm_store_si128((__m128i *)&rxdp++->read, dma_addr0);
> +		_mm_store_si128((__m128i *)&rxdp++->read, dma_addr1);
> +	}
> +#else
> +	struct rte_mbuf *mb0, *mb1, *mb2, *mb3;
> +	struct rte_mbuf *mb4, *mb5, *mb6, *mb7;
> +	__m512i dma_addr0_3, dma_addr4_7;
> +	__m512i hdr_room =
> _mm512_set1_epi64(RTE_PKTMBUF_HEADROOM);
> +	/* Initialize the mbufs in vector, process 4 mbufs in one loop */

[DC] Comment should say 8 mbufs

> +	for (i = 0; i < RTE_I40E_RXQ_REARM_THRESH;
> +			i += 8, rxep += 8, rxdp += 8) {
> +		__m128i vaddr0, vaddr1, vaddr2, vaddr3;
> +		__m128i vaddr4, vaddr5, vaddr6, vaddr7;

<snip>

> +		vaddr6 = _mm_loadu_si128((__m128i *)&mb6->buf_addr);
> +		vaddr7 = _mm_loadu_si128((__m128i *)&mb7->buf_addr);
> +
> +		/**
> +		 * merge 0 & 1, by casting 0 to 256-bit and inserting 1
> +		 * into the high lanes. Similarly for 2 & 3
> +		 */

[DC] Comment should say "Similarly for 2 & 3, 4 & 5, 6 & 7"

> +		vaddr0_1 =
> +
> 	_mm256_inserti128_si256(_mm256_castsi128_si256(vaddr0),
> +						vaddr1, 1);

<snip>

> +		/* flush desc with pa dma_addr */
> +		_mm512_store_si512((__m512i *)&rxdp->read,
> dma_addr0_3);
> +		_mm512_store_si512((__m512i *)&(rxdp + 4)->read,
> dma_addr4_7);
> +	}
> +#endif

[DC] Again, there's common code here with the avx2 file and also with the IAVF and ICE PMDs.

As I said in other reviews, maybe it's not practical to share code across PMDs.
But might be good to have some common functions within each PMD for avx2 and avx512 paths

> 
> +done:
>  	rxq->rxrearm_start += RTE_I40E_RXQ_REARM_THRESH;
>  	if (rxq->rxrearm_start >= rxq->nb_rx_desc)
>  		rxq->rxrearm_start = 0;

The patch fixes the seg fault, but note I have only tested the default '#ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC ' path

Tested-by: David Coyle <david.coyle@intel.com>
  
Wenzhuo Lu March 26, 2021, 1:50 a.m. UTC | #2
Hi David,
Sorry for the late response.

> > +	if (!cache)
> > +		goto normal;
> 
> [DC] Like in IAVF and ICE, should we also check for cache->len == 0, like is
> done in Tx path?
Not necessary because below code tries to get more buffer is length is not as long as needed, it covers the case even if the length is 0.

> > +	/* Initialize the mbufs in vector, process 4 mbufs in one loop */
> 
> [DC] Comment should say 2 mbufs
Thanks. Will correct it.

> > _mm512_set1_epi64(RTE_PKTMBUF_HEADROOM);
> > +	/* Initialize the mbufs in vector, process 4 mbufs in one loop */
> 
> [DC] Comment should say 8 mbufs
Thanks. Will correct it.

> > +		/**
> > +		 * merge 0 & 1, by casting 0 to 256-bit and inserting 1
> > +		 * into the high lanes. Similarly for 2 & 3
> > +		 */
> 
> [DC] Comment should say "Similarly for 2 & 3, 4 & 5, 6 & 7"
Thanks. Will correct it.

> 
> > +		vaddr0_1 =
> > +
> > 	_mm256_inserti128_si256(_mm256_castsi128_si256(vaddr0),
> > +						vaddr1, 1);
> 
> <snip>
> 
> > +		/* flush desc with pa dma_addr */
> > +		_mm512_store_si512((__m512i *)&rxdp->read,
> > dma_addr0_3);
> > +		_mm512_store_si512((__m512i *)&(rxdp + 4)->read,
> > dma_addr4_7);
> > +	}
> > +#endif
> 
> [DC] Again, there's common code here with the avx2 file and also with the
> IAVF and ICE PMDs.
> 
> As I said in other reviews, maybe it's not practical to share code across PMDs.
> But might be good to have some common functions within each PMD for
> avx2 and avx512 paths
Most of the code looks same but not all. In this avx512 file, some avx512 instructions are used to replace the avx2 ones.
Let me check if some common code can be saved.
  

Patch

diff --git a/drivers/net/i40e/i40e_rxtx_vec_avx512.c b/drivers/net/i40e/i40e_rxtx_vec_avx512.c
index 862c916..36521da 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_avx512.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_avx512.c
@@ -32,6 +32,9 @@ 
 
 	rxdp = rxq->rx_ring + rxq->rxrearm_start;
 
+	if (!cache)
+		goto normal;
+
 	/* We need to pull 'n' more MBUFs into the software ring from mempool
 	 * We inline the mempool function here, so we can vectorize the copy
 	 * from the cache into the shadow ring.
@@ -132,7 +135,132 @@ 
 #endif
 		rxep += 8, rxdp += 8, cache->len -= 8;
 	}
+	goto done;
+
+normal:
+	/* Pull 'n' more MBUFs into the software ring */
+	if (rte_mempool_get_bulk(rxq->mp,
+				 (void *)rxep,
+				 RTE_I40E_RXQ_REARM_THRESH) < 0) {
+		if (rxq->rxrearm_nb + RTE_I40E_RXQ_REARM_THRESH >=
+		    rxq->nb_rx_desc) {
+			__m128i dma_addr0;
+
+			dma_addr0 = _mm_setzero_si128();
+			for (i = 0; i < RTE_I40E_DESCS_PER_LOOP; i++) {
+				rxep[i].mbuf = &rxq->fake_mbuf;
+				_mm_store_si128((__m128i *)&rxdp[i].read,
+						dma_addr0);
+			}
+		}
+		rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed +=
+			RTE_I40E_RXQ_REARM_THRESH;
+		return;
+	}
+
+#ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
+	struct rte_mbuf *mb0, *mb1;
+	__m128i dma_addr0, dma_addr1;
+	__m128i hdr_room = _mm_set_epi64x(RTE_PKTMBUF_HEADROOM,
+			RTE_PKTMBUF_HEADROOM);
+	/* Initialize the mbufs in vector, process 4 mbufs in one loop */
+	for (i = 0; i < RTE_I40E_RXQ_REARM_THRESH; i += 2, rxep += 2) {
+		__m128i vaddr0, vaddr1;
+
+		mb0 = rxep[0].mbuf;
+		mb1 = rxep[1].mbuf;
+
+		/* load buf_addr(lo 64bit) and buf_physaddr(hi 64bit) */
+		RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) !=
+				offsetof(struct rte_mbuf, buf_addr) + 8);
+		vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr);
+		vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr);
+
+		/* convert pa to dma_addr hdr/data */
+		dma_addr0 = _mm_unpackhi_epi64(vaddr0, vaddr0);
+		dma_addr1 = _mm_unpackhi_epi64(vaddr1, vaddr1);
+
+		/* add headroom to pa values */
+		dma_addr0 = _mm_add_epi64(dma_addr0, hdr_room);
+		dma_addr1 = _mm_add_epi64(dma_addr1, hdr_room);
+
+		/* flush desc with pa dma_addr */
+		_mm_store_si128((__m128i *)&rxdp++->read, dma_addr0);
+		_mm_store_si128((__m128i *)&rxdp++->read, dma_addr1);
+	}
+#else
+	struct rte_mbuf *mb0, *mb1, *mb2, *mb3;
+	struct rte_mbuf *mb4, *mb5, *mb6, *mb7;
+	__m512i dma_addr0_3, dma_addr4_7;
+	__m512i hdr_room = _mm512_set1_epi64(RTE_PKTMBUF_HEADROOM);
+	/* Initialize the mbufs in vector, process 4 mbufs in one loop */
+	for (i = 0; i < RTE_I40E_RXQ_REARM_THRESH;
+			i += 8, rxep += 8, rxdp += 8) {
+		__m128i vaddr0, vaddr1, vaddr2, vaddr3;
+		__m128i vaddr4, vaddr5, vaddr6, vaddr7;
+		__m256i vaddr0_1, vaddr2_3;
+		__m256i vaddr4_5, vaddr6_7;
+		__m512i vaddr0_3, vaddr4_7;
+
+		mb0 = rxep[0].mbuf;
+		mb1 = rxep[1].mbuf;
+		mb2 = rxep[2].mbuf;
+		mb3 = rxep[3].mbuf;
+		mb4 = rxep[4].mbuf;
+		mb5 = rxep[5].mbuf;
+		mb6 = rxep[6].mbuf;
+		mb7 = rxep[7].mbuf;
+
+		/* load buf_addr(lo 64bit) and buf_physaddr(hi 64bit) */
+		RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) !=
+				offsetof(struct rte_mbuf, buf_addr) + 8);
+		vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr);
+		vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr);
+		vaddr2 = _mm_loadu_si128((__m128i *)&mb2->buf_addr);
+		vaddr3 = _mm_loadu_si128((__m128i *)&mb3->buf_addr);
+		vaddr4 = _mm_loadu_si128((__m128i *)&mb4->buf_addr);
+		vaddr5 = _mm_loadu_si128((__m128i *)&mb5->buf_addr);
+		vaddr6 = _mm_loadu_si128((__m128i *)&mb6->buf_addr);
+		vaddr7 = _mm_loadu_si128((__m128i *)&mb7->buf_addr);
+
+		/**
+		 * merge 0 & 1, by casting 0 to 256-bit and inserting 1
+		 * into the high lanes. Similarly for 2 & 3
+		 */
+		vaddr0_1 =
+			_mm256_inserti128_si256(_mm256_castsi128_si256(vaddr0),
+						vaddr1, 1);
+		vaddr2_3 =
+			_mm256_inserti128_si256(_mm256_castsi128_si256(vaddr2),
+						vaddr3, 1);
+		vaddr4_5 =
+			_mm256_inserti128_si256(_mm256_castsi128_si256(vaddr4),
+						vaddr5, 1);
+		vaddr6_7 =
+			_mm256_inserti128_si256(_mm256_castsi128_si256(vaddr6),
+						vaddr7, 1);
+		vaddr0_3 =
+			_mm512_inserti64x4(_mm512_castsi256_si512(vaddr0_1),
+						vaddr2_3, 1);
+		vaddr4_7 =
+			_mm512_inserti64x4(_mm512_castsi256_si512(vaddr4_5),
+						vaddr6_7, 1);
+
+		/* convert pa to dma_addr hdr/data */
+		dma_addr0_3 = _mm512_unpackhi_epi64(vaddr0_3, vaddr0_3);
+		dma_addr4_7 = _mm512_unpackhi_epi64(vaddr4_7, vaddr4_7);
+
+		/* add headroom to pa values */
+		dma_addr0_3 = _mm512_add_epi64(dma_addr0_3, hdr_room);
+		dma_addr4_7 = _mm512_add_epi64(dma_addr4_7, hdr_room);
+
+		/* flush desc with pa dma_addr */
+		_mm512_store_si512((__m512i *)&rxdp->read, dma_addr0_3);
+		_mm512_store_si512((__m512i *)&(rxdp + 4)->read, dma_addr4_7);
+	}
+#endif
 
+done:
 	rxq->rxrearm_start += RTE_I40E_RXQ_REARM_THRESH;
 	if (rxq->rxrearm_start >= rxq->nb_rx_desc)
 		rxq->rxrearm_start = 0;