[v4,4/5] net/ice: fix vector rx burst for ice

Message ID 20200917075834.60034-5-jia.guo@intel.com (mailing list archive)
State Changes Requested, archived
Delegated to: Qi Zhang
Headers
Series fix vector rx burst for PMDs |

Checks

Context Check Description
ci/checkpatch success coding style OK

Commit Message

Guo, Jia Sept. 17, 2020, 7:58 a.m. UTC
  The limitation of burst size in vector rx was removed, since it should
retrieve as much received packets as possible. And also the scattered
receive path should use a wrapper function to achieve the goal of
burst maximizing. And do some code cleaning for vector rx path.

Bugzilla ID: 516
Fixes: c68a52b8b38c ("net/ice: support vector SSE in Rx")
Fixes: ae60d3c9b227 ("net/ice: support Rx AVX2 vector")

Signed-off-by: Jeff Guo <jia.guo@intel.com>
Tested-by: Yingya Han <yingyax.han@intel.com>
---
 drivers/net/ice/ice_rxtx.h          |  1 +
 drivers/net/ice/ice_rxtx_vec_avx2.c | 23 ++++++------
 drivers/net/ice/ice_rxtx_vec_sse.c  | 56 +++++++++++++++++++----------
 3 files changed, 49 insertions(+), 31 deletions(-)
  

Comments

Qi Zhang Sept. 17, 2020, 11:03 a.m. UTC | #1
> -----Original Message-----
> From: Guo, Jia <jia.guo@intel.com>
> Sent: Thursday, September 17, 2020 3:59 PM
> To: Yang, Qiming <qiming.yang@intel.com>; Xing, Beilei
> <beilei.xing@intel.com>; Zhang, Qi Z <qi.z.zhang@intel.com>; Wu, Jingjing
> <jingjing.wu@intel.com>; Wang, Haiyue <haiyue.wang@intel.com>
> Cc: Zhao1, Wei <wei.zhao1@intel.com>; Richardson, Bruce
> <bruce.richardson@intel.com>; dev@dpdk.org; Guo, Jia <jia.guo@intel.com>;
> Zhang, Helin <helin.zhang@intel.com>; mb@smartsharesystems.com; Yigit,
> Ferruh <ferruh.yigit@intel.com>; stephen@networkplumber.org;
> barbette@kth.se; Han, YingyaX <yingyax.han@intel.com>
> Subject: [PATCH v4 4/5] net/ice: fix vector rx burst for ice
> 
> The limitation of burst size in vector rx was removed, since it should retrieve as
> much received packets as possible. And also the scattered receive path should
> use a wrapper function to achieve the goal of burst maximizing. And do some
> code cleaning for vector rx path.
> 
> Bugzilla ID: 516
> Fixes: c68a52b8b38c ("net/ice: support vector SSE in Rx")
> Fixes: ae60d3c9b227 ("net/ice: support Rx AVX2 vector")
> 
> Signed-off-by: Jeff Guo <jia.guo@intel.com>
> Tested-by: Yingya Han <yingyax.han@intel.com>
> ---
>  drivers/net/ice/ice_rxtx.h          |  1 +
>  drivers/net/ice/ice_rxtx_vec_avx2.c | 23 ++++++------
> drivers/net/ice/ice_rxtx_vec_sse.c  | 56 +++++++++++++++++++----------
>  3 files changed, 49 insertions(+), 31 deletions(-)
> 
> diff --git a/drivers/net/ice/ice_rxtx.h b/drivers/net/ice/ice_rxtx.h index
> 2fdcfb7d0..3ef5f300d 100644
> --- a/drivers/net/ice/ice_rxtx.h
> +++ b/drivers/net/ice/ice_rxtx.h
> @@ -35,6 +35,7 @@
>  #define ICE_MAX_RX_BURST            ICE_RXQ_REARM_THRESH
>  #define ICE_TX_MAX_FREE_BUF_SZ      64
>  #define ICE_DESCS_PER_LOOP          4
> +#define ICE_DESCS_PER_LOOP_AVX	    8

No need to expose this if no external link, better to keep all avx stuff inside avx.c

> 
>  #define ICE_FDIR_PKT_LEN	512
> 
> diff --git a/drivers/net/ice/ice_rxtx_vec_avx2.c
> b/drivers/net/ice/ice_rxtx_vec_avx2.c
> index be50677c2..843e4f32a 100644
> --- a/drivers/net/ice/ice_rxtx_vec_avx2.c
> +++ b/drivers/net/ice/ice_rxtx_vec_avx2.c
> @@ -29,7 +29,7 @@ ice_rxq_rearm(struct ice_rx_queue *rxq)
>  			__m128i dma_addr0;
> 
>  			dma_addr0 = _mm_setzero_si128();
> -			for (i = 0; i < ICE_DESCS_PER_LOOP; i++) {
> +			for (i = 0; i < ICE_DESCS_PER_LOOP_AVX; i++) {
>  				rxep[i].mbuf = &rxq->fake_mbuf;
>  				_mm_store_si128((__m128i *)&rxdp[i].read,
>  						dma_addr0);
> @@ -132,12 +132,17 @@ ice_rxq_rearm(struct ice_rx_queue *rxq)
>  	ICE_PCI_REG_WRITE(rxq->qrx_tail, rx_id);  }
> 
> +/**
> + * vPMD raw receive routine, only accept(nb_pkts >=
> +ICE_DESCS_PER_LOOP_AVX)
> + *
> + * Notice:
> + * - nb_pkts < ICE_DESCS_PER_LOOP_AVX, just return no packet
> + * - floor align nb_pkts to a ICE_DESCS_PER_LOOP_AVX power-of-two  */

The comment is misleading, it looks like we are going to floor align nb_pkts to 2^8, better to reword .

>  static inline uint16_t
>  _ice_recv_raw_pkts_vec_avx2(struct ice_rx_queue *rxq, struct rte_mbuf
> **rx_pkts,
>  			    uint16_t nb_pkts, uint8_t *split_packet)  { -#define
> ICE_DESCS_PER_LOOP_AVX 8
> -
>  	const uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl;
>  	const __m256i mbuf_init = _mm256_set_epi64x(0, 0,
>  			0, rxq->mbuf_initializer);
> @@ -603,10 +608,6 @@ _ice_recv_raw_pkts_vec_avx2(struct ice_rx_queue
> *rxq, struct rte_mbuf **rx_pkts,
>  	return received;
>  }
> 
> -/*
> - * Notice:
> - * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet
> - */
>  uint16_t
>  ice_recv_pkts_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
>  		       uint16_t nb_pkts)
> @@ -616,8 +617,6 @@ ice_recv_pkts_vec_avx2(void *rx_queue, struct
> rte_mbuf **rx_pkts,
> 
>  /**
>   * vPMD receive routine that reassembles single burst of 32 scattered
> packets
> - * Notice:
> - * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet
>   */

Why we need to remove this? is it still true for this function?

>  static uint16_t
>  ice_recv_scattered_burst_vec_avx2(void *rx_queue, struct rte_mbuf
> **rx_pkts, @@ -626,6 +625,9 @@ ice_recv_scattered_burst_vec_avx2(void
> *rx_queue, struct rte_mbuf **rx_pkts,
>  	struct ice_rx_queue *rxq = rx_queue;
>  	uint8_t split_flags[ICE_VPMD_RX_BURST] = {0};
> 
> +	/* split_flags only can support max of ICE_VPMD_RX_BURST */
> +	nb_pkts = RTE_MIN(nb_pkts, ICE_VPMD_RX_BURST);

Is this necessary?  the only consumer of this function is ice_recv_scattered_pkts_vec_avx2, 
I think nb_pkts <= ICE_VPMD_RX_BURST it already be guaranteed.
> +
>  	/* get some new buffers */
>  	uint16_t nb_bufs = _ice_recv_raw_pkts_vec_avx2(rxq, rx_pkts, nb_pkts,
>  						       split_flags);
> @@ -657,9 +659,6 @@ ice_recv_scattered_burst_vec_avx2(void *rx_queue,
> struct rte_mbuf **rx_pkts,
> 
>  /**
>   * vPMD receive routine that reassembles scattered packets.
> - * Main receive routine that can handle arbitrary burst sizes
> - * Notice:
> - * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet
>   */

Why we need to remove this? isn't it the main routine that be able to handle arbitrary burst size?

Btw, I will suggest all AVX2 changes can be in a separate patch, because this looks like some code clean and fix.
its not related with the main purpose of the patch set.
  
Guo, Jia Sept. 18, 2020, 3:20 a.m. UTC | #2
Hi, qi

> -----Original Message-----
> From: Zhang, Qi Z <qi.z.zhang@intel.com>
> Sent: Thursday, September 17, 2020 7:03 PM
> To: Guo, Jia <jia.guo@intel.com>; Yang, Qiming <qiming.yang@intel.com>;
> Xing, Beilei <beilei.xing@intel.com>; Wu, Jingjing <jingjing.wu@intel.com>;
> Wang, Haiyue <haiyue.wang@intel.com>
> Cc: Zhao1, Wei <wei.zhao1@intel.com>; Richardson, Bruce
> <bruce.richardson@intel.com>; dev@dpdk.org; Zhang, Helin
> <helin.zhang@intel.com>; mb@smartsharesystems.com; Yigit, Ferruh
> <ferruh.yigit@intel.com>; stephen@networkplumber.org; barbette@kth.se;
> Han, YingyaX <yingyax.han@intel.com>
> Subject: RE: [PATCH v4 4/5] net/ice: fix vector rx burst for ice
> 
> 
> 
> > -----Original Message-----
> > From: Guo, Jia <jia.guo@intel.com>
> > Sent: Thursday, September 17, 2020 3:59 PM
> > To: Yang, Qiming <qiming.yang@intel.com>; Xing, Beilei
> > <beilei.xing@intel.com>; Zhang, Qi Z <qi.z.zhang@intel.com>; Wu,
> > Jingjing <jingjing.wu@intel.com>; Wang, Haiyue <haiyue.wang@intel.com>
> > Cc: Zhao1, Wei <wei.zhao1@intel.com>; Richardson, Bruce
> > <bruce.richardson@intel.com>; dev@dpdk.org; Guo, Jia
> > <jia.guo@intel.com>; Zhang, Helin <helin.zhang@intel.com>;
> > mb@smartsharesystems.com; Yigit, Ferruh <ferruh.yigit@intel.com>;
> > stephen@networkplumber.org; barbette@kth.se; Han, YingyaX
> > <yingyax.han@intel.com>
> > Subject: [PATCH v4 4/5] net/ice: fix vector rx burst for ice
> >
> > The limitation of burst size in vector rx was removed, since it should
> > retrieve as much received packets as possible. And also the scattered
> > receive path should use a wrapper function to achieve the goal of
> > burst maximizing. And do some code cleaning for vector rx path.
> >
> > Bugzilla ID: 516
> > Fixes: c68a52b8b38c ("net/ice: support vector SSE in Rx")
> > Fixes: ae60d3c9b227 ("net/ice: support Rx AVX2 vector")
> >
> > Signed-off-by: Jeff Guo <jia.guo@intel.com>
> > Tested-by: Yingya Han <yingyax.han@intel.com>
> > ---
> >  drivers/net/ice/ice_rxtx.h          |  1 +
> >  drivers/net/ice/ice_rxtx_vec_avx2.c | 23 ++++++------
> > drivers/net/ice/ice_rxtx_vec_sse.c  | 56 +++++++++++++++++++----------
> >  3 files changed, 49 insertions(+), 31 deletions(-)
> >
> > diff --git a/drivers/net/ice/ice_rxtx.h b/drivers/net/ice/ice_rxtx.h
> > index 2fdcfb7d0..3ef5f300d 100644
> > --- a/drivers/net/ice/ice_rxtx.h
> > +++ b/drivers/net/ice/ice_rxtx.h
> > @@ -35,6 +35,7 @@
> >  #define ICE_MAX_RX_BURST            ICE_RXQ_REARM_THRESH
> >  #define ICE_TX_MAX_FREE_BUF_SZ      64
> >  #define ICE_DESCS_PER_LOOP          4
> > +#define ICE_DESCS_PER_LOOP_AVX	    8
> 
> No need to expose this if no external link, better to keep all avx stuff inside
> avx.c
> 

Ok, so define it in avx.c is the best choice if avx should not in rxtx.h.

> >
> >  #define ICE_FDIR_PKT_LEN	512
> >
> > diff --git a/drivers/net/ice/ice_rxtx_vec_avx2.c
> > b/drivers/net/ice/ice_rxtx_vec_avx2.c
> > index be50677c2..843e4f32a 100644
> > --- a/drivers/net/ice/ice_rxtx_vec_avx2.c
> > +++ b/drivers/net/ice/ice_rxtx_vec_avx2.c
> > @@ -29,7 +29,7 @@ ice_rxq_rearm(struct ice_rx_queue *rxq)
> >  			__m128i dma_addr0;
> >
> >  			dma_addr0 = _mm_setzero_si128();
> > -			for (i = 0; i < ICE_DESCS_PER_LOOP; i++) {
> > +			for (i = 0; i < ICE_DESCS_PER_LOOP_AVX; i++) {
> >  				rxep[i].mbuf = &rxq->fake_mbuf;
> >  				_mm_store_si128((__m128i *)&rxdp[i].read,
> >  						dma_addr0);
> > @@ -132,12 +132,17 @@ ice_rxq_rearm(struct ice_rx_queue *rxq)
> >  	ICE_PCI_REG_WRITE(rxq->qrx_tail, rx_id);  }
> >
> > +/**
> > + * vPMD raw receive routine, only accept(nb_pkts >=
> > +ICE_DESCS_PER_LOOP_AVX)
> > + *
> > + * Notice:
> > + * - nb_pkts < ICE_DESCS_PER_LOOP_AVX, just return no packet
> > + * - floor align nb_pkts to a ICE_DESCS_PER_LOOP_AVX power-of-two  */
> 
> The comment is misleading, it looks like we are going to floor align nb_pkts to
> 2^8, better to reword .
> 

It should be, agree.

> >  static inline uint16_t
> >  _ice_recv_raw_pkts_vec_avx2(struct ice_rx_queue *rxq, struct rte_mbuf
> > **rx_pkts,
> >  			    uint16_t nb_pkts, uint8_t *split_packet)  { -#define
> > ICE_DESCS_PER_LOOP_AVX 8
> > -
> >  	const uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl;
> >  	const __m256i mbuf_init = _mm256_set_epi64x(0, 0,
> >  			0, rxq->mbuf_initializer);
> > @@ -603,10 +608,6 @@ _ice_recv_raw_pkts_vec_avx2(struct
> ice_rx_queue
> > *rxq, struct rte_mbuf **rx_pkts,
> >  	return received;
> >  }
> >
> > -/*
> > - * Notice:
> > - * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet
> > - */
> >  uint16_t
> >  ice_recv_pkts_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
> >  		       uint16_t nb_pkts)
> > @@ -616,8 +617,6 @@ ice_recv_pkts_vec_avx2(void *rx_queue, struct
> > rte_mbuf **rx_pkts,
> >
> >  /**
> >   * vPMD receive routine that reassembles single burst of 32 scattered
> > packets
> > - * Notice:
> > - * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet
> >   */
> 
> Why we need to remove this? is it still true for this function?
> 

The reason is that this comment is in the calling function " _ice_recv_raw_pkts_vec_avx2" which process the related thing, no need to add it more and more in the caller function. 

> >  static uint16_t
> >  ice_recv_scattered_burst_vec_avx2(void *rx_queue, struct rte_mbuf
> > **rx_pkts, @@ -626,6 +625,9 @@
> ice_recv_scattered_burst_vec_avx2(void
> > *rx_queue, struct rte_mbuf **rx_pkts,
> >  	struct ice_rx_queue *rxq = rx_queue;
> >  	uint8_t split_flags[ICE_VPMD_RX_BURST] = {0};
> >
> > +	/* split_flags only can support max of ICE_VPMD_RX_BURST */
> > +	nb_pkts = RTE_MIN(nb_pkts, ICE_VPMD_RX_BURST);
> 
> Is this necessary?  the only consumer of this function is
> ice_recv_scattered_pkts_vec_avx2, I think nb_pkts <=
> ICE_VPMD_RX_BURST it already be guaranteed.

The reason is that we remove "nb_pkts <= ICE_VPMD_RX_BURST" and in this function split_flags have a limit for ICE_VPMD_RX_BURST, so a checking is need in the function.

> > +
> >  	/* get some new buffers */
> >  	uint16_t nb_bufs = _ice_recv_raw_pkts_vec_avx2(rxq, rx_pkts,
> nb_pkts,
> >  						       split_flags);
> > @@ -657,9 +659,6 @@ ice_recv_scattered_burst_vec_avx2(void
> *rx_queue,
> > struct rte_mbuf **rx_pkts,
> >
> >  /**
> >   * vPMD receive routine that reassembles scattered packets.
> > - * Main receive routine that can handle arbitrary burst sizes
> > - * Notice:
> > - * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet
> >   */
> 
> Why we need to remove this? isn't it the main routine that be able to handle
> arbitrary burst size?
> 

The question is why we need to said the arbitrary sizes if we process and return what we could receive packet for maximum? It is not only useless comment but also maybe bring some confuse I think. 

> Btw, I will suggest all AVX2 changes can be in a separate patch, because this
> looks like some code clean and fix.
> its not related with the main purpose of the patch set.

I consider it and ask any objection before, so totally I am not disagree on separate it, but I think if  the purpose of the patch set is to clean some misleading for vec(sse/avx) burst, it could still be on a set even separate it to patch.
  
Qi Zhang Sept. 18, 2020, 3:41 a.m. UTC | #3
> -----Original Message-----
> From: Guo, Jia <jia.guo@intel.com>
> Sent: Friday, September 18, 2020 11:20 AM
> To: Zhang, Qi Z <qi.z.zhang@intel.com>; Yang, Qiming
> <qiming.yang@intel.com>; Xing, Beilei <beilei.xing@intel.com>; Wu, Jingjing
> <jingjing.wu@intel.com>; Wang, Haiyue <haiyue.wang@intel.com>
> Cc: Zhao1, Wei <wei.zhao1@intel.com>; Richardson, Bruce
> <bruce.richardson@intel.com>; dev@dpdk.org; Zhang, Helin
> <helin.zhang@intel.com>; mb@smartsharesystems.com; Yigit, Ferruh
> <ferruh.yigit@intel.com>; stephen@networkplumber.org; barbette@kth.se;
> Han, YingyaX <yingyax.han@intel.com>
> Subject: RE: [PATCH v4 4/5] net/ice: fix vector rx burst for ice
> 
> Hi, qi
> 
> > -----Original Message-----
> > From: Zhang, Qi Z <qi.z.zhang@intel.com>
> > Sent: Thursday, September 17, 2020 7:03 PM
> > To: Guo, Jia <jia.guo@intel.com>; Yang, Qiming
> > <qiming.yang@intel.com>; Xing, Beilei <beilei.xing@intel.com>; Wu,
> > Jingjing <jingjing.wu@intel.com>; Wang, Haiyue <haiyue.wang@intel.com>
> > Cc: Zhao1, Wei <wei.zhao1@intel.com>; Richardson, Bruce
> > <bruce.richardson@intel.com>; dev@dpdk.org; Zhang, Helin
> > <helin.zhang@intel.com>; mb@smartsharesystems.com; Yigit, Ferruh
> > <ferruh.yigit@intel.com>; stephen@networkplumber.org; barbette@kth.se;
> > Han, YingyaX <yingyax.han@intel.com>
> > Subject: RE: [PATCH v4 4/5] net/ice: fix vector rx burst for ice
> >
> >
> >
> > > -----Original Message-----
> > > From: Guo, Jia <jia.guo@intel.com>
> > > Sent: Thursday, September 17, 2020 3:59 PM
> > > To: Yang, Qiming <qiming.yang@intel.com>; Xing, Beilei
> > > <beilei.xing@intel.com>; Zhang, Qi Z <qi.z.zhang@intel.com>; Wu,
> > > Jingjing <jingjing.wu@intel.com>; Wang, Haiyue
> > > <haiyue.wang@intel.com>
> > > Cc: Zhao1, Wei <wei.zhao1@intel.com>; Richardson, Bruce
> > > <bruce.richardson@intel.com>; dev@dpdk.org; Guo, Jia
> > > <jia.guo@intel.com>; Zhang, Helin <helin.zhang@intel.com>;
> > > mb@smartsharesystems.com; Yigit, Ferruh <ferruh.yigit@intel.com>;
> > > stephen@networkplumber.org; barbette@kth.se; Han, YingyaX
> > > <yingyax.han@intel.com>
> > > Subject: [PATCH v4 4/5] net/ice: fix vector rx burst for ice
> > >
> > > The limitation of burst size in vector rx was removed, since it
> > > should retrieve as much received packets as possible. And also the
> > > scattered receive path should use a wrapper function to achieve the
> > > goal of burst maximizing. And do some code cleaning for vector rx path.
> > >
> > > Bugzilla ID: 516
> > > Fixes: c68a52b8b38c ("net/ice: support vector SSE in Rx")
> > > Fixes: ae60d3c9b227 ("net/ice: support Rx AVX2 vector")
> > >
> > > Signed-off-by: Jeff Guo <jia.guo@intel.com>
> > > Tested-by: Yingya Han <yingyax.han@intel.com>
> > > ---
> > >  drivers/net/ice/ice_rxtx.h          |  1 +
> > >  drivers/net/ice/ice_rxtx_vec_avx2.c | 23 ++++++------
> > > drivers/net/ice/ice_rxtx_vec_sse.c  | 56
> > > +++++++++++++++++++----------
> > >  3 files changed, 49 insertions(+), 31 deletions(-)
> > >
> > > diff --git a/drivers/net/ice/ice_rxtx.h b/drivers/net/ice/ice_rxtx.h
> > > index 2fdcfb7d0..3ef5f300d 100644
> > > --- a/drivers/net/ice/ice_rxtx.h
> > > +++ b/drivers/net/ice/ice_rxtx.h
> > > @@ -35,6 +35,7 @@
> > >  #define ICE_MAX_RX_BURST            ICE_RXQ_REARM_THRESH
> > >  #define ICE_TX_MAX_FREE_BUF_SZ      64
> > >  #define ICE_DESCS_PER_LOOP          4
> > > +#define ICE_DESCS_PER_LOOP_AVX	    8
> >
> > No need to expose this if no external link, better to keep all avx
> > stuff inside avx.c
> >
> 
> Ok, so define it in avx.c is the best choice if avx should not in rxtx.h.
> 
> > >
> > >  #define ICE_FDIR_PKT_LEN	512
> > >
> > > diff --git a/drivers/net/ice/ice_rxtx_vec_avx2.c
> > > b/drivers/net/ice/ice_rxtx_vec_avx2.c
> > > index be50677c2..843e4f32a 100644
> > > --- a/drivers/net/ice/ice_rxtx_vec_avx2.c
> > > +++ b/drivers/net/ice/ice_rxtx_vec_avx2.c
> > > @@ -29,7 +29,7 @@ ice_rxq_rearm(struct ice_rx_queue *rxq)
> > >  			__m128i dma_addr0;
> > >
> > >  			dma_addr0 = _mm_setzero_si128();
> > > -			for (i = 0; i < ICE_DESCS_PER_LOOP; i++) {
> > > +			for (i = 0; i < ICE_DESCS_PER_LOOP_AVX; i++) {
> > >  				rxep[i].mbuf = &rxq->fake_mbuf;
> > >  				_mm_store_si128((__m128i *)&rxdp[i].read,
> > >  						dma_addr0);
> > > @@ -132,12 +132,17 @@ ice_rxq_rearm(struct ice_rx_queue *rxq)
> > >  	ICE_PCI_REG_WRITE(rxq->qrx_tail, rx_id);  }
> > >
> > > +/**
> > > + * vPMD raw receive routine, only accept(nb_pkts >=
> > > +ICE_DESCS_PER_LOOP_AVX)
> > > + *
> > > + * Notice:
> > > + * - nb_pkts < ICE_DESCS_PER_LOOP_AVX, just return no packet
> > > + * - floor align nb_pkts to a ICE_DESCS_PER_LOOP_AVX power-of-two
> > > +*/
> >
> > The comment is misleading, it looks like we are going to floor align
> > nb_pkts to 2^8, better to reword .
> >
> 
> It should be, agree.
> 
> > >  static inline uint16_t
> > >  _ice_recv_raw_pkts_vec_avx2(struct ice_rx_queue *rxq, struct
> > > rte_mbuf **rx_pkts,
> > >  			    uint16_t nb_pkts, uint8_t *split_packet)  { -#define
> > > ICE_DESCS_PER_LOOP_AVX 8
> > > -
> > >  	const uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl;
> > >  	const __m256i mbuf_init = _mm256_set_epi64x(0, 0,
> > >  			0, rxq->mbuf_initializer);
> > > @@ -603,10 +608,6 @@ _ice_recv_raw_pkts_vec_avx2(struct
> > ice_rx_queue
> > > *rxq, struct rte_mbuf **rx_pkts,
> > >  	return received;
> > >  }
> > >
> > > -/*
> > > - * Notice:
> > > - * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet
> > > - */
> > >  uint16_t
> > >  ice_recv_pkts_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
> > >  		       uint16_t nb_pkts)
> > > @@ -616,8 +617,6 @@ ice_recv_pkts_vec_avx2(void *rx_queue, struct
> > > rte_mbuf **rx_pkts,
> > >
> > >  /**
> > >   * vPMD receive routine that reassembles single burst of 32
> > > scattered packets
> > > - * Notice:
> > > - * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet
> > >   */
> >
> > Why we need to remove this? is it still true for this function?
> >
> 
> The reason is that this comment is in the calling function "
> _ice_recv_raw_pkts_vec_avx2" which process the related thing, no need to
> add it more and more in the caller function.

I think you remove related comment from the calling function also :)

Also I think better to keep this even it's a little bit duplicate, that help people to understand the internal logic

> 
> > >  static uint16_t
> > >  ice_recv_scattered_burst_vec_avx2(void *rx_queue, struct rte_mbuf
> > > **rx_pkts, @@ -626,6 +625,9 @@
> > ice_recv_scattered_burst_vec_avx2(void
> > > *rx_queue, struct rte_mbuf **rx_pkts,
> > >  	struct ice_rx_queue *rxq = rx_queue;
> > >  	uint8_t split_flags[ICE_VPMD_RX_BURST] = {0};
> > >
> > > +	/* split_flags only can support max of ICE_VPMD_RX_BURST */
> > > +	nb_pkts = RTE_MIN(nb_pkts, ICE_VPMD_RX_BURST);
> >
> > Is this necessary?  the only consumer of this function is
> > ice_recv_scattered_pkts_vec_avx2, I think nb_pkts <= ICE_VPMD_RX_BURST
> > it already be guaranteed.
> 
> The reason is that we remove "nb_pkts <= ICE_VPMD_RX_BURST" and in this
> function split_flags have a limit for ICE_VPMD_RX_BURST, so a checking is
> need in the function.

Can't get this, could tell me is there any case that nb_pkts > ICE_VPMD_RX_BURST?


> 
> > > +
> > >  	/* get some new buffers */
> > >  	uint16_t nb_bufs = _ice_recv_raw_pkts_vec_avx2(rxq, rx_pkts,
> > nb_pkts,
> > >  						       split_flags);
> > > @@ -657,9 +659,6 @@ ice_recv_scattered_burst_vec_avx2(void
> > *rx_queue,
> > > struct rte_mbuf **rx_pkts,
> > >
> > >  /**
> > >   * vPMD receive routine that reassembles scattered packets.
> > > - * Main receive routine that can handle arbitrary burst sizes
> > > - * Notice:
> > > - * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet
> > >   */
> >
> > Why we need to remove this? isn't it the main routine that be able to
> > handle arbitrary burst size?
> >
> 
> The question is why we need to said the arbitrary sizes if we process and return
> what we could receive packet for maximum? It is not only useless comment but
> also maybe bring some confuse I think.

Yes arbitrary size description can be removed, as this is assumed to be the default behavior.  
But the description for nb_pkts should still be kept.

> 
> > Btw, I will suggest all AVX2 changes can be in a separate patch,
> > because this looks like some code clean and fix.
> > its not related with the main purpose of the patch set.
> 
> I consider it and ask any objection before, so totally I am not disagree on
> separate it, but I think if  the purpose of the patch set is to clean some
> misleading for vec(sse/avx) burst, it could still be on a set even separate it to
> patch.

I will not be insist on patch separate, but if you separate them, some of fixes can be merged early and no need to wait for those part need more review.
  
Guo, Jia Sept. 18, 2020, 4:41 a.m. UTC | #4
> -----Original Message-----
> From: Zhang, Qi Z <qi.z.zhang@intel.com>
> Sent: Friday, September 18, 2020 11:41 AM
> To: Guo, Jia <jia.guo@intel.com>; Yang, Qiming <qiming.yang@intel.com>;
> Xing, Beilei <beilei.xing@intel.com>; Wu, Jingjing <jingjing.wu@intel.com>;
> Wang, Haiyue <haiyue.wang@intel.com>
> Cc: Zhao1, Wei <wei.zhao1@intel.com>; Richardson, Bruce
> <bruce.richardson@intel.com>; dev@dpdk.org; Zhang, Helin
> <helin.zhang@intel.com>; mb@smartsharesystems.com; Yigit, Ferruh
> <ferruh.yigit@intel.com>; stephen@networkplumber.org; barbette@kth.se;
> Han, YingyaX <yingyax.han@intel.com>
> Subject: RE: [PATCH v4 4/5] net/ice: fix vector rx burst for ice
> 
> 
> 
> > -----Original Message-----
> > From: Guo, Jia <jia.guo@intel.com>
> > Sent: Friday, September 18, 2020 11:20 AM
> > To: Zhang, Qi Z <qi.z.zhang@intel.com>; Yang, Qiming
> > <qiming.yang@intel.com>; Xing, Beilei <beilei.xing@intel.com>; Wu,
> > Jingjing <jingjing.wu@intel.com>; Wang, Haiyue <haiyue.wang@intel.com>
> > Cc: Zhao1, Wei <wei.zhao1@intel.com>; Richardson, Bruce
> > <bruce.richardson@intel.com>; dev@dpdk.org; Zhang, Helin
> > <helin.zhang@intel.com>; mb@smartsharesystems.com; Yigit, Ferruh
> > <ferruh.yigit@intel.com>; stephen@networkplumber.org;
> barbette@kth.se;
> > Han, YingyaX <yingyax.han@intel.com>
> > Subject: RE: [PATCH v4 4/5] net/ice: fix vector rx burst for ice
> >
> > Hi, qi
> >
> > > -----Original Message-----
> > > From: Zhang, Qi Z <qi.z.zhang@intel.com>
> > > Sent: Thursday, September 17, 2020 7:03 PM
> > > To: Guo, Jia <jia.guo@intel.com>; Yang, Qiming
> > > <qiming.yang@intel.com>; Xing, Beilei <beilei.xing@intel.com>; Wu,
> > > Jingjing <jingjing.wu@intel.com>; Wang, Haiyue
> > > <haiyue.wang@intel.com>
> > > Cc: Zhao1, Wei <wei.zhao1@intel.com>; Richardson, Bruce
> > > <bruce.richardson@intel.com>; dev@dpdk.org; Zhang, Helin
> > > <helin.zhang@intel.com>; mb@smartsharesystems.com; Yigit, Ferruh
> > > <ferruh.yigit@intel.com>; stephen@networkplumber.org;
> > > barbette@kth.se; Han, YingyaX <yingyax.han@intel.com>
> > > Subject: RE: [PATCH v4 4/5] net/ice: fix vector rx burst for ice
> > >
> > >
> > >
> > > > -----Original Message-----
> > > > From: Guo, Jia <jia.guo@intel.com>
> > > > Sent: Thursday, September 17, 2020 3:59 PM
> > > > To: Yang, Qiming <qiming.yang@intel.com>; Xing, Beilei
> > > > <beilei.xing@intel.com>; Zhang, Qi Z <qi.z.zhang@intel.com>; Wu,
> > > > Jingjing <jingjing.wu@intel.com>; Wang, Haiyue
> > > > <haiyue.wang@intel.com>
> > > > Cc: Zhao1, Wei <wei.zhao1@intel.com>; Richardson, Bruce
> > > > <bruce.richardson@intel.com>; dev@dpdk.org; Guo, Jia
> > > > <jia.guo@intel.com>; Zhang, Helin <helin.zhang@intel.com>;
> > > > mb@smartsharesystems.com; Yigit, Ferruh <ferruh.yigit@intel.com>;
> > > > stephen@networkplumber.org; barbette@kth.se; Han, YingyaX
> > > > <yingyax.han@intel.com>
> > > > Subject: [PATCH v4 4/5] net/ice: fix vector rx burst for ice
> > > >
> > > > The limitation of burst size in vector rx was removed, since it
> > > > should retrieve as much received packets as possible. And also the
> > > > scattered receive path should use a wrapper function to achieve
> > > > the goal of burst maximizing. And do some code cleaning for vector rx
> path.
> > > >
> > > > Bugzilla ID: 516
> > > > Fixes: c68a52b8b38c ("net/ice: support vector SSE in Rx")
> > > > Fixes: ae60d3c9b227 ("net/ice: support Rx AVX2 vector")
> > > >
> > > > Signed-off-by: Jeff Guo <jia.guo@intel.com>
> > > > Tested-by: Yingya Han <yingyax.han@intel.com>
> > > > ---
> > > >  drivers/net/ice/ice_rxtx.h          |  1 +
> > > >  drivers/net/ice/ice_rxtx_vec_avx2.c | 23 ++++++------
> > > > drivers/net/ice/ice_rxtx_vec_sse.c  | 56
> > > > +++++++++++++++++++----------
> > > >  3 files changed, 49 insertions(+), 31 deletions(-)
> > > >
> > > > diff --git a/drivers/net/ice/ice_rxtx.h
> > > > b/drivers/net/ice/ice_rxtx.h index 2fdcfb7d0..3ef5f300d 100644
> > > > --- a/drivers/net/ice/ice_rxtx.h
> > > > +++ b/drivers/net/ice/ice_rxtx.h
> > > > @@ -35,6 +35,7 @@
> > > >  #define ICE_MAX_RX_BURST            ICE_RXQ_REARM_THRESH
> > > >  #define ICE_TX_MAX_FREE_BUF_SZ      64
> > > >  #define ICE_DESCS_PER_LOOP          4
> > > > +#define ICE_DESCS_PER_LOOP_AVX	    8
> > >
> > > No need to expose this if no external link, better to keep all avx
> > > stuff inside avx.c
> > >
> >
> > Ok, so define it in avx.c is the best choice if avx should not in rxtx.h.
> >
> > > >
> > > >  #define ICE_FDIR_PKT_LEN	512
> > > >
> > > > diff --git a/drivers/net/ice/ice_rxtx_vec_avx2.c
> > > > b/drivers/net/ice/ice_rxtx_vec_avx2.c
> > > > index be50677c2..843e4f32a 100644
> > > > --- a/drivers/net/ice/ice_rxtx_vec_avx2.c
> > > > +++ b/drivers/net/ice/ice_rxtx_vec_avx2.c
> > > > @@ -29,7 +29,7 @@ ice_rxq_rearm(struct ice_rx_queue *rxq)
> > > >  			__m128i dma_addr0;
> > > >
> > > >  			dma_addr0 = _mm_setzero_si128();
> > > > -			for (i = 0; i < ICE_DESCS_PER_LOOP; i++) {
> > > > +			for (i = 0; i < ICE_DESCS_PER_LOOP_AVX; i++) {
> > > >  				rxep[i].mbuf = &rxq->fake_mbuf;
> > > >  				_mm_store_si128((__m128i *)&rxdp[i].read,
> > > >  						dma_addr0);
> > > > @@ -132,12 +132,17 @@ ice_rxq_rearm(struct ice_rx_queue *rxq)
> > > >  	ICE_PCI_REG_WRITE(rxq->qrx_tail, rx_id);  }
> > > >
> > > > +/**
> > > > + * vPMD raw receive routine, only accept(nb_pkts >=
> > > > +ICE_DESCS_PER_LOOP_AVX)
> > > > + *
> > > > + * Notice:
> > > > + * - nb_pkts < ICE_DESCS_PER_LOOP_AVX, just return no packet
> > > > + * - floor align nb_pkts to a ICE_DESCS_PER_LOOP_AVX power-of-two
> > > > +*/
> > >
> > > The comment is misleading, it looks like we are going to floor align
> > > nb_pkts to 2^8, better to reword .
> > >
> >
> > It should be, agree.
> >
> > > >  static inline uint16_t
> > > >  _ice_recv_raw_pkts_vec_avx2(struct ice_rx_queue *rxq, struct
> > > > rte_mbuf **rx_pkts,
> > > >  			    uint16_t nb_pkts, uint8_t *split_packet)  { -#define
> > > > ICE_DESCS_PER_LOOP_AVX 8
> > > > -
> > > >  	const uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl;
> > > >  	const __m256i mbuf_init = _mm256_set_epi64x(0, 0,
> > > >  			0, rxq->mbuf_initializer);
> > > > @@ -603,10 +608,6 @@ _ice_recv_raw_pkts_vec_avx2(struct
> > > ice_rx_queue
> > > > *rxq, struct rte_mbuf **rx_pkts,
> > > >  	return received;
> > > >  }
> > > >
> > > > -/*
> > > > - * Notice:
> > > > - * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet
> > > > - */
> > > >  uint16_t
> > > >  ice_recv_pkts_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
> > > >  		       uint16_t nb_pkts)
> > > > @@ -616,8 +617,6 @@ ice_recv_pkts_vec_avx2(void *rx_queue, struct
> > > > rte_mbuf **rx_pkts,
> > > >
> > > >  /**
> > > >   * vPMD receive routine that reassembles single burst of 32
> > > > scattered packets
> > > > - * Notice:
> > > > - * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet
> > > >   */
> > >
> > > Why we need to remove this? is it still true for this function?
> > >
> >
> > The reason is that this comment is in the calling function "
> > _ice_recv_raw_pkts_vec_avx2" which process the related thing, no need
> > to add it more and more in the caller function.
> 
> I think you remove related comment from the calling function also :)
> 
> Also I think better to keep this even it's a little bit duplicate, that help people
> to understand the internal logic
> 
> >
> > > >  static uint16_t
> > > >  ice_recv_scattered_burst_vec_avx2(void *rx_queue, struct rte_mbuf
> > > > **rx_pkts, @@ -626,6 +625,9 @@
> > > ice_recv_scattered_burst_vec_avx2(void
> > > > *rx_queue, struct rte_mbuf **rx_pkts,
> > > >  	struct ice_rx_queue *rxq = rx_queue;
> > > >  	uint8_t split_flags[ICE_VPMD_RX_BURST] = {0};
> > > >
> > > > +	/* split_flags only can support max of ICE_VPMD_RX_BURST */
> > > > +	nb_pkts = RTE_MIN(nb_pkts, ICE_VPMD_RX_BURST);
> > >
> > > Is this necessary?  the only consumer of this function is
> > > ice_recv_scattered_pkts_vec_avx2, I think nb_pkts <=
> > > ICE_VPMD_RX_BURST it already be guaranteed.
> >
> > The reason is that we remove "nb_pkts <= ICE_VPMD_RX_BURST" and in
> > this function split_flags have a limit for ICE_VPMD_RX_BURST, so a
> > checking is need in the function.
> 
> Can't get this, could tell me is there any case that nb_pkts >
> ICE_VPMD_RX_BURST?
> 

I know we just set the hard value here and only one case usage, but I think only the caller know what would be the input param, but the calling should not know the input param will be, even there is no any caller but the calling still need to be complete.  

> 
> >
> > > > +
> > > >  	/* get some new buffers */
> > > >  	uint16_t nb_bufs = _ice_recv_raw_pkts_vec_avx2(rxq, rx_pkts,
> > > nb_pkts,
> > > >  						       split_flags);
> > > > @@ -657,9 +659,6 @@ ice_recv_scattered_burst_vec_avx2(void
> > > *rx_queue,
> > > > struct rte_mbuf **rx_pkts,
> > > >
> > > >  /**
> > > >   * vPMD receive routine that reassembles scattered packets.
> > > > - * Main receive routine that can handle arbitrary burst sizes
> > > > - * Notice:
> > > > - * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet
> > > >   */
> > >
> > > Why we need to remove this? isn't it the main routine that be able
> > > to handle arbitrary burst size?
> > >
> >
> > The question is why we need to said the arbitrary sizes if we process
> > and return what we could receive packet for maximum? It is not only
> > useless comment but also maybe bring some confuse I think.
> 
> Yes arbitrary size description can be removed, as this is assumed to be the
> default behavior.
> But the description for nb_pkts should still be kept.
> 
> >
> > > Btw, I will suggest all AVX2 changes can be in a separate patch,
> > > because this looks like some code clean and fix.
> > > its not related with the main purpose of the patch set.
> >
> > I consider it and ask any objection before, so totally I am not
> > disagree on separate it, but I think if  the purpose of the patch set
> > is to clean some misleading for vec(sse/avx) burst, it could still be
> > on a set even separate it to patch.
> 
> I will not be insist on patch separate, but if you separate them, some of fixes
> can be merged early and no need to wait for those part need more review.

Ok, seems that there still something discuss on the code cleaning patch, let me separate it for better review.
  
Qi Zhang Sept. 18, 2020, 5:39 a.m. UTC | #5
> -----Original Message-----
> From: Guo, Jia <jia.guo@intel.com>
> Sent: Friday, September 18, 2020 12:41 PM
> To: Zhang, Qi Z <qi.z.zhang@intel.com>; Yang, Qiming
> <qiming.yang@intel.com>; Xing, Beilei <beilei.xing@intel.com>; Wu, Jingjing
> <jingjing.wu@intel.com>; Wang, Haiyue <haiyue.wang@intel.com>
> Cc: Zhao1, Wei <wei.zhao1@intel.com>; Richardson, Bruce
> <bruce.richardson@intel.com>; dev@dpdk.org; Zhang, Helin
> <helin.zhang@intel.com>; mb@smartsharesystems.com; Yigit, Ferruh
> <ferruh.yigit@intel.com>; stephen@networkplumber.org; barbette@kth.se;
> Han, YingyaX <yingyax.han@intel.com>
> Subject: RE: [PATCH v4 4/5] net/ice: fix vector rx burst for ice
> 
> 
> > -----Original Message-----
> > From: Zhang, Qi Z <qi.z.zhang@intel.com>
> > Sent: Friday, September 18, 2020 11:41 AM
> > To: Guo, Jia <jia.guo@intel.com>; Yang, Qiming
> > <qiming.yang@intel.com>; Xing, Beilei <beilei.xing@intel.com>; Wu,
> > Jingjing <jingjing.wu@intel.com>; Wang, Haiyue <haiyue.wang@intel.com>
> > Cc: Zhao1, Wei <wei.zhao1@intel.com>; Richardson, Bruce
> > <bruce.richardson@intel.com>; dev@dpdk.org; Zhang, Helin
> > <helin.zhang@intel.com>; mb@smartsharesystems.com; Yigit, Ferruh
> > <ferruh.yigit@intel.com>; stephen@networkplumber.org; barbette@kth.se;
> > Han, YingyaX <yingyax.han@intel.com>
> > Subject: RE: [PATCH v4 4/5] net/ice: fix vector rx burst for ice
> >
> >
> >
> > > -----Original Message-----
> > > From: Guo, Jia <jia.guo@intel.com>
> > > Sent: Friday, September 18, 2020 11:20 AM
> > > To: Zhang, Qi Z <qi.z.zhang@intel.com>; Yang, Qiming
> > > <qiming.yang@intel.com>; Xing, Beilei <beilei.xing@intel.com>; Wu,
> > > Jingjing <jingjing.wu@intel.com>; Wang, Haiyue
> > > <haiyue.wang@intel.com>
> > > Cc: Zhao1, Wei <wei.zhao1@intel.com>; Richardson, Bruce
> > > <bruce.richardson@intel.com>; dev@dpdk.org; Zhang, Helin
> > > <helin.zhang@intel.com>; mb@smartsharesystems.com; Yigit, Ferruh
> > > <ferruh.yigit@intel.com>; stephen@networkplumber.org;
> > barbette@kth.se;
> > > Han, YingyaX <yingyax.han@intel.com>
> > > Subject: RE: [PATCH v4 4/5] net/ice: fix vector rx burst for ice
> > >
> > > Hi, qi
> > >
> > > > -----Original Message-----
> > > > From: Zhang, Qi Z <qi.z.zhang@intel.com>
> > > > Sent: Thursday, September 17, 2020 7:03 PM
> > > > To: Guo, Jia <jia.guo@intel.com>; Yang, Qiming
> > > > <qiming.yang@intel.com>; Xing, Beilei <beilei.xing@intel.com>; Wu,
> > > > Jingjing <jingjing.wu@intel.com>; Wang, Haiyue
> > > > <haiyue.wang@intel.com>
> > > > Cc: Zhao1, Wei <wei.zhao1@intel.com>; Richardson, Bruce
> > > > <bruce.richardson@intel.com>; dev@dpdk.org; Zhang, Helin
> > > > <helin.zhang@intel.com>; mb@smartsharesystems.com; Yigit, Ferruh
> > > > <ferruh.yigit@intel.com>; stephen@networkplumber.org;
> > > > barbette@kth.se; Han, YingyaX <yingyax.han@intel.com>
> > > > Subject: RE: [PATCH v4 4/5] net/ice: fix vector rx burst for ice
> > > >
> > > >
> > > >
> > > > > -----Original Message-----
> > > > > From: Guo, Jia <jia.guo@intel.com>
> > > > > Sent: Thursday, September 17, 2020 3:59 PM
> > > > > To: Yang, Qiming <qiming.yang@intel.com>; Xing, Beilei
> > > > > <beilei.xing@intel.com>; Zhang, Qi Z <qi.z.zhang@intel.com>; Wu,
> > > > > Jingjing <jingjing.wu@intel.com>; Wang, Haiyue
> > > > > <haiyue.wang@intel.com>
> > > > > Cc: Zhao1, Wei <wei.zhao1@intel.com>; Richardson, Bruce
> > > > > <bruce.richardson@intel.com>; dev@dpdk.org; Guo, Jia
> > > > > <jia.guo@intel.com>; Zhang, Helin <helin.zhang@intel.com>;
> > > > > mb@smartsharesystems.com; Yigit, Ferruh
> > > > > <ferruh.yigit@intel.com>; stephen@networkplumber.org;
> > > > > barbette@kth.se; Han, YingyaX <yingyax.han@intel.com>
> > > > > Subject: [PATCH v4 4/5] net/ice: fix vector rx burst for ice
> > > > >
> > > > > The limitation of burst size in vector rx was removed, since it
> > > > > should retrieve as much received packets as possible. And also
> > > > > the scattered receive path should use a wrapper function to
> > > > > achieve the goal of burst maximizing. And do some code cleaning
> > > > > for vector rx
> > path.
> > > > >
> > > > > Bugzilla ID: 516
> > > > > Fixes: c68a52b8b38c ("net/ice: support vector SSE in Rx")
> > > > > Fixes: ae60d3c9b227 ("net/ice: support Rx AVX2 vector")
> > > > >
> > > > > Signed-off-by: Jeff Guo <jia.guo@intel.com>
> > > > > Tested-by: Yingya Han <yingyax.han@intel.com>
> > > > > ---
> > > > >  drivers/net/ice/ice_rxtx.h          |  1 +
> > > > >  drivers/net/ice/ice_rxtx_vec_avx2.c | 23 ++++++------
> > > > > drivers/net/ice/ice_rxtx_vec_sse.c  | 56
> > > > > +++++++++++++++++++----------
> > > > >  3 files changed, 49 insertions(+), 31 deletions(-)
> > > > >
> > > > > diff --git a/drivers/net/ice/ice_rxtx.h
> > > > > b/drivers/net/ice/ice_rxtx.h index 2fdcfb7d0..3ef5f300d 100644
> > > > > --- a/drivers/net/ice/ice_rxtx.h
> > > > > +++ b/drivers/net/ice/ice_rxtx.h
> > > > > @@ -35,6 +35,7 @@
> > > > >  #define ICE_MAX_RX_BURST
> ICE_RXQ_REARM_THRESH
> > > > >  #define ICE_TX_MAX_FREE_BUF_SZ      64
> > > > >  #define ICE_DESCS_PER_LOOP          4
> > > > > +#define ICE_DESCS_PER_LOOP_AVX	    8
> > > >
> > > > No need to expose this if no external link, better to keep all avx
> > > > stuff inside avx.c
> > > >
> > >
> > > Ok, so define it in avx.c is the best choice if avx should not in rxtx.h.
> > >
> > > > >
> > > > >  #define ICE_FDIR_PKT_LEN	512
> > > > >
> > > > > diff --git a/drivers/net/ice/ice_rxtx_vec_avx2.c
> > > > > b/drivers/net/ice/ice_rxtx_vec_avx2.c
> > > > > index be50677c2..843e4f32a 100644
> > > > > --- a/drivers/net/ice/ice_rxtx_vec_avx2.c
> > > > > +++ b/drivers/net/ice/ice_rxtx_vec_avx2.c
> > > > > @@ -29,7 +29,7 @@ ice_rxq_rearm(struct ice_rx_queue *rxq)
> > > > >  			__m128i dma_addr0;
> > > > >
> > > > >  			dma_addr0 = _mm_setzero_si128();
> > > > > -			for (i = 0; i < ICE_DESCS_PER_LOOP; i++) {
> > > > > +			for (i = 0; i < ICE_DESCS_PER_LOOP_AVX; i++) {
> > > > >  				rxep[i].mbuf = &rxq->fake_mbuf;
> > > > >  				_mm_store_si128((__m128i *)&rxdp[i].read,
> > > > >  						dma_addr0);
> > > > > @@ -132,12 +132,17 @@ ice_rxq_rearm(struct ice_rx_queue *rxq)
> > > > >  	ICE_PCI_REG_WRITE(rxq->qrx_tail, rx_id);  }
> > > > >
> > > > > +/**
> > > > > + * vPMD raw receive routine, only accept(nb_pkts >=
> > > > > +ICE_DESCS_PER_LOOP_AVX)
> > > > > + *
> > > > > + * Notice:
> > > > > + * - nb_pkts < ICE_DESCS_PER_LOOP_AVX, just return no packet
> > > > > + * - floor align nb_pkts to a ICE_DESCS_PER_LOOP_AVX
> > > > > +power-of-two */
> > > >
> > > > The comment is misleading, it looks like we are going to floor
> > > > align nb_pkts to 2^8, better to reword .
> > > >
> > >
> > > It should be, agree.
> > >
> > > > >  static inline uint16_t
> > > > >  _ice_recv_raw_pkts_vec_avx2(struct ice_rx_queue *rxq, struct
> > > > > rte_mbuf **rx_pkts,
> > > > >  			    uint16_t nb_pkts, uint8_t *split_packet)  { -#define
> > > > > ICE_DESCS_PER_LOOP_AVX 8
> > > > > -
> > > > >  	const uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl;
> > > > >  	const __m256i mbuf_init = _mm256_set_epi64x(0, 0,
> > > > >  			0, rxq->mbuf_initializer);
> > > > > @@ -603,10 +608,6 @@ _ice_recv_raw_pkts_vec_avx2(struct
> > > > ice_rx_queue
> > > > > *rxq, struct rte_mbuf **rx_pkts,
> > > > >  	return received;
> > > > >  }
> > > > >
> > > > > -/*
> > > > > - * Notice:
> > > > > - * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet
> > > > > - */
> > > > >  uint16_t
> > > > >  ice_recv_pkts_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
> > > > >  		       uint16_t nb_pkts)
> > > > > @@ -616,8 +617,6 @@ ice_recv_pkts_vec_avx2(void *rx_queue,
> > > > > struct rte_mbuf **rx_pkts,
> > > > >
> > > > >  /**
> > > > >   * vPMD receive routine that reassembles single burst of 32
> > > > > scattered packets
> > > > > - * Notice:
> > > > > - * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet
> > > > >   */
> > > >
> > > > Why we need to remove this? is it still true for this function?
> > > >
> > >
> > > The reason is that this comment is in the calling function "
> > > _ice_recv_raw_pkts_vec_avx2" which process the related thing, no
> > > need to add it more and more in the caller function.
> >
> > I think you remove related comment from the calling function also :)
> >
> > Also I think better to keep this even it's a little bit duplicate,
> > that help people to understand the internal logic
> >
> > >
> > > > >  static uint16_t
> > > > >  ice_recv_scattered_burst_vec_avx2(void *rx_queue, struct
> > > > > rte_mbuf **rx_pkts, @@ -626,6 +625,9 @@
> > > > ice_recv_scattered_burst_vec_avx2(void
> > > > > *rx_queue, struct rte_mbuf **rx_pkts,
> > > > >  	struct ice_rx_queue *rxq = rx_queue;
> > > > >  	uint8_t split_flags[ICE_VPMD_RX_BURST] = {0};
> > > > >
> > > > > +	/* split_flags only can support max of ICE_VPMD_RX_BURST */
> > > > > +	nb_pkts = RTE_MIN(nb_pkts, ICE_VPMD_RX_BURST);
> > > >
> > > > Is this necessary?  the only consumer of this function is
> > > > ice_recv_scattered_pkts_vec_avx2, I think nb_pkts <=
> > > > ICE_VPMD_RX_BURST it already be guaranteed.
> > >
> > > The reason is that we remove "nb_pkts <= ICE_VPMD_RX_BURST" and in
> > > this function split_flags have a limit for ICE_VPMD_RX_BURST, so a
> > > checking is need in the function.
> >
> > Can't get this, could tell me is there any case that nb_pkts >
> > ICE_VPMD_RX_BURST?
> >
> 
> I know we just set the hard value here and only one case usage, but I think only
> the caller know what would be the input param, but the calling should not know
> the input param will be, even there is no any caller but the calling still need to
> be complete.

It's in data path where performance is sensitive and also this is just an internal function, we know all the detail, so skip unnecessary route is reasonable, 
to avoid bugs and give necessary warning for future scale, I think RTE_ASSERT is the right way.
> 
> >
> > >
> > > > > +
> > > > >  	/* get some new buffers */
> > > > >  	uint16_t nb_bufs = _ice_recv_raw_pkts_vec_avx2(rxq, rx_pkts,
> > > > nb_pkts,
> > > > >  						       split_flags);
> > > > > @@ -657,9 +659,6 @@ ice_recv_scattered_burst_vec_avx2(void
> > > > *rx_queue,
> > > > > struct rte_mbuf **rx_pkts,
> > > > >
> > > > >  /**
> > > > >   * vPMD receive routine that reassembles scattered packets.
> > > > > - * Main receive routine that can handle arbitrary burst sizes
> > > > > - * Notice:
> > > > > - * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet
> > > > >   */
> > > >
> > > > Why we need to remove this? isn't it the main routine that be able
> > > > to handle arbitrary burst size?
> > > >
> > >
> > > The question is why we need to said the arbitrary sizes if we
> > > process and return what we could receive packet for maximum? It is
> > > not only useless comment but also maybe bring some confuse I think.
> >
> > Yes arbitrary size description can be removed, as this is assumed to
> > be the default behavior.
> > But the description for nb_pkts should still be kept.
> >
> > >
> > > > Btw, I will suggest all AVX2 changes can be in a separate patch,
> > > > because this looks like some code clean and fix.
> > > > its not related with the main purpose of the patch set.
> > >
> > > I consider it and ask any objection before, so totally I am not
> > > disagree on separate it, but I think if  the purpose of the patch
> > > set is to clean some misleading for vec(sse/avx) burst, it could
> > > still be on a set even separate it to patch.
> >
> > I will not be insist on patch separate, but if you separate them, some
> > of fixes can be merged early and no need to wait for those part need more
> review.
> 
> Ok, seems that there still something discuss on the code cleaning patch, let me
> separate it for better review.
  

Patch

diff --git a/drivers/net/ice/ice_rxtx.h b/drivers/net/ice/ice_rxtx.h
index 2fdcfb7d0..3ef5f300d 100644
--- a/drivers/net/ice/ice_rxtx.h
+++ b/drivers/net/ice/ice_rxtx.h
@@ -35,6 +35,7 @@ 
 #define ICE_MAX_RX_BURST            ICE_RXQ_REARM_THRESH
 #define ICE_TX_MAX_FREE_BUF_SZ      64
 #define ICE_DESCS_PER_LOOP          4
+#define ICE_DESCS_PER_LOOP_AVX	    8
 
 #define ICE_FDIR_PKT_LEN	512
 
diff --git a/drivers/net/ice/ice_rxtx_vec_avx2.c b/drivers/net/ice/ice_rxtx_vec_avx2.c
index be50677c2..843e4f32a 100644
--- a/drivers/net/ice/ice_rxtx_vec_avx2.c
+++ b/drivers/net/ice/ice_rxtx_vec_avx2.c
@@ -29,7 +29,7 @@  ice_rxq_rearm(struct ice_rx_queue *rxq)
 			__m128i dma_addr0;
 
 			dma_addr0 = _mm_setzero_si128();
-			for (i = 0; i < ICE_DESCS_PER_LOOP; i++) {
+			for (i = 0; i < ICE_DESCS_PER_LOOP_AVX; i++) {
 				rxep[i].mbuf = &rxq->fake_mbuf;
 				_mm_store_si128((__m128i *)&rxdp[i].read,
 						dma_addr0);
@@ -132,12 +132,17 @@  ice_rxq_rearm(struct ice_rx_queue *rxq)
 	ICE_PCI_REG_WRITE(rxq->qrx_tail, rx_id);
 }
 
+/**
+ * vPMD raw receive routine, only accept(nb_pkts >= ICE_DESCS_PER_LOOP_AVX)
+ *
+ * Notice:
+ * - nb_pkts < ICE_DESCS_PER_LOOP_AVX, just return no packet
+ * - floor align nb_pkts to a ICE_DESCS_PER_LOOP_AVX power-of-two
+ */
 static inline uint16_t
 _ice_recv_raw_pkts_vec_avx2(struct ice_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 			    uint16_t nb_pkts, uint8_t *split_packet)
 {
-#define ICE_DESCS_PER_LOOP_AVX 8
-
 	const uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl;
 	const __m256i mbuf_init = _mm256_set_epi64x(0, 0,
 			0, rxq->mbuf_initializer);
@@ -603,10 +608,6 @@  _ice_recv_raw_pkts_vec_avx2(struct ice_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 	return received;
 }
 
-/**
- * Notice:
- * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet
- */
 uint16_t
 ice_recv_pkts_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
 		       uint16_t nb_pkts)
@@ -616,8 +617,6 @@  ice_recv_pkts_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
 
 /**
  * vPMD receive routine that reassembles single burst of 32 scattered packets
- * Notice:
- * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet
  */
 static uint16_t
 ice_recv_scattered_burst_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
@@ -626,6 +625,9 @@  ice_recv_scattered_burst_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
 	struct ice_rx_queue *rxq = rx_queue;
 	uint8_t split_flags[ICE_VPMD_RX_BURST] = {0};
 
+	/* split_flags only can support max of ICE_VPMD_RX_BURST */
+	nb_pkts = RTE_MIN(nb_pkts, ICE_VPMD_RX_BURST);
+
 	/* get some new buffers */
 	uint16_t nb_bufs = _ice_recv_raw_pkts_vec_avx2(rxq, rx_pkts, nb_pkts,
 						       split_flags);
@@ -657,9 +659,6 @@  ice_recv_scattered_burst_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
 
 /**
  * vPMD receive routine that reassembles scattered packets.
- * Main receive routine that can handle arbitrary burst sizes
- * Notice:
- * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet
  */
 uint16_t
 ice_recv_scattered_pkts_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
diff --git a/drivers/net/ice/ice_rxtx_vec_sse.c b/drivers/net/ice/ice_rxtx_vec_sse.c
index 382ef31f3..c03e24092 100644
--- a/drivers/net/ice/ice_rxtx_vec_sse.c
+++ b/drivers/net/ice/ice_rxtx_vec_sse.c
@@ -205,10 +205,11 @@  ice_rx_desc_to_ptype_v(__m128i descs[4], struct rte_mbuf **rx_pkts,
 }
 
 /**
+ * vPMD raw receive routine, only accept(nb_pkts >= ICE_DESCS_PER_LOOP)
+ *
  * Notice:
  * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > ICE_VPMD_RX_BURST, only scan ICE_VPMD_RX_BURST
- *   numbers of DD bits
+ * - floor align nb_pkts to a ICE_DESCS_PER_LOOP power-of-two
  */
 static inline uint16_t
 _ice_recv_raw_pkts_vec(struct ice_rx_queue *rxq, struct rte_mbuf **rx_pkts,
@@ -264,9 +265,6 @@  _ice_recv_raw_pkts_vec(struct ice_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 	const __m128i eop_check = _mm_set_epi64x(0x0000000200000002LL,
 						 0x0000000200000002LL);
 
-	/* nb_pkts shall be less equal than ICE_MAX_RX_BURST */
-	nb_pkts = RTE_MIN(nb_pkts, ICE_MAX_RX_BURST);
-
 	/* nb_pkts has to be floor-aligned to ICE_DESCS_PER_LOOP */
 	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, ICE_DESCS_PER_LOOP);
 
@@ -441,12 +439,6 @@  _ice_recv_raw_pkts_vec(struct ice_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 	return nb_pkts_recd;
 }
 
-/**
- * Notice:
- * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > ICE_VPMD_RX_BURST, only scan ICE_VPMD_RX_BURST
- *   numbers of DD bits
- */
 uint16_t
 ice_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 		  uint16_t nb_pkts)
@@ -454,19 +446,19 @@  ice_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 	return _ice_recv_raw_pkts_vec(rx_queue, rx_pkts, nb_pkts, NULL);
 }
 
-/* vPMD receive routine that reassembles scattered packets
- * Notice:
- * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet
- * - nb_pkts > ICE_VPMD_RX_BURST, only scan ICE_VPMD_RX_BURST
- *   numbers of DD bits
+/**
+ * vPMD receive routine that reassembles single burst of 32 scattered packets
  */
-uint16_t
-ice_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
-			    uint16_t nb_pkts)
+static uint16_t
+ice_recv_scattered_burst_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+			     uint16_t nb_pkts)
 {
 	struct ice_rx_queue *rxq = rx_queue;
 	uint8_t split_flags[ICE_VPMD_RX_BURST] = {0};
 
+	/* split_flags only can support max of ICE_VPMD_RX_BURST */
+	nb_pkts = RTE_MIN(nb_pkts, ICE_VPMD_RX_BURST);
+
 	/* get some new buffers */
 	uint16_t nb_bufs = _ice_recv_raw_pkts_vec(rxq, rx_pkts, nb_pkts,
 						  split_flags);
@@ -496,6 +488,32 @@  ice_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 					     &split_flags[i]);
 }
 
+/**
+ * vPMD receive routine that reassembles scattered packets.
+ */
+uint16_t
+ice_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+			    uint16_t nb_pkts)
+{
+	uint16_t retval = 0;
+
+	while (nb_pkts > ICE_VPMD_RX_BURST) {
+		uint16_t burst;
+
+		burst = ice_recv_scattered_burst_vec(rx_queue,
+						     rx_pkts + retval,
+						     ICE_VPMD_RX_BURST);
+		retval += burst;
+		nb_pkts -= burst;
+		if (burst < ICE_VPMD_RX_BURST)
+			return retval;
+	}
+
+	return retval + ice_recv_scattered_burst_vec(rx_queue,
+						     rx_pkts + retval,
+						     nb_pkts);
+}
+
 static inline void
 ice_vtx1(volatile struct ice_tx_desc *txdp, struct rte_mbuf *pkt,
 	 uint64_t flags)