[v3] net/mlx5: relaxed ordering for multi-packet RQ buffer refcnt

Message ID 1592900807-13289-1-git-send-email-phil.yang@arm.com (mailing list archive)
State Superseded, archived
Delegated to: Raslan Darawsheh
Headers
Series [v3] net/mlx5: relaxed ordering for multi-packet RQ buffer refcnt |

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/iol-broadcom-Performance success Performance Testing PASS
ci/iol-intel-Performance success Performance Testing PASS
ci/travis-robot success Travis build: passed
ci/iol-nxp-Performance success Performance Testing PASS
ci/Intel-compilation success Compilation OK
ci/iol-mellanox-Performance success Performance Testing PASS
ci/iol-testing success Testing PASS

Commit Message

Phil Yang June 23, 2020, 8:26 a.m. UTC
  Use c11 atomics with explicit ordering instead of the rte_atomic ops
which enforce unnecessary barriers on aarch64.

Signed-off-by: Phil Yang <phil.yang@arm.com>
---
v3:
Split from the patchset:
http://patchwork.dpdk.org/cover/68159/

 drivers/net/mlx5/mlx5_rxq.c  |  2 +-
 drivers/net/mlx5/mlx5_rxtx.c | 16 +++++++++-------
 drivers/net/mlx5/mlx5_rxtx.h |  2 +-
 3 files changed, 11 insertions(+), 9 deletions(-)
  

Comments

Phil Yang July 13, 2020, 3:02 a.m. UTC | #1
Hi,

We are also doing C11 atomics converting for other components.
Your insight would be much appreciated.

Thanks,
Phil Yang

> -----Original Message-----
> From: dev <dev-bounces@dpdk.org> On Behalf Of Phil Yang
> Sent: Tuesday, June 23, 2020 4:27 PM
> To: dev@dpdk.org
> Cc: matan@mellanox.com; shahafs@mellanox.com;
> viacheslavo@mellanox.com; Honnappa Nagarahalli
> <Honnappa.Nagarahalli@arm.com>; drc@linux.vnet.ibm.com; nd
> <nd@arm.com>
> Subject: [dpdk-dev] [PATCH v3] net/mlx5: relaxed ordering for multi-packet
> RQ buffer refcnt
> 
> Use c11 atomics with explicit ordering instead of the rte_atomic ops
> which enforce unnecessary barriers on aarch64.
> 
> Signed-off-by: Phil Yang <phil.yang@arm.com>
> ---
> v3:
> Split from the patchset:
> http://patchwork.dpdk.org/cover/68159/
> 
>  drivers/net/mlx5/mlx5_rxq.c  |  2 +-
>  drivers/net/mlx5/mlx5_rxtx.c | 16 +++++++++-------
>  drivers/net/mlx5/mlx5_rxtx.h |  2 +-
>  3 files changed, 11 insertions(+), 9 deletions(-)
> 
> diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
> index dda0073..7f487f1 100644
> --- a/drivers/net/mlx5/mlx5_rxq.c
> +++ b/drivers/net/mlx5/mlx5_rxq.c
> @@ -1545,7 +1545,7 @@ mlx5_mprq_buf_init(struct rte_mempool *mp,
> void *opaque_arg,
> 
>  	memset(_m, 0, sizeof(*buf));
>  	buf->mp = mp;
> -	rte_atomic16_set(&buf->refcnt, 1);
> +	__atomic_store_n(&buf->refcnt, 1, __ATOMIC_RELAXED);
>  	for (j = 0; j != strd_n; ++j) {
>  		shinfo = &buf->shinfos[j];
>  		shinfo->free_cb = mlx5_mprq_buf_free_cb;
> diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
> index e4106bf..f0eda88 100644
> --- a/drivers/net/mlx5/mlx5_rxtx.c
> +++ b/drivers/net/mlx5/mlx5_rxtx.c
> @@ -1595,10 +1595,11 @@ mlx5_mprq_buf_free_cb(void *addr
> __rte_unused, void *opaque)
>  {
>  	struct mlx5_mprq_buf *buf = opaque;
> 
> -	if (rte_atomic16_read(&buf->refcnt) == 1) {
> +	if (__atomic_load_n(&buf->refcnt, __ATOMIC_RELAXED) == 1) {
>  		rte_mempool_put(buf->mp, buf);
> -	} else if (rte_atomic16_add_return(&buf->refcnt, -1) == 0) {
> -		rte_atomic16_set(&buf->refcnt, 1);
> +	} else if (unlikely(__atomic_sub_fetch(&buf->refcnt, 1,
> +					       __ATOMIC_RELAXED) == 0)) {
> +		__atomic_store_n(&buf->refcnt, 1, __ATOMIC_RELAXED);
>  		rte_mempool_put(buf->mp, buf);
>  	}
>  }
> @@ -1678,7 +1679,8 @@ mlx5_rx_burst_mprq(void *dpdk_rxq, struct
> rte_mbuf **pkts, uint16_t pkts_n)
> 
>  		if (consumed_strd == strd_n) {
>  			/* Replace WQE only if the buffer is still in use. */
> -			if (rte_atomic16_read(&buf->refcnt) > 1) {
> +			if (__atomic_load_n(&buf->refcnt,
> +					    __ATOMIC_RELAXED) > 1) {
>  				mprq_buf_replace(rxq, rq_ci & wq_mask,
> strd_n);
>  				/* Release the old buffer. */
>  				mlx5_mprq_buf_free(buf);
> @@ -1790,9 +1792,9 @@ mlx5_rx_burst_mprq(void *dpdk_rxq, struct
> rte_mbuf **pkts, uint16_t pkts_n)
>  			void *buf_addr;
> 
>  			/* Increment the refcnt of the whole chunk. */
> -			rte_atomic16_add_return(&buf->refcnt, 1);
> -			MLX5_ASSERT((uint16_t)rte_atomic16_read(&buf-
> >refcnt) <=
> -				    strd_n + 1);
> +			__atomic_add_fetch(&buf->refcnt, 1,
> __ATOMIC_ACQUIRE);
> +			MLX5_ASSERT(__atomic_load_n(&buf->refcnt,
> +				    __ATOMIC_RELAXED) <= strd_n + 1);
>  			buf_addr = RTE_PTR_SUB(addr,
> RTE_PKTMBUF_HEADROOM);
>  			/*
>  			 * MLX5 device doesn't use iova but it is necessary in
> a
> diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
> index 26621ff..0fc15f3 100644
> --- a/drivers/net/mlx5/mlx5_rxtx.h
> +++ b/drivers/net/mlx5/mlx5_rxtx.h
> @@ -78,7 +78,7 @@ struct rxq_zip {
>  /* Multi-Packet RQ buffer header. */
>  struct mlx5_mprq_buf {
>  	struct rte_mempool *mp;
> -	rte_atomic16_t refcnt; /* Atomically accessed refcnt. */
> +	uint16_t refcnt; /* Atomically accessed refcnt. */
>  	uint8_t pad[RTE_PKTMBUF_HEADROOM]; /* Headroom for the first
> packet. */
>  	struct rte_mbuf_ext_shared_info shinfos[];
>  	/*
> --
> 2.7.4
  
Alexander Kozyrev July 20, 2020, 11:21 p.m. UTC | #2
Hi Phil Yang, we noticed that this patch gives us 10% of performance degradation on ARM.
x86 seems to be unaffected though. Do you know what may be the reason of this behavior?

Regards,
Alex

> -----Original Message-----
> From: dev <dev-bounces@dpdk.org> On Behalf Of Phil Yang
> Sent: Sunday, July 12, 2020 23:02
> To: Matan Azrad <matan@mellanox.com>; Shahaf Shuler
> <shahafs@mellanox.com>; Slava Ovsiienko <viacheslavo@mellanox.com>
> Cc: Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>;
> drc@linux.vnet.ibm.com; nd <nd@arm.com>; Phil Yang <Phil.Yang@arm.com>;
> dev@dpdk.org; nd <nd@arm.com>
> Subject: Re: [dpdk-dev] [PATCH v3] net/mlx5: relaxed ordering for multi-packet
> RQ buffer refcnt
> 
> Hi,
> 
> We are also doing C11 atomics converting for other components.
> Your insight would be much appreciated.
> 
> Thanks,
> Phil Yang
> 
> > -----Original Message-----
> > From: dev <dev-bounces@dpdk.org> On Behalf Of Phil Yang
> > Sent: Tuesday, June 23, 2020 4:27 PM
> > To: dev@dpdk.org
> > Cc: matan@mellanox.com; shahafs@mellanox.com;
> > viacheslavo@mellanox.com; Honnappa Nagarahalli
> > <Honnappa.Nagarahalli@arm.com>; drc@linux.vnet.ibm.com; nd
> > <nd@arm.com>
> > Subject: [dpdk-dev] [PATCH v3] net/mlx5: relaxed ordering for
> > multi-packet RQ buffer refcnt
> >
> > Use c11 atomics with explicit ordering instead of the rte_atomic ops
> > which enforce unnecessary barriers on aarch64.
> >
> > Signed-off-by: Phil Yang <phil.yang@arm.com>
> > ---
> > v3:
> > Split from the patchset:
> > https://eur03.safelinks.protection.outlook.com/?url=http%3A%2F%2Fpatch
> >
> work.dpdk.org%2Fcover%2F68159%2F&amp;data=02%7C01%7Cakozyrev%40m
> ellano
> >
> x.com%7C1e3dc839a3604924fdf208d826d934ad%7Ca652971c7d2e4d9ba6a4d1
> 49256
> >
> f461b%7C0%7C0%7C637302061620808255&amp;sdata=mRXbgPi6HyrVtP04Vl7
> Bx8lD0
> > trVP7noQlpOD7gBoTQ%3D&amp;reserved=0
> >
> >  drivers/net/mlx5/mlx5_rxq.c  |  2 +-
> >  drivers/net/mlx5/mlx5_rxtx.c | 16 +++++++++-------
> > drivers/net/mlx5/mlx5_rxtx.h |  2 +-
> >  3 files changed, 11 insertions(+), 9 deletions(-)
> >
> > diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
> > index dda0073..7f487f1 100644
> > --- a/drivers/net/mlx5/mlx5_rxq.c
> > +++ b/drivers/net/mlx5/mlx5_rxq.c
> > @@ -1545,7 +1545,7 @@ mlx5_mprq_buf_init(struct rte_mempool *mp, void
> > *opaque_arg,
> >
> >  	memset(_m, 0, sizeof(*buf));
> >  	buf->mp = mp;
> > -	rte_atomic16_set(&buf->refcnt, 1);
> > +	__atomic_store_n(&buf->refcnt, 1, __ATOMIC_RELAXED);
> >  	for (j = 0; j != strd_n; ++j) {
> >  		shinfo = &buf->shinfos[j];
> >  		shinfo->free_cb = mlx5_mprq_buf_free_cb; diff --git
> > a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c index
> > e4106bf..f0eda88 100644
> > --- a/drivers/net/mlx5/mlx5_rxtx.c
> > +++ b/drivers/net/mlx5/mlx5_rxtx.c
> > @@ -1595,10 +1595,11 @@ mlx5_mprq_buf_free_cb(void *addr
> __rte_unused,
> > void *opaque)  {
> >  	struct mlx5_mprq_buf *buf = opaque;
> >
> > -	if (rte_atomic16_read(&buf->refcnt) == 1) {
> > +	if (__atomic_load_n(&buf->refcnt, __ATOMIC_RELAXED) == 1) {
> >  		rte_mempool_put(buf->mp, buf);
> > -	} else if (rte_atomic16_add_return(&buf->refcnt, -1) == 0) {
> > -		rte_atomic16_set(&buf->refcnt, 1);
> > +	} else if (unlikely(__atomic_sub_fetch(&buf->refcnt, 1,
> > +					       __ATOMIC_RELAXED) == 0)) {
> > +		__atomic_store_n(&buf->refcnt, 1, __ATOMIC_RELAXED);
> >  		rte_mempool_put(buf->mp, buf);
> >  	}
> >  }
> > @@ -1678,7 +1679,8 @@ mlx5_rx_burst_mprq(void *dpdk_rxq, struct
> > rte_mbuf **pkts, uint16_t pkts_n)
> >
> >  		if (consumed_strd == strd_n) {
> >  			/* Replace WQE only if the buffer is still in use. */
> > -			if (rte_atomic16_read(&buf->refcnt) > 1) {
> > +			if (__atomic_load_n(&buf->refcnt,
> > +					    __ATOMIC_RELAXED) > 1) {
> >  				mprq_buf_replace(rxq, rq_ci & wq_mask,
> strd_n);
> >  				/* Release the old buffer. */
> >  				mlx5_mprq_buf_free(buf);
> > @@ -1790,9 +1792,9 @@ mlx5_rx_burst_mprq(void *dpdk_rxq, struct
> > rte_mbuf **pkts, uint16_t pkts_n)
> >  			void *buf_addr;
> >
> >  			/* Increment the refcnt of the whole chunk. */
> > -			rte_atomic16_add_return(&buf->refcnt, 1);
> > -			MLX5_ASSERT((uint16_t)rte_atomic16_read(&buf-
> > >refcnt) <=
> > -				    strd_n + 1);
> > +			__atomic_add_fetch(&buf->refcnt, 1,
> > __ATOMIC_ACQUIRE);
> > +			MLX5_ASSERT(__atomic_load_n(&buf->refcnt,
> > +				    __ATOMIC_RELAXED) <= strd_n + 1);
> >  			buf_addr = RTE_PTR_SUB(addr,
> > RTE_PKTMBUF_HEADROOM);
> >  			/*
> >  			 * MLX5 device doesn't use iova but it is necessary in a
> diff
> > --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
> > index 26621ff..0fc15f3 100644
> > --- a/drivers/net/mlx5/mlx5_rxtx.h
> > +++ b/drivers/net/mlx5/mlx5_rxtx.h
> > @@ -78,7 +78,7 @@ struct rxq_zip {
> >  /* Multi-Packet RQ buffer header. */
> >  struct mlx5_mprq_buf {
> >  	struct rte_mempool *mp;
> > -	rte_atomic16_t refcnt; /* Atomically accessed refcnt. */
> > +	uint16_t refcnt; /* Atomically accessed refcnt. */
> >  	uint8_t pad[RTE_PKTMBUF_HEADROOM]; /* Headroom for the first
> packet.
> > */
> >  	struct rte_mbuf_ext_shared_info shinfos[];
> >  	/*
> > --
> > 2.7.4
  
Phil Yang July 21, 2020, 1:55 a.m. UTC | #3
Alexander Kozyrev <akozyrev@mellanox.com> writes:

<...>
> Subject: RE: [dpdk-dev] [PATCH v3] net/mlx5: relaxed ordering for multi-
> packet RQ buffer refcnt
> 
> Hi Phil Yang, we noticed that this patch gives us 10% of performance
> degradation on ARM.
> x86 seems to be unaffected though. Do you know what may be the reason
> of this behavior?

Hi Alexander,

Thanks for your feedback. 
This patch removed some expensive memory barriers on aarch64, it should get better performance. 
I am not sure the root cause of this degradation now, I will start the investigation. We can profiling this issue together.
Could you share your test case(including your testbed configuration) with us? 

Thanks,
Phil
> 
> Regards,
> Alex
> 
> > -----Original Message-----
> > From: dev <dev-bounces@dpdk.org> On Behalf Of Phil Yang
> > Sent: Sunday, July 12, 2020 23:02
> > To: Matan Azrad <matan@mellanox.com>; Shahaf Shuler
> > <shahafs@mellanox.com>; Slava Ovsiienko <viacheslavo@mellanox.com>
> > Cc: Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>;
> > drc@linux.vnet.ibm.com; nd <nd@arm.com>; Phil Yang
> <Phil.Yang@arm.com>;
> > dev@dpdk.org; nd <nd@arm.com>
> > Subject: Re: [dpdk-dev] [PATCH v3] net/mlx5: relaxed ordering for multi-
> packet
> > RQ buffer refcnt
> >
> > Hi,
> >
> > We are also doing C11 atomics converting for other components.
> > Your insight would be much appreciated.
> >
> > Thanks,
> > Phil Yang
> >
> > > -----Original Message-----
> > > From: dev <dev-bounces@dpdk.org> On Behalf Of Phil Yang
> > > Sent: Tuesday, June 23, 2020 4:27 PM
> > > To: dev@dpdk.org
> > > Cc: matan@mellanox.com; shahafs@mellanox.com;
> > > viacheslavo@mellanox.com; Honnappa Nagarahalli
> > > <Honnappa.Nagarahalli@arm.com>; drc@linux.vnet.ibm.com; nd
> > > <nd@arm.com>
> > > Subject: [dpdk-dev] [PATCH v3] net/mlx5: relaxed ordering for
> > > multi-packet RQ buffer refcnt
> > >
> > > Use c11 atomics with explicit ordering instead of the rte_atomic ops
> > > which enforce unnecessary barriers on aarch64.
> > >
> > > Signed-off-by: Phil Yang <phil.yang@arm.com>
> > > ---
> > > v3:
> > > Split from the patchset:
> > >
> https://eur03.safelinks.protection.outlook.com/?url=http%3A%2F%2Fpatch
> > >
> >
> work.dpdk.org%2Fcover%2F68159%2F&amp;data=02%7C01%7Cakozyrev%40
> m
> > ellano
> > >
> >
> x.com%7C1e3dc839a3604924fdf208d826d934ad%7Ca652971c7d2e4d9ba6a4d
> 1
> > 49256
> > >
> >
> f461b%7C0%7C0%7C637302061620808255&amp;sdata=mRXbgPi6HyrVtP04Vl
> 7
> > Bx8lD0
> > > trVP7noQlpOD7gBoTQ%3D&amp;reserved=0
> > >
> > >  drivers/net/mlx5/mlx5_rxq.c  |  2 +-
> > >  drivers/net/mlx5/mlx5_rxtx.c | 16 +++++++++-------
> > > drivers/net/mlx5/mlx5_rxtx.h |  2 +-
> > >  3 files changed, 11 insertions(+), 9 deletions(-)
> > >
> > > diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
> > > index dda0073..7f487f1 100644
> > > --- a/drivers/net/mlx5/mlx5_rxq.c
> > > +++ b/drivers/net/mlx5/mlx5_rxq.c
> > > @@ -1545,7 +1545,7 @@ mlx5_mprq_buf_init(struct rte_mempool *mp,
> void
> > > *opaque_arg,
> > >
> > >  	memset(_m, 0, sizeof(*buf));
> > >  	buf->mp = mp;
> > > -	rte_atomic16_set(&buf->refcnt, 1);
> > > +	__atomic_store_n(&buf->refcnt, 1, __ATOMIC_RELAXED);
> > >  	for (j = 0; j != strd_n; ++j) {
> > >  		shinfo = &buf->shinfos[j];
> > >  		shinfo->free_cb = mlx5_mprq_buf_free_cb; diff --git
> > > a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c index
> > > e4106bf..f0eda88 100644
> > > --- a/drivers/net/mlx5/mlx5_rxtx.c
> > > +++ b/drivers/net/mlx5/mlx5_rxtx.c
> > > @@ -1595,10 +1595,11 @@ mlx5_mprq_buf_free_cb(void *addr
> > __rte_unused,
> > > void *opaque)  {
> > >  	struct mlx5_mprq_buf *buf = opaque;
> > >
> > > -	if (rte_atomic16_read(&buf->refcnt) == 1) {
> > > +	if (__atomic_load_n(&buf->refcnt, __ATOMIC_RELAXED) == 1) {
> > >  		rte_mempool_put(buf->mp, buf);
> > > -	} else if (rte_atomic16_add_return(&buf->refcnt, -1) == 0) {
> > > -		rte_atomic16_set(&buf->refcnt, 1);
> > > +	} else if (unlikely(__atomic_sub_fetch(&buf->refcnt, 1,
> > > +					       __ATOMIC_RELAXED) == 0)) {
> > > +		__atomic_store_n(&buf->refcnt, 1, __ATOMIC_RELAXED);
> > >  		rte_mempool_put(buf->mp, buf);
> > >  	}
> > >  }
> > > @@ -1678,7 +1679,8 @@ mlx5_rx_burst_mprq(void *dpdk_rxq, struct
> > > rte_mbuf **pkts, uint16_t pkts_n)
> > >
> > >  		if (consumed_strd == strd_n) {
> > >  			/* Replace WQE only if the buffer is still in use. */
> > > -			if (rte_atomic16_read(&buf->refcnt) > 1) {
> > > +			if (__atomic_load_n(&buf->refcnt,
> > > +					    __ATOMIC_RELAXED) > 1) {
> > >  				mprq_buf_replace(rxq, rq_ci & wq_mask,
> > strd_n);
> > >  				/* Release the old buffer. */
> > >  				mlx5_mprq_buf_free(buf);
> > > @@ -1790,9 +1792,9 @@ mlx5_rx_burst_mprq(void *dpdk_rxq, struct
> > > rte_mbuf **pkts, uint16_t pkts_n)
> > >  			void *buf_addr;
> > >
> > >  			/* Increment the refcnt of the whole chunk. */
> > > -			rte_atomic16_add_return(&buf->refcnt, 1);
> > > -			MLX5_ASSERT((uint16_t)rte_atomic16_read(&buf-
> > > >refcnt) <=
> > > -				    strd_n + 1);
> > > +			__atomic_add_fetch(&buf->refcnt, 1,
> > > __ATOMIC_ACQUIRE);
> > > +			MLX5_ASSERT(__atomic_load_n(&buf->refcnt,
> > > +				    __ATOMIC_RELAXED) <= strd_n + 1);
> > >  			buf_addr = RTE_PTR_SUB(addr,
> > > RTE_PKTMBUF_HEADROOM);
> > >  			/*
> > >  			 * MLX5 device doesn't use iova but it is necessary in
> a
> > diff
> > > --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
> > > index 26621ff..0fc15f3 100644
> > > --- a/drivers/net/mlx5/mlx5_rxtx.h
> > > +++ b/drivers/net/mlx5/mlx5_rxtx.h
> > > @@ -78,7 +78,7 @@ struct rxq_zip {
> > >  /* Multi-Packet RQ buffer header. */
> > >  struct mlx5_mprq_buf {
> > >  	struct rte_mempool *mp;
> > > -	rte_atomic16_t refcnt; /* Atomically accessed refcnt. */
> > > +	uint16_t refcnt; /* Atomically accessed refcnt. */
> > >  	uint8_t pad[RTE_PKTMBUF_HEADROOM]; /* Headroom for the first
> > packet.
> > > */
> > >  	struct rte_mbuf_ext_shared_info shinfos[];
> > >  	/*
> > > --
> > > 2.7.4
  
Alexander Kozyrev July 21, 2020, 3:58 a.m. UTC | #4
> Phil Yang <Phil.Yang@arm.com> writes:
> 
> <...>
> > Subject: RE: [dpdk-dev] [PATCH v3] net/mlx5: relaxed ordering for
> > multi- packet RQ buffer refcnt
> >
> > Hi Phil Yang, we noticed that this patch gives us 10% of performance
> > degradation on ARM.
> > x86 seems to be unaffected though. Do you know what may be the reason
> > of this behavior?
> 
> Hi Alexander,
> 
> Thanks for your feedback.
> This patch removed some expensive memory barriers on aarch64, it should get
> better performance.
> I am not sure the root cause of this degradation now, I will start the
> investigation. We can profiling this issue together.
> Could you share your test case(including your testbed configuration) with us?
> 
> Thanks,
> Phil

I'm surprised too, Phil, but looks like it is actually making things worse. I used Connect-X 6DX on aarch64:
Linux dragon71-bf 5.4.31-mlnx.15.ge938819 #1 SMP PREEMPT Thu Jul 2 17:01:15 IDT 2020 aarch64 aarch64 aarch64 GNU/Linux
Traffic generator sends 60 bytes packets and DUT executes the following command:
arm64-bluefield-linuxapp-gcc/build/app/test-pmd/testpmd -n 4  -w 0000:03:00.1,mprq_en=1,rxqs_min_mprq=1  -w 0000:03:00.0,mprq_en=1,rxqs_min_mprq=1 -c 0xe  -- --burst=64 --mbcache=512 -i  --nb-cores=1  --rxq=1 --txq=1 --txd=256 --rxd=256  --auto-start --rss-udp
Without a patch I'm getting 3.2mpps, and only 2.9mpps when the patch is applied.

Regards,
Alex
  
Honnappa Nagarahalli July 21, 2020, 4:03 a.m. UTC | #5
<snip>

> > > Subject: RE: [dpdk-dev] [PATCH v3] net/mlx5: relaxed ordering for
> > > multi- packet RQ buffer refcnt
> > >
> > > Hi Phil Yang, we noticed that this patch gives us 10% of performance
> > > degradation on ARM.
> > > x86 seems to be unaffected though. Do you know what may be the
> > > reason of this behavior?
> >
> > Hi Alexander,
> >
> > Thanks for your feedback.
> > This patch removed some expensive memory barriers on aarch64, it
> > should get better performance.
> > I am not sure the root cause of this degradation now, I will start the
> > investigation. We can profiling this issue together.
> > Could you share your test case(including your testbed configuration) with
> us?
> >
> > Thanks,
> > Phil
> 
> I'm surprised too, Phil, but looks like it is actually making things worse. I used
> Connect-X 6DX on aarch64:
> Linux dragon71-bf 5.4.31-mlnx.15.ge938819 #1 SMP PREEMPT Thu Jul 2
> 17:01:15 IDT 2020 aarch64 aarch64 aarch64 GNU/Linux Traffic generator
> sends 60 bytes packets and DUT executes the following command:
> arm64-bluefield-linuxapp-gcc/build/app/test-pmd/testpmd -n 4  -w
> 0000:03:00.1,mprq_en=1,rxqs_min_mprq=1  -w
> 0000:03:00.0,mprq_en=1,rxqs_min_mprq=1 -c 0xe  -- --burst=64 --
> mbcache=512 -i  --nb-cores=1  --rxq=1 --txq=1 --txd=256 --rxd=256  --auto-
> start --rss-udp Without a patch I'm getting 3.2mpps, and only 2.9mpps when
> the patch is applied.
You are running on A72 cores, is that correct?

> 
> Regards,
> Alex
  
Alexander Kozyrev July 21, 2020, 4:11 a.m. UTC | #6
<snip>
> 
> > > > Subject: RE: [dpdk-dev] [PATCH v3] net/mlx5: relaxed ordering for
> > > > multi- packet RQ buffer refcnt
> > > >
> > > > Hi Phil Yang, we noticed that this patch gives us 10% of
> > > > performance degradation on ARM.
> > > > x86 seems to be unaffected though. Do you know what may be the
> > > > reason of this behavior?
> > >
> > > Hi Alexander,
> > >
> > > Thanks for your feedback.
> > > This patch removed some expensive memory barriers on aarch64, it
> > > should get better performance.
> > > I am not sure the root cause of this degradation now, I will start
> > > the investigation. We can profiling this issue together.
> > > Could you share your test case(including your testbed configuration)
> > > with
> > us?
> > >
> > > Thanks,
> > > Phil
> >
> > I'm surprised too, Phil, but looks like it is actually making things
> > worse. I used Connect-X 6DX on aarch64:
> > Linux dragon71-bf 5.4.31-mlnx.15.ge938819 #1 SMP PREEMPT Thu Jul 2
> > 17:01:15 IDT 2020 aarch64 aarch64 aarch64 GNU/Linux Traffic generator
> > sends 60 bytes packets and DUT executes the following command:
> > arm64-bluefield-linuxapp-gcc/build/app/test-pmd/testpmd -n 4  -w
> > 0000:03:00.1,mprq_en=1,rxqs_min_mprq=1  -w
> > 0000:03:00.0,mprq_en=1,rxqs_min_mprq=1 -c 0xe  -- --burst=64 --
> > mbcache=512 -i  --nb-cores=1  --rxq=1 --txq=1 --txd=256 --rxd=256
> > --auto- start --rss-udp Without a patch I'm getting 3.2mpps, and only
> > 2.9mpps when the patch is applied.
> You are running on A72 cores, is that correct?

Correct, cat /proc/cpuinfo
processor       : 0
BogoMIPS        : 312.50
Features        : fp asimd evtstrm crc32 cpuid
CPU implementer : 0x41
CPU architecture: 8
CPU variant     : 0x0
CPU part        : 0xd08
CPU revision    : 3
  
Phil Yang July 22, 2020, 12:06 p.m. UTC | #7
Alexander Kozyrev <akozyrev@mellanox.com> writes:

<snip>
> >
> > > > > Subject: RE: [dpdk-dev] [PATCH v3] net/mlx5: relaxed ordering for
> > > > > multi- packet RQ buffer refcnt
> > > > >
> > > > > Hi Phil Yang, we noticed that this patch gives us 10% of
> > > > > performance degradation on ARM.
> > > > > x86 seems to be unaffected though. Do you know what may be the
> > > > > reason of this behavior?
> > > >
> > > > Hi Alexander,
> > > >
> > > > Thanks for your feedback.
> > > > This patch removed some expensive memory barriers on aarch64, it
> > > > should get better performance.
> > > > I am not sure the root cause of this degradation now, I will start
> > > > the investigation. We can profiling this issue together.
> > > > Could you share your test case(including your testbed configuration)
> > > > with
> > > us?
<...>
> > >
> > > I'm surprised too, Phil, but looks like it is actually making things
> > > worse. I used Connect-X 6DX on aarch64:
> > > Linux dragon71-bf 5.4.31-mlnx.15.ge938819 #1 SMP PREEMPT Thu Jul 2
> > > 17:01:15 IDT 2020 aarch64 aarch64 aarch64 GNU/Linux Traffic generator
> > > sends 60 bytes packets and DUT executes the following command:
> > > arm64-bluefield-linuxapp-gcc/build/app/test-pmd/testpmd -n 4  -w
> > > 0000:03:00.1,mprq_en=1,rxqs_min_mprq=1  -w
> > > 0000:03:00.0,mprq_en=1,rxqs_min_mprq=1 -c 0xe  -- --burst=64 --
> > > mbcache=512 -i  --nb-cores=1  --rxq=1 --txq=1 --txd=256 --rxd=256
> > > --auto- start --rss-udp Without a patch I'm getting 3.2mpps, and only
> > > 2.9mpps when the patch is applied.
> > You are running on A72 cores, is that correct?
> 
> Correct, cat /proc/cpuinfo
> processor       : 0
> BogoMIPS        : 312.50
> Features        : fp asimd evtstrm crc32 cpuid
> CPU implementer : 0x41
> CPU architecture: 8
> CPU variant     : 0x0
> CPU part        : 0xd08
> CPU revision    : 3

Thanks a lot for your input, Alex.
With your test command line, I remeasured this patch on two different aarch64 machines and both got some performance improvement.

SOC#1. On Thunderx2 (with LSE support), I see 7.6% performance improvement on throughput. 
NIC: ConnectX-6 / driver: mlx5_core version: 5.0-1.0.0.0 / firmware-version: 20.27.1016 (MT_0000000224)

SOC#2. On N1SDP (I disabled LSE to generate A72 likewise instructions), I also see slightly (about 1%~2%) performance improvement on throughput.
NIC: ConnectX-5 / driver: mlx5_core / version: 5.0-2.1.8 / firmware-version: 16.27.2008 (MT_0000000090)

Without LSE (i.e. A72 and SOC#2 case.) it uses the 'Exclusive' mechanism to achieve atomicity.
For example, it generates below instructions for __atomic_add_fetch.
__atomic_add_fetch(&buf->refcnt, 1, __ATOMIC_ACQUIRE);
   70118:       f94037e3        ldr     x3, [sp, #104]
   7011c:       91002060        add     x0, x3, #0x8
   70120:       485ffc02        ldaxrh  w2, [x0]
   70124:       11000442        add     w2, w2, #0x1
   70128:       48057c02        stxrh   w5, w2, [x0]
   7012c:       35ffffa5        cbnz    w5, 70120 <mlx5_rx_burst_mprq+0xb48>

In general, I think this patch will not lead to a sharp decline in performance. 
Maybe you can try other testbeds?

Thanks,
Phil
  
Honnappa Nagarahalli July 23, 2020, 4:47 a.m. UTC | #8
Hi Alexander,
	Thank you for testing this patch. Few comments below.

<snip>

> > Subject: Re: [dpdk-dev] [PATCH v3] net/mlx5: relaxed ordering for
> > multi-packet RQ buffer refcnt
> >
> > Hi,
> >
> > We are also doing C11 atomics converting for other components.
> > Your insight would be much appreciated.
> >
> > Thanks,
> > Phil Yang
> >
> > > -----Original Message-----
> > > From: dev <dev-bounces@dpdk.org> On Behalf Of Phil Yang
> > > Sent: Tuesday, June 23, 2020 4:27 PM
> > > To: dev@dpdk.org
> > > Cc: matan@mellanox.com; shahafs@mellanox.com;
> > > viacheslavo@mellanox.com; Honnappa Nagarahalli
> > > <Honnappa.Nagarahalli@arm.com>; drc@linux.vnet.ibm.com; nd
> > > <nd@arm.com>
> > > Subject: [dpdk-dev] [PATCH v3] net/mlx5: relaxed ordering for
> > > multi-packet RQ buffer refcnt
> > >
> > > Use c11 atomics with explicit ordering instead of the rte_atomic ops
> > > which enforce unnecessary barriers on aarch64.
> > >
> > > Signed-off-by: Phil Yang <phil.yang@arm.com>
> > > ---
> > > v3:
> > > Split from the patchset:
> > > https://eur03.safelinks.protection.outlook.com/?url=http%3A%2F%2Fpat
> > > ch
> > >
> >
> work.dpdk.org%2Fcover%2F68159%2F&amp;data=02%7C01%7Cakozyrev%40m
> > ellano
> > >
> >
> x.com%7C1e3dc839a3604924fdf208d826d934ad%7Ca652971c7d2e4d9ba6a4d1
> > 49256
> > >
> >
> f461b%7C0%7C0%7C637302061620808255&amp;sdata=mRXbgPi6HyrVtP04Vl7
> > Bx8lD0
> > > trVP7noQlpOD7gBoTQ%3D&amp;reserved=0
> > >
> > >  drivers/net/mlx5/mlx5_rxq.c  |  2 +-  drivers/net/mlx5/mlx5_rxtx.c
> > > | 16 +++++++++------- drivers/net/mlx5/mlx5_rxtx.h |  2 +-
> > >  3 files changed, 11 insertions(+), 9 deletions(-)
> > >
> > > diff --git a/drivers/net/mlx5/mlx5_rxq.c
> > > b/drivers/net/mlx5/mlx5_rxq.c index dda0073..7f487f1 100644
> > > --- a/drivers/net/mlx5/mlx5_rxq.c
> > > +++ b/drivers/net/mlx5/mlx5_rxq.c
> > > @@ -1545,7 +1545,7 @@ mlx5_mprq_buf_init(struct rte_mempool *mp,
> > > void *opaque_arg,
> > >
> > >  	memset(_m, 0, sizeof(*buf));
> > >  	buf->mp = mp;
> > > -	rte_atomic16_set(&buf->refcnt, 1);
> > > +	__atomic_store_n(&buf->refcnt, 1, __ATOMIC_RELAXED);
> > >  	for (j = 0; j != strd_n; ++j) {
> > >  		shinfo = &buf->shinfos[j];
> > >  		shinfo->free_cb = mlx5_mprq_buf_free_cb; diff --git
> > > a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c index
> > > e4106bf..f0eda88 100644
> > > --- a/drivers/net/mlx5/mlx5_rxtx.c
> > > +++ b/drivers/net/mlx5/mlx5_rxtx.c
> > > @@ -1595,10 +1595,11 @@ mlx5_mprq_buf_free_cb(void *addr
> > __rte_unused,
> > > void *opaque)  {
> > >  	struct mlx5_mprq_buf *buf = opaque;
> > >
> > > -	if (rte_atomic16_read(&buf->refcnt) == 1) {
> > > +	if (__atomic_load_n(&buf->refcnt, __ATOMIC_RELAXED) == 1) {
> > >  		rte_mempool_put(buf->mp, buf);
> > > -	} else if (rte_atomic16_add_return(&buf->refcnt, -1) == 0) {
> > > -		rte_atomic16_set(&buf->refcnt, 1);
> > > +	} else if (unlikely(__atomic_sub_fetch(&buf->refcnt, 1,
> > > +					       __ATOMIC_RELAXED) == 0)) {
> > > +		__atomic_store_n(&buf->refcnt, 1, __ATOMIC_RELAXED);
> > >  		rte_mempool_put(buf->mp, buf);
> > >  	}
> > >  }
> > > @@ -1678,7 +1679,8 @@ mlx5_rx_burst_mprq(void *dpdk_rxq, struct
> > > rte_mbuf **pkts, uint16_t pkts_n)
> > >
> > >  		if (consumed_strd == strd_n) {
> > >  			/* Replace WQE only if the buffer is still in use. */
> > > -			if (rte_atomic16_read(&buf->refcnt) > 1) {
> > > +			if (__atomic_load_n(&buf->refcnt,
> > > +					    __ATOMIC_RELAXED) > 1) {
> > >  				mprq_buf_replace(rxq, rq_ci & wq_mask,
> > strd_n);
> > >  				/* Release the old buffer. */
> > >  				mlx5_mprq_buf_free(buf);
> > > @@ -1790,9 +1792,9 @@ mlx5_rx_burst_mprq(void *dpdk_rxq, struct
> > > rte_mbuf **pkts, uint16_t pkts_n)
> > >  			void *buf_addr;
> > >
> > >  			/* Increment the refcnt of the whole chunk. */
> > > -			rte_atomic16_add_return(&buf->refcnt, 1);
rte_atomic16_add_return includes a full barrier along with atomic operation. But is full barrier required here? For ex: __atomic_add_fetch(&buf->refcnt, 1, __ATOMIC_RELAXED) will offer atomicity, but no barrier. Would that be enough?

> > > -			MLX5_ASSERT((uint16_t)rte_atomic16_read(&buf-
> > > >refcnt) <=
> > > -				    strd_n + 1);
> > > +			__atomic_add_fetch(&buf->refcnt, 1,
> > > __ATOMIC_ACQUIRE);

Can you replace just the above line with the following lines and test it?

__atomic_add_fetch(&buf->refcnt, 1, __ATOMIC_RELAXED);
__atomic_thread_fence(__ATOMIC_ACQ_REL);

This should make the generated code same as before this patch. Let me know if you would prefer us to re-spin the patch instead (for testing).

> > > +			MLX5_ASSERT(__atomic_load_n(&buf->refcnt,
> > > +				    __ATOMIC_RELAXED) <= strd_n + 1);
> > >  			buf_addr = RTE_PTR_SUB(addr,
> > > RTE_PKTMBUF_HEADROOM);
> > >  			/*
> > >  			 * MLX5 device doesn't use iova but it is necessary in a
> > diff
> > > --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
> > > index 26621ff..0fc15f3 100644
> > > --- a/drivers/net/mlx5/mlx5_rxtx.h
> > > +++ b/drivers/net/mlx5/mlx5_rxtx.h
> > > @@ -78,7 +78,7 @@ struct rxq_zip {
> > >  /* Multi-Packet RQ buffer header. */  struct mlx5_mprq_buf {
> > >  	struct rte_mempool *mp;
> > > -	rte_atomic16_t refcnt; /* Atomically accessed refcnt. */
> > > +	uint16_t refcnt; /* Atomically accessed refcnt. */
> > >  	uint8_t pad[RTE_PKTMBUF_HEADROOM]; /* Headroom for the first
> > packet.
> > > */
> > >  	struct rte_mbuf_ext_shared_info shinfos[];
> > >  	/*
> > > --
> > > 2.7.4
  
Phil Yang July 23, 2020, 6:11 a.m. UTC | #9
<snip>
> 
> > > Subject: Re: [dpdk-dev] [PATCH v3] net/mlx5: relaxed ordering for
> > > multi-packet RQ buffer refcnt
> > >
> > > Hi,
> > >
> > > We are also doing C11 atomics converting for other components.
> > > Your insight would be much appreciated.
> > >
> > > Thanks,
> > > Phil Yang
> > >
> > > > -----Original Message-----
> > > > From: dev <dev-bounces@dpdk.org> On Behalf Of Phil Yang
> > > > Sent: Tuesday, June 23, 2020 4:27 PM
> > > > To: dev@dpdk.org
> > > > Cc: matan@mellanox.com; shahafs@mellanox.com;
> > > > viacheslavo@mellanox.com; Honnappa Nagarahalli
> > > > <Honnappa.Nagarahalli@arm.com>; drc@linux.vnet.ibm.com; nd
> > > > <nd@arm.com>
> > > > Subject: [dpdk-dev] [PATCH v3] net/mlx5: relaxed ordering for
> > > > multi-packet RQ buffer refcnt
> > > >
> > > > Use c11 atomics with explicit ordering instead of the rte_atomic ops
> > > > which enforce unnecessary barriers on aarch64.
> > > >
> > > > Signed-off-by: Phil Yang <phil.yang@arm.com>
> > > > ---
<...>
> > > >
> > > >  drivers/net/mlx5/mlx5_rxq.c  |  2 +-  drivers/net/mlx5/mlx5_rxtx.c
> > > > | 16 +++++++++------- drivers/net/mlx5/mlx5_rxtx.h |  2 +-
> > > >  3 files changed, 11 insertions(+), 9 deletions(-)
> > > >
> > > > diff --git a/drivers/net/mlx5/mlx5_rxq.c
> > > > b/drivers/net/mlx5/mlx5_rxq.c index dda0073..7f487f1 100644
> > > > --- a/drivers/net/mlx5/mlx5_rxq.c
> > > > +++ b/drivers/net/mlx5/mlx5_rxq.c
> > > > @@ -1545,7 +1545,7 @@ mlx5_mprq_buf_init(struct rte_mempool
> *mp,
> > > > void *opaque_arg,
> > > >
> > > >  memset(_m, 0, sizeof(*buf));
> > > >  buf->mp = mp;
> > > > -rte_atomic16_set(&buf->refcnt, 1);
> > > > +__atomic_store_n(&buf->refcnt, 1, __ATOMIC_RELAXED);
> > > >  for (j = 0; j != strd_n; ++j) {
> > > >  shinfo = &buf->shinfos[j];
> > > >  shinfo->free_cb = mlx5_mprq_buf_free_cb; diff --git
> > > > a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c index
> > > > e4106bf..f0eda88 100644
> > > > --- a/drivers/net/mlx5/mlx5_rxtx.c
> > > > +++ b/drivers/net/mlx5/mlx5_rxtx.c
> > > > @@ -1595,10 +1595,11 @@ mlx5_mprq_buf_free_cb(void *addr
> > > __rte_unused,
> > > > void *opaque)  {
> > > >  struct mlx5_mprq_buf *buf = opaque;
> > > >
> > > > -if (rte_atomic16_read(&buf->refcnt) == 1) {
> > > > +if (__atomic_load_n(&buf->refcnt, __ATOMIC_RELAXED) == 1) {
> > > >  rte_mempool_put(buf->mp, buf);
> > > > -} else if (rte_atomic16_add_return(&buf->refcnt, -1) == 0) {
> > > > -rte_atomic16_set(&buf->refcnt, 1);
> > > > +} else if (unlikely(__atomic_sub_fetch(&buf->refcnt, 1,
> > > > +       __ATOMIC_RELAXED) == 0)) {
> > > > +__atomic_store_n(&buf->refcnt, 1, __ATOMIC_RELAXED);
> > > >  rte_mempool_put(buf->mp, buf);
> > > >  }
> > > >  }
> > > > @@ -1678,7 +1679,8 @@ mlx5_rx_burst_mprq(void *dpdk_rxq, struct
> > > > rte_mbuf **pkts, uint16_t pkts_n)
> > > >
> > > >  if (consumed_strd == strd_n) {
> > > >  /* Replace WQE only if the buffer is still in use. */
> > > > -if (rte_atomic16_read(&buf->refcnt) > 1) {
> > > > +if (__atomic_load_n(&buf->refcnt,
> > > > +    __ATOMIC_RELAXED) > 1) {
> > > >  mprq_buf_replace(rxq, rq_ci & wq_mask,
> > > strd_n);
> > > >  /* Release the old buffer. */
> > > >  mlx5_mprq_buf_free(buf);
> > > > @@ -1790,9 +1792,9 @@ mlx5_rx_burst_mprq(void *dpdk_rxq, struct
> > > > rte_mbuf **pkts, uint16_t pkts_n)
> > > >  void *buf_addr;
> > > >
> > > >  /* Increment the refcnt of the whole chunk. */
> > > > -rte_atomic16_add_return(&buf->refcnt, 1);
> rte_atomic16_add_return includes a full barrier along with atomic operation.
> But is full barrier required here? For ex: __atomic_add_fetch(&buf->refcnt, 1,
> __ATOMIC_RELAXED) will offer atomicity, but no barrier. Would that be
> enough?
> 
> > > > -MLX5_ASSERT((uint16_t)rte_atomic16_read(&buf-
> > > > >refcnt) <=
> > > > -    strd_n + 1);
> > > > +__atomic_add_fetch(&buf->refcnt, 1,
> > > > __ATOMIC_ACQUIRE);

The atomic load in MLX5_ASSERT() accesses the same memory space as the previous __atomic_add_fetch() does.
They will access this memory space in the program order when we enabled MLX5_PMD_DEBUG. So the ACQUIRE barrier in __atomic_add_fetch() becomes unnecessary.

By changing it to RELAXED ordering, this patch got 7.6% performance improvement on N1 (making it generate A72 alike instructions).

Could you please also try it on your testbed, Alex?

> 
> Can you replace just the above line with the following lines and test it?
> 
> __atomic_add_fetch(&buf->refcnt, 1, __ATOMIC_RELAXED);
> __atomic_thread_fence(__ATOMIC_ACQ_REL);
> 
> This should make the generated code same as before this patch. Let me
> know if you would prefer us to re-spin the patch instead (for testing).
> 
> > > > +MLX5_ASSERT(__atomic_load_n(&buf->refcnt,
> > > > +    __ATOMIC_RELAXED) <= strd_n + 1);
> > > >  buf_addr = RTE_PTR_SUB(addr,
> > > > RTE_PKTMBUF_HEADROOM);
> > > >  /*
> > > >   * MLX5 device doesn't use iova but it is necessary in a
> > > diff
> > > > --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
> > > > index 26621ff..0fc15f3 100644
> > > > --- a/drivers/net/mlx5/mlx5_rxtx.h
> > > > +++ b/drivers/net/mlx5/mlx5_rxtx.h
> > > > @@ -78,7 +78,7 @@ struct rxq_zip {
> > > >  /* Multi-Packet RQ buffer header. */  struct mlx5_mprq_buf {
> > > >  struct rte_mempool *mp;
> > > > -rte_atomic16_t refcnt; /* Atomically accessed refcnt. */
> > > > +uint16_t refcnt; /* Atomically accessed refcnt. */
> > > >  uint8_t pad[RTE_PKTMBUF_HEADROOM]; /* Headroom for the first
> > > packet.
> > > > */
> > > >  struct rte_mbuf_ext_shared_info shinfos[];
> > > >  /*
> > > > --
> > > > 2.7.4
>
  
Alexander Kozyrev July 23, 2020, 4:53 p.m. UTC | #10
> <snip>
> >
> > > > Subject: Re: [dpdk-dev] [PATCH v3] net/mlx5: relaxed ordering for
> > > > multi-packet RQ buffer refcnt
> > > >
> > > > Hi,
> > > >
> > > > We are also doing C11 atomics converting for other components.
> > > > Your insight would be much appreciated.
> > > >
> > > > Thanks,
> > > > Phil Yang
> > > >
> > > > > -----Original Message-----
> > > > > From: dev <dev-bounces@dpdk.org> On Behalf Of Phil Yang
> > > > > Sent: Tuesday, June 23, 2020 4:27 PM
> > > > > To: dev@dpdk.org
> > > > > Cc: matan@mellanox.com; shahafs@mellanox.com;
> > > > > viacheslavo@mellanox.com; Honnappa Nagarahalli
> > > > > <Honnappa.Nagarahalli@arm.com>; drc@linux.vnet.ibm.com; nd
> > > > > <nd@arm.com>
> > > > > Subject: [dpdk-dev] [PATCH v3] net/mlx5: relaxed ordering for
> > > > > multi-packet RQ buffer refcnt
> > > > >
> > > > > Use c11 atomics with explicit ordering instead of the rte_atomic
> > > > > ops which enforce unnecessary barriers on aarch64.
> > > > >
> > > > > Signed-off-by: Phil Yang <phil.yang@arm.com>
> > > > > ---
> <...>
> > > > >
> > > > >  drivers/net/mlx5/mlx5_rxq.c  |  2 +-
> > > > > drivers/net/mlx5/mlx5_rxtx.c
> > > > > | 16 +++++++++------- drivers/net/mlx5/mlx5_rxtx.h |  2 +-
> > > > >  3 files changed, 11 insertions(+), 9 deletions(-)
> > > > >
> > > > > diff --git a/drivers/net/mlx5/mlx5_rxq.c
> > > > > b/drivers/net/mlx5/mlx5_rxq.c index dda0073..7f487f1 100644
> > > > > --- a/drivers/net/mlx5/mlx5_rxq.c
> > > > > +++ b/drivers/net/mlx5/mlx5_rxq.c
> > > > > @@ -1545,7 +1545,7 @@ mlx5_mprq_buf_init(struct rte_mempool
> > *mp,
> > > > > void *opaque_arg,
> > > > >
> > > > >  memset(_m, 0, sizeof(*buf));
> > > > >  buf->mp = mp;
> > > > > -rte_atomic16_set(&buf->refcnt, 1);
> > > > > +__atomic_store_n(&buf->refcnt, 1, __ATOMIC_RELAXED);
> > > > >  for (j = 0; j != strd_n; ++j) {  shinfo = &buf->shinfos[j];
> > > > > shinfo->free_cb = mlx5_mprq_buf_free_cb; diff --git
> > > > > a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
> > > > > index
> > > > > e4106bf..f0eda88 100644
> > > > > --- a/drivers/net/mlx5/mlx5_rxtx.c
> > > > > +++ b/drivers/net/mlx5/mlx5_rxtx.c
> > > > > @@ -1595,10 +1595,11 @@ mlx5_mprq_buf_free_cb(void *addr
> > > > __rte_unused,
> > > > > void *opaque)  {
> > > > >  struct mlx5_mprq_buf *buf = opaque;
> > > > >
> > > > > -if (rte_atomic16_read(&buf->refcnt) == 1) {
> > > > > +if (__atomic_load_n(&buf->refcnt, __ATOMIC_RELAXED) == 1) {
> > > > >  rte_mempool_put(buf->mp, buf);
> > > > > -} else if (rte_atomic16_add_return(&buf->refcnt, -1) == 0) {
> > > > > -rte_atomic16_set(&buf->refcnt, 1);
> > > > > +} else if (unlikely(__atomic_sub_fetch(&buf->refcnt, 1,
> > > > > +       __ATOMIC_RELAXED) == 0)) {
> > > > > +__atomic_store_n(&buf->refcnt, 1, __ATOMIC_RELAXED);
> > > > >  rte_mempool_put(buf->mp, buf);
> > > > >  }
> > > > >  }
> > > > > @@ -1678,7 +1679,8 @@ mlx5_rx_burst_mprq(void *dpdk_rxq,
> struct
> > > > > rte_mbuf **pkts, uint16_t pkts_n)
> > > > >
> > > > >  if (consumed_strd == strd_n) {
> > > > >  /* Replace WQE only if the buffer is still in use. */ -if
> > > > > (rte_atomic16_read(&buf->refcnt) > 1) {
> > > > > +if (__atomic_load_n(&buf->refcnt,
> > > > > +    __ATOMIC_RELAXED) > 1) {
> > > > >  mprq_buf_replace(rxq, rq_ci & wq_mask,
> > > > strd_n);
> > > > >  /* Release the old buffer. */
> > > > >  mlx5_mprq_buf_free(buf);
> > > > > @@ -1790,9 +1792,9 @@ mlx5_rx_burst_mprq(void *dpdk_rxq,
> struct
> > > > > rte_mbuf **pkts, uint16_t pkts_n)  void *buf_addr;
> > > > >
> > > > >  /* Increment the refcnt of the whole chunk. */
> > > > > -rte_atomic16_add_return(&buf->refcnt, 1);
> > rte_atomic16_add_return includes a full barrier along with atomic
> operation.
> > But is full barrier required here? For ex:
> > __atomic_add_fetch(&buf->refcnt, 1,
> > __ATOMIC_RELAXED) will offer atomicity, but no barrier. Would that be
> > enough?
> >
> > > > > -MLX5_ASSERT((uint16_t)rte_atomic16_read(&buf-
> > > > > >refcnt) <=
> > > > > -    strd_n + 1);
> > > > > +__atomic_add_fetch(&buf->refcnt, 1,
> > > > > __ATOMIC_ACQUIRE);
> 
> The atomic load in MLX5_ASSERT() accesses the same memory space as the
> previous __atomic_add_fetch() does.
> They will access this memory space in the program order when we enabled
> MLX5_PMD_DEBUG. So the ACQUIRE barrier in __atomic_add_fetch()
> becomes unnecessary.
> 
> By changing it to RELAXED ordering, this patch got 7.6% performance
> improvement on N1 (making it generate A72 alike instructions).
> 
> Could you please also try it on your testbed, Alex?

Situation got better with this modification, here are the results:
 - no patch:             3.0 Mpps CPU cycles/packet=51.52
 - original patch:    2.1 Mpps CPU cycles/packet=71.05
 - modified patch: 2.9 Mpps CPU cycles/packet=52.79
Also, I found that the degradation is there only in case I enable bursts stats.
Could you please turn on the following config options and see if you can reproduce this as well?
CONFIG_RTE_TEST_PMD_RECORD_CORE_CYCLES=y
CONFIG_RTE_TEST_PMD_RECORD_BURST_STATS=y

> >
> > Can you replace just the above line with the following lines and test it?
> >
> > __atomic_add_fetch(&buf->refcnt, 1, __ATOMIC_RELAXED);
> > __atomic_thread_fence(__ATOMIC_ACQ_REL);
> >
> > This should make the generated code same as before this patch. Let me
> > know if you would prefer us to re-spin the patch instead (for testing).
> >
> > > > > +MLX5_ASSERT(__atomic_load_n(&buf->refcnt,
> > > > > +    __ATOMIC_RELAXED) <= strd_n + 1);
> > > > >  buf_addr = RTE_PTR_SUB(addr,
> > > > > RTE_PKTMBUF_HEADROOM);
> > > > >  /*
> > > > >   * MLX5 device doesn't use iova but it is necessary in a
> > > > diff
> > > > > --git a/drivers/net/mlx5/mlx5_rxtx.h
> > > > > b/drivers/net/mlx5/mlx5_rxtx.h index 26621ff..0fc15f3 100644
> > > > > --- a/drivers/net/mlx5/mlx5_rxtx.h
> > > > > +++ b/drivers/net/mlx5/mlx5_rxtx.h
> > > > > @@ -78,7 +78,7 @@ struct rxq_zip {
> > > > >  /* Multi-Packet RQ buffer header. */  struct mlx5_mprq_buf {
> > > > > struct rte_mempool *mp; -rte_atomic16_t refcnt; /* Atomically
> > > > > accessed refcnt. */
> > > > > +uint16_t refcnt; /* Atomically accessed refcnt. */
> > > > >  uint8_t pad[RTE_PKTMBUF_HEADROOM]; /* Headroom for the first
> > > > packet.
> > > > > */
> > > > >  struct rte_mbuf_ext_shared_info shinfos[];
> > > > >  /*
> > > > > --
> > > > > 2.7.4
> >
  
Phil Yang July 27, 2020, 2:52 p.m. UTC | #11
Alexander Kozyrev <akozyrev@mellanox.com> writes:

<snip>

> > > > > > @@ -1790,9 +1792,9 @@ mlx5_rx_burst_mprq(void *dpdk_rxq,
> > struct
> > > > > > rte_mbuf **pkts, uint16_t pkts_n)  void *buf_addr;
> > > > > >
> > > > > >  /* Increment the refcnt of the whole chunk. */
> > > > > > -rte_atomic16_add_return(&buf->refcnt, 1);
> > > rte_atomic16_add_return includes a full barrier along with atomic
> > operation.
> > > But is full barrier required here? For ex:
> > > __atomic_add_fetch(&buf->refcnt, 1,
> > > __ATOMIC_RELAXED) will offer atomicity, but no barrier. Would that be
> > > enough?
> > >
> > > > > > -MLX5_ASSERT((uint16_t)rte_atomic16_read(&buf-
> > > > > > >refcnt) <=
> > > > > > -    strd_n + 1);
> > > > > > +__atomic_add_fetch(&buf->refcnt, 1,
> > > > > > __ATOMIC_ACQUIRE);
> >
> > The atomic load in MLX5_ASSERT() accesses the same memory space as the
> > previous __atomic_add_fetch() does.
> > They will access this memory space in the program order when we enabled
> > MLX5_PMD_DEBUG. So the ACQUIRE barrier in __atomic_add_fetch()
> > becomes unnecessary.
> >
> > By changing it to RELAXED ordering, this patch got 7.6% performance
> > improvement on N1 (making it generate A72 alike instructions).
> >
> > Could you please also try it on your testbed, Alex?
> 
> Situation got better with this modification, here are the results:
>  - no patch:             3.0 Mpps CPU cycles/packet=51.52
>  - original patch:    2.1 Mpps CPU cycles/packet=71.05
>  - modified patch: 2.9 Mpps CPU cycles/packet=52.79
> Also, I found that the degradation is there only in case I enable bursts stats.


Great! So this patch will not hurt the normal datapath performance.


> Could you please turn on the following config options and see if you can
> reproduce this as well?
> CONFIG_RTE_TEST_PMD_RECORD_CORE_CYCLES=y
> CONFIG_RTE_TEST_PMD_RECORD_BURST_STATS=y

Thanks, Alex. Some updates.

Slightly (about 1%) throughput degradation was detected after we enabled these two config options on N1 SoC.

If we look insight the perf stats results, with this patch, both mlx5_rx_burst and mlx5_tx_burst consume fewer CPU cycles than the original code. 
However, __memcpy_generic takes more cycles. I think that might be the reason for CPU cycles per packet increment after applying this patch.

Original code:
98.07%--pkt_burst_io_forward
        |
        |--44.53%--__memcpy_generic
        |
        |--35.85%--mlx5_rx_burst_mprq
        |
        |--15.94%--mlx5_tx_burst_none_empw
        |          |
        |          |--7.32%--mlx5_tx_handle_completion.isra.0
        |          |
        |           --0.50%--__memcpy_generic
        |
         --1.14%--memcpy@plt

Use C11 with RELAXED ordering:
99.36%--pkt_burst_io_forward
        |
        |--47.40%--__memcpy_generic
        |
        |--34.62%--mlx5_rx_burst_mprq
        |
        |--15.55%--mlx5_tx_burst_none_empw
        |          |
        |           --7.08%--mlx5_tx_handle_completion.isra.0
        |
         --1.17%--memcpy@plt

BTW, all the atomic operations in this patch are not the hotspot.

> 
> > >
> > > Can you replace just the above line with the following lines and test it?
> > >
> > > __atomic_add_fetch(&buf->refcnt, 1, __ATOMIC_RELAXED);
> > > __atomic_thread_fence(__ATOMIC_ACQ_REL);
> > >
> > > This should make the generated code same as before this patch. Let me
> > > know if you would prefer us to re-spin the patch instead (for testing).
> > >
> > > > > > +MLX5_ASSERT(__atomic_load_n(&buf->refcnt,
> > > > > > +    __ATOMIC_RELAXED) <= strd_n + 1);
> > > > > >  buf_addr = RTE_PTR_SUB(addr,
> > > > > > RTE_PKTMBUF_HEADROOM);
> > > > > >  /*
> > > > > >   * MLX5 device doesn't use iova but it is necessary in a
> > > > > diff
> > > > > > --git a/drivers/net/mlx5/mlx5_rxtx.h
> > > > > > b/drivers/net/mlx5/mlx5_rxtx.h index 26621ff..0fc15f3 100644
> > > > > > --- a/drivers/net/mlx5/mlx5_rxtx.h
> > > > > > +++ b/drivers/net/mlx5/mlx5_rxtx.h
<snip>
> > >
  
Alexander Kozyrev Aug. 6, 2020, 2:43 a.m. UTC | #12
> Phil Yang <Phil.Yang@arm.com> writes:
> 
> <snip>
> 
> > > > > > > @@ -1790,9 +1792,9 @@ mlx5_rx_burst_mprq(void *dpdk_rxq,
> > > struct
> > > > > > > rte_mbuf **pkts, uint16_t pkts_n)  void *buf_addr;
> > > > > > >
> > > > > > >  /* Increment the refcnt of the whole chunk. */
> > > > > > > -rte_atomic16_add_return(&buf->refcnt, 1);
> > > > rte_atomic16_add_return includes a full barrier along with atomic
> > > operation.
> > > > But is full barrier required here? For ex:
> > > > __atomic_add_fetch(&buf->refcnt, 1,
> > > > __ATOMIC_RELAXED) will offer atomicity, but no barrier. Would that
> > > > be enough?
> > > >
> > > > > > > -MLX5_ASSERT((uint16_t)rte_atomic16_read(&buf-
> > > > > > > >refcnt) <=
> > > > > > > -    strd_n + 1);
> > > > > > > +__atomic_add_fetch(&buf->refcnt, 1,
> > > > > > > __ATOMIC_ACQUIRE);
> > >
> > > The atomic load in MLX5_ASSERT() accesses the same memory space as
> > > the previous __atomic_add_fetch() does.
> > > They will access this memory space in the program order when we
> > > enabled MLX5_PMD_DEBUG. So the ACQUIRE barrier in
> > > __atomic_add_fetch() becomes unnecessary.
> > >
> > > By changing it to RELAXED ordering, this patch got 7.6% performance
> > > improvement on N1 (making it generate A72 alike instructions).
> > >
> > > Could you please also try it on your testbed, Alex?
> >
> > Situation got better with this modification, here are the results:
> >  - no patch:             3.0 Mpps CPU cycles/packet=51.52
> >  - original patch:    2.1 Mpps CPU cycles/packet=71.05
> >  - modified patch: 2.9 Mpps CPU cycles/packet=52.79 Also, I found that
> > the degradation is there only in case I enable bursts stats.
> 
> 
> Great! So this patch will not hurt the normal datapath performance.
> 
> 
> > Could you please turn on the following config options and see if you
> > can reproduce this as well?
> > CONFIG_RTE_TEST_PMD_RECORD_CORE_CYCLES=y
> > CONFIG_RTE_TEST_PMD_RECORD_BURST_STATS=y
> 
> Thanks, Alex. Some updates.
> 
> Slightly (about 1%) throughput degradation was detected after we enabled
> these two config options on N1 SoC.
> 
> If we look insight the perf stats results, with this patch, both mlx5_rx_burst
> and mlx5_tx_burst consume fewer CPU cycles than the original code.
> However, __memcpy_generic takes more cycles. I think that might be the
> reason for CPU cycles per packet increment after applying this patch.
> 
> Original code:
> 98.07%--pkt_burst_io_forward
>         |
>         |--44.53%--__memcpy_generic
>         |
>         |--35.85%--mlx5_rx_burst_mprq
>         |
>         |--15.94%--mlx5_tx_burst_none_empw
>         |          |
>         |          |--7.32%--mlx5_tx_handle_completion.isra.0
>         |          |
>         |           --0.50%--__memcpy_generic
>         |
>          --1.14%--memcpy@plt
> 
> Use C11 with RELAXED ordering:
> 99.36%--pkt_burst_io_forward
>         |
>         |--47.40%--__memcpy_generic
>         |
>         |--34.62%--mlx5_rx_burst_mprq
>         |
>         |--15.55%--mlx5_tx_burst_none_empw
>         |          |
>         |           --7.08%--mlx5_tx_handle_completion.isra.0
>         |
>          --1.17%--memcpy@plt
> 
> BTW, all the atomic operations in this patch are not the hotspot.

Phil, we are seeing much worse degradation on our ARM platform unfortunately.
I don't think that discrepancy in memcpy can explain this behavior.
Your patch is not touching this area of code. Let me collect some perf stat on our side.

> 
> >
> > > >
> > > > Can you replace just the above line with the following lines and test it?
> > > >
> > > > __atomic_add_fetch(&buf->refcnt, 1, __ATOMIC_RELAXED);
> > > > __atomic_thread_fence(__ATOMIC_ACQ_REL);
> > > >
> > > > This should make the generated code same as before this patch. Let
> > > > me know if you would prefer us to re-spin the patch instead (for
> testing).
> > > >
> > > > > > > +MLX5_ASSERT(__atomic_load_n(&buf->refcnt,
> > > > > > > +    __ATOMIC_RELAXED) <= strd_n + 1);
> > > > > > >  buf_addr = RTE_PTR_SUB(addr, RTE_PKTMBUF_HEADROOM);
> > > > > > >  /*
> > > > > > >   * MLX5 device doesn't use iova but it is necessary in a
> > > > > > diff
> > > > > > > --git a/drivers/net/mlx5/mlx5_rxtx.h
> > > > > > > b/drivers/net/mlx5/mlx5_rxtx.h index 26621ff..0fc15f3 100644
> > > > > > > --- a/drivers/net/mlx5/mlx5_rxtx.h
> > > > > > > +++ b/drivers/net/mlx5/mlx5_rxtx.h
> <snip>
> > > >
  
Honnappa Nagarahalli Aug. 11, 2020, 5:20 a.m. UTC | #13
<snip>

> >
> > > > > > > > @@ -1790,9 +1792,9 @@ mlx5_rx_burst_mprq(void *dpdk_rxq,
> > > > struct
> > > > > > > > rte_mbuf **pkts, uint16_t pkts_n)  void *buf_addr;
> > > > > > > >
> > > > > > > >  /* Increment the refcnt of the whole chunk. */
> > > > > > > > -rte_atomic16_add_return(&buf->refcnt, 1);
> > > > > rte_atomic16_add_return includes a full barrier along with
> > > > > atomic
> > > > operation.
> > > > > But is full barrier required here? For ex:
> > > > > __atomic_add_fetch(&buf->refcnt, 1,
> > > > > __ATOMIC_RELAXED) will offer atomicity, but no barrier. Would
> > > > > that be enough?
> > > > >
> > > > > > > > -MLX5_ASSERT((uint16_t)rte_atomic16_read(&buf-
> > > > > > > > >refcnt) <=
> > > > > > > > -    strd_n + 1);
> > > > > > > > +__atomic_add_fetch(&buf->refcnt, 1,
> > > > > > > > __ATOMIC_ACQUIRE);
> > > >
> > > > The atomic load in MLX5_ASSERT() accesses the same memory space as
> > > > the previous __atomic_add_fetch() does.
> > > > They will access this memory space in the program order when we
> > > > enabled MLX5_PMD_DEBUG. So the ACQUIRE barrier in
> > > > __atomic_add_fetch() becomes unnecessary.
> > > >
> > > > By changing it to RELAXED ordering, this patch got 7.6%
> > > > performance improvement on N1 (making it generate A72 alike
> instructions).
> > > >
> > > > Could you please also try it on your testbed, Alex?
> > >
> > > Situation got better with this modification, here are the results:
> > >  - no patch:             3.0 Mpps CPU cycles/packet=51.52
> > >  - original patch:    2.1 Mpps CPU cycles/packet=71.05
> > >  - modified patch: 2.9 Mpps CPU cycles/packet=52.79 Also, I found
> > > that the degradation is there only in case I enable bursts stats.
> >
> >
> > Great! So this patch will not hurt the normal datapath performance.
> >
> >
> > > Could you please turn on the following config options and see if you
> > > can reproduce this as well?
> > > CONFIG_RTE_TEST_PMD_RECORD_CORE_CYCLES=y
> > > CONFIG_RTE_TEST_PMD_RECORD_BURST_STATS=y
> >
> > Thanks, Alex. Some updates.
> >
> > Slightly (about 1%) throughput degradation was detected after we
> > enabled these two config options on N1 SoC.
> >
> > If we look insight the perf stats results, with this patch, both
> > mlx5_rx_burst and mlx5_tx_burst consume fewer CPU cycles than the
> original code.
> > However, __memcpy_generic takes more cycles. I think that might be the
> > reason for CPU cycles per packet increment after applying this patch.
> >
> > Original code:
> > 98.07%--pkt_burst_io_forward
> >         |
> >         |--44.53%--__memcpy_generic
> >         |
> >         |--35.85%--mlx5_rx_burst_mprq
> >         |
> >         |--15.94%--mlx5_tx_burst_none_empw
> >         |          |
> >         |          |--7.32%--mlx5_tx_handle_completion.isra.0
> >         |          |
> >         |           --0.50%--__memcpy_generic
> >         |
> >          --1.14%--memcpy@plt
> >
> > Use C11 with RELAXED ordering:
> > 99.36%--pkt_burst_io_forward
> >         |
> >         |--47.40%--__memcpy_generic
> >         |
> >         |--34.62%--mlx5_rx_burst_mprq
> >         |
> >         |--15.55%--mlx5_tx_burst_none_empw
> >         |          |
> >         |           --7.08%--mlx5_tx_handle_completion.isra.0
> >         |
> >          --1.17%--memcpy@plt
> >
> > BTW, all the atomic operations in this patch are not the hotspot.
> 
> Phil, we are seeing much worse degradation on our ARM platform
> unfortunately.
> I don't think that discrepancy in memcpy can explain this behavior.
> Your patch is not touching this area of code. Let me collect some perf stat on
> our side.
Are you testing the patch as is or have you made the changes that were discussed in the thread?

> 
> >
> > >
> > > > >
> > > > > Can you replace just the above line with the following lines and test it?
> > > > >
> > > > > __atomic_add_fetch(&buf->refcnt, 1, __ATOMIC_RELAXED);
> > > > > __atomic_thread_fence(__ATOMIC_ACQ_REL);
> > > > >
> > > > > This should make the generated code same as before this patch.
> > > > > Let me know if you would prefer us to re-spin the patch instead
> > > > > (for
> > testing).
> > > > >
> > > > > > > > +MLX5_ASSERT(__atomic_load_n(&buf->refcnt,
> > > > > > > > +    __ATOMIC_RELAXED) <= strd_n + 1);
> > > > > > > >  buf_addr = RTE_PTR_SUB(addr, RTE_PKTMBUF_HEADROOM);
> > > > > > > >  /*
> > > > > > > >   * MLX5 device doesn't use iova but it is necessary in a
> > > > > > > diff
> > > > > > > > --git a/drivers/net/mlx5/mlx5_rxtx.h
> > > > > > > > b/drivers/net/mlx5/mlx5_rxtx.h index 26621ff..0fc15f3
> > > > > > > > 100644
> > > > > > > > --- a/drivers/net/mlx5/mlx5_rxtx.h
> > > > > > > > +++ b/drivers/net/mlx5/mlx5_rxtx.h
> > <snip>
> > > > >
  
Alexander Kozyrev Sept. 2, 2020, 9:52 p.m. UTC | #14
> <snip>
> 
> > >
> > > > > > > > > @@ -1790,9 +1792,9 @@ mlx5_rx_burst_mprq(void
> *dpdk_rxq,
> > > > > struct
> > > > > > > > > rte_mbuf **pkts, uint16_t pkts_n)  void *buf_addr;
> > > > > > > > >
> > > > > > > > >  /* Increment the refcnt of the whole chunk. */
> > > > > > > > > -rte_atomic16_add_return(&buf->refcnt, 1);
> > > > > > rte_atomic16_add_return includes a full barrier along with
> > > > > > atomic
> > > > > operation.
> > > > > > But is full barrier required here? For ex:
> > > > > > __atomic_add_fetch(&buf->refcnt, 1,
> > > > > > __ATOMIC_RELAXED) will offer atomicity, but no barrier. Would
> > > > > > that be enough?
> > > > > >
> > > > > > > > > -MLX5_ASSERT((uint16_t)rte_atomic16_read(&buf-
> > > > > > > > > >refcnt) <=
> > > > > > > > > -    strd_n + 1);
> > > > > > > > > +__atomic_add_fetch(&buf->refcnt, 1,
> > > > > > > > > __ATOMIC_ACQUIRE);
> > > > >
> > > > > The atomic load in MLX5_ASSERT() accesses the same memory space
> > > > > as the previous __atomic_add_fetch() does.
> > > > > They will access this memory space in the program order when we
> > > > > enabled MLX5_PMD_DEBUG. So the ACQUIRE barrier in
> > > > > __atomic_add_fetch() becomes unnecessary.
> > > > >
> > > > > By changing it to RELAXED ordering, this patch got 7.6%
> > > > > performance improvement on N1 (making it generate A72 alike
> > instructions).
> > > > >
> > > > > Could you please also try it on your testbed, Alex?
> > > >
> > > > Situation got better with this modification, here are the results:
> > > >  - no patch:             3.0 Mpps CPU cycles/packet=51.52
> > > >  - original patch:    2.1 Mpps CPU cycles/packet=71.05
> > > >  - modified patch: 2.9 Mpps CPU cycles/packet=52.79 Also, I found
> > > > that the degradation is there only in case I enable bursts stats.
> > >
> > >
> > > Great! So this patch will not hurt the normal datapath performance.
> > >
> > >
> > > > Could you please turn on the following config options and see if
> > > > you can reproduce this as well?
> > > > CONFIG_RTE_TEST_PMD_RECORD_CORE_CYCLES=y
> > > > CONFIG_RTE_TEST_PMD_RECORD_BURST_STATS=y
> > >
> > > Thanks, Alex. Some updates.
> > >
> > > Slightly (about 1%) throughput degradation was detected after we
> > > enabled these two config options on N1 SoC.
> > >
> > > If we look insight the perf stats results, with this patch, both
> > > mlx5_rx_burst and mlx5_tx_burst consume fewer CPU cycles than the
> > original code.
> > > However, __memcpy_generic takes more cycles. I think that might be
> > > the reason for CPU cycles per packet increment after applying this patch.
> > >
> > > Original code:
> > > 98.07%--pkt_burst_io_forward
> > >         |
> > >         |--44.53%--__memcpy_generic
> > >         |
> > >         |--35.85%--mlx5_rx_burst_mprq
> > >         |
> > >         |--15.94%--mlx5_tx_burst_none_empw
> > >         |          |
> > >         |          |--7.32%--mlx5_tx_handle_completion.isra.0
> > >         |          |
> > >         |           --0.50%--__memcpy_generic
> > >         |
> > >          --1.14%--memcpy@plt
> > >
> > > Use C11 with RELAXED ordering:
> > > 99.36%--pkt_burst_io_forward
> > >         |
> > >         |--47.40%--__memcpy_generic
> > >         |
> > >         |--34.62%--mlx5_rx_burst_mprq
> > >         |
> > >         |--15.55%--mlx5_tx_burst_none_empw
> > >         |          |
> > >         |           --7.08%--mlx5_tx_handle_completion.isra.0
> > >         |
> > >          --1.17%--memcpy@plt
> > >
> > > BTW, all the atomic operations in this patch are not the hotspot.
> >
> > Phil, we are seeing much worse degradation on our ARM platform
> > unfortunately.
> > I don't think that discrepancy in memcpy can explain this behavior.
> > Your patch is not touching this area of code. Let me collect some perf
> > stat on our side.
> Are you testing the patch as is or have you made the changes that were
> discussed in the thread?
> 

Yes, I made the changes you suggested. It really gets better with them.
Could you please respin the patch to make sure I got it right in my environment?

> >
> > >
> > > >
> > > > > >
> > > > > > Can you replace just the above line with the following lines and test
> it?
> > > > > >
> > > > > > __atomic_add_fetch(&buf->refcnt, 1, __ATOMIC_RELAXED);
> > > > > > __atomic_thread_fence(__ATOMIC_ACQ_REL);
> > > > > >
> > > > > > This should make the generated code same as before this patch.
> > > > > > Let me know if you would prefer us to re-spin the patch
> > > > > > instead (for
> > > testing).
> > > > > >
> > > > > > > > > +MLX5_ASSERT(__atomic_load_n(&buf->refcnt,
> > > > > > > > > +    __ATOMIC_RELAXED) <= strd_n + 1);
> > > > > > > > >  buf_addr = RTE_PTR_SUB(addr,
> RTE_PKTMBUF_HEADROOM);
> > > > > > > > >  /*
> > > > > > > > >   * MLX5 device doesn't use iova but it is necessary in
> > > > > > > > > a
> > > > > > > > diff
> > > > > > > > > --git a/drivers/net/mlx5/mlx5_rxtx.h
> > > > > > > > > b/drivers/net/mlx5/mlx5_rxtx.h index 26621ff..0fc15f3
> > > > > > > > > 100644
> > > > > > > > > --- a/drivers/net/mlx5/mlx5_rxtx.h
> > > > > > > > > +++ b/drivers/net/mlx5/mlx5_rxtx.h
> > > <snip>
> > > > > >
  
Phil Yang Sept. 3, 2020, 2:55 a.m. UTC | #15
<snip>
> >
> > > >
> > > > > > > > > > @@ -1790,9 +1792,9 @@ mlx5_rx_burst_mprq(void
> > *dpdk_rxq,
> > > > > > struct
> > > > > > > > > > rte_mbuf **pkts, uint16_t pkts_n)  void *buf_addr;
> > > > > > > > > >
> > > > > > > > > >  /* Increment the refcnt of the whole chunk. */
> > > > > > > > > > -rte_atomic16_add_return(&buf->refcnt, 1);
> > > > > > > rte_atomic16_add_return includes a full barrier along with
> > > > > > > atomic
> > > > > > operation.
> > > > > > > But is full barrier required here? For ex:
> > > > > > > __atomic_add_fetch(&buf->refcnt, 1,
> > > > > > > __ATOMIC_RELAXED) will offer atomicity, but no barrier. Would
> > > > > > > that be enough?
> > > > > > >
> > > > > > > > > > -MLX5_ASSERT((uint16_t)rte_atomic16_read(&buf-
> > > > > > > > > > >refcnt) <=
> > > > > > > > > > -    strd_n + 1);
> > > > > > > > > > +__atomic_add_fetch(&buf->refcnt, 1,
> > > > > > > > > > __ATOMIC_ACQUIRE);
> > > > > >
> > > > > > The atomic load in MLX5_ASSERT() accesses the same memory
> space
> > > > > > as the previous __atomic_add_fetch() does.
> > > > > > They will access this memory space in the program order when we
> > > > > > enabled MLX5_PMD_DEBUG. So the ACQUIRE barrier in
> > > > > > __atomic_add_fetch() becomes unnecessary.
> > > > > >
> > > > > > By changing it to RELAXED ordering, this patch got 7.6%
> > > > > > performance improvement on N1 (making it generate A72 alike
> > > instructions).
> > > > > >
> > > > > > Could you please also try it on your testbed, Alex?
> > > > >
> > > > > Situation got better with this modification, here are the results:
> > > > >  - no patch:             3.0 Mpps CPU cycles/packet=51.52
> > > > >  - original patch:    2.1 Mpps CPU cycles/packet=71.05
> > > > >  - modified patch: 2.9 Mpps CPU cycles/packet=52.79 Also, I found
> > > > > that the degradation is there only in case I enable bursts stats.
> > > >
> > > >
> > > > Great! So this patch will not hurt the normal datapath performance.
> > > >
> > > >
> > > > > Could you please turn on the following config options and see if
> > > > > you can reproduce this as well?
> > > > > CONFIG_RTE_TEST_PMD_RECORD_CORE_CYCLES=y
> > > > > CONFIG_RTE_TEST_PMD_RECORD_BURST_STATS=y
> > > >
> > > > Thanks, Alex. Some updates.
> > > >
> > > > Slightly (about 1%) throughput degradation was detected after we
> > > > enabled these two config options on N1 SoC.
> > > >
> > > > If we look insight the perf stats results, with this patch, both
> > > > mlx5_rx_burst and mlx5_tx_burst consume fewer CPU cycles than the
> > > original code.
> > > > However, __memcpy_generic takes more cycles. I think that might be
> > > > the reason for CPU cycles per packet increment after applying this
> patch.
> > > >
> > > > Original code:
> > > > 98.07%--pkt_burst_io_forward
> > > >         |
> > > >         |--44.53%--__memcpy_generic
> > > >         |
> > > >         |--35.85%--mlx5_rx_burst_mprq
> > > >         |
> > > >         |--15.94%--mlx5_tx_burst_none_empw
> > > >         |          |
> > > >         |          |--7.32%--mlx5_tx_handle_completion.isra.0
> > > >         |          |
> > > >         |           --0.50%--__memcpy_generic
> > > >         |
> > > >          --1.14%--memcpy@plt
> > > >
> > > > Use C11 with RELAXED ordering:
> > > > 99.36%--pkt_burst_io_forward
> > > >         |
> > > >         |--47.40%--__memcpy_generic
> > > >         |
> > > >         |--34.62%--mlx5_rx_burst_mprq
> > > >         |
> > > >         |--15.55%--mlx5_tx_burst_none_empw
> > > >         |          |
> > > >         |           --7.08%--mlx5_tx_handle_completion.isra.0
> > > >         |
> > > >          --1.17%--memcpy@plt
> > > >
> > > > BTW, all the atomic operations in this patch are not the hotspot.
> > >
> > > Phil, we are seeing much worse degradation on our ARM platform
> > > unfortunately.
> > > I don't think that discrepancy in memcpy can explain this behavior.
> > > Your patch is not touching this area of code. Let me collect some perf
> > > stat on our side.
> > Are you testing the patch as is or have you made the changes that were
> > discussed in the thread?
> >
> 
> Yes, I made the changes you suggested. It really gets better with them.
> Could you please respin the patch to make sure I got it right in my
> environment?

Thanks, Alex.
Please check the new version here.
http://patchwork.dpdk.org/patch/76335/


> 
> > >
> > > >
> > > > >
> > > > > > >
> > > > > > > Can you replace just the above line with the following lines and
> test
> > it?
> > > > > > >
> > > > > > > __atomic_add_fetch(&buf->refcnt, 1, __ATOMIC_RELAXED);
> > > > > > > __atomic_thread_fence(__ATOMIC_ACQ_REL);
> > > > > > >
> > > > > > > This should make the generated code same as before this patch.
> > > > > > > Let me know if you would prefer us to re-spin the patch
> > > > > > > instead (for
> > > > testing).
> > > > > > >
> > > > > > > > > > +MLX5_ASSERT(__atomic_load_n(&buf->refcnt,
> > > > > > > > > > +    __ATOMIC_RELAXED) <= strd_n + 1);
> > > > > > > > > >  buf_addr = RTE_PTR_SUB(addr,
> > RTE_PKTMBUF_HEADROOM);
> > > > > > > > > >  /*
> > > > > > > > > >   * MLX5 device doesn't use iova but it is necessary in
> > > > > > > > > > a
> > > > > > > > > diff
> > > > > > > > > > --git a/drivers/net/mlx5/mlx5_rxtx.h
> > > > > > > > > > b/drivers/net/mlx5/mlx5_rxtx.h index 26621ff..0fc15f3
> > > > > > > > > > 100644
> > > > > > > > > > --- a/drivers/net/mlx5/mlx5_rxtx.h
> > > > > > > > > > +++ b/drivers/net/mlx5/mlx5_rxtx.h
> > > > <snip>
> > > > > > >
  
Alexander Kozyrev Sept. 9, 2020, 1:29 p.m. UTC | #16
> <snip>
> > >
> > > > >
> > > > > > > > > > > @@ -1790,9 +1792,9 @@ mlx5_rx_burst_mprq(void
> > > *dpdk_rxq,
> > > > > > > struct
> > > > > > > > > > > rte_mbuf **pkts, uint16_t pkts_n)  void *buf_addr;
> > > > > > > > > > >
> > > > > > > > > > >  /* Increment the refcnt of the whole chunk. */
> > > > > > > > > > > -rte_atomic16_add_return(&buf->refcnt, 1);
> > > > > > > > rte_atomic16_add_return includes a full barrier along with
> > > > > > > > atomic
> > > > > > > operation.
> > > > > > > > But is full barrier required here? For ex:
> > > > > > > > __atomic_add_fetch(&buf->refcnt, 1,
> > > > > > > > __ATOMIC_RELAXED) will offer atomicity, but no barrier.
> > > > > > > > Would that be enough?
> > > > > > > >
> > > > > > > > > > > -MLX5_ASSERT((uint16_t)rte_atomic16_read(&buf-
> > > > > > > > > > > >refcnt) <=
> > > > > > > > > > > -    strd_n + 1);
> > > > > > > > > > > +__atomic_add_fetch(&buf->refcnt, 1,
> > > > > > > > > > > __ATOMIC_ACQUIRE);
> > > > > > >
> > > > > > > The atomic load in MLX5_ASSERT() accesses the same memory
> > space
> > > > > > > as the previous __atomic_add_fetch() does.
> > > > > > > They will access this memory space in the program order when
> > > > > > > we enabled MLX5_PMD_DEBUG. So the ACQUIRE barrier in
> > > > > > > __atomic_add_fetch() becomes unnecessary.
> > > > > > >
> > > > > > > By changing it to RELAXED ordering, this patch got 7.6%
> > > > > > > performance improvement on N1 (making it generate A72 alike
> > > > instructions).
> > > > > > >
> > > > > > > Could you please also try it on your testbed, Alex?
> > > > > >
> > > > > > Situation got better with this modification, here are the results:
> > > > > >  - no patch:             3.0 Mpps CPU cycles/packet=51.52
> > > > > >  - original patch:    2.1 Mpps CPU cycles/packet=71.05
> > > > > >  - modified patch: 2.9 Mpps CPU cycles/packet=52.79 Also, I
> > > > > > found that the degradation is there only in case I enable bursts stats.
> > > > >
> > > > >
> > > > > Great! So this patch will not hurt the normal datapath performance.
> > > > >
> > > > >
> > > > > > Could you please turn on the following config options and see
> > > > > > if you can reproduce this as well?
> > > > > > CONFIG_RTE_TEST_PMD_RECORD_CORE_CYCLES=y
> > > > > > CONFIG_RTE_TEST_PMD_RECORD_BURST_STATS=y
> > > > >
> > > > > Thanks, Alex. Some updates.
> > > > >
> > > > > Slightly (about 1%) throughput degradation was detected after we
> > > > > enabled these two config options on N1 SoC.
> > > > >
> > > > > If we look insight the perf stats results, with this patch, both
> > > > > mlx5_rx_burst and mlx5_tx_burst consume fewer CPU cycles than
> > > > > the
> > > > original code.
> > > > > However, __memcpy_generic takes more cycles. I think that might
> > > > > be the reason for CPU cycles per packet increment after applying
> > > > > this
> > patch.
> > > > >
> > > > > Original code:
> > > > > 98.07%--pkt_burst_io_forward
> > > > >         |
> > > > >         |--44.53%--__memcpy_generic
> > > > >         |
> > > > >         |--35.85%--mlx5_rx_burst_mprq
> > > > >         |
> > > > >         |--15.94%--mlx5_tx_burst_none_empw
> > > > >         |          |
> > > > >         |          |--7.32%--mlx5_tx_handle_completion.isra.0
> > > > >         |          |
> > > > >         |           --0.50%--__memcpy_generic
> > > > >         |
> > > > >          --1.14%--memcpy@plt
> > > > >
> > > > > Use C11 with RELAXED ordering:
> > > > > 99.36%--pkt_burst_io_forward
> > > > >         |
> > > > >         |--47.40%--__memcpy_generic
> > > > >         |
> > > > >         |--34.62%--mlx5_rx_burst_mprq
> > > > >         |
> > > > >         |--15.55%--mlx5_tx_burst_none_empw
> > > > >         |          |
> > > > >         |           --7.08%--mlx5_tx_handle_completion.isra.0
> > > > >         |
> > > > >          --1.17%--memcpy@plt
> > > > >
> > > > > BTW, all the atomic operations in this patch are not the hotspot.
> > > >
> > > > Phil, we are seeing much worse degradation on our ARM platform
> > > > unfortunately.
> > > > I don't think that discrepancy in memcpy can explain this behavior.
> > > > Your patch is not touching this area of code. Let me collect some
> > > > perf stat on our side.
> > > Are you testing the patch as is or have you made the changes that
> > > were discussed in the thread?
> > >
> >
> > Yes, I made the changes you suggested. It really gets better with them.
> > Could you please respin the patch to make sure I got it right in my
> > environment?
> 
> Thanks, Alex.
> Please check the new version here.
> https://nam11.safelinks.protection.outlook.com/?url=http%3A%2F%2Fpatchwo
> rk.dpdk.org%2Fpatch%2F76335%2F&amp;data=02%7C01%7Cakozyrev%40nvidi
> a.com%7C2486830050214bac8b9708d84fb4d9f9%7C43083d15727340c1b7db39
> efd9ccc17a%7C0%7C0%7C637346985463620568&amp;sdata=WGw0JZPcbjosSiI
> UxJuQz3r2pZBYkz%2BIXSqlOXimZdc%3D&amp;reserved=0

This patch is definitely better, do not see a degradation anymore, thank you.

Acked-by: Alexander Kozyrev <akozyrev@nvidia.com>

> 
> >
> > > >
> > > > >
> > > > > >
> > > > > > > >
> > > > > > > > Can you replace just the above line with the following
> > > > > > > > lines and
> > test
> > > it?
> > > > > > > >
> > > > > > > > __atomic_add_fetch(&buf->refcnt, 1, __ATOMIC_RELAXED);
> > > > > > > > __atomic_thread_fence(__ATOMIC_ACQ_REL);
> > > > > > > >
> > > > > > > > This should make the generated code same as before this patch.
> > > > > > > > Let me know if you would prefer us to re-spin the patch
> > > > > > > > instead (for
> > > > > testing).
> > > > > > > >
> > > > > > > > > > > +MLX5_ASSERT(__atomic_load_n(&buf->refcnt,
> > > > > > > > > > > +    __ATOMIC_RELAXED) <= strd_n + 1);
> > > > > > > > > > >  buf_addr = RTE_PTR_SUB(addr,
> > > RTE_PKTMBUF_HEADROOM);
> > > > > > > > > > >  /*
> > > > > > > > > > >   * MLX5 device doesn't use iova but it is necessary
> > > > > > > > > > > in a
> > > > > > > > > > diff
> > > > > > > > > > > --git a/drivers/net/mlx5/mlx5_rxtx.h
> > > > > > > > > > > b/drivers/net/mlx5/mlx5_rxtx.h index
> > > > > > > > > > > 26621ff..0fc15f3
> > > > > > > > > > > 100644
> > > > > > > > > > > --- a/drivers/net/mlx5/mlx5_rxtx.h
> > > > > > > > > > > +++ b/drivers/net/mlx5/mlx5_rxtx.h
> > > > > <snip>
> > > > > > > >
  
Honnappa Nagarahalli Sept. 10, 2020, 1:34 a.m. UTC | #17
<snip>

> > > > > Phil, we are seeing much worse degradation on our ARM platform
> > > > > unfortunately.
> > > > > I don't think that discrepancy in memcpy can explain this behavior.
> > > > > Your patch is not touching this area of code. Let me collect
> > > > > some perf stat on our side.
> > > > Are you testing the patch as is or have you made the changes that
> > > > were discussed in the thread?
> > > >
> > >
> > > Yes, I made the changes you suggested. It really gets better with them.
> > > Could you please respin the patch to make sure I got it right in my
> > > environment?
> >
> > Thanks, Alex.
> > Please check the new version here.
> >
> https://nam11.safelinks.protection.outlook.com/?url=http%3A%2F%2Fpatch
> > wo
> rk.dpdk.org%2Fpatch%2F76335%2F&amp;data=02%7C01%7Cakozyrev%40nv
> idi
> >
> a.com%7C2486830050214bac8b9708d84fb4d9f9%7C43083d15727340c1b7d
> b39
> >
> efd9ccc17a%7C0%7C0%7C637346985463620568&amp;sdata=WGw0JZPcbjos
> SiI
> > UxJuQz3r2pZBYkz%2BIXSqlOXimZdc%3D&amp;reserved=0
> 
> This patch is definitely better, do not see a degradation anymore, thank you.
> 
> Acked-by: Alexander Kozyrev <akozyrev@nvidia.com>

Can you please add your Acked-by tag in v4 (Phil provided the link above)?

<snip>
  

Patch

diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
index dda0073..7f487f1 100644
--- a/drivers/net/mlx5/mlx5_rxq.c
+++ b/drivers/net/mlx5/mlx5_rxq.c
@@ -1545,7 +1545,7 @@  mlx5_mprq_buf_init(struct rte_mempool *mp, void *opaque_arg,
 
 	memset(_m, 0, sizeof(*buf));
 	buf->mp = mp;
-	rte_atomic16_set(&buf->refcnt, 1);
+	__atomic_store_n(&buf->refcnt, 1, __ATOMIC_RELAXED);
 	for (j = 0; j != strd_n; ++j) {
 		shinfo = &buf->shinfos[j];
 		shinfo->free_cb = mlx5_mprq_buf_free_cb;
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index e4106bf..f0eda88 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -1595,10 +1595,11 @@  mlx5_mprq_buf_free_cb(void *addr __rte_unused, void *opaque)
 {
 	struct mlx5_mprq_buf *buf = opaque;
 
-	if (rte_atomic16_read(&buf->refcnt) == 1) {
+	if (__atomic_load_n(&buf->refcnt, __ATOMIC_RELAXED) == 1) {
 		rte_mempool_put(buf->mp, buf);
-	} else if (rte_atomic16_add_return(&buf->refcnt, -1) == 0) {
-		rte_atomic16_set(&buf->refcnt, 1);
+	} else if (unlikely(__atomic_sub_fetch(&buf->refcnt, 1,
+					       __ATOMIC_RELAXED) == 0)) {
+		__atomic_store_n(&buf->refcnt, 1, __ATOMIC_RELAXED);
 		rte_mempool_put(buf->mp, buf);
 	}
 }
@@ -1678,7 +1679,8 @@  mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 
 		if (consumed_strd == strd_n) {
 			/* Replace WQE only if the buffer is still in use. */
-			if (rte_atomic16_read(&buf->refcnt) > 1) {
+			if (__atomic_load_n(&buf->refcnt,
+					    __ATOMIC_RELAXED) > 1) {
 				mprq_buf_replace(rxq, rq_ci & wq_mask, strd_n);
 				/* Release the old buffer. */
 				mlx5_mprq_buf_free(buf);
@@ -1790,9 +1792,9 @@  mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 			void *buf_addr;
 
 			/* Increment the refcnt of the whole chunk. */
-			rte_atomic16_add_return(&buf->refcnt, 1);
-			MLX5_ASSERT((uint16_t)rte_atomic16_read(&buf->refcnt) <=
-				    strd_n + 1);
+			__atomic_add_fetch(&buf->refcnt, 1, __ATOMIC_ACQUIRE);
+			MLX5_ASSERT(__atomic_load_n(&buf->refcnt,
+				    __ATOMIC_RELAXED) <= strd_n + 1);
 			buf_addr = RTE_PTR_SUB(addr, RTE_PKTMBUF_HEADROOM);
 			/*
 			 * MLX5 device doesn't use iova but it is necessary in a
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 26621ff..0fc15f3 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -78,7 +78,7 @@  struct rxq_zip {
 /* Multi-Packet RQ buffer header. */
 struct mlx5_mprq_buf {
 	struct rte_mempool *mp;
-	rte_atomic16_t refcnt; /* Atomically accessed refcnt. */
+	uint16_t refcnt; /* Atomically accessed refcnt. */
 	uint8_t pad[RTE_PKTMBUF_HEADROOM]; /* Headroom for the first packet. */
 	struct rte_mbuf_ext_shared_info shinfos[];
 	/*