[v2,2/3] test/distributor: replace sync builtins with atomic builtins

Message ID 1553856998-25394-3-git-send-email-phil.yang@arm.com (mailing list archive)
State Superseded, archived
Delegated to: Thomas Monjalon
Headers
Series example and test cases optimizations |

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/Intel-compilation success Compilation OK

Commit Message

Phil Yang March 29, 2019, 10:56 a.m. UTC
  '__sync' built-in functions are deprecated, should use the '__atomic'
built-in instead. the sync built-in functions are full barriers, while
atomic built-in functions offer less restrictive one-way barriers,
which help performance.

Here is the example test result on TX2:
sudo ./arm64-armv8a-linuxapp-gcc/app/test -l 112-139 \
-n 4 --socket-mem=1024,1024 -- -i
RTE>>distributor_perf_autotest

*** distributor_perf_autotest without this patch ***
  

Comments

Honnappa Nagarahalli April 1, 2019, 4:24 p.m. UTC | #1
> 
> diff --git a/app/test/test_distributor.c b/app/test/test_distributor.c index
> 98919ec..ddab08d 100644
> --- a/app/test/test_distributor.c
> +++ b/app/test/test_distributor.c
> @@ -62,9 +62,14 @@ handle_work(void *arg)
>  	struct worker_params *wp = arg;
>  	struct rte_distributor *db = wp->dist;
>  	unsigned int count = 0, num = 0;
> -	unsigned int id = __sync_fetch_and_add(&worker_idx, 1);
>  	int i;
> 
> +#ifdef RTE_USE_C11_MEM_MODEL
> +	unsigned int id = __atomic_fetch_add(&worker_idx, 1,
> +__ATOMIC_RELAXED); #else
> +	unsigned int id = __sync_fetch_and_add(&worker_idx, 1); #endif
> +
I suggest we remove the conditional compilation and just keep the __atomic_xxx calls as this is test code. More over the distributor library does not have a C11 version of the library (assuming that using __atomic_xxx does not impact performance on other platforms negatively). This applies to other instances in this patch.

>  	for (i = 0; i < 8; i++)
>  		buf[i] = NULL;
>  	num = rte_distributor_get_pkt(db, id, buf, buf, num); @@ -270,7
> +275,12 @@ handle_work_with_free_mbufs(void *arg)
>  	unsigned int count = 0;
>  	unsigned int i;
>  	unsigned int num = 0;
> +
> +#ifdef RTE_USE_C11_MEM_MODEL
> +	unsigned int id = __atomic_fetch_add(&worker_idx, 1,
> +__ATOMIC_RELAXED); #else
>  	unsigned int id = __sync_fetch_and_add(&worker_idx, 1);
> +#endif
> 
>  	for (i = 0; i < 8; i++)
>  		buf[i] = NULL;
> @@ -343,7 +353,13 @@ handle_work_for_shutdown_test(void *arg)
>  	unsigned int total = 0;
>  	unsigned int i;
>  	unsigned int returned = 0;
> +
> +#ifdef RTE_USE_C11_MEM_MODEL
> +	const unsigned int id = __atomic_fetch_add(&worker_idx, 1,
> +			__ATOMIC_RELAXED);
> +#else
>  	const unsigned int id = __sync_fetch_and_add(&worker_idx, 1);
> +#endif
> 
>  	num = rte_distributor_get_pkt(d, id, buf, buf, num);
> 
> diff --git a/app/test/test_distributor_perf.c b/app/test/test_distributor_perf.c
> index edf1998..9367460 100644
> --- a/app/test/test_distributor_perf.c
> +++ b/app/test/test_distributor_perf.c
> @@ -111,9 +111,14 @@ handle_work(void *arg)
>  	unsigned int count = 0;
>  	unsigned int num = 0;
>  	int i;
> -	unsigned int id = __sync_fetch_and_add(&worker_idx, 1);
>  	struct rte_mbuf *buf[8] __rte_cache_aligned;
> 
> +#ifdef RTE_USE_C11_MEM_MODEL
> +	unsigned int id = __atomic_fetch_add(&worker_idx, 1,
> +__ATOMIC_RELAXED); #else
> +	unsigned int id = __sync_fetch_and_add(&worker_idx, 1); #endif
> +
>  	for (i = 0; i < 8; i++)
>  		buf[i] = NULL;
> 
> --
> 2.7.4
  
Phil Yang April 2, 2019, 3:43 a.m. UTC | #2
> -----Original Message-----
> From: Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>
> Sent: Tuesday, April 2, 2019 12:24 AM
> To: Phil Yang (Arm Technology China) <Phil.Yang@arm.com>; dev@dpdk.org;
> thomas@monjalon.net
> Cc: david.hunt@intel.com; reshma.pattan@intel.com; Gavin Hu (Arm
> Technology China) <Gavin.Hu@arm.com>; Phil Yang (Arm Technology China)
> <Phil.Yang@arm.com>; nd <nd@arm.com>; Honnappa Nagarahalli
> <Honnappa.Nagarahalli@arm.com>; nd <nd@arm.com>
> Subject: RE: [PATCH v2 2/3] test/distributor: replace sync builtins with atomic
> builtins
> 
> >
> > diff --git a/app/test/test_distributor.c b/app/test/test_distributor.c
> > index 98919ec..ddab08d 100644
> > --- a/app/test/test_distributor.c
> > +++ b/app/test/test_distributor.c
> > @@ -62,9 +62,14 @@ handle_work(void *arg)
> >  	struct worker_params *wp = arg;
> >  	struct rte_distributor *db = wp->dist;
> >  	unsigned int count = 0, num = 0;
> > -	unsigned int id = __sync_fetch_and_add(&worker_idx, 1);
> >  	int i;
> >
> > +#ifdef RTE_USE_C11_MEM_MODEL
> > +	unsigned int id = __atomic_fetch_add(&worker_idx, 1,
> > +__ATOMIC_RELAXED); #else
> > +	unsigned int id = __sync_fetch_and_add(&worker_idx, 1); #endif
> > +
> I suggest we remove the conditional compilation and just keep the
> __atomic_xxx calls as this is test code. More over the distributor library does
> not have a C11 version of the library (assuming that using __atomic_xxx does
> not impact performance on other platforms negatively). This applies to other
> instances in this patch.
Hi Honnappa,

Thanks for your comments.
Agree. I don't think __atomic_xxx will cause performance degradation on other platforms.  Remove the conditional compilation will make the code more clear.

Thanks,
Phil Yang

> 
> >  	for (i = 0; i < 8; i++)
> >  		buf[i] = NULL;
> >  	num = rte_distributor_get_pkt(db, id, buf, buf, num); @@ -270,7
> > +275,12 @@ handle_work_with_free_mbufs(void *arg)
> >  	unsigned int count = 0;
> >  	unsigned int i;
> >  	unsigned int num = 0;
> > +
> > +#ifdef RTE_USE_C11_MEM_MODEL
> > +	unsigned int id = __atomic_fetch_add(&worker_idx, 1,
> > +__ATOMIC_RELAXED); #else
> >  	unsigned int id = __sync_fetch_and_add(&worker_idx, 1);
> > +#endif
> >
> >  	for (i = 0; i < 8; i++)
> >  		buf[i] = NULL;
> > @@ -343,7 +353,13 @@ handle_work_for_shutdown_test(void *arg)
> >  	unsigned int total = 0;
> >  	unsigned int i;
> >  	unsigned int returned = 0;
> > +
> > +#ifdef RTE_USE_C11_MEM_MODEL
> > +	const unsigned int id = __atomic_fetch_add(&worker_idx, 1,
> > +			__ATOMIC_RELAXED);
> > +#else
> >  	const unsigned int id = __sync_fetch_and_add(&worker_idx, 1);
> > +#endif
> >
> >  	num = rte_distributor_get_pkt(d, id, buf, buf, num);
> >
> > diff --git a/app/test/test_distributor_perf.c
> > b/app/test/test_distributor_perf.c
> > index edf1998..9367460 100644
> > --- a/app/test/test_distributor_perf.c
> > +++ b/app/test/test_distributor_perf.c
> > @@ -111,9 +111,14 @@ handle_work(void *arg)
> >  	unsigned int count = 0;
> >  	unsigned int num = 0;
> >  	int i;
> > -	unsigned int id = __sync_fetch_and_add(&worker_idx, 1);
> >  	struct rte_mbuf *buf[8] __rte_cache_aligned;
> >
> > +#ifdef RTE_USE_C11_MEM_MODEL
> > +	unsigned int id = __atomic_fetch_add(&worker_idx, 1,
> > +__ATOMIC_RELAXED); #else
> > +	unsigned int id = __sync_fetch_and_add(&worker_idx, 1); #endif
> > +
> >  	for (i = 0; i < 8; i++)
> >  		buf[i] = NULL;
> >
> > --
> > 2.7.4
  

Patch

==== Cache line switch test ===
Time for 33554432 iterations = 1519202730 ticks
Ticks per iteration = 45

*** distributor_perf_autotest with this patch ***
==== Cache line switch test ===
Time for 33554432 iterations = 1251715496 ticks
Ticks per iteration = 37

Less ticks needed for the cache line switch test. It got 17% of
performance improvement.

Signed-off-by: Phil Yang <phil.yang@arm.com>
Reviewed-by: Gavin Hu <gavin.hu@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
Reviewed-by: Joyce Kong <joyce.kong@arm.com>
Reviewed-by: Dharmik Thakkar <dharmik.thakkar@arm.com>
---
 app/test/test_distributor.c      | 18 +++++++++++++++++-
 app/test/test_distributor_perf.c |  7 ++++++-
 2 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/app/test/test_distributor.c b/app/test/test_distributor.c
index 98919ec..ddab08d 100644
--- a/app/test/test_distributor.c
+++ b/app/test/test_distributor.c
@@ -62,9 +62,14 @@  handle_work(void *arg)
 	struct worker_params *wp = arg;
 	struct rte_distributor *db = wp->dist;
 	unsigned int count = 0, num = 0;
-	unsigned int id = __sync_fetch_and_add(&worker_idx, 1);
 	int i;
 
+#ifdef RTE_USE_C11_MEM_MODEL
+	unsigned int id = __atomic_fetch_add(&worker_idx, 1, __ATOMIC_RELAXED);
+#else
+	unsigned int id = __sync_fetch_and_add(&worker_idx, 1);
+#endif
+
 	for (i = 0; i < 8; i++)
 		buf[i] = NULL;
 	num = rte_distributor_get_pkt(db, id, buf, buf, num);
@@ -270,7 +275,12 @@  handle_work_with_free_mbufs(void *arg)
 	unsigned int count = 0;
 	unsigned int i;
 	unsigned int num = 0;
+
+#ifdef RTE_USE_C11_MEM_MODEL
+	unsigned int id = __atomic_fetch_add(&worker_idx, 1, __ATOMIC_RELAXED);
+#else
 	unsigned int id = __sync_fetch_and_add(&worker_idx, 1);
+#endif
 
 	for (i = 0; i < 8; i++)
 		buf[i] = NULL;
@@ -343,7 +353,13 @@  handle_work_for_shutdown_test(void *arg)
 	unsigned int total = 0;
 	unsigned int i;
 	unsigned int returned = 0;
+
+#ifdef RTE_USE_C11_MEM_MODEL
+	const unsigned int id = __atomic_fetch_add(&worker_idx, 1,
+			__ATOMIC_RELAXED);
+#else
 	const unsigned int id = __sync_fetch_and_add(&worker_idx, 1);
+#endif
 
 	num = rte_distributor_get_pkt(d, id, buf, buf, num);
 
diff --git a/app/test/test_distributor_perf.c b/app/test/test_distributor_perf.c
index edf1998..9367460 100644
--- a/app/test/test_distributor_perf.c
+++ b/app/test/test_distributor_perf.c
@@ -111,9 +111,14 @@  handle_work(void *arg)
 	unsigned int count = 0;
 	unsigned int num = 0;
 	int i;
-	unsigned int id = __sync_fetch_and_add(&worker_idx, 1);
 	struct rte_mbuf *buf[8] __rte_cache_aligned;
 
+#ifdef RTE_USE_C11_MEM_MODEL
+	unsigned int id = __atomic_fetch_add(&worker_idx, 1, __ATOMIC_RELAXED);
+#else
+	unsigned int id = __sync_fetch_and_add(&worker_idx, 1);
+#endif
+
 	for (i = 0; i < 8; i++)
 		buf[i] = NULL;