[v1] test/ring: ring perf test case enhancement

Message ID 1545305634-81288-1-git-send-email-gavin.hu@arm.com (mailing list archive)
State Superseded, archived
Delegated to: Thomas Monjalon
Headers
Series [v1] test/ring: ring perf test case enhancement |

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/Intel-compilation success Compilation OK
ci/mellanox-Performance-Testing success Performance Testing PASS
ci/intel-Performance-Testing success Performance Testing PASS

Commit Message

Gavin Hu Dec. 20, 2018, 11:33 a.m. UTC
  From: Joyce Kong <joyce.kong@arm.com>

Run ring perf test on all available cores to really verify MPMC operations.
The old way of running on a pair of cores is not enough for MPMC rings. We
used this test case for ring optimization and it was really helpful for
measuring the ring performance in multi-core environment.

Suggested-by: Gavin Hu <gavin.hu@arm.com>
Signed-off-by: Joyce Kong <joyce.kong@arm.com>
Reviewed-by: Ruifeng Wang <Ruifeng.Wang@arm.com>
Reviewed-by: Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>
Reviewed-by: Dharmik Thakkar <Dharmik.Thakkar@arm.com>
Reviewed-by: Ola Liljedahl <Ola.Liljedahl@arm.com>
Reviewed-by: Gavin Hu <gavin.hu@arm.com>
---
 test/test/test_ring_perf.c | 82 ++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 80 insertions(+), 2 deletions(-)
  

Comments

Thomas Monjalon Dec. 20, 2018, 11:40 a.m. UTC | #1
+Cc Olivier, maintainer of the ring library.

20/12/2018 12:33, Gavin Hu:
> From: Joyce Kong <joyce.kong@arm.com>
> 
> Run ring perf test on all available cores to really verify MPMC operations.
> The old way of running on a pair of cores is not enough for MPMC rings. We
> used this test case for ring optimization and it was really helpful for
> measuring the ring performance in multi-core environment.
> 
> Suggested-by: Gavin Hu <gavin.hu@arm.com>
> Signed-off-by: Joyce Kong <joyce.kong@arm.com>
> Reviewed-by: Ruifeng Wang <Ruifeng.Wang@arm.com>
> Reviewed-by: Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>
> Reviewed-by: Dharmik Thakkar <Dharmik.Thakkar@arm.com>
> Reviewed-by: Ola Liljedahl <Ola.Liljedahl@arm.com>
> Reviewed-by: Gavin Hu <gavin.hu@arm.com>
> ---
>  test/test/test_ring_perf.c | 82 ++++++++++++++++++++++++++++++++++++++++++++--
>  1 file changed, 80 insertions(+), 2 deletions(-)
> 
> diff --git a/test/test/test_ring_perf.c b/test/test/test_ring_perf.c
> index ebb3939..819d119 100644
> --- a/test/test/test_ring_perf.c
> +++ b/test/test/test_ring_perf.c
> @@ -20,12 +20,17 @@
>   *  * Empty ring dequeue
>   *  * Enqueue/dequeue of bursts in 1 threads
>   *  * Enqueue/dequeue of bursts in 2 threads
> + *  * Enqueue/dequeue of bursts in all available threads
>   */
>  
>  #define RING_NAME "RING_PERF"
>  #define RING_SIZE 4096
>  #define MAX_BURST 32
>  
> +#ifndef ARRAY_SIZE
> +#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
> +#endif
> +
>  /*
>   * the sizes to enqueue and dequeue in testing
>   * (marked volatile so they won't be seen as compile-time constants)
> @@ -248,9 +253,78 @@ run_on_core_pair(struct lcore_pair *cores, struct rte_ring *r,
>  	}
>  }
>  
> +static rte_atomic32_t synchro;
> +static uint64_t queue_count[RTE_MAX_LCORE] = {0};
> +
> +#define TIME_MS 100
> +
> +static int
> +load_loop_fn(void *p)
> +{
> +	uint64_t time_diff = 0;
> +	uint64_t begin = 0;
> +	uint64_t hz = rte_get_timer_hz();
> +	uint64_t lcount = 0;
> +	const unsigned int lcore = rte_lcore_id();
> +	struct thread_params *params = p;
> +	void *burst[MAX_BURST] = {0};
> +
> +	/* wait synchro for slaves */
> +	if (lcore != rte_get_master_lcore())
> +		while (rte_atomic32_read(&synchro) == 0)
> +			rte_pause();
> +
> +	begin = rte_get_timer_cycles();
> +	while (time_diff < hz * TIME_MS / 1000) {
> +		rte_ring_mp_enqueue_bulk(params->r, burst, params->size, NULL);
> +		rte_ring_mc_dequeue_bulk(params->r, burst, params->size, NULL);
> +		lcount++;
> +		time_diff = rte_get_timer_cycles() - begin;
> +	}
> +	queue_count[lcore] = lcount;
> +	return 0;
> +}
> +
> +static int
> +run_on_all_cores(struct rte_ring *r)
> +{
> +	uint64_t total = 0;
> +	struct thread_params param = {0};
> +	unsigned int i, c;
> +	for (i = 0; i < ARRAY_SIZE(bulk_sizes); i++) {
> +		printf("\nBulk enq/dequeue count on size %u\n", bulk_sizes[i]);
> +		param.size = bulk_sizes[i];
> +		param.r = r;
> +
> +		/* clear synchro and start slaves */
> +		rte_atomic32_set(&synchro, 0);
> +		if (rte_eal_mp_remote_launch(load_loop_fn,
> +				&param, SKIP_MASTER) < 0)
> +			return -1;
> +
> +		/* start synchro and launch test on master */
> +		rte_atomic32_set(&synchro, 1);
> +		load_loop_fn(&param);
> +
> +		rte_eal_mp_wait_lcore();
> +
> +		RTE_LCORE_FOREACH(c) {
> +			printf("Core [%u] count = %"PRIu64"\n",
> +					c, queue_count[c]);
> +			total += queue_count[c];
> +		}
> +
> +		printf("Total count (size: %u): %"PRIu64"\n", bulk_sizes[i],
> +						total);
> +	}
> +
> +	return 0;
> +}
> +
>  /*
> - * Test function that determines how long an enqueue + dequeue of a single item
> - * takes on a single lcore. Result is for comparison with the bulk enq+deq.
> + * Test function that determines how long an enqueue + dequeue of a single
> + * item takes on a single lcore. Result is for comparison with the bulk
> + * enq+deq.
>   */
>  static void
>  test_single_enqueue_dequeue(struct rte_ring *r)
> @@ -394,6 +468,10 @@ test_ring_perf(void)
>  		printf("\n### Testing using two NUMA nodes ###\n");
>  		run_on_core_pair(&cores, r, enqueue_bulk, dequeue_bulk);
>  	}
> +
> +	printf("\n### Testing using all slave nodes ###\n");
> +	run_on_all_cores(r);
> +
>  	rte_ring_free(r);
>  	return 0;
>  }
>
  
Honnappa Nagarahalli Dec. 20, 2018, 9:03 p.m. UTC | #2
> 
> +Cc Olivier, maintainer of the ring library.
> 
> 20/12/2018 12:33, Gavin Hu:
> > From: Joyce Kong <joyce.kong@arm.com>
> >
> > Run ring perf test on all available cores to really verify MPMC operations.
> > The old way of running on a pair of cores is not enough for MPMC
> > rings. We used this test case for ring optimization and it was really
> > helpful for measuring the ring performance in multi-core environment.
IMO, the last sentence does not convey quantifiable information. I suggest taking that out or replacing it with something that explains how it is useful.

> >
> > Suggested-by: Gavin Hu <gavin.hu@arm.com>
> > Signed-off-by: Joyce Kong <joyce.kong@arm.com>
> > Reviewed-by: Ruifeng Wang <Ruifeng.Wang@arm.com>
> > Reviewed-by: Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>
> > Reviewed-by: Dharmik Thakkar <Dharmik.Thakkar@arm.com>
> > Reviewed-by: Ola Liljedahl <Ola.Liljedahl@arm.com>
> > Reviewed-by: Gavin Hu <gavin.hu@arm.com>
> > ---
> >  test/test/test_ring_perf.c | 82
> > ++++++++++++++++++++++++++++++++++++++++++++--
> >  1 file changed, 80 insertions(+), 2 deletions(-)
> >
> > diff --git a/test/test/test_ring_perf.c b/test/test/test_ring_perf.c
> > index ebb3939..819d119 100644
> > --- a/test/test/test_ring_perf.c
> > +++ b/test/test/test_ring_perf.c
> > @@ -20,12 +20,17 @@
> >   *  * Empty ring dequeue
> >   *  * Enqueue/dequeue of bursts in 1 threads
> >   *  * Enqueue/dequeue of bursts in 2 threads
> > + *  * Enqueue/dequeue of bursts in all available threads
> >   */
> >
> >  #define RING_NAME "RING_PERF"
> >  #define RING_SIZE 4096
> >  #define MAX_BURST 32
> >
> > +#ifndef ARRAY_SIZE
> > +#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) #endif
> > +
> >  /*
> >   * the sizes to enqueue and dequeue in testing
> >   * (marked volatile so they won't be seen as compile-time constants)
> > @@ -248,9 +253,78 @@ run_on_core_pair(struct lcore_pair *cores, struct
> rte_ring *r,
> >  	}
> >  }
> >
> > +static rte_atomic32_t synchro;
> > +static uint64_t queue_count[RTE_MAX_LCORE] = {0};
> > +
> > +#define TIME_MS 100
> > +
> > +static int
> > +load_loop_fn(void *p)
> > +{
> > +	uint64_t time_diff = 0;
> > +	uint64_t begin = 0;
> > +	uint64_t hz = rte_get_timer_hz();
> > +	uint64_t lcount = 0;
> > +	const unsigned int lcore = rte_lcore_id();
> > +	struct thread_params *params = p;
> > +	void *burst[MAX_BURST] = {0};
> > +
> > +	/* wait synchro for slaves */
> > +	if (lcore != rte_get_master_lcore())
> > +		while (rte_atomic32_read(&synchro) == 0)
> > +			rte_pause();
> > +
> > +	begin = rte_get_timer_cycles();
> > +	while (time_diff < hz * TIME_MS / 1000) {
> > +		rte_ring_mp_enqueue_bulk(params->r, burst, params->size,
> NULL);
> > +		rte_ring_mc_dequeue_bulk(params->r, burst, params->size,
> NULL);
> > +		lcount++;
> > +		time_diff = rte_get_timer_cycles() - begin;
> > +	}
IMO, the method of measurement should be changed to reduce the over head of reading the cycles.

> > +	queue_count[lcore] = lcount;
> > +	return 0;
> > +}
> > +
> > +static int
> > +run_on_all_cores(struct rte_ring *r)
> > +{
> > +	uint64_t total = 0;
> > +	struct thread_params param = {0};
> > +	unsigned int i, c;
> > +	for (i = 0; i < ARRAY_SIZE(bulk_sizes); i++) {
> > +		printf("\nBulk enq/dequeue count on size %u\n",
> bulk_sizes[i]);
> > +		param.size = bulk_sizes[i];
> > +		param.r = r;
> > +
> > +		/* clear synchro and start slaves */
> > +		rte_atomic32_set(&synchro, 0);
> > +		if (rte_eal_mp_remote_launch(load_loop_fn,
> > +				&param, SKIP_MASTER) < 0)
> > +			return -1;
> > +
> > +		/* start synchro and launch test on master */
> > +		rte_atomic32_set(&synchro, 1);
> > +		load_loop_fn(&param);
> > +
> > +		rte_eal_mp_wait_lcore();
> > +
> > +		RTE_LCORE_FOREACH(c) {
> > +			printf("Core [%u] count = %"PRIu64"\n",
> > +					c, queue_count[c]);
> > +			total += queue_count[c];
> > +		}
> > +
> > +		printf("Total count (size: %u): %"PRIu64"\n", bulk_sizes[i],
> > +						total);
> > +	}
> > +
> > +	return 0;
> > +}
> > +
> >  /*
> > - * Test function that determines how long an enqueue + dequeue of a
> > single item
> > - * takes on a single lcore. Result is for comparison with the bulk enq+deq.
> > + * Test function that determines how long an enqueue + dequeue of a
> > + single
> > + * item takes on a single lcore. Result is for comparison with the
> > + bulk
> > + * enq+deq.
> >   */
> >  static void
> >  test_single_enqueue_dequeue(struct rte_ring *r) @@ -394,6 +468,10 @@
> > test_ring_perf(void)
> >  		printf("\n### Testing using two NUMA nodes ###\n");
> >  		run_on_core_pair(&cores, r, enqueue_bulk, dequeue_bulk);
> >  	}
> > +
> > +	printf("\n### Testing using all slave nodes ###\n");
> > +	run_on_all_cores(r);
> > +
> >  	rte_ring_free(r);
> >  	return 0;
> >  }
> >
> 
> 
> 
>
  
Jerin Jacob Kollanukkaran Dec. 27, 2018, 12:30 p.m. UTC | #3
On Thu, 2018-12-20 at 19:33 +0800, Gavin Hu wrote:
> External Email
> 
> -------------------------------------------------------------------
> ---
> From: Joyce Kong <joyce.kong@arm.com>
> 
> Run ring perf test on all available cores to really verify MPMC
> operations.
> The old way of running on a pair of cores is not enough for MPMC
> rings. We
> used this test case for ring optimization and it was really helpful
> for
> measuring the ring performance in multi-core environment.
> 
> Suggested-by: Gavin Hu <gavin.hu@arm.com>
> Signed-off-by: Joyce Kong <joyce.kong@arm.com>
> Reviewed-by: Ruifeng Wang <Ruifeng.Wang@arm.com>
> Reviewed-by: Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>
> Reviewed-by: Dharmik Thakkar <Dharmik.Thakkar@arm.com>
> Reviewed-by: Ola Liljedahl <Ola.Liljedahl@arm.com>
> Reviewed-by: Gavin Hu <gavin.hu@arm.com>
> ---
>  test/test/test_ring_perf.c | 82
> ++++++++++++++++++++++++++++++++++++++++++++--
>  1 file changed, 80 insertions(+), 2 deletions(-)
> 
> diff --git a/test/test/test_ring_perf.c b/test/test/test_ring_perf.c
> index ebb3939..819d119 100644
> --- a/test/test/test_ring_perf.c
> +++ b/test/test/test_ring_perf.c
> @@ -20,12 +20,17 @@
>   *  * Empty ring dequeue
>   *  * Enqueue/dequeue of bursts in 1 threads
>   *  * Enqueue/dequeue of bursts in 2 threads
> + *  * Enqueue/dequeue of bursts in all available threads
>   */
>  
>  #define RING_NAME "RING_PERF"
>  #define RING_SIZE 4096
>  #define MAX_BURST 32
>  
> +#ifndef ARRAY_SIZE
> +#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))

Use RTE_DIM instead.


> +#endif
> +
>  /*
>   * the sizes to enqueue and dequeue in testing
>   * (marked volatile so they won't be seen as compile-time constants)
> @@ -248,9 +253,78 @@ run_on_core_pair(struct lcore_pair *cores,
> struct rte_ring *r,
>  	}
>  }
>  
> +static rte_atomic32_t synchro;
> +static uint64_t queue_count[RTE_MAX_LCORE] = {0};

Do we need explicit {0} for this static global variable?

> +
> +#define TIME_MS 100
> +
> +static int
> +load_loop_fn(void *p)
> +{
> +	uint64_t time_diff = 0;
> +	uint64_t begin = 0;
> +	uint64_t hz = rte_get_timer_hz();
> +	uint64_t lcount = 0;
> +	const unsigned int lcore = rte_lcore_id();
> +	struct thread_params *params = p;
> +	void *burst[MAX_BURST] = {0};
> +
> +	/* wait synchro for slaves */
> +	if (lcore != rte_get_master_lcore())
> +		while (rte_atomic32_read(&synchro) == 0)
> +			rte_pause();
> +
> +	begin = rte_get_timer_cycles();
> +	while (time_diff < hz * TIME_MS / 1000) {
> +		rte_ring_mp_enqueue_bulk(params->r, burst, params-
> >size, NULL);
> +		rte_ring_mc_dequeue_bulk(params->r, burst, params-
> >size, NULL);
> +		lcount++;
> +		time_diff = rte_get_timer_cycles() - begin;
> +	}
> +	queue_count[lcore] = lcount;
> +	return 0;
> +}
> +
> +static int
> +run_on_all_cores(struct rte_ring *r)
> +{
> +	uint64_t total = 0;
> +	struct thread_params param = {0};

Try to use memset here. Some version of clang complain
{0} schemeatics.
  

Patch

diff --git a/test/test/test_ring_perf.c b/test/test/test_ring_perf.c
index ebb3939..819d119 100644
--- a/test/test/test_ring_perf.c
+++ b/test/test/test_ring_perf.c
@@ -20,12 +20,17 @@ 
  *  * Empty ring dequeue
  *  * Enqueue/dequeue of bursts in 1 threads
  *  * Enqueue/dequeue of bursts in 2 threads
+ *  * Enqueue/dequeue of bursts in all available threads
  */
 
 #define RING_NAME "RING_PERF"
 #define RING_SIZE 4096
 #define MAX_BURST 32
 
+#ifndef ARRAY_SIZE
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+#endif
+
 /*
  * the sizes to enqueue and dequeue in testing
  * (marked volatile so they won't be seen as compile-time constants)
@@ -248,9 +253,78 @@  run_on_core_pair(struct lcore_pair *cores, struct rte_ring *r,
 	}
 }
 
+static rte_atomic32_t synchro;
+static uint64_t queue_count[RTE_MAX_LCORE] = {0};
+
+#define TIME_MS 100
+
+static int
+load_loop_fn(void *p)
+{
+	uint64_t time_diff = 0;
+	uint64_t begin = 0;
+	uint64_t hz = rte_get_timer_hz();
+	uint64_t lcount = 0;
+	const unsigned int lcore = rte_lcore_id();
+	struct thread_params *params = p;
+	void *burst[MAX_BURST] = {0};
+
+	/* wait synchro for slaves */
+	if (lcore != rte_get_master_lcore())
+		while (rte_atomic32_read(&synchro) == 0)
+			rte_pause();
+
+	begin = rte_get_timer_cycles();
+	while (time_diff < hz * TIME_MS / 1000) {
+		rte_ring_mp_enqueue_bulk(params->r, burst, params->size, NULL);
+		rte_ring_mc_dequeue_bulk(params->r, burst, params->size, NULL);
+		lcount++;
+		time_diff = rte_get_timer_cycles() - begin;
+	}
+	queue_count[lcore] = lcount;
+	return 0;
+}
+
+static int
+run_on_all_cores(struct rte_ring *r)
+{
+	uint64_t total = 0;
+	struct thread_params param = {0};
+	unsigned int i, c;
+	for (i = 0; i < ARRAY_SIZE(bulk_sizes); i++) {
+		printf("\nBulk enq/dequeue count on size %u\n", bulk_sizes[i]);
+		param.size = bulk_sizes[i];
+		param.r = r;
+
+		/* clear synchro and start slaves */
+		rte_atomic32_set(&synchro, 0);
+		if (rte_eal_mp_remote_launch(load_loop_fn,
+				&param, SKIP_MASTER) < 0)
+			return -1;
+
+		/* start synchro and launch test on master */
+		rte_atomic32_set(&synchro, 1);
+		load_loop_fn(&param);
+
+		rte_eal_mp_wait_lcore();
+
+		RTE_LCORE_FOREACH(c) {
+			printf("Core [%u] count = %"PRIu64"\n",
+					c, queue_count[c]);
+			total += queue_count[c];
+		}
+
+		printf("Total count (size: %u): %"PRIu64"\n", bulk_sizes[i],
+						total);
+	}
+
+	return 0;
+}
+
 /*
- * Test function that determines how long an enqueue + dequeue of a single item
- * takes on a single lcore. Result is for comparison with the bulk enq+deq.
+ * Test function that determines how long an enqueue + dequeue of a single
+ * item takes on a single lcore. Result is for comparison with the bulk
+ * enq+deq.
  */
 static void
 test_single_enqueue_dequeue(struct rte_ring *r)
@@ -394,6 +468,10 @@  test_ring_perf(void)
 		printf("\n### Testing using two NUMA nodes ###\n");
 		run_on_core_pair(&cores, r, enqueue_bulk, dequeue_bulk);
 	}
+
+	printf("\n### Testing using all slave nodes ###\n");
+	run_on_all_cores(r);
+
 	rte_ring_free(r);
 	return 0;
 }