Message ID | 1545305634-81288-1-git-send-email-gavin.hu@arm.com (mailing list archive) |
---|---|
State | Superseded, archived |
Delegated to: | Thomas Monjalon |
Headers | show |
Series | [v1] test/ring: ring perf test case enhancement | expand |
Context | Check | Description |
---|---|---|
ci/checkpatch | success | coding style OK |
ci/Intel-compilation | success | Compilation OK |
ci/mellanox-Performance-Testing | success | Performance Testing PASS |
ci/intel-Performance-Testing | success | Performance Testing PASS |
+Cc Olivier, maintainer of the ring library. 20/12/2018 12:33, Gavin Hu: > From: Joyce Kong <joyce.kong@arm.com> > > Run ring perf test on all available cores to really verify MPMC operations. > The old way of running on a pair of cores is not enough for MPMC rings. We > used this test case for ring optimization and it was really helpful for > measuring the ring performance in multi-core environment. > > Suggested-by: Gavin Hu <gavin.hu@arm.com> > Signed-off-by: Joyce Kong <joyce.kong@arm.com> > Reviewed-by: Ruifeng Wang <Ruifeng.Wang@arm.com> > Reviewed-by: Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com> > Reviewed-by: Dharmik Thakkar <Dharmik.Thakkar@arm.com> > Reviewed-by: Ola Liljedahl <Ola.Liljedahl@arm.com> > Reviewed-by: Gavin Hu <gavin.hu@arm.com> > --- > test/test/test_ring_perf.c | 82 ++++++++++++++++++++++++++++++++++++++++++++-- > 1 file changed, 80 insertions(+), 2 deletions(-) > > diff --git a/test/test/test_ring_perf.c b/test/test/test_ring_perf.c > index ebb3939..819d119 100644 > --- a/test/test/test_ring_perf.c > +++ b/test/test/test_ring_perf.c > @@ -20,12 +20,17 @@ > * * Empty ring dequeue > * * Enqueue/dequeue of bursts in 1 threads > * * Enqueue/dequeue of bursts in 2 threads > + * * Enqueue/dequeue of bursts in all available threads > */ > > #define RING_NAME "RING_PERF" > #define RING_SIZE 4096 > #define MAX_BURST 32 > > +#ifndef ARRAY_SIZE > +#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) > +#endif > + > /* > * the sizes to enqueue and dequeue in testing > * (marked volatile so they won't be seen as compile-time constants) > @@ -248,9 +253,78 @@ run_on_core_pair(struct lcore_pair *cores, struct rte_ring *r, > } > } > > +static rte_atomic32_t synchro; > +static uint64_t queue_count[RTE_MAX_LCORE] = {0}; > + > +#define TIME_MS 100 > + > +static int > +load_loop_fn(void *p) > +{ > + uint64_t time_diff = 0; > + uint64_t begin = 0; > + uint64_t hz = rte_get_timer_hz(); > + uint64_t lcount = 0; > + const unsigned int lcore = rte_lcore_id(); > + struct thread_params *params = p; > + void *burst[MAX_BURST] = {0}; > + > + /* wait synchro for slaves */ > + if (lcore != rte_get_master_lcore()) > + while (rte_atomic32_read(&synchro) == 0) > + rte_pause(); > + > + begin = rte_get_timer_cycles(); > + while (time_diff < hz * TIME_MS / 1000) { > + rte_ring_mp_enqueue_bulk(params->r, burst, params->size, NULL); > + rte_ring_mc_dequeue_bulk(params->r, burst, params->size, NULL); > + lcount++; > + time_diff = rte_get_timer_cycles() - begin; > + } > + queue_count[lcore] = lcount; > + return 0; > +} > + > +static int > +run_on_all_cores(struct rte_ring *r) > +{ > + uint64_t total = 0; > + struct thread_params param = {0}; > + unsigned int i, c; > + for (i = 0; i < ARRAY_SIZE(bulk_sizes); i++) { > + printf("\nBulk enq/dequeue count on size %u\n", bulk_sizes[i]); > + param.size = bulk_sizes[i]; > + param.r = r; > + > + /* clear synchro and start slaves */ > + rte_atomic32_set(&synchro, 0); > + if (rte_eal_mp_remote_launch(load_loop_fn, > + ¶m, SKIP_MASTER) < 0) > + return -1; > + > + /* start synchro and launch test on master */ > + rte_atomic32_set(&synchro, 1); > + load_loop_fn(¶m); > + > + rte_eal_mp_wait_lcore(); > + > + RTE_LCORE_FOREACH(c) { > + printf("Core [%u] count = %"PRIu64"\n", > + c, queue_count[c]); > + total += queue_count[c]; > + } > + > + printf("Total count (size: %u): %"PRIu64"\n", bulk_sizes[i], > + total); > + } > + > + return 0; > +} > + > /* > - * Test function that determines how long an enqueue + dequeue of a single item > - * takes on a single lcore. Result is for comparison with the bulk enq+deq. > + * Test function that determines how long an enqueue + dequeue of a single > + * item takes on a single lcore. Result is for comparison with the bulk > + * enq+deq. > */ > static void > test_single_enqueue_dequeue(struct rte_ring *r) > @@ -394,6 +468,10 @@ test_ring_perf(void) > printf("\n### Testing using two NUMA nodes ###\n"); > run_on_core_pair(&cores, r, enqueue_bulk, dequeue_bulk); > } > + > + printf("\n### Testing using all slave nodes ###\n"); > + run_on_all_cores(r); > + > rte_ring_free(r); > return 0; > } >
> > +Cc Olivier, maintainer of the ring library. > > 20/12/2018 12:33, Gavin Hu: > > From: Joyce Kong <joyce.kong@arm.com> > > > > Run ring perf test on all available cores to really verify MPMC operations. > > The old way of running on a pair of cores is not enough for MPMC > > rings. We used this test case for ring optimization and it was really > > helpful for measuring the ring performance in multi-core environment. IMO, the last sentence does not convey quantifiable information. I suggest taking that out or replacing it with something that explains how it is useful. > > > > Suggested-by: Gavin Hu <gavin.hu@arm.com> > > Signed-off-by: Joyce Kong <joyce.kong@arm.com> > > Reviewed-by: Ruifeng Wang <Ruifeng.Wang@arm.com> > > Reviewed-by: Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com> > > Reviewed-by: Dharmik Thakkar <Dharmik.Thakkar@arm.com> > > Reviewed-by: Ola Liljedahl <Ola.Liljedahl@arm.com> > > Reviewed-by: Gavin Hu <gavin.hu@arm.com> > > --- > > test/test/test_ring_perf.c | 82 > > ++++++++++++++++++++++++++++++++++++++++++++-- > > 1 file changed, 80 insertions(+), 2 deletions(-) > > > > diff --git a/test/test/test_ring_perf.c b/test/test/test_ring_perf.c > > index ebb3939..819d119 100644 > > --- a/test/test/test_ring_perf.c > > +++ b/test/test/test_ring_perf.c > > @@ -20,12 +20,17 @@ > > * * Empty ring dequeue > > * * Enqueue/dequeue of bursts in 1 threads > > * * Enqueue/dequeue of bursts in 2 threads > > + * * Enqueue/dequeue of bursts in all available threads > > */ > > > > #define RING_NAME "RING_PERF" > > #define RING_SIZE 4096 > > #define MAX_BURST 32 > > > > +#ifndef ARRAY_SIZE > > +#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) #endif > > + > > /* > > * the sizes to enqueue and dequeue in testing > > * (marked volatile so they won't be seen as compile-time constants) > > @@ -248,9 +253,78 @@ run_on_core_pair(struct lcore_pair *cores, struct > rte_ring *r, > > } > > } > > > > +static rte_atomic32_t synchro; > > +static uint64_t queue_count[RTE_MAX_LCORE] = {0}; > > + > > +#define TIME_MS 100 > > + > > +static int > > +load_loop_fn(void *p) > > +{ > > + uint64_t time_diff = 0; > > + uint64_t begin = 0; > > + uint64_t hz = rte_get_timer_hz(); > > + uint64_t lcount = 0; > > + const unsigned int lcore = rte_lcore_id(); > > + struct thread_params *params = p; > > + void *burst[MAX_BURST] = {0}; > > + > > + /* wait synchro for slaves */ > > + if (lcore != rte_get_master_lcore()) > > + while (rte_atomic32_read(&synchro) == 0) > > + rte_pause(); > > + > > + begin = rte_get_timer_cycles(); > > + while (time_diff < hz * TIME_MS / 1000) { > > + rte_ring_mp_enqueue_bulk(params->r, burst, params->size, > NULL); > > + rte_ring_mc_dequeue_bulk(params->r, burst, params->size, > NULL); > > + lcount++; > > + time_diff = rte_get_timer_cycles() - begin; > > + } IMO, the method of measurement should be changed to reduce the over head of reading the cycles. > > + queue_count[lcore] = lcount; > > + return 0; > > +} > > + > > +static int > > +run_on_all_cores(struct rte_ring *r) > > +{ > > + uint64_t total = 0; > > + struct thread_params param = {0}; > > + unsigned int i, c; > > + for (i = 0; i < ARRAY_SIZE(bulk_sizes); i++) { > > + printf("\nBulk enq/dequeue count on size %u\n", > bulk_sizes[i]); > > + param.size = bulk_sizes[i]; > > + param.r = r; > > + > > + /* clear synchro and start slaves */ > > + rte_atomic32_set(&synchro, 0); > > + if (rte_eal_mp_remote_launch(load_loop_fn, > > + ¶m, SKIP_MASTER) < 0) > > + return -1; > > + > > + /* start synchro and launch test on master */ > > + rte_atomic32_set(&synchro, 1); > > + load_loop_fn(¶m); > > + > > + rte_eal_mp_wait_lcore(); > > + > > + RTE_LCORE_FOREACH(c) { > > + printf("Core [%u] count = %"PRIu64"\n", > > + c, queue_count[c]); > > + total += queue_count[c]; > > + } > > + > > + printf("Total count (size: %u): %"PRIu64"\n", bulk_sizes[i], > > + total); > > + } > > + > > + return 0; > > +} > > + > > /* > > - * Test function that determines how long an enqueue + dequeue of a > > single item > > - * takes on a single lcore. Result is for comparison with the bulk enq+deq. > > + * Test function that determines how long an enqueue + dequeue of a > > + single > > + * item takes on a single lcore. Result is for comparison with the > > + bulk > > + * enq+deq. > > */ > > static void > > test_single_enqueue_dequeue(struct rte_ring *r) @@ -394,6 +468,10 @@ > > test_ring_perf(void) > > printf("\n### Testing using two NUMA nodes ###\n"); > > run_on_core_pair(&cores, r, enqueue_bulk, dequeue_bulk); > > } > > + > > + printf("\n### Testing using all slave nodes ###\n"); > > + run_on_all_cores(r); > > + > > rte_ring_free(r); > > return 0; > > } > > > > > >
On Thu, 2018-12-20 at 19:33 +0800, Gavin Hu wrote: > External Email > > ------------------------------------------------------------------- > --- > From: Joyce Kong <joyce.kong@arm.com> > > Run ring perf test on all available cores to really verify MPMC > operations. > The old way of running on a pair of cores is not enough for MPMC > rings. We > used this test case for ring optimization and it was really helpful > for > measuring the ring performance in multi-core environment. > > Suggested-by: Gavin Hu <gavin.hu@arm.com> > Signed-off-by: Joyce Kong <joyce.kong@arm.com> > Reviewed-by: Ruifeng Wang <Ruifeng.Wang@arm.com> > Reviewed-by: Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com> > Reviewed-by: Dharmik Thakkar <Dharmik.Thakkar@arm.com> > Reviewed-by: Ola Liljedahl <Ola.Liljedahl@arm.com> > Reviewed-by: Gavin Hu <gavin.hu@arm.com> > --- > test/test/test_ring_perf.c | 82 > ++++++++++++++++++++++++++++++++++++++++++++-- > 1 file changed, 80 insertions(+), 2 deletions(-) > > diff --git a/test/test/test_ring_perf.c b/test/test/test_ring_perf.c > index ebb3939..819d119 100644 > --- a/test/test/test_ring_perf.c > +++ b/test/test/test_ring_perf.c > @@ -20,12 +20,17 @@ > * * Empty ring dequeue > * * Enqueue/dequeue of bursts in 1 threads > * * Enqueue/dequeue of bursts in 2 threads > + * * Enqueue/dequeue of bursts in all available threads > */ > > #define RING_NAME "RING_PERF" > #define RING_SIZE 4096 > #define MAX_BURST 32 > > +#ifndef ARRAY_SIZE > +#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) Use RTE_DIM instead. > +#endif > + > /* > * the sizes to enqueue and dequeue in testing > * (marked volatile so they won't be seen as compile-time constants) > @@ -248,9 +253,78 @@ run_on_core_pair(struct lcore_pair *cores, > struct rte_ring *r, > } > } > > +static rte_atomic32_t synchro; > +static uint64_t queue_count[RTE_MAX_LCORE] = {0}; Do we need explicit {0} for this static global variable? > + > +#define TIME_MS 100 > + > +static int > +load_loop_fn(void *p) > +{ > + uint64_t time_diff = 0; > + uint64_t begin = 0; > + uint64_t hz = rte_get_timer_hz(); > + uint64_t lcount = 0; > + const unsigned int lcore = rte_lcore_id(); > + struct thread_params *params = p; > + void *burst[MAX_BURST] = {0}; > + > + /* wait synchro for slaves */ > + if (lcore != rte_get_master_lcore()) > + while (rte_atomic32_read(&synchro) == 0) > + rte_pause(); > + > + begin = rte_get_timer_cycles(); > + while (time_diff < hz * TIME_MS / 1000) { > + rte_ring_mp_enqueue_bulk(params->r, burst, params- > >size, NULL); > + rte_ring_mc_dequeue_bulk(params->r, burst, params- > >size, NULL); > + lcount++; > + time_diff = rte_get_timer_cycles() - begin; > + } > + queue_count[lcore] = lcount; > + return 0; > +} > + > +static int > +run_on_all_cores(struct rte_ring *r) > +{ > + uint64_t total = 0; > + struct thread_params param = {0}; Try to use memset here. Some version of clang complain {0} schemeatics.
diff --git a/test/test/test_ring_perf.c b/test/test/test_ring_perf.c index ebb3939..819d119 100644 --- a/test/test/test_ring_perf.c +++ b/test/test/test_ring_perf.c @@ -20,12 +20,17 @@ * * Empty ring dequeue * * Enqueue/dequeue of bursts in 1 threads * * Enqueue/dequeue of bursts in 2 threads + * * Enqueue/dequeue of bursts in all available threads */ #define RING_NAME "RING_PERF" #define RING_SIZE 4096 #define MAX_BURST 32 +#ifndef ARRAY_SIZE +#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) +#endif + /* * the sizes to enqueue and dequeue in testing * (marked volatile so they won't be seen as compile-time constants) @@ -248,9 +253,78 @@ run_on_core_pair(struct lcore_pair *cores, struct rte_ring *r, } } +static rte_atomic32_t synchro; +static uint64_t queue_count[RTE_MAX_LCORE] = {0}; + +#define TIME_MS 100 + +static int +load_loop_fn(void *p) +{ + uint64_t time_diff = 0; + uint64_t begin = 0; + uint64_t hz = rte_get_timer_hz(); + uint64_t lcount = 0; + const unsigned int lcore = rte_lcore_id(); + struct thread_params *params = p; + void *burst[MAX_BURST] = {0}; + + /* wait synchro for slaves */ + if (lcore != rte_get_master_lcore()) + while (rte_atomic32_read(&synchro) == 0) + rte_pause(); + + begin = rte_get_timer_cycles(); + while (time_diff < hz * TIME_MS / 1000) { + rte_ring_mp_enqueue_bulk(params->r, burst, params->size, NULL); + rte_ring_mc_dequeue_bulk(params->r, burst, params->size, NULL); + lcount++; + time_diff = rte_get_timer_cycles() - begin; + } + queue_count[lcore] = lcount; + return 0; +} + +static int +run_on_all_cores(struct rte_ring *r) +{ + uint64_t total = 0; + struct thread_params param = {0}; + unsigned int i, c; + for (i = 0; i < ARRAY_SIZE(bulk_sizes); i++) { + printf("\nBulk enq/dequeue count on size %u\n", bulk_sizes[i]); + param.size = bulk_sizes[i]; + param.r = r; + + /* clear synchro and start slaves */ + rte_atomic32_set(&synchro, 0); + if (rte_eal_mp_remote_launch(load_loop_fn, + ¶m, SKIP_MASTER) < 0) + return -1; + + /* start synchro and launch test on master */ + rte_atomic32_set(&synchro, 1); + load_loop_fn(¶m); + + rte_eal_mp_wait_lcore(); + + RTE_LCORE_FOREACH(c) { + printf("Core [%u] count = %"PRIu64"\n", + c, queue_count[c]); + total += queue_count[c]; + } + + printf("Total count (size: %u): %"PRIu64"\n", bulk_sizes[i], + total); + } + + return 0; +} + /* - * Test function that determines how long an enqueue + dequeue of a single item - * takes on a single lcore. Result is for comparison with the bulk enq+deq. + * Test function that determines how long an enqueue + dequeue of a single + * item takes on a single lcore. Result is for comparison with the bulk + * enq+deq. */ static void test_single_enqueue_dequeue(struct rte_ring *r) @@ -394,6 +468,10 @@ test_ring_perf(void) printf("\n### Testing using two NUMA nodes ###\n"); run_on_core_pair(&cores, r, enqueue_bulk, dequeue_bulk); } + + printf("\n### Testing using all slave nodes ###\n"); + run_on_all_cores(r); + rte_ring_free(r); return 0; }