[1/2] test/mcslock: move performance test to perf tests

Message ID 1584936978-11899-1-git-send-email-phil.yang@arm.com (mailing list archive)
State Superseded, archived
Delegated to: David Marchand
Headers
Series [1/2] test/mcslock: move performance test to perf tests |

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/iol-intel-Performance success Performance Testing PASS
ci/iol-mellanox-Performance success Performance Testing PASS
ci/iol-testing success Testing PASS
ci/Intel-compilation success Compilation OK

Commit Message

Phil Yang March 23, 2020, 4:16 a.m. UTC
  The MCS lock performance test takes more than 10 seconds and leads
to meson test timeout on some platforms. Move the performance test
into perf tests.

Signed-off-by: Phil Yang <phil.yang@arm.com>
Reviewed-by: Gavin Hu <gavin.hu@arm.com>
---
 MAINTAINERS                  |   1 +
 app/test/Makefile            |   1 +
 app/test/autotest_data.py    |   6 +++
 app/test/meson.build         |   2 +
 app/test/test_mcslock.c      |  88 -------------------------------
 app/test/test_mcslock_perf.c | 121 +++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 131 insertions(+), 88 deletions(-)
 create mode 100644 app/test/test_mcslock_perf.c
  

Comments

Phil Yang July 23, 2020, 2:36 a.m. UTC | #1
Hi Aaron,

It seems Travis CI cannot capture this timeout issue.
But the local test records these two cases as TIMEOUT under the default timeout configuration.

1. The test results on x86 server (72 lcores online@ 2.60GHz) are:
a. Default setting:
$ sudo meson test -C build_test --suite DPDK:fast-tests
   3/92 DPDK:fast-tests / atomic_autotest                TIMEOUT        10.32s
39/92 DPDK:fast-tests / mcslock_autotest               TIMEOUT        10.32s

b. Extend timeout:
$ sudo meson test -C build_test --suite DPDK:fast-tests -t 30
   3/92 DPDK:fast-tests / atomic_autotest                OK             173.01s
39/92 DPDK:fast-tests / mcslock_autotest               OK             12.19s

2. The test results on aarch64 server (56 lcore online @ 2.0GHz) are:
a. Default setting:
$ sudo meson test -C build_test --suite DPDK:fast-tests
   3/92 DPDK:fast-tests / atomic_autotest       TIMEOUT 10.47 s
39/92 DPDK:fast-tests / mcslock_autotest      TIMEOUT 10.47 s

b. Extend timeout:
$ sudo meson test -C build_test --suite DPDK:fast-tests -t 60
   3/92 DPDK:fast-tests / atomic_autotest       OK      431.89 s
39/92 DPDK:fast-tests / mcslock_autotest      OK      21.25 s

So I think these two patches can resolve this issue. I'd love to hear your thoughts.

Thanks,
Phil

<snip>

> Subject: [dpdk-dev] [PATCH 1/2] test/mcslock: move performance test to
> perf tests
> 
> The MCS lock performance test takes more than 10 seconds and leads
> to meson test timeout on some platforms. Move the performance test
> into perf tests.
> 
> Signed-off-by: Phil Yang <phil.yang@arm.com>
> Reviewed-by: Gavin Hu <gavin.hu@arm.com>
> ---
>  MAINTAINERS                  |   1 +
>  app/test/Makefile            |   1 +
>  app/test/autotest_data.py    |   6 +++
>  app/test/meson.build         |   2 +
>  app/test/test_mcslock.c      |  88 -------------------------------
>  app/test/test_mcslock_perf.c | 121
> +++++++++++++++++++++++++++++++++++++++++++
>  6 files changed, 131 insertions(+), 88 deletions(-)
>  create mode 100644 app/test/test_mcslock_perf.c
> 
> diff --git a/MAINTAINERS b/MAINTAINERS
> index db235c2..411bdeb 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -247,6 +247,7 @@ MCSlock - EXPERIMENTAL
>  M: Phil Yang <phil.yang@arm.com>
>  F: lib/librte_eal/common/include/generic/rte_mcslock.h
>  F: app/test/test_mcslock.c
> +F: app/test/test_mcslock_perf.c
> 
>  Ticketlock
>  M: Joyce Kong <joyce.kong@arm.com>
> diff --git a/app/test/Makefile b/app/test/Makefile
> index 1f080d1..97de3ac 100644
> --- a/app/test/Makefile
> +++ b/app/test/Makefile
> @@ -65,6 +65,7 @@ SRCS-y += test_barrier.c
>  SRCS-y += test_malloc.c
>  SRCS-y += test_cycles.c
>  SRCS-y += test_mcslock.c
> +SRCS-y += test_mcslock_perf.c
>  SRCS-y += test_spinlock.c
>  SRCS-y += test_ticketlock.c
>  SRCS-y += test_memory.c
> diff --git a/app/test/autotest_data.py b/app/test/autotest_data.py
> index 7b1d013..2a4619d 100644
> --- a/app/test/autotest_data.py
> +++ b/app/test/autotest_data.py
> @@ -784,6 +784,12 @@
>          "Func":    default_autotest,
>          "Report":  None,
>      },
> +    {
> +        "Name":    "MCS Lock performance autotest",
> +        "Command": "mcslock_perf_autotest",
> +        "Func":    default_autotest,
> +        "Report":  None,
> +    },
>      #
>      # Please always make sure that ring_perf is the last test!
>      #
> diff --git a/app/test/meson.build b/app/test/meson.build
> index 0a2ce71..335a869 100644
> --- a/app/test/meson.build
> +++ b/app/test/meson.build
> @@ -82,6 +82,7 @@ test_sources = files('commands.c',
>  	'test_meter.c',
>  	'test_metrics.c',
>  	'test_mcslock.c',
> +	'test_mcslock_perf.c',
>  	'test_mp_secondary.c',
>  	'test_per_lcore.c',
>  	'test_pmd_perf.c',
> @@ -270,6 +271,7 @@ perf_test_names = [
>          'rand_perf_autotest',
>          'hash_readwrite_perf_autotest',
>          'hash_readwrite_lf_perf_autotest',
> +	'mcslock_perf_autotest',
>  ]
> 
>  driver_test_names = [
> diff --git a/app/test/test_mcslock.c b/app/test/test_mcslock.c
> index e9359df..15f9751 100644
> --- a/app/test/test_mcslock.c
> +++ b/app/test/test_mcslock.c
> @@ -32,23 +32,16 @@
>   *
>   *   - The function takes the global lock, display something, then releases
>   *     the global lock on each core.
> - *
> - * - A load test is carried out, with all cores attempting to lock a single
> - *   lock multiple times.
>   */
> 
>  RTE_DEFINE_PER_LCORE(rte_mcslock_t, _ml_me);
>  RTE_DEFINE_PER_LCORE(rte_mcslock_t, _ml_try_me);
> -RTE_DEFINE_PER_LCORE(rte_mcslock_t, _ml_perf_me);
> 
>  rte_mcslock_t *p_ml;
>  rte_mcslock_t *p_ml_try;
> -rte_mcslock_t *p_ml_perf;
> 
>  static unsigned int count;
> 
> -static rte_atomic32_t synchro;
> -
>  static int
>  test_mcslock_per_core(__attribute__((unused)) void *arg)
>  {
> @@ -63,85 +56,8 @@ test_mcslock_per_core(__attribute__((unused)) void
> *arg)
>  	return 0;
>  }
> 
> -static uint64_t time_count[RTE_MAX_LCORE] = {0};
> -
>  #define MAX_LOOP 1000000
> 
> -static int
> -load_loop_fn(void *func_param)
> -{
> -	uint64_t time_diff = 0, begin;
> -	uint64_t hz = rte_get_timer_hz();
> -	volatile uint64_t lcount = 0;
> -	const int use_lock = *(int *)func_param;
> -	const unsigned int lcore = rte_lcore_id();
> -
> -	/**< Per core me node. */
> -	rte_mcslock_t ml_perf_me = RTE_PER_LCORE(_ml_perf_me);
> -
> -	/* wait synchro */
> -	while (rte_atomic32_read(&synchro) == 0)
> -		;
> -
> -	begin = rte_get_timer_cycles();
> -	while (lcount < MAX_LOOP) {
> -		if (use_lock)
> -			rte_mcslock_lock(&p_ml_perf, &ml_perf_me);
> -
> -		lcount++;
> -		if (use_lock)
> -			rte_mcslock_unlock(&p_ml_perf, &ml_perf_me);
> -	}
> -	time_diff = rte_get_timer_cycles() - begin;
> -	time_count[lcore] = time_diff * 1000000 / hz;
> -	return 0;
> -}
> -
> -static int
> -test_mcslock_perf(void)
> -{
> -	unsigned int i;
> -	uint64_t total = 0;
> -	int lock = 0;
> -	const unsigned int lcore = rte_lcore_id();
> -
> -	printf("\nTest with no lock on single core...\n");
> -	rte_atomic32_set(&synchro, 1);
> -	load_loop_fn(&lock);
> -	printf("Core [%u] Cost Time = %"PRIu64" us\n",
> -			lcore, time_count[lcore]);
> -	memset(time_count, 0, sizeof(time_count));
> -
> -	printf("\nTest with lock on single core...\n");
> -	lock = 1;
> -	rte_atomic32_set(&synchro, 1);
> -	load_loop_fn(&lock);
> -	printf("Core [%u] Cost Time = %"PRIu64" us\n",
> -			lcore, time_count[lcore]);
> -	memset(time_count, 0, sizeof(time_count));
> -
> -	printf("\nTest with lock on %u cores...\n", (rte_lcore_count()));
> -
> -	rte_atomic32_set(&synchro, 0);
> -	rte_eal_mp_remote_launch(load_loop_fn, &lock, SKIP_MASTER);
> -
> -	/* start synchro and launch test on master */
> -	rte_atomic32_set(&synchro, 1);
> -	load_loop_fn(&lock);
> -
> -	rte_eal_mp_wait_lcore();
> -
> -	RTE_LCORE_FOREACH(i) {
> -		printf("Core [%u] Cost Time = %"PRIu64" us\n",
> -				i, time_count[i]);
> -		total += time_count[i];
> -	}
> -
> -	printf("Total Cost Time = %"PRIu64" us\n", total);
> -
> -	return 0;
> -}
> -
>  /*
>   * Use rte_mcslock_trylock() to trylock a mcs lock object,
>   * If it could not lock the object successfully, it would
> @@ -240,10 +156,6 @@ test_mcslock(void)
>  		ret = -1;
>  	rte_mcslock_unlock(&p_ml, &ml_me);
> 
> -	/* mcs lock perf test */
> -	if (test_mcslock_perf() < 0)
> -		return -1;
> -
>  	return ret;
>  }
> 
> diff --git a/app/test/test_mcslock_perf.c b/app/test/test_mcslock_perf.c
> new file mode 100644
> index 0000000..6948344
> --- /dev/null
> +++ b/app/test/test_mcslock_perf.c
> @@ -0,0 +1,121 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2019 Arm Limited
> + */
> +
> +#include <stdio.h>
> +#include <stdint.h>
> +#include <inttypes.h>
> +#include <string.h>
> +#include <unistd.h>
> +#include <sys/queue.h>
> +
> +#include <rte_common.h>
> +#include <rte_memory.h>
> +#include <rte_per_lcore.h>
> +#include <rte_launch.h>
> +#include <rte_eal.h>
> +#include <rte_lcore.h>
> +#include <rte_cycles.h>
> +#include <rte_mcslock.h>
> +#include <rte_atomic.h>
> +
> +#include "test.h"
> +
> +/*
> + * RTE MCS lock perf test
> + * ======================
> + *
> + * These tests are derived from spin lock perf test cases.
> + *
> + * - A load test is carried out, with all cores attempting to lock a single
> + *   lock multiple times.
> + */
> +
> +RTE_DEFINE_PER_LCORE(rte_mcslock_t, _ml_perf_me);
> +rte_mcslock_t *p_ml_perf;
> +
> +static rte_atomic32_t synchro;
> +static uint64_t time_count[RTE_MAX_LCORE] = {0};
> +
> +#define MAX_LOOP 1000000
> +
> +static int
> +load_loop_fn(void *func_param)
> +{
> +	uint64_t time_diff = 0, begin;
> +	uint64_t hz = rte_get_timer_hz();
> +	volatile uint64_t lcount = 0;
> +	const int use_lock = *(int *)func_param;
> +	const unsigned int lcore = rte_lcore_id();
> +
> +	/**< Per core me node. */
> +	rte_mcslock_t ml_perf_me = RTE_PER_LCORE(_ml_perf_me);
> +
> +	/* wait synchro */
> +	while (rte_atomic32_read(&synchro) == 0)
> +		;
> +
> +	begin = rte_get_timer_cycles();
> +	while (lcount < MAX_LOOP) {
> +		if (use_lock)
> +			rte_mcslock_lock(&p_ml_perf, &ml_perf_me);
> +
> +		lcount++;
> +		if (use_lock)
> +			rte_mcslock_unlock(&p_ml_perf, &ml_perf_me);
> +	}
> +	time_diff = rte_get_timer_cycles() - begin;
> +	time_count[lcore] = time_diff * 1000000 / hz;
> +	return 0;
> +}
> +
> +/*
> + * Test rte_eal_get_lcore_state() in addition to mcs locks
> + * as we have "waiting" then "running" lcores.
> + */
> +static int
> +test_mcslock_perf(void)
> +{
> +	unsigned int i;
> +	uint64_t total = 0;
> +	int lock = 0;
> +	const unsigned int lcore = rte_lcore_id();
> +
> +	printf("\nTest with no lock on single core...\n");
> +	rte_atomic32_set(&synchro, 1);
> +	load_loop_fn(&lock);
> +	printf("Core [%u] Cost Time = %"PRIu64" us\n",
> +			lcore, time_count[lcore]);
> +	memset(time_count, 0, sizeof(time_count));
> +
> +	printf("\nTest with lock on single core...\n");
> +	lock = 1;
> +	rte_atomic32_set(&synchro, 1);
> +	load_loop_fn(&lock);
> +	printf("Core [%u] Cost Time = %"PRIu64" us\n",
> +			lcore, time_count[lcore]);
> +	memset(time_count, 0, sizeof(time_count));
> +
> +	printf("\nTest with lock on %u cores...\n", (rte_lcore_count()));
> +
> +	rte_atomic32_set(&synchro, 0);
> +	rte_eal_mp_remote_launch(load_loop_fn, &lock, SKIP_MASTER);
> +
> +	/* start synchro and launch test on master */
> +	rte_atomic32_set(&synchro, 1);
> +	load_loop_fn(&lock);
> +
> +	rte_eal_mp_wait_lcore();
> +
> +	RTE_LCORE_FOREACH(i) {
> +		printf("Core [%u] Cost Time = %"PRIu64" us\n",
> +				i, time_count[i]);
> +		total += time_count[i];
> +	}
> +
> +	printf("Total Cost Time = %"PRIu64" us\n", total);
> +
> +	return 0;
> +}
> +
> +REGISTER_TEST_COMMAND(mcslock_perf_autotest, test_mcslock_perf);
> --
> 2.7.4
  
Aaron Conole July 23, 2020, 3:14 p.m. UTC | #2
Phil Yang <Phil.Yang@arm.com> writes:

> Hi Aaron,
>
> It seems Travis CI cannot capture this timeout issue.

That's probably because we use -t 3 to multiply the timeout.  We needed
to do that because a few of the tests were on the cusp of failure in the
Travis environment since it's a bit lower power.  In more brawny
systems, probably they aren't an issue.

> But the local test records these two cases as TIMEOUT under the default timeout configuration.
>
> 1. The test results on x86 server (72 lcores online@ 2.60GHz) are:
> a. Default setting:
> $ sudo meson test -C build_test --suite DPDK:fast-tests
>    3/92 DPDK:fast-tests / atomic_autotest                TIMEOUT        10.32s
> 39/92 DPDK:fast-tests / mcslock_autotest               TIMEOUT        10.32s
>
> b. Extend timeout:
> $ sudo meson test -C build_test --suite DPDK:fast-tests -t 30
>    3/92 DPDK:fast-tests / atomic_autotest                OK             173.01s
> 39/92 DPDK:fast-tests / mcslock_autotest               OK             12.19s
>
> 2. The test results on aarch64 server (56 lcore online @ 2.0GHz) are:
> a. Default setting:
> $ sudo meson test -C build_test --suite DPDK:fast-tests
>    3/92 DPDK:fast-tests / atomic_autotest       TIMEOUT 10.47 s
> 39/92 DPDK:fast-tests / mcslock_autotest      TIMEOUT 10.47 s
>
> b. Extend timeout:
> $ sudo meson test -C build_test --suite DPDK:fast-tests -t 60
>    3/92 DPDK:fast-tests / atomic_autotest       OK      431.89 s
> 39/92 DPDK:fast-tests / mcslock_autotest      OK      21.25 s

In all the cases it seems the mcslock_autotest is really not too far
off, but the atomic_autotest looks really long no matter what.

> So I think these two patches can resolve this issue. I'd love to hear your thoughts.

In general, I'm always happy to see perf tests more appropriately put
under perf area.  I don't have much opinion on this specific patch,
though.

> Thanks,
> Phil
>
> <snip>
>
>> Subject: [dpdk-dev] [PATCH 1/2] test/mcslock: move performance test to
>> perf tests
>> 
>> The MCS lock performance test takes more than 10 seconds and leads
>> to meson test timeout on some platforms. Move the performance test
>> into perf tests.
>> 
>> Signed-off-by: Phil Yang <phil.yang@arm.com>
>> Reviewed-by: Gavin Hu <gavin.hu@arm.com>
>> ---
>>  MAINTAINERS                  |   1 +
>>  app/test/Makefile            |   1 +
>>  app/test/autotest_data.py    |   6 +++
>>  app/test/meson.build         |   2 +
>>  app/test/test_mcslock.c      |  88 -------------------------------
>>  app/test/test_mcslock_perf.c | 121
>> +++++++++++++++++++++++++++++++++++++++++++
>>  6 files changed, 131 insertions(+), 88 deletions(-)
>>  create mode 100644 app/test/test_mcslock_perf.c
>> 
>> diff --git a/MAINTAINERS b/MAINTAINERS
>> index db235c2..411bdeb 100644
>> --- a/MAINTAINERS
>> +++ b/MAINTAINERS
>> @@ -247,6 +247,7 @@ MCSlock - EXPERIMENTAL
>>  M: Phil Yang <phil.yang@arm.com>
>>  F: lib/librte_eal/common/include/generic/rte_mcslock.h
>>  F: app/test/test_mcslock.c
>> +F: app/test/test_mcslock_perf.c
>> 
>>  Ticketlock
>>  M: Joyce Kong <joyce.kong@arm.com>
>> diff --git a/app/test/Makefile b/app/test/Makefile
>> index 1f080d1..97de3ac 100644
>> --- a/app/test/Makefile
>> +++ b/app/test/Makefile
>> @@ -65,6 +65,7 @@ SRCS-y += test_barrier.c
>>  SRCS-y += test_malloc.c
>>  SRCS-y += test_cycles.c
>>  SRCS-y += test_mcslock.c
>> +SRCS-y += test_mcslock_perf.c
>>  SRCS-y += test_spinlock.c
>>  SRCS-y += test_ticketlock.c
>>  SRCS-y += test_memory.c
>> diff --git a/app/test/autotest_data.py b/app/test/autotest_data.py
>> index 7b1d013..2a4619d 100644
>> --- a/app/test/autotest_data.py
>> +++ b/app/test/autotest_data.py
>> @@ -784,6 +784,12 @@
>>          "Func":    default_autotest,
>>          "Report":  None,
>>      },
>> +    {
>> +        "Name":    "MCS Lock performance autotest",
>> +        "Command": "mcslock_perf_autotest",
>> +        "Func":    default_autotest,
>> +        "Report":  None,
>> +    },
>>      #
>>      # Please always make sure that ring_perf is the last test!
>>      #
>> diff --git a/app/test/meson.build b/app/test/meson.build
>> index 0a2ce71..335a869 100644
>> --- a/app/test/meson.build
>> +++ b/app/test/meson.build
>> @@ -82,6 +82,7 @@ test_sources = files('commands.c',
>>  	'test_meter.c',
>>  	'test_metrics.c',
>>  	'test_mcslock.c',
>> +	'test_mcslock_perf.c',
>>  	'test_mp_secondary.c',
>>  	'test_per_lcore.c',
>>  	'test_pmd_perf.c',
>> @@ -270,6 +271,7 @@ perf_test_names = [
>>          'rand_perf_autotest',
>>          'hash_readwrite_perf_autotest',
>>          'hash_readwrite_lf_perf_autotest',
>> +	'mcslock_perf_autotest',
>>  ]
>> 
>>  driver_test_names = [
>> diff --git a/app/test/test_mcslock.c b/app/test/test_mcslock.c
>> index e9359df..15f9751 100644
>> --- a/app/test/test_mcslock.c
>> +++ b/app/test/test_mcslock.c
>> @@ -32,23 +32,16 @@
>>   *
>>   *   - The function takes the global lock, display something, then releases
>>   *     the global lock on each core.
>> - *
>> - * - A load test is carried out, with all cores attempting to lock a single
>> - *   lock multiple times.
>>   */
>> 
>>  RTE_DEFINE_PER_LCORE(rte_mcslock_t, _ml_me);
>>  RTE_DEFINE_PER_LCORE(rte_mcslock_t, _ml_try_me);
>> -RTE_DEFINE_PER_LCORE(rte_mcslock_t, _ml_perf_me);
>> 
>>  rte_mcslock_t *p_ml;
>>  rte_mcslock_t *p_ml_try;
>> -rte_mcslock_t *p_ml_perf;
>> 
>>  static unsigned int count;
>> 
>> -static rte_atomic32_t synchro;
>> -
>>  static int
>>  test_mcslock_per_core(__attribute__((unused)) void *arg)
>>  {
>> @@ -63,85 +56,8 @@ test_mcslock_per_core(__attribute__((unused)) void
>> *arg)
>>  	return 0;
>>  }
>> 
>> -static uint64_t time_count[RTE_MAX_LCORE] = {0};
>> -
>>  #define MAX_LOOP 1000000
>> 
>> -static int
>> -load_loop_fn(void *func_param)
>> -{
>> -	uint64_t time_diff = 0, begin;
>> -	uint64_t hz = rte_get_timer_hz();
>> -	volatile uint64_t lcount = 0;
>> -	const int use_lock = *(int *)func_param;
>> -	const unsigned int lcore = rte_lcore_id();
>> -
>> -	/**< Per core me node. */
>> -	rte_mcslock_t ml_perf_me = RTE_PER_LCORE(_ml_perf_me);
>> -
>> -	/* wait synchro */
>> -	while (rte_atomic32_read(&synchro) == 0)
>> -		;
>> -
>> -	begin = rte_get_timer_cycles();
>> -	while (lcount < MAX_LOOP) {
>> -		if (use_lock)
>> -			rte_mcslock_lock(&p_ml_perf, &ml_perf_me);
>> -
>> -		lcount++;
>> -		if (use_lock)
>> -			rte_mcslock_unlock(&p_ml_perf, &ml_perf_me);
>> -	}
>> -	time_diff = rte_get_timer_cycles() - begin;
>> -	time_count[lcore] = time_diff * 1000000 / hz;
>> -	return 0;
>> -}
>> -
>> -static int
>> -test_mcslock_perf(void)
>> -{
>> -	unsigned int i;
>> -	uint64_t total = 0;
>> -	int lock = 0;
>> -	const unsigned int lcore = rte_lcore_id();
>> -
>> -	printf("\nTest with no lock on single core...\n");
>> -	rte_atomic32_set(&synchro, 1);
>> -	load_loop_fn(&lock);
>> -	printf("Core [%u] Cost Time = %"PRIu64" us\n",
>> -			lcore, time_count[lcore]);
>> -	memset(time_count, 0, sizeof(time_count));
>> -
>> -	printf("\nTest with lock on single core...\n");
>> -	lock = 1;
>> -	rte_atomic32_set(&synchro, 1);
>> -	load_loop_fn(&lock);
>> -	printf("Core [%u] Cost Time = %"PRIu64" us\n",
>> -			lcore, time_count[lcore]);
>> -	memset(time_count, 0, sizeof(time_count));
>> -
>> -	printf("\nTest with lock on %u cores...\n", (rte_lcore_count()));
>> -
>> -	rte_atomic32_set(&synchro, 0);
>> -	rte_eal_mp_remote_launch(load_loop_fn, &lock, SKIP_MASTER);
>> -
>> -	/* start synchro and launch test on master */
>> -	rte_atomic32_set(&synchro, 1);
>> -	load_loop_fn(&lock);
>> -
>> -	rte_eal_mp_wait_lcore();
>> -
>> -	RTE_LCORE_FOREACH(i) {
>> -		printf("Core [%u] Cost Time = %"PRIu64" us\n",
>> -				i, time_count[i]);
>> -		total += time_count[i];
>> -	}
>> -
>> -	printf("Total Cost Time = %"PRIu64" us\n", total);
>> -
>> -	return 0;
>> -}
>> -
>>  /*
>>   * Use rte_mcslock_trylock() to trylock a mcs lock object,
>>   * If it could not lock the object successfully, it would
>> @@ -240,10 +156,6 @@ test_mcslock(void)
>>  		ret = -1;
>>  	rte_mcslock_unlock(&p_ml, &ml_me);
>> 
>> -	/* mcs lock perf test */
>> -	if (test_mcslock_perf() < 0)
>> -		return -1;
>> -
>>  	return ret;
>>  }
>> 
>> diff --git a/app/test/test_mcslock_perf.c b/app/test/test_mcslock_perf.c
>> new file mode 100644
>> index 0000000..6948344
>> --- /dev/null
>> +++ b/app/test/test_mcslock_perf.c
>> @@ -0,0 +1,121 @@
>> +/* SPDX-License-Identifier: BSD-3-Clause
>> + * Copyright(c) 2019 Arm Limited
>> + */
>> +
>> +#include <stdio.h>
>> +#include <stdint.h>
>> +#include <inttypes.h>
>> +#include <string.h>
>> +#include <unistd.h>
>> +#include <sys/queue.h>
>> +
>> +#include <rte_common.h>
>> +#include <rte_memory.h>
>> +#include <rte_per_lcore.h>
>> +#include <rte_launch.h>
>> +#include <rte_eal.h>
>> +#include <rte_lcore.h>
>> +#include <rte_cycles.h>
>> +#include <rte_mcslock.h>
>> +#include <rte_atomic.h>
>> +
>> +#include "test.h"
>> +
>> +/*
>> + * RTE MCS lock perf test
>> + * ======================
>> + *
>> + * These tests are derived from spin lock perf test cases.
>> + *
>> + * - A load test is carried out, with all cores attempting to lock a single
>> + *   lock multiple times.
>> + */
>> +
>> +RTE_DEFINE_PER_LCORE(rte_mcslock_t, _ml_perf_me);
>> +rte_mcslock_t *p_ml_perf;
>> +
>> +static rte_atomic32_t synchro;
>> +static uint64_t time_count[RTE_MAX_LCORE] = {0};
>> +
>> +#define MAX_LOOP 1000000
>> +
>> +static int
>> +load_loop_fn(void *func_param)
>> +{
>> +	uint64_t time_diff = 0, begin;
>> +	uint64_t hz = rte_get_timer_hz();
>> +	volatile uint64_t lcount = 0;
>> +	const int use_lock = *(int *)func_param;
>> +	const unsigned int lcore = rte_lcore_id();
>> +
>> +	/**< Per core me node. */
>> +	rte_mcslock_t ml_perf_me = RTE_PER_LCORE(_ml_perf_me);
>> +
>> +	/* wait synchro */
>> +	while (rte_atomic32_read(&synchro) == 0)
>> +		;
>> +
>> +	begin = rte_get_timer_cycles();
>> +	while (lcount < MAX_LOOP) {
>> +		if (use_lock)
>> +			rte_mcslock_lock(&p_ml_perf, &ml_perf_me);
>> +
>> +		lcount++;
>> +		if (use_lock)
>> +			rte_mcslock_unlock(&p_ml_perf, &ml_perf_me);
>> +	}
>> +	time_diff = rte_get_timer_cycles() - begin;
>> +	time_count[lcore] = time_diff * 1000000 / hz;
>> +	return 0;
>> +}
>> +
>> +/*
>> + * Test rte_eal_get_lcore_state() in addition to mcs locks
>> + * as we have "waiting" then "running" lcores.
>> + */
>> +static int
>> +test_mcslock_perf(void)
>> +{
>> +	unsigned int i;
>> +	uint64_t total = 0;
>> +	int lock = 0;
>> +	const unsigned int lcore = rte_lcore_id();
>> +
>> +	printf("\nTest with no lock on single core...\n");
>> +	rte_atomic32_set(&synchro, 1);
>> +	load_loop_fn(&lock);
>> +	printf("Core [%u] Cost Time = %"PRIu64" us\n",
>> +			lcore, time_count[lcore]);
>> +	memset(time_count, 0, sizeof(time_count));
>> +
>> +	printf("\nTest with lock on single core...\n");
>> +	lock = 1;
>> +	rte_atomic32_set(&synchro, 1);
>> +	load_loop_fn(&lock);
>> +	printf("Core [%u] Cost Time = %"PRIu64" us\n",
>> +			lcore, time_count[lcore]);
>> +	memset(time_count, 0, sizeof(time_count));
>> +
>> +	printf("\nTest with lock on %u cores...\n", (rte_lcore_count()));
>> +
>> +	rte_atomic32_set(&synchro, 0);
>> +	rte_eal_mp_remote_launch(load_loop_fn, &lock, SKIP_MASTER);
>> +
>> +	/* start synchro and launch test on master */
>> +	rte_atomic32_set(&synchro, 1);
>> +	load_loop_fn(&lock);
>> +
>> +	rte_eal_mp_wait_lcore();
>> +
>> +	RTE_LCORE_FOREACH(i) {
>> +		printf("Core [%u] Cost Time = %"PRIu64" us\n",
>> +				i, time_count[i]);
>> +		total += time_count[i];
>> +	}
>> +
>> +	printf("Total Cost Time = %"PRIu64" us\n", total);
>> +
>> +	return 0;
>> +}
>> +
>> +REGISTER_TEST_COMMAND(mcslock_perf_autotest, test_mcslock_perf);
>> --
>> 2.7.4
  
Stephen Hemminger June 12, 2023, 9:23 p.m. UTC | #3
On Mon, 23 Mar 2020 12:16:17 +0800
Phil Yang <phil.yang@arm.com> wrote:

> The MCS lock performance test takes more than 10 seconds and leads
> to meson test timeout on some platforms. Move the performance test
> into perf tests.
> 
> Signed-off-by: Phil Yang <phil.yang@arm.com>
> Reviewed-by: Gavin Hu <gavin.hu@arm.com>
> ---

No, fix the test.
There is no reason that mcslock should be in perf section
but not all the other lock types.
  

Patch

diff --git a/MAINTAINERS b/MAINTAINERS
index db235c2..411bdeb 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -247,6 +247,7 @@  MCSlock - EXPERIMENTAL
 M: Phil Yang <phil.yang@arm.com>
 F: lib/librte_eal/common/include/generic/rte_mcslock.h
 F: app/test/test_mcslock.c
+F: app/test/test_mcslock_perf.c
 
 Ticketlock
 M: Joyce Kong <joyce.kong@arm.com>
diff --git a/app/test/Makefile b/app/test/Makefile
index 1f080d1..97de3ac 100644
--- a/app/test/Makefile
+++ b/app/test/Makefile
@@ -65,6 +65,7 @@  SRCS-y += test_barrier.c
 SRCS-y += test_malloc.c
 SRCS-y += test_cycles.c
 SRCS-y += test_mcslock.c
+SRCS-y += test_mcslock_perf.c
 SRCS-y += test_spinlock.c
 SRCS-y += test_ticketlock.c
 SRCS-y += test_memory.c
diff --git a/app/test/autotest_data.py b/app/test/autotest_data.py
index 7b1d013..2a4619d 100644
--- a/app/test/autotest_data.py
+++ b/app/test/autotest_data.py
@@ -784,6 +784,12 @@ 
         "Func":    default_autotest,
         "Report":  None,
     },
+    {
+        "Name":    "MCS Lock performance autotest",
+        "Command": "mcslock_perf_autotest",
+        "Func":    default_autotest,
+        "Report":  None,
+    },
     #
     # Please always make sure that ring_perf is the last test!
     #
diff --git a/app/test/meson.build b/app/test/meson.build
index 0a2ce71..335a869 100644
--- a/app/test/meson.build
+++ b/app/test/meson.build
@@ -82,6 +82,7 @@  test_sources = files('commands.c',
 	'test_meter.c',
 	'test_metrics.c',
 	'test_mcslock.c',
+	'test_mcslock_perf.c',
 	'test_mp_secondary.c',
 	'test_per_lcore.c',
 	'test_pmd_perf.c',
@@ -270,6 +271,7 @@  perf_test_names = [
         'rand_perf_autotest',
         'hash_readwrite_perf_autotest',
         'hash_readwrite_lf_perf_autotest',
+	'mcslock_perf_autotest',
 ]
 
 driver_test_names = [
diff --git a/app/test/test_mcslock.c b/app/test/test_mcslock.c
index e9359df..15f9751 100644
--- a/app/test/test_mcslock.c
+++ b/app/test/test_mcslock.c
@@ -32,23 +32,16 @@ 
  *
  *   - The function takes the global lock, display something, then releases
  *     the global lock on each core.
- *
- * - A load test is carried out, with all cores attempting to lock a single
- *   lock multiple times.
  */
 
 RTE_DEFINE_PER_LCORE(rte_mcslock_t, _ml_me);
 RTE_DEFINE_PER_LCORE(rte_mcslock_t, _ml_try_me);
-RTE_DEFINE_PER_LCORE(rte_mcslock_t, _ml_perf_me);
 
 rte_mcslock_t *p_ml;
 rte_mcslock_t *p_ml_try;
-rte_mcslock_t *p_ml_perf;
 
 static unsigned int count;
 
-static rte_atomic32_t synchro;
-
 static int
 test_mcslock_per_core(__attribute__((unused)) void *arg)
 {
@@ -63,85 +56,8 @@  test_mcslock_per_core(__attribute__((unused)) void *arg)
 	return 0;
 }
 
-static uint64_t time_count[RTE_MAX_LCORE] = {0};
-
 #define MAX_LOOP 1000000
 
-static int
-load_loop_fn(void *func_param)
-{
-	uint64_t time_diff = 0, begin;
-	uint64_t hz = rte_get_timer_hz();
-	volatile uint64_t lcount = 0;
-	const int use_lock = *(int *)func_param;
-	const unsigned int lcore = rte_lcore_id();
-
-	/**< Per core me node. */
-	rte_mcslock_t ml_perf_me = RTE_PER_LCORE(_ml_perf_me);
-
-	/* wait synchro */
-	while (rte_atomic32_read(&synchro) == 0)
-		;
-
-	begin = rte_get_timer_cycles();
-	while (lcount < MAX_LOOP) {
-		if (use_lock)
-			rte_mcslock_lock(&p_ml_perf, &ml_perf_me);
-
-		lcount++;
-		if (use_lock)
-			rte_mcslock_unlock(&p_ml_perf, &ml_perf_me);
-	}
-	time_diff = rte_get_timer_cycles() - begin;
-	time_count[lcore] = time_diff * 1000000 / hz;
-	return 0;
-}
-
-static int
-test_mcslock_perf(void)
-{
-	unsigned int i;
-	uint64_t total = 0;
-	int lock = 0;
-	const unsigned int lcore = rte_lcore_id();
-
-	printf("\nTest with no lock on single core...\n");
-	rte_atomic32_set(&synchro, 1);
-	load_loop_fn(&lock);
-	printf("Core [%u] Cost Time = %"PRIu64" us\n",
-			lcore, time_count[lcore]);
-	memset(time_count, 0, sizeof(time_count));
-
-	printf("\nTest with lock on single core...\n");
-	lock = 1;
-	rte_atomic32_set(&synchro, 1);
-	load_loop_fn(&lock);
-	printf("Core [%u] Cost Time = %"PRIu64" us\n",
-			lcore, time_count[lcore]);
-	memset(time_count, 0, sizeof(time_count));
-
-	printf("\nTest with lock on %u cores...\n", (rte_lcore_count()));
-
-	rte_atomic32_set(&synchro, 0);
-	rte_eal_mp_remote_launch(load_loop_fn, &lock, SKIP_MASTER);
-
-	/* start synchro and launch test on master */
-	rte_atomic32_set(&synchro, 1);
-	load_loop_fn(&lock);
-
-	rte_eal_mp_wait_lcore();
-
-	RTE_LCORE_FOREACH(i) {
-		printf("Core [%u] Cost Time = %"PRIu64" us\n",
-				i, time_count[i]);
-		total += time_count[i];
-	}
-
-	printf("Total Cost Time = %"PRIu64" us\n", total);
-
-	return 0;
-}
-
 /*
  * Use rte_mcslock_trylock() to trylock a mcs lock object,
  * If it could not lock the object successfully, it would
@@ -240,10 +156,6 @@  test_mcslock(void)
 		ret = -1;
 	rte_mcslock_unlock(&p_ml, &ml_me);
 
-	/* mcs lock perf test */
-	if (test_mcslock_perf() < 0)
-		return -1;
-
 	return ret;
 }
 
diff --git a/app/test/test_mcslock_perf.c b/app/test/test_mcslock_perf.c
new file mode 100644
index 0000000..6948344
--- /dev/null
+++ b/app/test/test_mcslock_perf.c
@@ -0,0 +1,121 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2019 Arm Limited
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <inttypes.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/queue.h>
+
+#include <rte_common.h>
+#include <rte_memory.h>
+#include <rte_per_lcore.h>
+#include <rte_launch.h>
+#include <rte_eal.h>
+#include <rte_lcore.h>
+#include <rte_cycles.h>
+#include <rte_mcslock.h>
+#include <rte_atomic.h>
+
+#include "test.h"
+
+/*
+ * RTE MCS lock perf test
+ * ======================
+ *
+ * These tests are derived from spin lock perf test cases.
+ *
+ * - A load test is carried out, with all cores attempting to lock a single
+ *   lock multiple times.
+ */
+
+RTE_DEFINE_PER_LCORE(rte_mcslock_t, _ml_perf_me);
+rte_mcslock_t *p_ml_perf;
+
+static rte_atomic32_t synchro;
+static uint64_t time_count[RTE_MAX_LCORE] = {0};
+
+#define MAX_LOOP 1000000
+
+static int
+load_loop_fn(void *func_param)
+{
+	uint64_t time_diff = 0, begin;
+	uint64_t hz = rte_get_timer_hz();
+	volatile uint64_t lcount = 0;
+	const int use_lock = *(int *)func_param;
+	const unsigned int lcore = rte_lcore_id();
+
+	/**< Per core me node. */
+	rte_mcslock_t ml_perf_me = RTE_PER_LCORE(_ml_perf_me);
+
+	/* wait synchro */
+	while (rte_atomic32_read(&synchro) == 0)
+		;
+
+	begin = rte_get_timer_cycles();
+	while (lcount < MAX_LOOP) {
+		if (use_lock)
+			rte_mcslock_lock(&p_ml_perf, &ml_perf_me);
+
+		lcount++;
+		if (use_lock)
+			rte_mcslock_unlock(&p_ml_perf, &ml_perf_me);
+	}
+	time_diff = rte_get_timer_cycles() - begin;
+	time_count[lcore] = time_diff * 1000000 / hz;
+	return 0;
+}
+
+/*
+ * Test rte_eal_get_lcore_state() in addition to mcs locks
+ * as we have "waiting" then "running" lcores.
+ */
+static int
+test_mcslock_perf(void)
+{
+	unsigned int i;
+	uint64_t total = 0;
+	int lock = 0;
+	const unsigned int lcore = rte_lcore_id();
+
+	printf("\nTest with no lock on single core...\n");
+	rte_atomic32_set(&synchro, 1);
+	load_loop_fn(&lock);
+	printf("Core [%u] Cost Time = %"PRIu64" us\n",
+			lcore, time_count[lcore]);
+	memset(time_count, 0, sizeof(time_count));
+
+	printf("\nTest with lock on single core...\n");
+	lock = 1;
+	rte_atomic32_set(&synchro, 1);
+	load_loop_fn(&lock);
+	printf("Core [%u] Cost Time = %"PRIu64" us\n",
+			lcore, time_count[lcore]);
+	memset(time_count, 0, sizeof(time_count));
+
+	printf("\nTest with lock on %u cores...\n", (rte_lcore_count()));
+
+	rte_atomic32_set(&synchro, 0);
+	rte_eal_mp_remote_launch(load_loop_fn, &lock, SKIP_MASTER);
+
+	/* start synchro and launch test on master */
+	rte_atomic32_set(&synchro, 1);
+	load_loop_fn(&lock);
+
+	rte_eal_mp_wait_lcore();
+
+	RTE_LCORE_FOREACH(i) {
+		printf("Core [%u] Cost Time = %"PRIu64" us\n",
+				i, time_count[i]);
+		total += time_count[i];
+	}
+
+	printf("Total Cost Time = %"PRIu64" us\n", total);
+
+	return 0;
+}
+
+REGISTER_TEST_COMMAND(mcslock_perf_autotest, test_mcslock_perf);