diff mbox series

[v1,3/3] test/mcslock: add mcs queued lock unit test

Message ID	1559750328-22377-4-git-send-email-phil.yang@arm.com (mailing list archive)
State	Superseded, archived
Delegated to:	Thomas Monjalon
Headers	From: Phil Yang <phil.yang@arm.com> To: dev@dpdk.org Cc: thomas@monjalon.net, jerinj@marvell.com, hemant.agrawal@nxp.com, Honnappa.Nagarahalli@arm.com, gavin.hu@arm.com, phil.yang@arm.com, nd@arm.com Date: Wed, 5 Jun 2019 23:58:48 +0800 Message-Id: <1559750328-22377-4-git-send-email-phil.yang@arm.com> In-Reply-To: <1559750328-22377-1-git-send-email-phil.yang@arm.com> References: <1559750328-22377-1-git-send-email-phil.yang@arm.com> Subject: [dpdk-dev] [PATCH v1 3/3] test/mcslock: add mcs queued lock unit test Precedence: list Errors-To: dev-bounces@dpdk.org Sender: "dev" <dev-bounces@dpdk.org>
Series	MCS queued lock implementation \| [v1,0/3] MCS queued lock implementation [v1,1/3] eal/mcslock: add mcs queued lock implementation [v1,2/3] eal/mcslock: use generic msc queued lock on all arch [v1,3/3] test/mcslock: add mcs queued lock unit test

Checks

Context	Check	Description
ci/checkpatch	success	coding style OK
ci/Intel-compilation	success	Compilation OK

Commit Message

Phil Yang June 5, 2019, 3:58 p.m. UTC

  Unit test and perf test for MCS queued lock.

Signed-off-by: Phil Yang <phil.yang@arm.com>
Reviewed-by: Gavin Hu <gavin.hu@arm.com>
Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>

---
 MAINTAINERS                     |   1 +
 app/test/Makefile               |   1 +
 app/test/autotest_data.py       |   6 +
 app/test/autotest_test_funcs.py |  32 ++++++
 app/test/meson.build            |   2 +
 app/test/test_mcslock.c         | 248 ++++++++++++++++++++++++++++++++++++++++
 6 files changed, 290 insertions(+)
 create mode 100644 app/test/test_mcslock.c

Comments

Ananyev, Konstantin June 6, 2019, 1:42 p.m. UTC | #1

Hi,

> 
> Unit test and perf test for MCS queued lock.

Perf test is important of course, but first I think we need
more robust functional test to make sure that lock does work properly.
At least something like ticketlock test but probably with bigger number of iterations.
10K seems quite small here.
In fact with this one we'll have 3 lock methods, I think it makes sense to have 
one united UT framework for them, so only actual lock/unlock would be different.
Konstantin

> 
> Signed-off-by: Phil Yang <phil.yang@arm.com>
> Reviewed-by: Gavin Hu <gavin.hu@arm.com>
> Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> 
> ---
>  MAINTAINERS                     |   1 +
>  app/test/Makefile               |   1 +
>  app/test/autotest_data.py       |   6 +
>  app/test/autotest_test_funcs.py |  32 ++++++
>  app/test/meson.build            |   2 +
>  app/test/test_mcslock.c         | 248 ++++++++++++++++++++++++++++++++++++++++
>  6 files changed, 290 insertions(+)
>  create mode 100644 app/test/test_mcslock.c
> 
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 1390238..33fdc8f 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -225,6 +225,7 @@ F: app/test/test_ticketlock.c
>  MCSlock - EXPERIMENTAL
>  M: Phil Yang <phil.yang@arm.com>
>  F: lib/librte_eal/common/include/generic/rte_mcslock.h
> +F: app/test/test_mcslock.c
> 
>  ARM v7
>  M: Jan Viktorin <viktorin@rehivetech.com>
> diff --git a/app/test/Makefile b/app/test/Makefile
> index 68d6b4f..be405cd 100644
> --- a/app/test/Makefile
> +++ b/app/test/Makefile
> @@ -64,6 +64,7 @@ SRCS-y += test_atomic.c
>  SRCS-y += test_barrier.c
>  SRCS-y += test_malloc.c
>  SRCS-y += test_cycles.c
> +SRCS-y += test_mcslock.c
>  SRCS-y += test_spinlock.c
>  SRCS-y += test_ticketlock.c
>  SRCS-y += test_memory.c
> diff --git a/app/test/autotest_data.py b/app/test/autotest_data.py
> index 0f2c9a7..68ca23d 100644
> --- a/app/test/autotest_data.py
> +++ b/app/test/autotest_data.py
> @@ -177,6 +177,12 @@
>          "Report":  None,
>      },
>      {
> +        "Name":    "MCSlock autotest",
> +        "Command": "mcslock_autotest",
> +        "Func":    mcslock_autotest,
> +        "Report":  None,
> +    },
> +    {
>          "Name":    "Byte order autotest",
>          "Command": "byteorder_autotest",
>          "Func":    default_autotest,
> diff --git a/app/test/autotest_test_funcs.py b/app/test/autotest_test_funcs.py
> index 31cc0f5..26688b7 100644
> --- a/app/test/autotest_test_funcs.py
> +++ b/app/test/autotest_test_funcs.py
> @@ -164,6 +164,38 @@ def ticketlock_autotest(child, test_name):
> 
>      return 0, "Success"
> 
> +def mcslock_autotest(child, test_name):
> +    i = 0
> +    ir = 0
> +    child.sendline(test_name)
> +    while True:
> +        index = child.expect(["Test OK",
> +                              "Test Failed",
> +                              "lcore ([0-9]*) state: ([0-1])"
> +                              "MCS lock taken on core ([0-9]*)",
> +                              "MCS lock released on core ([0-9]*)",
> +                              pexpect.TIMEOUT], timeout=5)
> +        # ok
> +        if index == 0:
> +            break
> +
> +        # message, check ordering
> +        elif index == 2:
> +            if int(child.match.groups()[0]) < i:
> +                return -1, "Fail [Bad order]"
> +            i = int(child.match.groups()[0])
> +        elif index == 3:
> +            if int(child.match.groups()[0]) < ir:
> +                return -1, "Fail [Bad order]"
> +            ir = int(child.match.groups()[0])
> +
> +        # fail
> +        elif index == 4:
> +            return -1, "Fail [Timeout]"
> +        elif index == 1:
> +            return -1, "Fail"
> +
> +    return 0, "Success"
> 
>  def logs_autotest(child, test_name):
>      child.sendline(test_name)
> diff --git a/app/test/meson.build b/app/test/meson.build
> index 83391ce..3f5f17a 100644
> --- a/app/test/meson.build
> +++ b/app/test/meson.build
> @@ -75,6 +75,7 @@ test_sources = files('commands.c',
>  	'test_memzone.c',
>  	'test_meter.c',
>  	'test_metrics.c',
> +	'test_mcslock.c',
>  	'test_mp_secondary.c',
>  	'test_pdump.c',
>  	'test_per_lcore.c',
> @@ -167,6 +168,7 @@ fast_parallel_test_names = [
>          'lpm6_autotest',
>          'malloc_autotest',
>          'mbuf_autotest',
> +        'mcslock_autotest',
>          'memcpy_autotest',
>          'memory_autotest',
>          'mempool_autotest',
> diff --git a/app/test/test_mcslock.c b/app/test/test_mcslock.c
> new file mode 100644
> index 0000000..a2274e5
> --- /dev/null
> +++ b/app/test/test_mcslock.c
> @@ -0,0 +1,248 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2019 Arm Limited
> + */
> +
> +#include <stdio.h>
> +#include <stdint.h>
> +#include <inttypes.h>
> +#include <string.h>
> +#include <unistd.h>
> +#include <sys/queue.h>
> +
> +#include <rte_common.h>
> +#include <rte_memory.h>
> +#include <rte_per_lcore.h>
> +#include <rte_launch.h>
> +#include <rte_eal.h>
> +#include <rte_lcore.h>
> +#include <rte_cycles.h>
> +#include <rte_mcslock.h>
> +#include <rte_atomic.h>
> +
> +#include "test.h"
> +
> +/*
> + * RTE MCS lock test
> + * =================
> + *
> + * These tests are derived from spin lock test cases.
> + *
> + * - The functional test takes all of these locks and launches the
> + *   ''test_mcslock_per_core()'' function on each core (except the master).
> + *
> + *   - The function takes the global lock, display something, then releases
> + *     the global lock on each core.
> + *
> + * - A load test is carried out, with all cores attempting to lock a single
> + *   lock multiple times.
> + */
> +#include <rte_per_lcore.h>
> +
> +RTE_DEFINE_PER_LCORE(rte_mcslock_t, _ml_me);
> +RTE_DEFINE_PER_LCORE(rte_mcslock_t, _ml_try_me);
> +RTE_DEFINE_PER_LCORE(rte_mcslock_t, _ml_perf_me);
> +
> +rte_mcslock_t *p_ml;
> +rte_mcslock_t *p_ml_try;
> +rte_mcslock_t *p_ml_perf;
> +
> +static unsigned int count;
> +
> +static rte_atomic32_t synchro;
> +
> +static int
> +test_mcslock_per_core(__attribute__((unused)) void *arg)
> +{
> +	/* Per core me node. */
> +	rte_mcslock_t ml_me = RTE_PER_LCORE(_ml_me);
> +
> +	rte_mcslock_lock(&p_ml, &ml_me);
> +	printf("MCS lock taken on core %u\n", rte_lcore_id());
> +	rte_mcslock_unlock(&p_ml, &ml_me);
> +	printf("MCS lock released on core %u\n", rte_lcore_id());
> +
> +	return 0;
> +}
> +
> +static uint64_t time_count[RTE_MAX_LCORE] = {0};
> +
> +#define MAX_LOOP 10000
> +
> +static int
> +load_loop_fn(void *func_param)
> +{
> +	uint64_t time_diff = 0, begin;
> +	uint64_t hz = rte_get_timer_hz();
> +	volatile uint64_t lcount = 0;
> +	const int use_lock = *(int *)func_param;
> +	const unsigned int lcore = rte_lcore_id();
> +
> +	/**< Per core me node. */
> +	rte_mcslock_t ml_perf_me = RTE_PER_LCORE(_ml_perf_me);
> +
> +	/* wait synchro */
> +	while (rte_atomic32_read(&synchro) == 0)
> +		;
> +
> +	begin = rte_get_timer_cycles();
> +	while (lcount < MAX_LOOP) {
> +		if (use_lock)
> +			rte_mcslock_lock(&p_ml_perf, &ml_perf_me);
> +
> +		lcount++;
> +		if (use_lock)
> +			rte_mcslock_unlock(&p_ml_perf, &ml_perf_me);
> +	}
> +	time_diff = rte_get_timer_cycles() - begin;
> +	time_count[lcore] = time_diff * 1000000 / hz;
> +	return 0;
> +}
> +
> +static int
> +test_mcslock_perf(void)
> +{
> +	unsigned int i;
> +	uint64_t total = 0;
> +	int lock = 0;
> +	const unsigned int lcore = rte_lcore_id();
> +
> +	printf("\nTest with no lock on single core...\n");
> +	rte_atomic32_set(&synchro, 1);
> +	load_loop_fn(&lock);
> +	printf("Core [%u] Cost Time = %"PRIu64" us\n",
> +			lcore, time_count[lcore]);
> +	memset(time_count, 0, sizeof(time_count));
> +
> +	printf("\nTest with lock on single core...\n");
> +	lock = 1;
> +	rte_atomic32_set(&synchro, 1);
> +	load_loop_fn(&lock);
> +	printf("Core [%u] Cost Time = %"PRIu64" us\n",
> +			lcore, time_count[lcore]);
> +	memset(time_count, 0, sizeof(time_count));
> +
> +	printf("\nTest with lock on %u cores...\n", (rte_lcore_count()-1));
> +
> +	rte_atomic32_set(&synchro, 0);
> +	rte_eal_mp_remote_launch(load_loop_fn, &lock, SKIP_MASTER);
> +	rte_atomic32_set(&synchro, 1);
> +
> +	rte_eal_mp_wait_lcore();
> +
> +	RTE_LCORE_FOREACH_SLAVE(i) {
> +		printf("Core [%u] Cost Time = %"PRIu64" us\n",
> +				i, time_count[i]);
> +		total += time_count[i];
> +	}
> +
> +	printf("Total Cost Time = %"PRIu64" us\n", total);
> +
> +	return 0;
> +}
> +
> +/*
> + * Use rte_mcslock_trylock() to trylock a mcs lock object,
> + * If it could not lock the object successfully, it would
> + * return immediately.
> + */
> +static int
> +test_mcslock_try(__attribute__((unused)) void *arg)
> +{
> +	/**< Per core me node. */
> +	rte_mcslock_t ml_me     = RTE_PER_LCORE(_ml_me);
> +	rte_mcslock_t ml_try_me = RTE_PER_LCORE(_ml_try_me);
> +
> +	/* Locked ml_try in the master lcore, so it should fail
> +	 * when trying to lock it in the slave lcore.
> +	 */
> +	if (rte_mcslock_trylock(&p_ml_try, &ml_try_me) == 0) {
> +		rte_mcslock_lock(&p_ml, &ml_me);
> +		count++;
> +		rte_mcslock_unlock(&p_ml, &ml_me);
> +	}
> +
> +	return 0;
> +}
> +
> +
> +/*
> + * Test rte_eal_get_lcore_state() in addition to mcs locks
> + * as we have "waiting" then "running" lcores.
> + */
> +static int
> +test_mcslock(void)
> +{
> +	int ret = 0;
> +	int i;
> +
> +	/* Define per core me node. */
> +	rte_mcslock_t ml_me     = RTE_PER_LCORE(_ml_me);
> +	rte_mcslock_t ml_try_me = RTE_PER_LCORE(_ml_try_me);
> +
> +	/*
> +	 * Test mcs lock & unlock on each core
> +	 */
> +
> +	/* slave cores should be waiting: print it */
> +	RTE_LCORE_FOREACH_SLAVE(i) {
> +		printf("lcore %d state: %d\n", i,
> +				(int) rte_eal_get_lcore_state(i));
> +	}
> +
> +	rte_mcslock_lock(&p_ml, &ml_me);
> +
> +	RTE_LCORE_FOREACH_SLAVE(i) {
> +		rte_eal_remote_launch(test_mcslock_per_core, NULL, i);
> +	}
> +
> +	/* slave cores should be busy: print it */
> +	RTE_LCORE_FOREACH_SLAVE(i) {
> +		printf("lcore %d state: %d\n", i,
> +				(int) rte_eal_get_lcore_state(i));
> +	}
> +
> +	rte_mcslock_unlock(&p_ml, &ml_me);
> +
> +	rte_eal_mp_wait_lcore();
> +
> +	/*
> +	 * Test if it could return immediately from try-locking a locked object.
> +	 * Here it will lock the mcs lock object first, then launch all the
> +	 * slave lcores to trylock the same mcs lock object.
> +	 * All the slave lcores should give up try-locking a locked object and
> +	 * return immediately, and then increase the "count" initialized with
> +	 * zero by one per times.
> +	 * We can check if the "count" is finally equal to the number of all
> +	 * slave lcores to see if the behavior of try-locking a locked
> +	 * mcslock object is correct.
> +	 */
> +	if (rte_mcslock_trylock(&p_ml_try, &ml_try_me) == 0)
> +		return -1;
> +
> +	count = 0;
> +	RTE_LCORE_FOREACH_SLAVE(i) {
> +		rte_eal_remote_launch(test_mcslock_try, NULL, i);
> +	}
> +	rte_mcslock_unlock(&p_ml_try, &ml_try_me);
> +	rte_eal_mp_wait_lcore();
> +
> +	/* Test is_locked API */
> +	if (rte_mcslock_is_locked(p_ml)) {
> +		printf("mcslock is locked but it should not be\n");
> +		return -1;
> +	}
> +
> +	/* Counting the locked times in each core */
> +	rte_mcslock_lock(&p_ml, &ml_me);
> +	if (count != (rte_lcore_count() - 1))
> +		ret = -1;
> +	rte_mcslock_unlock(&p_ml, &ml_me);
> +
> +	/* mcs lock perf test */
> +	if (test_mcslock_perf() < 0)
> +		return -1;
> +
> +	return ret;
> +}
> +
> +REGISTER_TEST_COMMAND(mcslock_autotest, test_mcslock);
> --
> 2.7.4

Honnappa Nagarahalli June 7, 2019, 5:27 a.m. UTC | #2

> Hi,
> 
> >
> > Unit test and perf test for MCS queued lock.
> 
> Perf test is important of course, but first I think we need more robust
> functional test to make sure that lock does work properly.
> At least something like ticketlock test but probably with bigger number of
> iterations.
> 10K seems quite small here.
> In fact with this one we'll have 3 lock methods, I think it makes sense to have
> one united UT framework for them, so only actual lock/unlock would be
> different.
+1

> Konstantin
> 
> >

<snip>

Phil Yang June 10, 2019, 4:36 p.m. UTC | #3

Hi,

> -----Original Message-----
> From: Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>
> Sent: Friday, June 7, 2019 1:27 PM
> To: Ananyev, Konstantin <konstantin.ananyev@intel.com>; Phil Yang (Arm
> Technology China) <Phil.Yang@arm.com>; dev@dpdk.org
> Cc: thomas@monjalon.net; jerinj@marvell.com; hemant.agrawal@nxp.com;
> Gavin Hu (Arm Technology China) <Gavin.Hu@arm.com>; nd
> <nd@arm.com>; nd <nd@arm.com>
> Subject: RE: [dpdk-dev] [PATCH v1 3/3] test/mcslock: add mcs queued lock
> unit test
> 
> > Hi,
> >
> > >
> > > Unit test and perf test for MCS queued lock.
> >
> > Perf test is important of course, but first I think we need more
> > robust functional test to make sure that lock does work properly.
> > At least something like ticketlock test but probably with bigger
> > number of iterations.
> > 10K seems quite small here.
Yes. I agree. I think we should also consider the total time consuming of the test. As more cores are involved in the lock contention, more time is required to complete the test. 

> > In fact with this one we'll have 3 lock methods, I think it makes
> > sense to have one united UT framework for them, so only actual
> > lock/unlock would be different.
+1.  
Since the APIs of MCS lock is different with spinlock and ticket lock, so I am wondering that what will this united UT framework look like?
Will it be the same test case that integrates 3 sets of lock tests running with different cmd line?

> +1
> 
> > Konstantin
> >
> > >
> 
> <snip>

Thanks,
Phil Yang

diff mbox series

Patch

diff --git a/MAINTAINERS b/MAINTAINERS
index 1390238..33fdc8f 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -225,6 +225,7 @@  F: app/test/test_ticketlock.c
 MCSlock - EXPERIMENTAL
 M: Phil Yang <phil.yang@arm.com>
 F: lib/librte_eal/common/include/generic/rte_mcslock.h
+F: app/test/test_mcslock.c
 
 ARM v7
 M: Jan Viktorin <viktorin@rehivetech.com>
diff --git a/app/test/Makefile b/app/test/Makefile
index 68d6b4f..be405cd 100644
--- a/app/test/Makefile
+++ b/app/test/Makefile
@@ -64,6 +64,7 @@  SRCS-y += test_atomic.c
 SRCS-y += test_barrier.c
 SRCS-y += test_malloc.c
 SRCS-y += test_cycles.c
+SRCS-y += test_mcslock.c
 SRCS-y += test_spinlock.c
 SRCS-y += test_ticketlock.c
 SRCS-y += test_memory.c
diff --git a/app/test/autotest_data.py b/app/test/autotest_data.py
index 0f2c9a7..68ca23d 100644
--- a/app/test/autotest_data.py
+++ b/app/test/autotest_data.py
@@ -177,6 +177,12 @@ 
         "Report":  None,
     },
     {
+        "Name":    "MCSlock autotest",
+        "Command": "mcslock_autotest",
+        "Func":    mcslock_autotest,
+        "Report":  None,
+    },
+    {
         "Name":    "Byte order autotest",
         "Command": "byteorder_autotest",
         "Func":    default_autotest,
diff --git a/app/test/autotest_test_funcs.py b/app/test/autotest_test_funcs.py
index 31cc0f5..26688b7 100644
--- a/app/test/autotest_test_funcs.py
+++ b/app/test/autotest_test_funcs.py
@@ -164,6 +164,38 @@  def ticketlock_autotest(child, test_name):
 
     return 0, "Success"
 
+def mcslock_autotest(child, test_name):
+    i = 0
+    ir = 0
+    child.sendline(test_name)
+    while True:
+        index = child.expect(["Test OK",
+                              "Test Failed",
+                              "lcore ([0-9]*) state: ([0-1])"
+                              "MCS lock taken on core ([0-9]*)",
+                              "MCS lock released on core ([0-9]*)",
+                              pexpect.TIMEOUT], timeout=5)
+        # ok
+        if index == 0:
+            break
+
+        # message, check ordering
+        elif index == 2:
+            if int(child.match.groups()[0]) < i:
+                return -1, "Fail [Bad order]"
+            i = int(child.match.groups()[0])
+        elif index == 3:
+            if int(child.match.groups()[0]) < ir:
+                return -1, "Fail [Bad order]"
+            ir = int(child.match.groups()[0])
+
+        # fail
+        elif index == 4:
+            return -1, "Fail [Timeout]"
+        elif index == 1:
+            return -1, "Fail"
+
+    return 0, "Success"
 
 def logs_autotest(child, test_name):
     child.sendline(test_name)
diff --git a/app/test/meson.build b/app/test/meson.build
index 83391ce..3f5f17a 100644
--- a/app/test/meson.build
+++ b/app/test/meson.build
@@ -75,6 +75,7 @@  test_sources = files('commands.c',
 	'test_memzone.c',
 	'test_meter.c',
 	'test_metrics.c',
+	'test_mcslock.c',
 	'test_mp_secondary.c',
 	'test_pdump.c',
 	'test_per_lcore.c',
@@ -167,6 +168,7 @@  fast_parallel_test_names = [
         'lpm6_autotest',
         'malloc_autotest',
         'mbuf_autotest',
+        'mcslock_autotest',
         'memcpy_autotest',
         'memory_autotest',
         'mempool_autotest',
diff --git a/app/test/test_mcslock.c b/app/test/test_mcslock.c
new file mode 100644
index 0000000..a2274e5
--- /dev/null
+++ b/app/test/test_mcslock.c
@@ -0,0 +1,248 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2019 Arm Limited
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <inttypes.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/queue.h>
+
+#include <rte_common.h>
+#include <rte_memory.h>
+#include <rte_per_lcore.h>
+#include <rte_launch.h>
+#include <rte_eal.h>
+#include <rte_lcore.h>
+#include <rte_cycles.h>
+#include <rte_mcslock.h>
+#include <rte_atomic.h>
+
+#include "test.h"
+
+/*
+ * RTE MCS lock test
+ * =================
+ *
+ * These tests are derived from spin lock test cases.
+ *
+ * - The functional test takes all of these locks and launches the
+ *   ''test_mcslock_per_core()'' function on each core (except the master).
+ *
+ *   - The function takes the global lock, display something, then releases
+ *     the global lock on each core.
+ *
+ * - A load test is carried out, with all cores attempting to lock a single
+ *   lock multiple times.
+ */
+#include <rte_per_lcore.h>
+
+RTE_DEFINE_PER_LCORE(rte_mcslock_t, _ml_me);
+RTE_DEFINE_PER_LCORE(rte_mcslock_t, _ml_try_me);
+RTE_DEFINE_PER_LCORE(rte_mcslock_t, _ml_perf_me);
+
+rte_mcslock_t *p_ml;
+rte_mcslock_t *p_ml_try;
+rte_mcslock_t *p_ml_perf;
+
+static unsigned int count;
+
+static rte_atomic32_t synchro;
+
+static int
+test_mcslock_per_core(__attribute__((unused)) void *arg)
+{
+	/* Per core me node. */
+	rte_mcslock_t ml_me = RTE_PER_LCORE(_ml_me);
+
+	rte_mcslock_lock(&p_ml, &ml_me);
+	printf("MCS lock taken on core %u\n", rte_lcore_id());
+	rte_mcslock_unlock(&p_ml, &ml_me);
+	printf("MCS lock released on core %u\n", rte_lcore_id());
+
+	return 0;
+}
+
+static uint64_t time_count[RTE_MAX_LCORE] = {0};
+
+#define MAX_LOOP 10000
+
+static int
+load_loop_fn(void *func_param)
+{
+	uint64_t time_diff = 0, begin;
+	uint64_t hz = rte_get_timer_hz();
+	volatile uint64_t lcount = 0;
+	const int use_lock = *(int *)func_param;
+	const unsigned int lcore = rte_lcore_id();
+
+	/**< Per core me node. */
+	rte_mcslock_t ml_perf_me = RTE_PER_LCORE(_ml_perf_me);
+
+	/* wait synchro */
+	while (rte_atomic32_read(&synchro) == 0)
+		;
+
+	begin = rte_get_timer_cycles();
+	while (lcount < MAX_LOOP) {
+		if (use_lock)
+			rte_mcslock_lock(&p_ml_perf, &ml_perf_me);
+
+		lcount++;
+		if (use_lock)
+			rte_mcslock_unlock(&p_ml_perf, &ml_perf_me);
+	}
+	time_diff = rte_get_timer_cycles() - begin;
+	time_count[lcore] = time_diff * 1000000 / hz;
+	return 0;
+}
+
+static int
+test_mcslock_perf(void)
+{
+	unsigned int i;
+	uint64_t total = 0;
+	int lock = 0;
+	const unsigned int lcore = rte_lcore_id();
+
+	printf("\nTest with no lock on single core...\n");
+	rte_atomic32_set(&synchro, 1);
+	load_loop_fn(&lock);
+	printf("Core [%u] Cost Time = %"PRIu64" us\n",
+			lcore, time_count[lcore]);
+	memset(time_count, 0, sizeof(time_count));
+
+	printf("\nTest with lock on single core...\n");
+	lock = 1;
+	rte_atomic32_set(&synchro, 1);
+	load_loop_fn(&lock);
+	printf("Core [%u] Cost Time = %"PRIu64" us\n",
+			lcore, time_count[lcore]);
+	memset(time_count, 0, sizeof(time_count));
+
+	printf("\nTest with lock on %u cores...\n", (rte_lcore_count()-1));
+
+	rte_atomic32_set(&synchro, 0);
+	rte_eal_mp_remote_launch(load_loop_fn, &lock, SKIP_MASTER);
+	rte_atomic32_set(&synchro, 1);
+
+	rte_eal_mp_wait_lcore();
+
+	RTE_LCORE_FOREACH_SLAVE(i) {
+		printf("Core [%u] Cost Time = %"PRIu64" us\n",
+				i, time_count[i]);
+		total += time_count[i];
+	}
+
+	printf("Total Cost Time = %"PRIu64" us\n", total);
+
+	return 0;
+}
+
+/*
+ * Use rte_mcslock_trylock() to trylock a mcs lock object,
+ * If it could not lock the object successfully, it would
+ * return immediately.
+ */
+static int
+test_mcslock_try(__attribute__((unused)) void *arg)
+{
+	/**< Per core me node. */
+	rte_mcslock_t ml_me     = RTE_PER_LCORE(_ml_me);
+	rte_mcslock_t ml_try_me = RTE_PER_LCORE(_ml_try_me);
+
+	/* Locked ml_try in the master lcore, so it should fail
+	 * when trying to lock it in the slave lcore.
+	 */
+	if (rte_mcslock_trylock(&p_ml_try, &ml_try_me) == 0) {
+		rte_mcslock_lock(&p_ml, &ml_me);
+		count++;
+		rte_mcslock_unlock(&p_ml, &ml_me);
+	}
+
+	return 0;
+}
+
+
+/*
+ * Test rte_eal_get_lcore_state() in addition to mcs locks
+ * as we have "waiting" then "running" lcores.
+ */
+static int
+test_mcslock(void)
+{
+	int ret = 0;
+	int i;
+
+	/* Define per core me node. */
+	rte_mcslock_t ml_me     = RTE_PER_LCORE(_ml_me);
+	rte_mcslock_t ml_try_me = RTE_PER_LCORE(_ml_try_me);
+
+	/*
+	 * Test mcs lock & unlock on each core
+	 */
+
+	/* slave cores should be waiting: print it */
+	RTE_LCORE_FOREACH_SLAVE(i) {
+		printf("lcore %d state: %d\n", i,
+				(int) rte_eal_get_lcore_state(i));
+	}
+
+	rte_mcslock_lock(&p_ml, &ml_me);
+
+	RTE_LCORE_FOREACH_SLAVE(i) {
+		rte_eal_remote_launch(test_mcslock_per_core, NULL, i);
+	}
+
+	/* slave cores should be busy: print it */
+	RTE_LCORE_FOREACH_SLAVE(i) {
+		printf("lcore %d state: %d\n", i,
+				(int) rte_eal_get_lcore_state(i));
+	}
+
+	rte_mcslock_unlock(&p_ml, &ml_me);
+
+	rte_eal_mp_wait_lcore();
+
+	/*
+	 * Test if it could return immediately from try-locking a locked object.
+	 * Here it will lock the mcs lock object first, then launch all the
+	 * slave lcores to trylock the same mcs lock object.
+	 * All the slave lcores should give up try-locking a locked object and
+	 * return immediately, and then increase the "count" initialized with
+	 * zero by one per times.
+	 * We can check if the "count" is finally equal to the number of all
+	 * slave lcores to see if the behavior of try-locking a locked
+	 * mcslock object is correct.
+	 */
+	if (rte_mcslock_trylock(&p_ml_try, &ml_try_me) == 0)
+		return -1;
+
+	count = 0;
+	RTE_LCORE_FOREACH_SLAVE(i) {
+		rte_eal_remote_launch(test_mcslock_try, NULL, i);
+	}
+	rte_mcslock_unlock(&p_ml_try, &ml_try_me);
+	rte_eal_mp_wait_lcore();
+
+	/* Test is_locked API */
+	if (rte_mcslock_is_locked(p_ml)) {
+		printf("mcslock is locked but it should not be\n");
+		return -1;
+	}
+
+	/* Counting the locked times in each core */
+	rte_mcslock_lock(&p_ml, &ml_me);
+	if (count != (rte_lcore_count() - 1))
+		ret = -1;
+	rte_mcslock_unlock(&p_ml, &ml_me);
+
+	/* mcs lock perf test */
+	if (test_mcslock_perf() < 0)
+		return -1;
+
+	return ret;
+}
+
+REGISTER_TEST_COMMAND(mcslock_autotest, test_mcslock);