[1/5] app/test-pm: add multiprocess test

Message ID 20231212042517.164353-2-artemyko@nvidia.com (mailing list archive)
State Superseded, archived
Delegated to: Thomas Monjalon
Headers
Series addressing races in concurrent process startup |

Checks

Context Check Description
ci/checkpatch warning coding style issues

Commit Message

Artemy Kovalyov Dec. 12, 2023, 4:25 a.m. UTC
  This commit adds a test scenario that initiates multiple processes
concurrently. These processes attach to the same shared heap, with an
automatic detection mechanism to identify the primary process.

Signed-off-by: Artemy Kovalyov <artemyko@nvidia.com>
---
 app/meson.build         |  1 +
 app/test-mp/main.c      | 49 +++++++++++++++++++++++++++++++++++++++++++++++++
 app/test-mp/meson.build |  8 ++++++++
 app/test-mp/run.sh      | 39 +++++++++++++++++++++++++++++++++++++++
 4 files changed, 97 insertions(+)
 create mode 100644 app/test-mp/main.c
 create mode 100644 app/test-mp/meson.build
 create mode 100755 app/test-mp/run.sh
  

Comments

Stephen Hemminger Dec. 12, 2023, 5:09 p.m. UTC | #1
On Tue, 12 Dec 2023 06:25:12 +0200
Artemy Kovalyov <artemyko@nvidia.com> wrote:

> +rte_atomic32_t g_count;
> +
> +static int
> +done(const struct rte_mp_msg *msg __rte_unused, const void *arg __rte_unused)
> +{
> +	rte_atomic32_dec(&g_count);
> +	return 0;
> +}

Local variable, should be static.

Also, assert may not be the ideal way to report test failures.

The preferred way would be to use RTE_TEST_ASSERT() and RTE_TEST_ASSERT_EQUAL()
  
Artemy Kovalyov March 7, 2024, 6:59 a.m. UTC | #2
In the process of initiating multiple processes concurrently, specifically with
automatic detection of the primary process, certain race conditions have been
identified. This patch series introduces a straightforward test that showcases
the issue and subsequently addresses the problems surfaced by the test. These
fixes aim to ensure the robust and secure utilization of DPDK within intricate
solutions that involve starting processes with job orchestrators such as Slurm
or Hadoop YARN.

Artemy Kovalyov (5):
  app/test-mp: add multiprocess test
  eal: fix multiprocess hotplug race
  ipc: fix mp channel closure to prevent message loss
  eal: fix first time primary autodetect
  eal: fix memzone fbarray cleanup

 app/meson.build                     |  1 +
 app/test-mp/main.c                  | 52 +++++++++++++++++++++++++++++++++++++
 app/test-mp/meson.build             |  8 ++++++
 app/test-mp/run.sh                  | 40 ++++++++++++++++++++++++++++
 lib/eal/common/eal_common_memzone.c | 12 +++++++++
 lib/eal/common/eal_common_proc.c    |  4 +--
 lib/eal/common/eal_private.h        |  5 ++++
 lib/eal/common/hotplug_mp.c         |  3 +++
 lib/eal/linux/eal.c                 |  3 ++-
 9 files changed, 125 insertions(+), 3 deletions(-)
 create mode 100644 app/test-mp/main.c
 create mode 100644 app/test-mp/meson.build
 create mode 100755 app/test-mp/run.sh
  
Artemy Kovalyov March 7, 2024, 7:01 a.m. UTC | #3
In the process of initiating multiple processes concurrently, specifically with
automatic detection of the primary process, certain race conditions have been
identified. This patch series introduces a straightforward test that showcases
the issue and subsequently addresses the problems surfaced by the test. These
fixes aim to ensure the robust and secure utilization of DPDK within intricate
solutions that involve starting processes with job orchestrators such as Slurm
or Hadoop YARN.

Artemy Kovalyov (5):
  app/test-mp: add multiprocess test
  eal: fix multiprocess hotplug race
  ipc: fix mp channel closure to prevent message loss
  eal: fix first time primary autodetect
  eal: fix memzone fbarray cleanup

 app/meson.build                     |  1 +
 app/test-mp/main.c                  | 52 +++++++++++++++++++++++++++++++++++++
 app/test-mp/meson.build             |  8 ++++++
 app/test-mp/run.sh                  | 40 ++++++++++++++++++++++++++++
 lib/eal/common/eal_common_memzone.c | 12 +++++++++
 lib/eal/common/eal_common_proc.c    |  4 +--
 lib/eal/common/eal_private.h        |  5 ++++
 lib/eal/common/hotplug_mp.c         |  3 +++
 lib/eal/linux/eal.c                 |  3 ++-
 9 files changed, 125 insertions(+), 3 deletions(-)
 create mode 100644 app/test-mp/main.c
 create mode 100644 app/test-mp/meson.build
 create mode 100755 app/test-mp/run.sh
  

Patch

diff --git a/app/meson.build b/app/meson.build
index 8aaed59..1b80091 100644
--- a/app/meson.build
+++ b/app/meson.build
@@ -30,6 +30,7 @@  apps = [
         'test-flow-perf',
         'test-gpudev',
         'test-mldev',
+        'test-mp',
         'test-pipeline',
         'test-pmd',
         'test-regex',
diff --git a/app/test-mp/main.c b/app/test-mp/main.c
new file mode 100644
index 0000000..0a0fbbf
--- /dev/null
+++ b/app/test-mp/main.c
@@ -0,0 +1,49 @@ 
+#include <stdio.h>
+#include <string.h>
+
+#include <rte_malloc.h>
+#include <rte_launch.h>
+#include <rte_eal.h>
+
+rte_atomic32_t g_count;
+
+static int
+done(const struct rte_mp_msg *msg __rte_unused, const void *arg __rte_unused)
+{
+	rte_atomic32_dec(&g_count);
+	return 0;
+}
+
+int
+main(int argc, char **argv)
+{
+	void *p;
+	int ret;
+
+	ret = rte_eal_init(argc, argv);
+	assert(ret >= 0);
+
+	rte_atomic32_set(&g_count, atoi(argv[++ret]));
+
+	if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+		ret = rte_mp_action_register("done", done);
+		assert(ret == 0);
+	}
+
+	p = rte_malloc(NULL, 0x1000000, 0x1000);
+	assert(p);
+
+	if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+		uint64_t timeout = rte_rdtsc() + 5 * rte_get_tsc_hz();
+
+		while (rte_atomic32_read(&g_count) > 0)
+			assert(rte_rdtsc() < timeout);
+	} else {
+		struct rte_mp_msg msg = { .name = "done" };
+
+		rte_mp_sendmsg(&msg);
+	}
+
+	rte_eal_cleanup();
+	return 0;
+}
diff --git a/app/test-mp/meson.build b/app/test-mp/meson.build
new file mode 100644
index 0000000..feb9e20
--- /dev/null
+++ b/app/test-mp/meson.build
@@ -0,0 +1,8 @@ 
+if is_windows
+    build = false
+    reason = 'not supported on Windows'
+    subdir_done()
+endif
+
+sources = files('main.c')
+deps = ['eal'] # , 'mempool', 'net', 'mbuf', 'ethdev', 'cmdline']
diff --git a/app/test-mp/run.sh b/app/test-mp/run.sh
new file mode 100755
index 0000000..8de07e2
--- /dev/null
+++ b/app/test-mp/run.sh
@@ -0,0 +1,39 @@ 
+#!/bin/bash
+
+logdir=/tmp/dpdk_test_mp
+repeat=1
+lastcore=$(($(nproc) - 1))
+log=1
+
+while getopts p:r:lL:d op; do case $op in
+    p) lastcore=$OPTARG ;;
+    r) repeat=$OPTARG ;;
+    L) logdir=$OPTARG ;;
+    l) log=0 ;;
+    d) debug=1 ;;
+esac done
+shift $((OPTIND-1))
+
+test=$1
+logpath=$logdir/$(date +%y%m%d-%H%M%S)
+
+rm -f core.*
+pkill dpdk-test-mp
+
+for j in $(seq $repeat) ; do
+    [ $log ] && mkdir -p $logpath/$j
+    for i in $(seq 0 $lastcore) ; do
+	args="-l $i --file-prefix=dpdk1 --proc-type=auto"
+	if [ $debug ] ; then
+	    args="$args --log-level=lib.eal:8"
+	fi
+	if [ $log ] ; then
+	    $test $args $lastcore >$logpath/$j/$i.log 2>&1 &
+	else
+	    $test $args $lastcore &
+	fi
+    done
+    wait || break
+    [ $(ls core.* 2>/dev/null | wc -l) -gt 0 ] && break
+    echo iteration $j passed
+done