service: split tests to perf and autotest to avoid spurious CI failures

Message ID 20230224173637.243266-1-harry.van.haaren@intel.com (mailing list archive)
State Superseded, archived
Delegated to: Thomas Monjalon
Headers
Series service: split tests to perf and autotest to avoid spurious CI failures |

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/loongarch-compilation success Compilation OK
ci/loongarch-unit-testing success Unit Testing PASS
ci/Intel-compilation success Compilation OK
ci/iol-broadcom-Functional success Functional Testing PASS
ci/iol-mellanox-Performance success Performance Testing PASS
ci/intel-Testing success Testing PASS
ci/iol-broadcom-Performance success Performance Testing PASS
ci/iol-intel-Functional success Functional Testing PASS
ci/iol-intel-Performance success Performance Testing PASS
ci/iol-x86_64-compile-testing success Testing PASS
ci/github-robot: build success github build: passed
ci/iol-testing success Testing PASS
ci/iol-x86_64-unit-testing success Testing PASS

Commit Message

Van Haaren, Harry Feb. 24, 2023, 5:36 p.m. UTC
  On some CI runs, some service-cores tests spuriously fail as the
service lcore thread is not actually scheduled by the OS in the
given amount of time.

Increasing timeouts has not resolved the issue in the CI, so the
solution in this patch is to move them to a separate perf test
suite.

Signed-off-by: Harry van Haaren <harry.van.haaren@intel.com>

---

See DPDK ML discussion in this thread:
http://mails.dpdk.org/archives/dev/2023-February/263523.html
---
 app/test/meson.build          |  1 +
 app/test/test_service_cores.c | 32 +++++++++++++++++++++++++++-----
 2 files changed, 28 insertions(+), 5 deletions(-)
  

Comments

David Marchand Feb. 27, 2023, 4:08 p.m. UTC | #1
Hello,

On Fri, Feb 24, 2023 at 7:04 PM Harry van Haaren
<harry.van.haaren@intel.com> wrote:
>
> On some CI runs, some service-cores tests spuriously fail as the
> service lcore thread is not actually scheduled by the OS in the
> given amount of time.
>
> Increasing timeouts has not resolved the issue in the CI, so the
> solution in this patch is to move them to a separate perf test
> suite.
>
> Signed-off-by: Harry van Haaren <harry.van.haaren@intel.com>
>
> ---
>
> See DPDK ML discussion in this thread:
> http://mails.dpdk.org/archives/dev/2023-February/263523.html
> ---
>  app/test/meson.build          |  1 +
>  app/test/test_service_cores.c | 32 +++++++++++++++++++++++++++-----
>  2 files changed, 28 insertions(+), 5 deletions(-)
>
> diff --git a/app/test/meson.build b/app/test/meson.build
> index f34d19e3c3..2db5ccf4ff 100644
> --- a/app/test/meson.build
> +++ b/app/test/meson.build
> @@ -287,6 +287,7 @@ perf_test_names = [
>          'pie_perf',
>          'distributor_perf_autotest',
>          'pmd_perf_autotest',
> +        'service_perf_autotest',
>          'stack_perf_autotest',
>          'stack_lf_perf_autotest',
>          'rand_perf_autotest',
> diff --git a/app/test/test_service_cores.c b/app/test/test_service_cores.c
> index 637fcd7cf9..06653dfdef 100644
> --- a/app/test/test_service_cores.c
> +++ b/app/test/test_service_cores.c
> @@ -1022,17 +1022,12 @@ static struct unit_test_suite service_tests  = {
>                 TEST_CASE_ST(dummy_register, NULL, service_name),
>                 TEST_CASE_ST(dummy_register, NULL, service_get_by_name),
>                 TEST_CASE_ST(dummy_register, NULL, service_dump),
> -               TEST_CASE_ST(dummy_register, NULL, service_attr_get),
> -               TEST_CASE_ST(dummy_register, NULL, service_lcore_attr_get),
>                 TEST_CASE_ST(dummy_register, NULL, service_probe_capability),
>                 TEST_CASE_ST(dummy_register, NULL, service_start_stop),
>                 TEST_CASE_ST(dummy_register, NULL, service_lcore_add_del),
> -               TEST_CASE_ST(dummy_register, NULL, service_lcore_start_stop),
>                 TEST_CASE_ST(dummy_register, NULL, service_lcore_en_dis_able),
>                 TEST_CASE_ST(dummy_register, NULL, service_mt_unsafe_poll),
>                 TEST_CASE_ST(dummy_register, NULL, service_mt_safe_poll),
> -               TEST_CASE_ST(dummy_register, NULL, service_app_lcore_mt_safe),
> -               TEST_CASE_ST(dummy_register, NULL, service_app_lcore_mt_unsafe),
>                 TEST_CASE_ST(dummy_register, NULL, service_may_be_active),
>                 TEST_CASE_ST(dummy_register, NULL, service_active_two_cores),
>                 TEST_CASES_END() /**< NULL terminate unit test array */
> @@ -1046,3 +1041,30 @@ test_service_common(void)
>  }
>
>  REGISTER_TEST_COMMAND(service_autotest, test_service_common);
> +
> +
> +/* The tests below have been split from the auto-test suite, as the

What is the auto-test suite?
Plus "as the when" reads strange.

In the end, I don't think it helps much to have this comment in the code.
The commitlog is supposed to tell the story, so I would simply remove
this comment.


> + * when they are run in a cloud CI environment they can give false-positive
> + * errors, due to the service-cores not being scheduled by the OS.
> + */
> +static struct unit_test_suite service_perf_tests  = {
> +       .suite_name = "service core test suite",

Maybe add "performance" in the name, so we have a uniquely named
testsuite object.


> +       .setup = testsuite_setup,
> +       .teardown = testsuite_teardown,
> +       .unit_test_cases = {
> +               TEST_CASE_ST(dummy_register, NULL, service_attr_get),
> +               TEST_CASE_ST(dummy_register, NULL, service_lcore_attr_get),
> +               TEST_CASE_ST(dummy_register, NULL, service_lcore_start_stop),

Looking at service_lcore_running_check(), don't you think
service_may_be_active() and service_active_two_cores() are also
subject to race?


> +               TEST_CASE_ST(dummy_register, NULL, service_app_lcore_mt_safe),
> +               TEST_CASE_ST(dummy_register, NULL, service_app_lcore_mt_unsafe),
> +               TEST_CASES_END() /**< NULL terminate unit test array */
> +       }
> +};
> +
> +static int
> +test_service_perf(void)
> +{
> +       return unit_test_suite_runner(&service_perf_tests);
> +}
> +
> +REGISTER_TEST_COMMAND(service_perf_autotest, test_service_perf);
> --
> 2.34.1
>
  
David Marchand March 3, 2023, 8:37 a.m. UTC | #2
On Mon, Feb 27, 2023 at 5:08 PM David Marchand
<david.marchand@redhat.com> wrote:
>
> Hello,
>
> On Fri, Feb 24, 2023 at 7:04 PM Harry van Haaren
> <harry.van.haaren@intel.com> wrote:
> >
> > On some CI runs, some service-cores tests spuriously fail as the
> > service lcore thread is not actually scheduled by the OS in the
> > given amount of time.
> >
> > Increasing timeouts has not resolved the issue in the CI, so the
> > solution in this patch is to move them to a separate perf test
> > suite.
> >
> > Signed-off-by: Harry van Haaren <harry.van.haaren@intel.com>
> >
> > ---
> >
> > See DPDK ML discussion in this thread:
> > http://mails.dpdk.org/archives/dev/2023-February/263523.html
> > ---
> >  app/test/meson.build          |  1 +
> >  app/test/test_service_cores.c | 32 +++++++++++++++++++++++++++-----
> >  2 files changed, 28 insertions(+), 5 deletions(-)
> >
> > diff --git a/app/test/meson.build b/app/test/meson.build
> > index f34d19e3c3..2db5ccf4ff 100644
> > --- a/app/test/meson.build
> > +++ b/app/test/meson.build
> > @@ -287,6 +287,7 @@ perf_test_names = [
> >          'pie_perf',
> >          'distributor_perf_autotest',
> >          'pmd_perf_autotest',
> > +        'service_perf_autotest',
> >          'stack_perf_autotest',
> >          'stack_lf_perf_autotest',
> >          'rand_perf_autotest',
> > diff --git a/app/test/test_service_cores.c b/app/test/test_service_cores.c
> > index 637fcd7cf9..06653dfdef 100644
> > --- a/app/test/test_service_cores.c
> > +++ b/app/test/test_service_cores.c
> > @@ -1022,17 +1022,12 @@ static struct unit_test_suite service_tests  = {
> >                 TEST_CASE_ST(dummy_register, NULL, service_name),
> >                 TEST_CASE_ST(dummy_register, NULL, service_get_by_name),
> >                 TEST_CASE_ST(dummy_register, NULL, service_dump),
> > -               TEST_CASE_ST(dummy_register, NULL, service_attr_get),
> > -               TEST_CASE_ST(dummy_register, NULL, service_lcore_attr_get),
> >                 TEST_CASE_ST(dummy_register, NULL, service_probe_capability),
> >                 TEST_CASE_ST(dummy_register, NULL, service_start_stop),
> >                 TEST_CASE_ST(dummy_register, NULL, service_lcore_add_del),
> > -               TEST_CASE_ST(dummy_register, NULL, service_lcore_start_stop),
> >                 TEST_CASE_ST(dummy_register, NULL, service_lcore_en_dis_able),
> >                 TEST_CASE_ST(dummy_register, NULL, service_mt_unsafe_poll),
> >                 TEST_CASE_ST(dummy_register, NULL, service_mt_safe_poll),
> > -               TEST_CASE_ST(dummy_register, NULL, service_app_lcore_mt_safe),
> > -               TEST_CASE_ST(dummy_register, NULL, service_app_lcore_mt_unsafe),
> >                 TEST_CASE_ST(dummy_register, NULL, service_may_be_active),
> >                 TEST_CASE_ST(dummy_register, NULL, service_active_two_cores),
> >                 TEST_CASES_END() /**< NULL terminate unit test array */
> > @@ -1046,3 +1041,30 @@ test_service_common(void)
> >  }
> >
> >  REGISTER_TEST_COMMAND(service_autotest, test_service_common);
> > +
> > +
> > +/* The tests below have been split from the auto-test suite, as the
>
> What is the auto-test suite?
> Plus "as the when" reads strange.
>
> In the end, I don't think it helps much to have this comment in the code.
> The commitlog is supposed to tell the story, so I would simply remove
> this comment.
>
>
> > + * when they are run in a cloud CI environment they can give false-positive
> > + * errors, due to the service-cores not being scheduled by the OS.
> > + */
> > +static struct unit_test_suite service_perf_tests  = {
> > +       .suite_name = "service core test suite",
>
> Maybe add "performance" in the name, so we have a uniquely named
> testsuite object.
>
>
> > +       .setup = testsuite_setup,
> > +       .teardown = testsuite_teardown,
> > +       .unit_test_cases = {
> > +               TEST_CASE_ST(dummy_register, NULL, service_attr_get),
> > +               TEST_CASE_ST(dummy_register, NULL, service_lcore_attr_get),
> > +               TEST_CASE_ST(dummy_register, NULL, service_lcore_start_stop),
>
> Looking at service_lcore_running_check(), don't you think
> service_may_be_active() and service_active_two_cores() are also
> subject to race?
>
>
> > +               TEST_CASE_ST(dummy_register, NULL, service_app_lcore_mt_safe),
> > +               TEST_CASE_ST(dummy_register, NULL, service_app_lcore_mt_unsafe),
> > +               TEST_CASES_END() /**< NULL terminate unit test array */
> > +       }
> > +};
> > +
> > +static int
> > +test_service_perf(void)
> > +{
> > +       return unit_test_suite_runner(&service_perf_tests);
> > +}
> > +
> > +REGISTER_TEST_COMMAND(service_perf_autotest, test_service_perf);
> > --
> > 2.34.1
> >

ping.
  
Van Haaren, Harry March 3, 2023, 10:59 a.m. UTC | #3
> -----Original Message-----
> From: David Marchand <david.marchand@redhat.com>
> Sent: Monday, February 27, 2023 4:09 PM
> To: Van Haaren, Harry <harry.van.haaren@intel.com>
> Cc: dev@dpdk.org; dpdklab@iol.unh.edu; ci@dpdk.org;
> Honnappa.Nagarahalli@arm.com; mattias.ronnblom
> <mattias.ronnblom@ericsson.com>; mb@smartsharesystems.com;
> roretzla@linux.microsoft.com; aconole@redhat.com; Richardson, Bruce
> <bruce.richardson@intel.com>
> Subject: Re: [PATCH] service: split tests to perf and autotest to avoid spurious CI
> failures
> 
> Hello,
> 
> On Fri, Feb 24, 2023 at 7:04 PM Harry van Haaren
> <harry.van.haaren@intel.com> wrote:
> >
> > On some CI runs, some service-cores tests spuriously fail as the
> > service lcore thread is not actually scheduled by the OS in the
> > given amount of time.
> >
> > Increasing timeouts has not resolved the issue in the CI, so the
> > solution in this patch is to move them to a separate perf test
> > suite.
> >
> > Signed-off-by: Harry van Haaren <harry.van.haaren@intel.com>
> >
> > ---
> >
> > See DPDK ML discussion in this thread:
> > http://mails.dpdk.org/archives/dev/2023-February/263523.html
> > ---
> >  app/test/meson.build          |  1 +
> >  app/test/test_service_cores.c | 32 +++++++++++++++++++++++++++-----
> >  2 files changed, 28 insertions(+), 5 deletions(-)
> >
> > diff --git a/app/test/meson.build b/app/test/meson.build
> > index f34d19e3c3..2db5ccf4ff 100644
> > --- a/app/test/meson.build
> > +++ b/app/test/meson.build
> > @@ -287,6 +287,7 @@ perf_test_names = [
> >          'pie_perf',
> >          'distributor_perf_autotest',
> >          'pmd_perf_autotest',
> > +        'service_perf_autotest',
> >          'stack_perf_autotest',
> >          'stack_lf_perf_autotest',
> >          'rand_perf_autotest',
> > diff --git a/app/test/test_service_cores.c b/app/test/test_service_cores.c
> > index 637fcd7cf9..06653dfdef 100644
> > --- a/app/test/test_service_cores.c
> > +++ b/app/test/test_service_cores.c
> > @@ -1022,17 +1022,12 @@ static struct unit_test_suite service_tests  = {
> >                 TEST_CASE_ST(dummy_register, NULL, service_name),
> >                 TEST_CASE_ST(dummy_register, NULL, service_get_by_name),
> >                 TEST_CASE_ST(dummy_register, NULL, service_dump),
> > -               TEST_CASE_ST(dummy_register, NULL, service_attr_get),
> > -               TEST_CASE_ST(dummy_register, NULL, service_lcore_attr_get),
> >                 TEST_CASE_ST(dummy_register, NULL, service_probe_capability),
> >                 TEST_CASE_ST(dummy_register, NULL, service_start_stop),
> >                 TEST_CASE_ST(dummy_register, NULL, service_lcore_add_del),
> > -               TEST_CASE_ST(dummy_register, NULL, service_lcore_start_stop),
> >                 TEST_CASE_ST(dummy_register, NULL, service_lcore_en_dis_able),
> >                 TEST_CASE_ST(dummy_register, NULL, service_mt_unsafe_poll),
> >                 TEST_CASE_ST(dummy_register, NULL, service_mt_safe_poll),
> > -               TEST_CASE_ST(dummy_register, NULL, service_app_lcore_mt_safe),
> > -               TEST_CASE_ST(dummy_register, NULL, service_app_lcore_mt_unsafe),
> >                 TEST_CASE_ST(dummy_register, NULL, service_may_be_active),
> >                 TEST_CASE_ST(dummy_register, NULL, service_active_two_cores),
> >                 TEST_CASES_END() /**< NULL terminate unit test array */
> > @@ -1046,3 +1041,30 @@ test_service_common(void)
> >  }
> >
> >  REGISTER_TEST_COMMAND(service_autotest, test_service_common);
> > +
> > +
> > +/* The tests below have been split from the auto-test suite, as the
> 
> What is the auto-test suite?
> Plus "as the when" reads strange.
> 
> In the end, I don't think it helps much to have this comment in the code.
> The commitlog is supposed to tell the story, so I would simply remove
> this comment.

OK, will remove comment.

> > + * when they are run in a cloud CI environment they can give false-positive
> > + * errors, due to the service-cores not being scheduled by the OS.
> > + */
> > +static struct unit_test_suite service_perf_tests  = {
> > +       .suite_name = "service core test suite",
> 
> Maybe add "performance" in the name, so we have a uniquely named
> testsuite object.

Ack.


> > +       .setup = testsuite_setup,
> > +       .teardown = testsuite_teardown,
> > +       .unit_test_cases = {
> > +               TEST_CASE_ST(dummy_register, NULL, service_attr_get),
> > +               TEST_CASE_ST(dummy_register, NULL, service_lcore_attr_get),
> > +               TEST_CASE_ST(dummy_register, NULL, service_lcore_start_stop),
> 
> Looking at service_lcore_running_check(), don't you think
> service_may_be_active() and service_active_two_cores() are also
> subject to race?

Perhaps, but those haven't *actually* been failing in any of the reports.
I'd prefer leave tests running if they're not causing issues in the CI.

<snip>

V2 patch on the way.
  
David Marchand March 7, 2023, 1:48 p.m. UTC | #4
On Fri, Mar 3, 2023 at 11:59 AM Van Haaren, Harry
<harry.van.haaren@intel.com> wrote:
> > > +       .setup = testsuite_setup,
> > > +       .teardown = testsuite_teardown,
> > > +       .unit_test_cases = {
> > > +               TEST_CASE_ST(dummy_register, NULL, service_attr_get),
> > > +               TEST_CASE_ST(dummy_register, NULL, service_lcore_attr_get),
> > > +               TEST_CASE_ST(dummy_register, NULL, service_lcore_start_stop),
> >
> > Looking at service_lcore_running_check(), don't you think
> > service_may_be_active() and service_active_two_cores() are also
> > subject to race?
>
> Perhaps, but those haven't *actually* been failing in any of the reports.
> I'd prefer leave tests running if they're not causing issues in the CI.

service_may_be_active did fail in the near past (report from October
that triggered the discussion and the timeout extension patch).
So my fear is that we will see some ocurrences.

Time will tell.
  

Patch

diff --git a/app/test/meson.build b/app/test/meson.build
index f34d19e3c3..2db5ccf4ff 100644
--- a/app/test/meson.build
+++ b/app/test/meson.build
@@ -287,6 +287,7 @@  perf_test_names = [
         'pie_perf',
         'distributor_perf_autotest',
         'pmd_perf_autotest',
+        'service_perf_autotest',
         'stack_perf_autotest',
         'stack_lf_perf_autotest',
         'rand_perf_autotest',
diff --git a/app/test/test_service_cores.c b/app/test/test_service_cores.c
index 637fcd7cf9..06653dfdef 100644
--- a/app/test/test_service_cores.c
+++ b/app/test/test_service_cores.c
@@ -1022,17 +1022,12 @@  static struct unit_test_suite service_tests  = {
 		TEST_CASE_ST(dummy_register, NULL, service_name),
 		TEST_CASE_ST(dummy_register, NULL, service_get_by_name),
 		TEST_CASE_ST(dummy_register, NULL, service_dump),
-		TEST_CASE_ST(dummy_register, NULL, service_attr_get),
-		TEST_CASE_ST(dummy_register, NULL, service_lcore_attr_get),
 		TEST_CASE_ST(dummy_register, NULL, service_probe_capability),
 		TEST_CASE_ST(dummy_register, NULL, service_start_stop),
 		TEST_CASE_ST(dummy_register, NULL, service_lcore_add_del),
-		TEST_CASE_ST(dummy_register, NULL, service_lcore_start_stop),
 		TEST_CASE_ST(dummy_register, NULL, service_lcore_en_dis_able),
 		TEST_CASE_ST(dummy_register, NULL, service_mt_unsafe_poll),
 		TEST_CASE_ST(dummy_register, NULL, service_mt_safe_poll),
-		TEST_CASE_ST(dummy_register, NULL, service_app_lcore_mt_safe),
-		TEST_CASE_ST(dummy_register, NULL, service_app_lcore_mt_unsafe),
 		TEST_CASE_ST(dummy_register, NULL, service_may_be_active),
 		TEST_CASE_ST(dummy_register, NULL, service_active_two_cores),
 		TEST_CASES_END() /**< NULL terminate unit test array */
@@ -1046,3 +1041,30 @@  test_service_common(void)
 }
 
 REGISTER_TEST_COMMAND(service_autotest, test_service_common);
+
+
+/* The tests below have been split from the auto-test suite, as the
+ * when they are run in a cloud CI environment they can give false-positive
+ * errors, due to the service-cores not being scheduled by the OS.
+ */
+static struct unit_test_suite service_perf_tests  = {
+	.suite_name = "service core test suite",
+	.setup = testsuite_setup,
+	.teardown = testsuite_teardown,
+	.unit_test_cases = {
+		TEST_CASE_ST(dummy_register, NULL, service_attr_get),
+		TEST_CASE_ST(dummy_register, NULL, service_lcore_attr_get),
+		TEST_CASE_ST(dummy_register, NULL, service_lcore_start_stop),
+		TEST_CASE_ST(dummy_register, NULL, service_app_lcore_mt_safe),
+		TEST_CASE_ST(dummy_register, NULL, service_app_lcore_mt_unsafe),
+		TEST_CASES_END() /**< NULL terminate unit test array */
+	}
+};
+
+static int
+test_service_perf(void)
+{
+	return unit_test_suite_runner(&service_perf_tests);
+}
+
+REGISTER_TEST_COMMAND(service_perf_autotest, test_service_perf);