[v3,1/2] test/service: add perf measurements for with stats mode

Message ID 20220711131825.3373195-1-harry.van.haaren@intel.com (mailing list archive)
State Accepted, archived
Delegated to: David Marchand
Headers
Series [v3,1/2] test/service: add perf measurements for with stats mode |

Checks

Context Check Description
ci/checkpatch success coding style OK

Commit Message

Van Haaren, Harry July 11, 2022, 1:18 p.m. UTC
  This commit improves the performance reporting of the service
cores polling loop to show both with and without statistics
collection modes. Collecting cycle statistics is costly, due
to calls to rte_rdtsc() per service iteration.

Reported-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
Suggested-by: Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>
Suggested-by: Morten Brørup <mb@smartsharesystems.com>
Signed-off-by: Harry van Haaren <harry.van.haaren@intel.com>

---

This is split out as a seperate patch from the fix to allow
measuring the before/after of the service stats atomic fixup.
---
 app/test/test_service_cores.c | 36 ++++++++++++++++++++++++-----------
 1 file changed, 25 insertions(+), 11 deletions(-)
  

Comments

Mattias Rönnblom Sept. 2, 2022, 5:17 p.m. UTC | #1
On 2022-07-11 15:18, Harry van Haaren wrote:
> This commit improves the performance reporting of the service
> cores polling loop to show both with and without statistics
> collection modes. Collecting cycle statistics is costly, due
> to calls to rte_rdtsc() per service iteration.

That is true for a service deployed on only a single core. For 
multi-core services, non-rdtsc-related overhead dominates. For example, 
if the service is deployed on 11 cores, the extra statistics-related 
overhead is ~1000 cc/service call on x86_64. 2x rdtsc shouldn't be more 
than ~50 cc.

> 
> Reported-by: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
> Suggested-by: Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>
> Suggested-by: Morten Brørup <mb@smartsharesystems.com>
> Signed-off-by: Harry van Haaren <harry.van.haaren@intel.com>
> 
> ---
> 
> This is split out as a seperate patch from the fix to allow
> measuring the before/after of the service stats atomic fixup.
> ---
>   app/test/test_service_cores.c | 36 ++++++++++++++++++++++++-----------
>   1 file changed, 25 insertions(+), 11 deletions(-)
> 
> diff --git a/app/test/test_service_cores.c b/app/test/test_service_cores.c
> index ced6ed0081..7415b6b686 100644
> --- a/app/test/test_service_cores.c
> +++ b/app/test/test_service_cores.c
> @@ -777,6 +777,22 @@ service_run_on_app_core_func(void *arg)
>   	return rte_service_run_iter_on_app_lcore(*delay_service_id, 1);
>   }
>   
> +static float
> +service_app_lcore_perf_measure(uint32_t id)
> +{
> +	/* Performance test: call in a loop, and measure tsc() */
> +	const uint32_t perf_iters = (1 << 12);
> +	uint64_t start = rte_rdtsc();
> +	uint32_t i;
> +	for (i = 0; i < perf_iters; i++) {
> +		int err = service_run_on_app_core_func(&id);

In a real-world scenario, the latency of this function isn't 
representative for the overall service core overhead.

For example, consider a scenario where an lcore has a single service 
mapped to it. rte_service.c will call service_run() 64 times, but only 
one will be a "hit" and the service being run. One iteration in the 
service loop costs ~600 cc, on a machine where this performance 
benchmark reports 128 cc. (Both with statistics disabled.)

For low-latency services, this is a significant overhead.

> +		TEST_ASSERT_EQUAL(0, err, "perf test: returned run failure");
> +	}
> +	uint64_t end = rte_rdtsc();
> +
> +	return (end - start)/(float)perf_iters;
> +}
> +
>   static int
>   service_app_lcore_poll_impl(const int mt_safe)
>   {
> @@ -828,17 +844,15 @@ service_app_lcore_poll_impl(const int mt_safe)
>   				"MT Unsafe: App core1 didn't return -EBUSY");
>   	}
>   
> -	/* Performance test: call in a loop, and measure tsc() */
> -	const uint32_t perf_iters = (1 << 12);
> -	uint64_t start = rte_rdtsc();
> -	uint32_t i;
> -	for (i = 0; i < perf_iters; i++) {
> -		int err = service_run_on_app_core_func(&id);
> -		TEST_ASSERT_EQUAL(0, err, "perf test: returned run failure");
> -	}
> -	uint64_t end = rte_rdtsc();
> -	printf("perf test for %s: %0.1f cycles per call\n", mt_safe ?
> -		"MT Safe" : "MT Unsafe", (end - start)/(float)perf_iters);
> +	/* Measure performance of no-stats and with-stats. */
> +	float cyc_no_stats = service_app_lcore_perf_measure(id);
> +
> +	TEST_ASSERT_EQUAL(0, rte_service_set_stats_enable(id, 1),
> +				"failed to enable stats for service.");
> +	float cyc_with_stats = service_app_lcore_perf_measure(id);
> +
> +	printf("perf test for %s, no stats: %0.1f, with stats %0.1f cycles/call\n",
> +		mt_safe ? "MT Safe" : "MT Unsafe", cyc_no_stats, cyc_with_stats);
>   
>   	unregister_all();
>   	return TEST_SUCCESS;
  

Patch

diff --git a/app/test/test_service_cores.c b/app/test/test_service_cores.c
index ced6ed0081..7415b6b686 100644
--- a/app/test/test_service_cores.c
+++ b/app/test/test_service_cores.c
@@ -777,6 +777,22 @@  service_run_on_app_core_func(void *arg)
 	return rte_service_run_iter_on_app_lcore(*delay_service_id, 1);
 }
 
+static float
+service_app_lcore_perf_measure(uint32_t id)
+{
+	/* Performance test: call in a loop, and measure tsc() */
+	const uint32_t perf_iters = (1 << 12);
+	uint64_t start = rte_rdtsc();
+	uint32_t i;
+	for (i = 0; i < perf_iters; i++) {
+		int err = service_run_on_app_core_func(&id);
+		TEST_ASSERT_EQUAL(0, err, "perf test: returned run failure");
+	}
+	uint64_t end = rte_rdtsc();
+
+	return (end - start)/(float)perf_iters;
+}
+
 static int
 service_app_lcore_poll_impl(const int mt_safe)
 {
@@ -828,17 +844,15 @@  service_app_lcore_poll_impl(const int mt_safe)
 				"MT Unsafe: App core1 didn't return -EBUSY");
 	}
 
-	/* Performance test: call in a loop, and measure tsc() */
-	const uint32_t perf_iters = (1 << 12);
-	uint64_t start = rte_rdtsc();
-	uint32_t i;
-	for (i = 0; i < perf_iters; i++) {
-		int err = service_run_on_app_core_func(&id);
-		TEST_ASSERT_EQUAL(0, err, "perf test: returned run failure");
-	}
-	uint64_t end = rte_rdtsc();
-	printf("perf test for %s: %0.1f cycles per call\n", mt_safe ?
-		"MT Safe" : "MT Unsafe", (end - start)/(float)perf_iters);
+	/* Measure performance of no-stats and with-stats. */
+	float cyc_no_stats = service_app_lcore_perf_measure(id);
+
+	TEST_ASSERT_EQUAL(0, rte_service_set_stats_enable(id, 1),
+				"failed to enable stats for service.");
+	float cyc_with_stats = service_app_lcore_perf_measure(id);
+
+	printf("perf test for %s, no stats: %0.1f, with stats %0.1f cycles/call\n",
+		mt_safe ? "MT Safe" : "MT Unsafe", cyc_no_stats, cyc_with_stats);
 
 	unregister_all();
 	return TEST_SUCCESS;