From patchwork Wed Mar 6 14:45:55 2019 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Eads, Gage" X-Patchwork-Id: 50855 X-Patchwork-Delegate: thomas@monjalon.net Return-Path: X-Original-To: patchwork@dpdk.org Delivered-To: patchwork@dpdk.org Received: from [92.243.14.124] (localhost [127.0.0.1]) by dpdk.org (Postfix) with ESMTP id 6FBDB5F28; Wed, 6 Mar 2019 15:46:13 +0100 (CET) Received: from mga04.intel.com (mga04.intel.com [192.55.52.120]) by dpdk.org (Postfix) with ESMTP id CFC235920 for ; Wed, 6 Mar 2019 15:46:06 +0100 (CET) X-Amp-Result: SKIPPED(no attachment in message) X-Amp-File-Uploaded: False Received: from fmsmga007.fm.intel.com ([10.253.24.52]) by fmsmga104.fm.intel.com with ESMTP/TLS/DHE-RSA-AES256-GCM-SHA384; 06 Mar 2019 06:46:06 -0800 X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="5.58,448,1544515200"; d="scan'208";a="132058518" Received: from txasoft-yocto.an.intel.com ([10.123.72.192]) by fmsmga007.fm.intel.com with ESMTP; 06 Mar 2019 06:46:05 -0800 From: Gage Eads To: dev@dpdk.org Cc: olivier.matz@6wind.com, arybchenko@solarflare.com, bruce.richardson@intel.com, konstantin.ananyev@intel.com, gavin.hu@arm.com, Honnappa.Nagarahalli@arm.com, nd@arm.com, thomas@monjalon.net Date: Wed, 6 Mar 2019 08:45:55 -0600 Message-Id: <20190306144559.391-5-gage.eads@intel.com> X-Mailer: git-send-email 2.13.6 In-Reply-To: <20190306144559.391-1-gage.eads@intel.com> References: <20190305164256.2367-1-gage.eads@intel.com> <20190306144559.391-1-gage.eads@intel.com> Subject: [dpdk-dev] [PATCH v3 4/8] test/stack: add stack perf test X-BeenThere: dev@dpdk.org X-Mailman-Version: 2.1.15 Precedence: list List-Id: DPDK patches and discussions List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: dev-bounces@dpdk.org Sender: "dev" stack_perf_autotest tests the following with one lcore: - Cycles to attempt to pop an empty stack - Cycles to push then pop a single object - Cycles to push then pop a burst of 32 objects It also tests the cycles to push then pop a burst of 8 and 32 objects with the following lcore combinations (if possible): - Two hyperthreads - Two physical cores - Two physical cores on separate NUMA nodes - All available lcores Signed-off-by: Gage Eads Reviewed-by: Olivier Matz --- app/test/Makefile | 1 + app/test/meson.build | 2 + app/test/test_stack_perf.c | 343 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 346 insertions(+) create mode 100644 app/test/test_stack_perf.c diff --git a/app/test/Makefile b/app/test/Makefile index 47cf98a3a..f9536fb31 100644 --- a/app/test/Makefile +++ b/app/test/Makefile @@ -90,6 +90,7 @@ endif SRCS-y += test_rwlock.c SRCS-$(CONFIG_RTE_LIBRTE_STACK) += test_stack.c +SRCS-$(CONFIG_RTE_LIBRTE_STACK) += test_stack_perf.c SRCS-$(CONFIG_RTE_LIBRTE_TIMER) += test_timer.c SRCS-$(CONFIG_RTE_LIBRTE_TIMER) += test_timer_perf.c diff --git a/app/test/meson.build b/app/test/meson.build index b00e1201a..ba3cb6261 100644 --- a/app/test/meson.build +++ b/app/test/meson.build @@ -96,6 +96,7 @@ test_sources = files('commands.c', 'test_service_cores.c', 'test_spinlock.c', 'test_stack.c', + 'test_stack_perf.c', 'test_string_fns.c', 'test_table.c', 'test_table_acl.c', @@ -240,6 +241,7 @@ perf_test_names = [ 'distributor_perf_autotest', 'ring_pmd_perf_autotest', 'pmd_perf_autotest', + 'stack_perf_autotest', ] # All test cases in driver_test_names list are non-parallel diff --git a/app/test/test_stack_perf.c b/app/test/test_stack_perf.c new file mode 100644 index 000000000..484370d30 --- /dev/null +++ b/app/test/test_stack_perf.c @@ -0,0 +1,343 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2019 Intel Corporation + */ + + +#include +#include +#include +#include +#include +#include + +#include "test.h" + +#define STACK_NAME "STACK_PERF" +#define MAX_BURST 32 +#define STACK_SIZE (RTE_MAX_LCORE * MAX_BURST) + +#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) + +/* + * Push/pop bulk sizes, marked volatile so they aren't treated as compile-time + * constants. + */ +static volatile unsigned int bulk_sizes[] = {8, MAX_BURST}; + +static rte_atomic32_t lcore_barrier; + +struct lcore_pair { + unsigned int c1; + unsigned int c2; +}; + +static int +get_two_hyperthreads(struct lcore_pair *lcp) +{ + unsigned int socket[2]; + unsigned int core[2]; + unsigned int id[2]; + + RTE_LCORE_FOREACH(id[0]) { + RTE_LCORE_FOREACH(id[1]) { + if (id[0] == id[1]) + continue; + core[0] = lcore_config[id[0]].core_id; + core[1] = lcore_config[id[1]].core_id; + socket[0] = lcore_config[id[0]].socket_id; + socket[1] = lcore_config[id[1]].socket_id; + if ((core[0] == core[1]) && (socket[0] == socket[1])) { + lcp->c1 = id[0]; + lcp->c2 = id[1]; + return 0; + } + } + } + + return 1; +} + +static int +get_two_cores(struct lcore_pair *lcp) +{ + unsigned int socket[2]; + unsigned int core[2]; + unsigned int id[2]; + + RTE_LCORE_FOREACH(id[0]) { + RTE_LCORE_FOREACH(id[1]) { + if (id[0] == id[1]) + continue; + core[0] = lcore_config[id[0]].core_id; + core[1] = lcore_config[id[1]].core_id; + socket[0] = lcore_config[id[0]].socket_id; + socket[1] = lcore_config[id[1]].socket_id; + if ((core[0] != core[1]) && (socket[0] == socket[1])) { + lcp->c1 = id[0]; + lcp->c2 = id[1]; + return 0; + } + } + } + + return 1; +} + +static int +get_two_sockets(struct lcore_pair *lcp) +{ + unsigned int socket[2]; + unsigned int id[2]; + + RTE_LCORE_FOREACH(id[0]) { + RTE_LCORE_FOREACH(id[1]) { + if (id[0] == id[1]) + continue; + socket[0] = lcore_config[id[0]].socket_id; + socket[1] = lcore_config[id[1]].socket_id; + if (socket[0] != socket[1]) { + lcp->c1 = id[0]; + lcp->c2 = id[1]; + return 0; + } + } + } + + return 1; +} + +/* Measure the cycle cost of popping an empty stack. */ +static void +test_empty_pop(struct rte_stack *s) +{ + unsigned int iterations = 100000000; + void *objs[MAX_BURST]; + unsigned int i; + + uint64_t start = rte_rdtsc(); + + for (i = 0; i < iterations; i++) + rte_stack_pop(s, objs, bulk_sizes[0]); + + uint64_t end = rte_rdtsc(); + + printf("Stack empty pop: %.2F\n", + (double)(end - start) / iterations); +} + +struct thread_args { + struct rte_stack *s; + unsigned int sz; + double avg; +}; + +/* Measure the average per-pointer cycle cost of stack push and pop */ +static int +bulk_push_pop(void *p) +{ + unsigned int iterations = 1000000; + struct thread_args *args = p; + void *objs[MAX_BURST] = {0}; + unsigned int size, i; + struct rte_stack *s; + + s = args->s; + size = args->sz; + + rte_atomic32_sub(&lcore_barrier, 1); + while (rte_atomic32_read(&lcore_barrier) != 0) + rte_pause(); + + uint64_t start = rte_rdtsc(); + + for (i = 0; i < iterations; i++) { + rte_stack_push(s, objs, size); + rte_stack_pop(s, objs, size); + } + + uint64_t end = rte_rdtsc(); + + args->avg = ((double)(end - start))/(iterations * size); + + return 0; +} + +/* + * Run bulk_push_pop() simultaneously on pairs of cores, to measure stack + * perf when between hyperthread siblings, cores on the same socket, and cores + * on different sockets. + */ +static void +run_on_core_pair(struct lcore_pair *cores, struct rte_stack *s, + lcore_function_t fn) +{ + struct thread_args args[2]; + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(bulk_sizes); i++) { + rte_atomic32_set(&lcore_barrier, 2); + + args[0].sz = args[1].sz = bulk_sizes[i]; + args[0].s = args[1].s = s; + + if (cores->c1 == rte_get_master_lcore()) { + rte_eal_remote_launch(fn, &args[1], cores->c2); + fn(&args[0]); + rte_eal_wait_lcore(cores->c2); + } else { + rte_eal_remote_launch(fn, &args[0], cores->c1); + rte_eal_remote_launch(fn, &args[1], cores->c2); + rte_eal_wait_lcore(cores->c1); + rte_eal_wait_lcore(cores->c2); + } + + printf("Average cycles per object push/pop (bulk size: %u): %.2F\n", + bulk_sizes[i], (args[0].avg + args[1].avg) / 2); + } +} + +/* Run bulk_push_pop() simultaneously on 1+ cores. */ +static void +run_on_n_cores(struct rte_stack *s, lcore_function_t fn, int n) +{ + struct thread_args args[RTE_MAX_LCORE]; + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(bulk_sizes); i++) { + unsigned int lcore_id; + int cnt = 0; + double avg; + + rte_atomic32_set(&lcore_barrier, n); + + RTE_LCORE_FOREACH_SLAVE(lcore_id) { + if (++cnt >= n) + break; + + args[lcore_id].s = s; + args[lcore_id].sz = bulk_sizes[i]; + + if (rte_eal_remote_launch(fn, &args[lcore_id], + lcore_id)) + rte_panic("Failed to launch lcore %d\n", + lcore_id); + } + + lcore_id = rte_lcore_id(); + + args[lcore_id].s = s; + args[lcore_id].sz = bulk_sizes[i]; + + fn(&args[lcore_id]); + + rte_eal_mp_wait_lcore(); + + avg = args[rte_lcore_id()].avg; + + cnt = 0; + RTE_LCORE_FOREACH_SLAVE(lcore_id) { + if (++cnt >= n) + break; + avg += args[lcore_id].avg; + } + + printf("Average cycles per object push/pop (bulk size: %u): %.2F\n", + bulk_sizes[i], avg / n); + } +} + +/* + * Measure the cycle cost of pushing and popping a single pointer on a single + * lcore. + */ +static void +test_single_push_pop(struct rte_stack *s) +{ + unsigned int iterations = 16000000; + void *obj = NULL; + unsigned int i; + + uint64_t start = rte_rdtsc(); + + for (i = 0; i < iterations; i++) { + rte_stack_push(s, &obj, 1); + rte_stack_pop(s, &obj, 1); + } + + uint64_t end = rte_rdtsc(); + + printf("Average cycles per single object push/pop: %.2F\n", + ((double)(end - start)) / iterations); +} + +/* Measure the cycle cost of bulk pushing and popping on a single lcore. */ +static void +test_bulk_push_pop(struct rte_stack *s) +{ + unsigned int iterations = 8000000; + void *objs[MAX_BURST]; + unsigned int sz, i; + + for (sz = 0; sz < ARRAY_SIZE(bulk_sizes); sz++) { + uint64_t start = rte_rdtsc(); + + for (i = 0; i < iterations; i++) { + rte_stack_push(s, objs, bulk_sizes[sz]); + rte_stack_pop(s, objs, bulk_sizes[sz]); + } + + uint64_t end = rte_rdtsc(); + + double avg = ((double)(end - start) / + (iterations * bulk_sizes[sz])); + + printf("Average cycles per object push/pop (bulk size: %u): %.2F\n", + bulk_sizes[sz], avg); + } +} + +static int +test_stack_perf(void) +{ + struct lcore_pair cores; + struct rte_stack *s; + + rte_atomic32_init(&lcore_barrier); + + s = rte_stack_create(STACK_NAME, STACK_SIZE, rte_socket_id(), 0); + if (s == NULL) { + printf("[%s():%u] failed to create a stack\n", + __func__, __LINE__); + return -1; + } + + printf("### Testing single element push/pop ###\n"); + test_single_push_pop(s); + + printf("\n### Testing empty pop ###\n"); + test_empty_pop(s); + + printf("\n### Testing using a single lcore ###\n"); + test_bulk_push_pop(s); + + if (get_two_hyperthreads(&cores) == 0) { + printf("\n### Testing using two hyperthreads ###\n"); + run_on_core_pair(&cores, s, bulk_push_pop); + } + if (get_two_cores(&cores) == 0) { + printf("\n### Testing using two physical cores ###\n"); + run_on_core_pair(&cores, s, bulk_push_pop); + } + if (get_two_sockets(&cores) == 0) { + printf("\n### Testing using two NUMA nodes ###\n"); + run_on_core_pair(&cores, s, bulk_push_pop); + } + + printf("\n### Testing on all %u lcores ###\n", rte_lcore_count()); + run_on_n_cores(s, bulk_push_pop, rte_lcore_count()); + + rte_stack_free(s); + return 0; +} + +REGISTER_TEST_COMMAND(stack_perf_autotest, test_stack_perf);