> -----Original Message-----
> From: Tomasz Duszynski <tduszynski@marvell.com>
> Sent: Monday, February 13, 2023 7:32 PM
> To: dev@dpdk.org; thomas@monjalon.net; Tomasz Duszynski <tduszynski@marvell.com>
> Cc: roretzla@linux.microsoft.com; Ruifeng Wang <Ruifeng.Wang@arm.com>;
> bruce.richardson@intel.com; jerinj@marvell.com; mattias.ronnblom@ericsson.com;
> mb@smartsharesystems.com; zhoumin@loongson.cn; david.marchand@redhat.com
> Subject: [PATCH v10 1/4] lib: add generic support for reading PMU events
>
> Add support for programming PMU counters and reading their values in runtime bypassing
> kernel completely.
>
> This is especially useful in cases where CPU cores are isolated i.e run dedicated tasks.
> In such cases one cannot use standard perf utility without sacrificing latency and
> performance.
>
> Signed-off-by: Tomasz Duszynski <tduszynski@marvell.com>
> Acked-by: Morten Brørup <mb@smartsharesystems.com>
> ---
> MAINTAINERS | 5 +
> app/test/meson.build | 2 +
> app/test/test_pmu.c | 62 ++++
> doc/api/doxy-api-index.md | 3 +-
> doc/api/doxy-api.conf.in | 1 +
> doc/guides/prog_guide/profile_app.rst | 12 +
> doc/guides/rel_notes/release_23_03.rst | 7 +
> lib/meson.build | 1 +
> lib/pmu/meson.build | 13 +
> lib/pmu/pmu_private.h | 32 ++
> lib/pmu/rte_pmu.c | 460 +++++++++++++++++++++++++
> lib/pmu/rte_pmu.h | 212 ++++++++++++
> lib/pmu/version.map | 15 +
> 13 files changed, 824 insertions(+), 1 deletion(-) create mode 100644
> app/test/test_pmu.c create mode 100644 lib/pmu/meson.build create mode 100644
> lib/pmu/pmu_private.h create mode 100644 lib/pmu/rte_pmu.c create mode 100644
> lib/pmu/rte_pmu.h create mode 100644 lib/pmu/version.map
>
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 3495946d0f..d37f242120 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -1697,6 +1697,11 @@ M: Nithin Dabilpuram <ndabilpuram@marvell.com>
> M: Pavan Nikhilesh <pbhagavatula@marvell.com>
> F: lib/node/
>
> +PMU - EXPERIMENTAL
> +M: Tomasz Duszynski <tduszynski@marvell.com>
> +F: lib/pmu/
> +F: app/test/test_pmu*
> +
>
> Test Applications
> -----------------
> diff --git a/app/test/meson.build b/app/test/meson.build index f34d19e3c3..6b61b7fc32
> 100644
> --- a/app/test/meson.build
> +++ b/app/test/meson.build
> @@ -111,6 +111,7 @@ test_sources = files(
> 'test_reciprocal_division_perf.c',
> 'test_red.c',
> 'test_pie.c',
> + 'test_pmu.c',
> 'test_reorder.c',
> 'test_rib.c',
> 'test_rib6.c',
> @@ -239,6 +240,7 @@ fast_tests = [
> ['kni_autotest', false, true],
> ['kvargs_autotest', true, true],
> ['member_autotest', true, true],
> + ['pmu_autotest', true, true],
> ['power_cpufreq_autotest', false, true],
> ['power_autotest', true, true],
> ['power_kvm_vm_autotest', false, true], diff --git a/app/test/test_pmu.c
> b/app/test/test_pmu.c new file mode 100644 index 0000000000..a64564b5f5
> --- /dev/null
> +++ b/app/test/test_pmu.c
> @@ -0,0 +1,62 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(C) 2023 Marvell International Ltd.
> + */
> +
> +#include "test.h"
> +
> +#ifndef RTE_EXEC_ENV_LINUX
> +
> +static int
> +test_pmu(void)
> +{
> + printf("pmu_autotest only supported on Linux, skipping test\n");
> + return TEST_SKIPPED;
> +}
> +
> +#else
> +
> +#include <rte_pmu.h>
> +
> +static int
> +test_pmu_read(void)
> +{
> + const char *name = NULL;
> + int tries = 10, event;
> + uint64_t val = 0;
> +
> + if (name == NULL) {
> + printf("PMU not supported on this arch\n");
> + return TEST_SKIPPED;
> + }
> +
> + if (rte_pmu_init() < 0)
> + return TEST_FAILED;
Can we return TEST_SKIPPED here?
On aarch64, this feature requires kernel version >= 5.17. CI setups doesn't meet this requirement will
start to report failure when running fast_tests.
> +
> + event = rte_pmu_add_event(name);
> + while (tries--)
> + val += rte_pmu_read(event);
> +
> + rte_pmu_fini();
> +
> + return val ? TEST_SUCCESS : TEST_FAILED; }
> +
> +static struct unit_test_suite pmu_tests = {
> + .suite_name = "pmu autotest",
> + .setup = NULL,
> + .teardown = NULL,
> + .unit_test_cases = {
> + TEST_CASE(test_pmu_read),
> + TEST_CASES_END()
> + }
> +};
> +
<snip>
@@ -1697,6 +1697,11 @@ M: Nithin Dabilpuram <ndabilpuram@marvell.com>
M: Pavan Nikhilesh <pbhagavatula@marvell.com>
F: lib/node/
+PMU - EXPERIMENTAL
+M: Tomasz Duszynski <tduszynski@marvell.com>
+F: lib/pmu/
+F: app/test/test_pmu*
+
Test Applications
-----------------
@@ -111,6 +111,7 @@ test_sources = files(
'test_reciprocal_division_perf.c',
'test_red.c',
'test_pie.c',
+ 'test_pmu.c',
'test_reorder.c',
'test_rib.c',
'test_rib6.c',
@@ -239,6 +240,7 @@ fast_tests = [
['kni_autotest', false, true],
['kvargs_autotest', true, true],
['member_autotest', true, true],
+ ['pmu_autotest', true, true],
['power_cpufreq_autotest', false, true],
['power_autotest', true, true],
['power_kvm_vm_autotest', false, true],
new file mode 100644
@@ -0,0 +1,62 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2023 Marvell International Ltd.
+ */
+
+#include "test.h"
+
+#ifndef RTE_EXEC_ENV_LINUX
+
+static int
+test_pmu(void)
+{
+ printf("pmu_autotest only supported on Linux, skipping test\n");
+ return TEST_SKIPPED;
+}
+
+#else
+
+#include <rte_pmu.h>
+
+static int
+test_pmu_read(void)
+{
+ const char *name = NULL;
+ int tries = 10, event;
+ uint64_t val = 0;
+
+ if (name == NULL) {
+ printf("PMU not supported on this arch\n");
+ return TEST_SKIPPED;
+ }
+
+ if (rte_pmu_init() < 0)
+ return TEST_FAILED;
+
+ event = rte_pmu_add_event(name);
+ while (tries--)
+ val += rte_pmu_read(event);
+
+ rte_pmu_fini();
+
+ return val ? TEST_SUCCESS : TEST_FAILED;
+}
+
+static struct unit_test_suite pmu_tests = {
+ .suite_name = "pmu autotest",
+ .setup = NULL,
+ .teardown = NULL,
+ .unit_test_cases = {
+ TEST_CASE(test_pmu_read),
+ TEST_CASES_END()
+ }
+};
+
+static int
+test_pmu(void)
+{
+ return unit_test_suite_runner(&pmu_tests);
+}
+
+#endif /* RTE_EXEC_ENV_LINUX */
+
+REGISTER_TEST_COMMAND(pmu_autotest, test_pmu);
@@ -223,7 +223,8 @@ The public API headers are grouped by topics:
[log](@ref rte_log.h),
[errno](@ref rte_errno.h),
[trace](@ref rte_trace.h),
- [trace_point](@ref rte_trace_point.h)
+ [trace_point](@ref rte_trace_point.h),
+ [pmu](@ref rte_pmu.h)
- **misc**:
[EAL config](@ref rte_eal.h),
@@ -63,6 +63,7 @@ INPUT = @TOPDIR@/doc/api/doxy-api-index.md \
@TOPDIR@/lib/pci \
@TOPDIR@/lib/pdump \
@TOPDIR@/lib/pipeline \
+ @TOPDIR@/lib/pmu \
@TOPDIR@/lib/port \
@TOPDIR@/lib/power \
@TOPDIR@/lib/rawdev \
@@ -7,6 +7,18 @@ Profile Your Application
The following sections describe methods of profiling DPDK applications on
different architectures.
+Performance counter based profiling
+-----------------------------------
+
+Majority of architectures support some performance monitoring unit (PMU).
+Such unit provides programmable counters that monitor specific events.
+
+Different tools gather that information, like for example perf.
+However, in some scenarios when CPU cores are isolated and run
+dedicated tasks interrupting those tasks with perf may be undesirable.
+
+In such cases, an application can use the PMU library to read such events via ``rte_pmu_read()``.
+
Profiling on x86
----------------
@@ -147,6 +147,13 @@ New Features
* Added support to capture packets at each graph node with packet metadata and
node name.
+* **Added PMU library.**
+
+ Added a new performance monitoring unit (PMU) library which allows applications
+ to perform self monitoring activities without depending on external utilities like perf.
+ After integration with :doc:`../prog_guide/trace_lib` data gathered from hardware counters
+ can be stored in CTF format for further analysis.
+
Removed Items
-------------
@@ -11,6 +11,7 @@
libraries = [
'kvargs', # eal depends on kvargs
'telemetry', # basic info querying
+ 'pmu',
'eal', # everything depends on eal
'ring',
'rcu', # rcu depends on ring
new file mode 100644
@@ -0,0 +1,13 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(C) 2023 Marvell International Ltd.
+
+if not is_linux
+ build = false
+ reason = 'only supported on Linux'
+ subdir_done()
+endif
+
+includes = [global_inc]
+
+sources = files('rte_pmu.c')
+headers = files('rte_pmu.h')
new file mode 100644
@@ -0,0 +1,32 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2023 Marvell
+ */
+
+#ifndef _PMU_PRIVATE_H_
+#define _PMU_PRIVATE_H_
+
+/**
+ * Architecture specific PMU init callback.
+ *
+ * @return
+ * 0 in case of success, negative value otherwise.
+ */
+int
+pmu_arch_init(void);
+
+/**
+ * Architecture specific PMU cleanup callback.
+ */
+void
+pmu_arch_fini(void);
+
+/**
+ * Apply architecture specific settings to config before passing it to syscall.
+ *
+ * @param config
+ * Architecture specific event configuration. Consult kernel sources for available options.
+ */
+void
+pmu_arch_fixup_config(uint64_t config[3]);
+
+#endif /* _PMU_PRIVATE_H_ */
new file mode 100644
@@ -0,0 +1,460 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2023 Marvell International Ltd.
+ */
+
+#include <ctype.h>
+#include <dirent.h>
+#include <errno.h>
+#include <regex.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/queue.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+
+#include <rte_atomic.h>
+#include <rte_per_lcore.h>
+#include <rte_pmu.h>
+#include <rte_spinlock.h>
+#include <rte_tailq.h>
+
+#include "pmu_private.h"
+
+#define EVENT_SOURCE_DEVICES_PATH "/sys/bus/event_source/devices"
+
+#define GENMASK_ULL(h, l) ((~0ULL - (1ULL << (l)) + 1) & (~0ULL >> ((64 - 1 - (h)))))
+#define FIELD_PREP(m, v) (((uint64_t)(v) << (__builtin_ffsll(m) - 1)) & (m))
+
+RTE_DEFINE_PER_LCORE(struct rte_pmu_event_group, _event_group);
+struct rte_pmu rte_pmu;
+
+/*
+ * Following __rte_weak functions provide default no-op. Architectures should override them if
+ * necessary.
+ */
+
+int
+__rte_weak pmu_arch_init(void)
+{
+ return 0;
+}
+
+void
+__rte_weak pmu_arch_fini(void)
+{
+}
+
+void
+__rte_weak pmu_arch_fixup_config(uint64_t __rte_unused config[3])
+{
+}
+
+static int
+get_term_format(const char *name, int *num, uint64_t *mask)
+{
+ char path[PATH_MAX];
+ char *config = NULL;
+ int high, low, ret;
+ FILE *fp;
+
+ *num = *mask = 0;
+ snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/format/%s", rte_pmu.name, name);
+ fp = fopen(path, "r");
+ if (fp == NULL)
+ return -errno;
+
+ errno = 0;
+ ret = fscanf(fp, "%m[^:]:%d-%d", &config, &low, &high);
+ if (ret < 2) {
+ ret = -ENODATA;
+ goto out;
+ }
+ if (errno) {
+ ret = -errno;
+ goto out;
+ }
+
+ if (ret == 2)
+ high = low;
+
+ *mask = GENMASK_ULL(high, low);
+ /* Last digit should be [012]. If last digit is missing 0 is implied. */
+ *num = config[strlen(config) - 1];
+ *num = isdigit(*num) ? *num - '0' : 0;
+
+ ret = 0;
+out:
+ free(config);
+ fclose(fp);
+
+ return ret;
+}
+
+static int
+parse_event(char *buf, uint64_t config[3])
+{
+ char *token, *term;
+ int num, ret, val;
+ uint64_t mask;
+
+ config[0] = config[1] = config[2] = 0;
+
+ token = strtok(buf, ",");
+ while (token) {
+ errno = 0;
+ /* <term>=<value> */
+ ret = sscanf(token, "%m[^=]=%i", &term, &val);
+ if (ret < 1)
+ return -ENODATA;
+ if (errno)
+ return -errno;
+ if (ret == 1)
+ val = 1;
+
+ ret = get_term_format(term, &num, &mask);
+ free(term);
+ if (ret)
+ return ret;
+
+ config[num] |= FIELD_PREP(mask, val);
+ token = strtok(NULL, ",");
+ }
+
+ return 0;
+}
+
+static int
+get_event_config(const char *name, uint64_t config[3])
+{
+ char path[PATH_MAX], buf[BUFSIZ];
+ FILE *fp;
+ int ret;
+
+ snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/events/%s", rte_pmu.name, name);
+ fp = fopen(path, "r");
+ if (fp == NULL)
+ return -errno;
+
+ ret = fread(buf, 1, sizeof(buf), fp);
+ if (ret == 0) {
+ fclose(fp);
+
+ return -EINVAL;
+ }
+ fclose(fp);
+ buf[ret] = '\0';
+
+ return parse_event(buf, config);
+}
+
+static int
+do_perf_event_open(uint64_t config[3], int group_fd)
+{
+ struct perf_event_attr attr = {
+ .size = sizeof(struct perf_event_attr),
+ .type = PERF_TYPE_RAW,
+ .exclude_kernel = 1,
+ .exclude_hv = 1,
+ .disabled = 1,
+ };
+
+ pmu_arch_fixup_config(config);
+
+ attr.config = config[0];
+ attr.config1 = config[1];
+ attr.config2 = config[2];
+
+ return syscall(SYS_perf_event_open, &attr, 0, -1, group_fd, 0);
+}
+
+static int
+open_events(struct rte_pmu_event_group *group)
+{
+ struct rte_pmu_event *event;
+ uint64_t config[3];
+ int num = 0, ret;
+
+ /* group leader gets created first, with fd = -1 */
+ group->fds[0] = -1;
+
+ TAILQ_FOREACH(event, &rte_pmu.event_list, next) {
+ ret = get_event_config(event->name, config);
+ if (ret)
+ continue;
+
+ ret = do_perf_event_open(config, group->fds[0]);
+ if (ret == -1) {
+ ret = -errno;
+ goto out;
+ }
+
+ group->fds[event->index] = ret;
+ num++;
+ }
+
+ return 0;
+out:
+ for (--num; num >= 0; num--) {
+ close(group->fds[num]);
+ group->fds[num] = -1;
+ }
+
+
+ return ret;
+}
+
+static int
+mmap_events(struct rte_pmu_event_group *group)
+{
+ long page_size = sysconf(_SC_PAGE_SIZE);
+ unsigned int i;
+ void *addr;
+ int ret;
+
+ for (i = 0; i < rte_pmu.num_group_events; i++) {
+ addr = mmap(0, page_size, PROT_READ, MAP_SHARED, group->fds[i], 0);
+ if (addr == MAP_FAILED) {
+ ret = -errno;
+ goto out;
+ }
+
+ group->mmap_pages[i] = addr;
+ if (!group->mmap_pages[i]->cap_user_rdpmc) {
+ ret = -EPERM;
+ goto out;
+ }
+ }
+
+ return 0;
+out:
+ for (; i; i--) {
+ munmap(group->mmap_pages[i - 1], page_size);
+ group->mmap_pages[i - 1] = NULL;
+ }
+
+ return ret;
+}
+
+static void
+cleanup_events(struct rte_pmu_event_group *group)
+{
+ unsigned int i;
+
+ if (group->fds[0] != -1)
+ ioctl(group->fds[0], PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP);
+
+ for (i = 0; i < rte_pmu.num_group_events; i++) {
+ if (group->mmap_pages[i]) {
+ munmap(group->mmap_pages[i], sysconf(_SC_PAGE_SIZE));
+ group->mmap_pages[i] = NULL;
+ }
+
+ if (group->fds[i] != -1) {
+ close(group->fds[i]);
+ group->fds[i] = -1;
+ }
+ }
+
+ group->enabled = false;
+}
+
+int
+__rte_pmu_enable_group(void)
+{
+ struct rte_pmu_event_group *group = &RTE_PER_LCORE(_event_group);
+ int ret;
+
+ if (rte_pmu.num_group_events == 0)
+ return -ENODEV;
+
+ ret = open_events(group);
+ if (ret)
+ goto out;
+
+ ret = mmap_events(group);
+ if (ret)
+ goto out;
+
+ if (ioctl(group->fds[0], PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP) == -1) {
+ ret = -errno;
+ goto out;
+ }
+
+ if (ioctl(group->fds[0], PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP) == -1) {
+ ret = -errno;
+ goto out;
+ }
+
+ rte_spinlock_lock(&rte_pmu.lock);
+ TAILQ_INSERT_TAIL(&rte_pmu.event_group_list, group, next);
+ rte_spinlock_unlock(&rte_pmu.lock);
+ group->enabled = true;
+
+ return 0;
+
+out:
+ cleanup_events(group);
+
+ return ret;
+}
+
+static int
+scan_pmus(void)
+{
+ char path[PATH_MAX];
+ struct dirent *dent;
+ const char *name;
+ DIR *dirp;
+
+ dirp = opendir(EVENT_SOURCE_DEVICES_PATH);
+ if (dirp == NULL)
+ return -errno;
+
+ while ((dent = readdir(dirp))) {
+ name = dent->d_name;
+ if (name[0] == '.')
+ continue;
+
+ /* sysfs entry should either contain cpus or be a cpu */
+ if (!strcmp(name, "cpu"))
+ break;
+
+ snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/cpus", name);
+ if (access(path, F_OK) == 0)
+ break;
+ }
+
+ if (dent) {
+ rte_pmu.name = strdup(name);
+ if (rte_pmu.name == NULL) {
+ closedir(dirp);
+
+ return -ENOMEM;
+ }
+ }
+
+ closedir(dirp);
+
+ return rte_pmu.name ? 0 : -ENODEV;
+}
+
+static struct rte_pmu_event *
+new_event(const char *name)
+{
+ struct rte_pmu_event *event;
+
+ event = calloc(1, sizeof(*event));
+ if (event == NULL)
+ goto out;
+
+ event->name = strdup(name);
+ if (event->name == NULL) {
+ free(event);
+ event = NULL;
+ }
+
+out:
+ return event;
+}
+
+static void
+free_event(struct rte_pmu_event *event)
+{
+ free(event->name);
+ free(event);
+}
+
+int
+rte_pmu_add_event(const char *name)
+{
+ struct rte_pmu_event *event;
+ char path[PATH_MAX];
+
+ if (rte_pmu.name == NULL)
+ return -ENODEV;
+
+ if (rte_pmu.num_group_events + 1 >= MAX_NUM_GROUP_EVENTS)
+ return -ENOSPC;
+
+ snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/events/%s", rte_pmu.name, name);
+ if (access(path, R_OK))
+ return -ENODEV;
+
+ TAILQ_FOREACH(event, &rte_pmu.event_list, next) {
+ if (!strcmp(event->name, name))
+ return event->index;
+ continue;
+ }
+
+ event = new_event(name);
+ if (event == NULL)
+ return -ENOMEM;
+
+ event->index = rte_pmu.num_group_events++;
+ TAILQ_INSERT_TAIL(&rte_pmu.event_list, event, next);
+
+ return event->index;
+}
+
+int
+rte_pmu_init(void)
+{
+ int ret;
+
+ /* Allow calling init from multiple contexts within a single thread. This simplifies
+ * resource management a bit e.g in case fast-path tracepoint has already been enabled
+ * via command line but application doesn't care enough and performs init/fini again.
+ */
+ if (rte_pmu.initialized != 0) {
+ rte_pmu.initialized++;
+ return 0;
+ }
+
+ ret = scan_pmus();
+ if (ret)
+ goto out;
+
+ ret = pmu_arch_init();
+ if (ret)
+ goto out;
+
+ TAILQ_INIT(&rte_pmu.event_list);
+ TAILQ_INIT(&rte_pmu.event_group_list);
+ rte_spinlock_init(&rte_pmu.lock);
+ rte_pmu.initialized = 1;
+
+ return 0;
+out:
+ free(rte_pmu.name);
+ rte_pmu.name = NULL;
+
+ return ret;
+}
+
+void
+rte_pmu_fini(void)
+{
+ struct rte_pmu_event_group *group, *tmp_group;
+ struct rte_pmu_event *event, *tmp_event;
+
+ /* cleanup once init count drops to zero */
+ if (rte_pmu.initialized == 0 || --rte_pmu.initialized != 0)
+ return;
+
+ RTE_TAILQ_FOREACH_SAFE(event, &rte_pmu.event_list, next, tmp_event) {
+ TAILQ_REMOVE(&rte_pmu.event_list, event, next);
+ free_event(event);
+ }
+
+ RTE_TAILQ_FOREACH_SAFE(group, &rte_pmu.event_group_list, next, tmp_group) {
+ TAILQ_REMOVE(&rte_pmu.event_group_list, group, next);
+ cleanup_events(group);
+ }
+
+ pmu_arch_fini();
+ free(rte_pmu.name);
+ rte_pmu.name = NULL;
+ rte_pmu.num_group_events = 0;
+}
new file mode 100644
@@ -0,0 +1,212 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2023 Marvell
+ */
+
+#ifndef _RTE_PMU_H_
+#define _RTE_PMU_H_
+
+/**
+ * @file
+ *
+ * PMU event tracing operations
+ *
+ * This file defines generic API and types necessary to setup PMU and
+ * read selected counters in runtime.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <linux/perf_event.h>
+
+#include <rte_atomic.h>
+#include <rte_branch_prediction.h>
+#include <rte_common.h>
+#include <rte_compat.h>
+#include <rte_spinlock.h>
+
+/** Maximum number of events in a group */
+#define MAX_NUM_GROUP_EVENTS 8
+
+/**
+ * A structure describing a group of events.
+ */
+struct rte_pmu_event_group {
+ struct perf_event_mmap_page *mmap_pages[MAX_NUM_GROUP_EVENTS]; /**< array of user pages */
+ int fds[MAX_NUM_GROUP_EVENTS]; /**< array of event descriptors */
+ bool enabled; /**< true if group was enabled on particular lcore */
+ TAILQ_ENTRY(rte_pmu_event_group) next; /**< list entry */
+} __rte_cache_aligned;
+
+/**
+ * A structure describing an event.
+ */
+struct rte_pmu_event {
+ char *name; /**< name of an event */
+ unsigned int index; /**< event index into fds/mmap_pages */
+ TAILQ_ENTRY(rte_pmu_event) next; /**< list entry */
+};
+
+/**
+ * A PMU state container.
+ */
+struct rte_pmu {
+ char *name; /**< name of core PMU listed under /sys/bus/event_source/devices */
+ rte_spinlock_t lock; /**< serialize access to event group list */
+ TAILQ_HEAD(, rte_pmu_event_group) event_group_list; /**< list of event groups */
+ unsigned int num_group_events; /**< number of events in a group */
+ TAILQ_HEAD(, rte_pmu_event) event_list; /**< list of matching events */
+ unsigned int initialized; /**< initialization counter */
+};
+
+/** lcore event group */
+RTE_DECLARE_PER_LCORE(struct rte_pmu_event_group, _event_group);
+
+/** PMU state container */
+extern struct rte_pmu rte_pmu;
+
+/** Each architecture supporting PMU needs to provide its own version */
+#ifndef rte_pmu_pmc_read
+#define rte_pmu_pmc_read(index) ({ 0; })
+#endif
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Read PMU counter.
+ *
+ * @warning This should be not called directly.
+ *
+ * @param pc
+ * Pointer to the mmapped user page.
+ * @return
+ * Counter value read from hardware.
+ */
+static __rte_always_inline uint64_t
+__rte_pmu_read_userpage(struct perf_event_mmap_page *pc)
+{
+ uint64_t width, offset;
+ uint32_t seq, index;
+ int64_t pmc;
+
+ for (;;) {
+ seq = pc->lock;
+ rte_compiler_barrier();
+ index = pc->index;
+ offset = pc->offset;
+ width = pc->pmc_width;
+
+ /* index set to 0 means that particular counter cannot be used */
+ if (likely(pc->cap_user_rdpmc && index)) {
+ pmc = rte_pmu_pmc_read(index - 1);
+ pmc <<= 64 - width;
+ pmc >>= 64 - width;
+ offset += pmc;
+ }
+
+ rte_compiler_barrier();
+
+ if (likely(pc->lock == seq))
+ return offset;
+ }
+
+ return 0;
+}
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Enable group of events on the calling lcore.
+ *
+ * @warning This should be not called directly.
+ *
+ * @return
+ * 0 in case of success, negative value otherwise.
+ */
+__rte_experimental
+int
+__rte_pmu_enable_group(void);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Initialize PMU library.
+ *
+ * @warning This should be not called directly.
+ *
+ * @return
+ * 0 in case of success, negative value otherwise.
+ */
+__rte_experimental
+int
+rte_pmu_init(void);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Finalize PMU library. This should be called after PMU counters are no longer being read.
+ */
+__rte_experimental
+void
+rte_pmu_fini(void);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Add event to the group of enabled events.
+ *
+ * @param name
+ * Name of an event listed under /sys/bus/event_source/devices/pmu/events.
+ * @return
+ * Event index in case of success, negative value otherwise.
+ */
+__rte_experimental
+int
+rte_pmu_add_event(const char *name);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Read hardware counter configured to count occurrences of an event.
+ *
+ * @param index
+ * Index of an event to be read.
+ * @return
+ * Event value read from register. In case of errors or lack of support
+ * 0 is returned. In other words, stream of zeros in a trace file
+ * indicates problem with reading particular PMU event register.
+ */
+__rte_experimental
+static __rte_always_inline uint64_t
+rte_pmu_read(unsigned int index)
+{
+ struct rte_pmu_event_group *group = &RTE_PER_LCORE(_event_group);
+ int ret;
+
+ if (unlikely(!rte_pmu.initialized))
+ return 0;
+
+ if (unlikely(!group->enabled)) {
+ ret = __rte_pmu_enable_group();
+ if (ret)
+ return 0;
+ }
+
+ if (unlikely(index >= rte_pmu.num_group_events))
+ return 0;
+
+ return __rte_pmu_read_userpage(group->mmap_pages[index]);
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_PMU_H_ */
new file mode 100644
@@ -0,0 +1,15 @@
+DPDK_23 {
+ local: *;
+};
+
+EXPERIMENTAL {
+ global:
+
+ __rte_pmu_enable_group;
+ per_lcore__event_group;
+ rte_pmu;
+ rte_pmu_add_event;
+ rte_pmu_fini;
+ rte_pmu_init;
+ rte_pmu_read;
+};