[v15,1/4] lib: add generic support for reading PMU events
Checks
Commit Message
Add support for programming PMU counters and reading their values
in runtime bypassing kernel completely.
This is especially useful in cases where CPU cores are isolated
i.e run dedicated tasks. In such cases one cannot use standard
perf utility without sacrificing latency and performance.
Signed-off-by: Tomasz Duszynski <tduszynski@marvell.com>
---
MAINTAINERS | 5 +
app/test/meson.build | 1 +
app/test/test_pmu.c | 49 +++
doc/api/doxy-api-index.md | 3 +-
doc/api/doxy-api.conf.in | 1 +
doc/guides/prog_guide/profile_app.rst | 29 ++
doc/guides/rel_notes/release_24_11.rst | 7 +
lib/eal/meson.build | 3 +
lib/meson.build | 1 +
lib/pmu/meson.build | 13 +
lib/pmu/pmu_private.h | 32 ++
lib/pmu/rte_pmu.c | 473 +++++++++++++++++++++++++
lib/pmu/rte_pmu.h | 205 +++++++++++
lib/pmu/version.map | 13 +
14 files changed, 834 insertions(+), 1 deletion(-)
create mode 100644 app/test/test_pmu.c
create mode 100644 lib/pmu/meson.build
create mode 100644 lib/pmu/pmu_private.h
create mode 100644 lib/pmu/rte_pmu.c
create mode 100644 lib/pmu/rte_pmu.h
create mode 100644 lib/pmu/version.map
Comments
> Add support for programming PMU counters and reading their values
> in runtime bypassing kernel completely.
>
> This is especially useful in cases where CPU cores are isolated
> i.e run dedicated tasks. In such cases one cannot use standard
> perf utility without sacrificing latency and performance.
LGTM in general, just few questions, nits - majority about docs/comments.
> Signed-off-by: Tomasz Duszynski <tduszynski@marvell.com>
> ---
> MAINTAINERS | 5 +
> app/test/meson.build | 1 +
> app/test/test_pmu.c | 49 +++
> doc/api/doxy-api-index.md | 3 +-
> doc/api/doxy-api.conf.in | 1 +
> doc/guides/prog_guide/profile_app.rst | 29 ++
> doc/guides/rel_notes/release_24_11.rst | 7 +
> lib/eal/meson.build | 3 +
> lib/meson.build | 1 +
> lib/pmu/meson.build | 13 +
> lib/pmu/pmu_private.h | 32 ++
> lib/pmu/rte_pmu.c | 473 +++++++++++++++++++++++++
> lib/pmu/rte_pmu.h | 205 +++++++++++
> lib/pmu/version.map | 13 +
> 14 files changed, 834 insertions(+), 1 deletion(-)
> create mode 100644 app/test/test_pmu.c
> create mode 100644 lib/pmu/meson.build
> create mode 100644 lib/pmu/pmu_private.h
> create mode 100644 lib/pmu/rte_pmu.c
> create mode 100644 lib/pmu/rte_pmu.h
> create mode 100644 lib/pmu/version.map
>
> diff --git a/MAINTAINERS b/MAINTAINERS
> index cd78bc7db1..077efe41cf 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -1816,6 +1816,11 @@ M: Nithin Dabilpuram <ndabilpuram@marvell.com>
> M: Pavan Nikhilesh <pbhagavatula@marvell.com>
> F: lib/node/
>
> +PMU - EXPERIMENTAL
> +M: Tomasz Duszynski <tduszynski@marvell.com>
> +F: lib/pmu/
> +F: app/test/test_pmu*
> +
>
> Test Applications
> -----------------
> diff --git a/app/test/meson.build b/app/test/meson.build
> index 0f7e11969a..5f1622ecab 100644
> --- a/app/test/meson.build
> +++ b/app/test/meson.build
> @@ -141,6 +141,7 @@ source_file_deps = {
> 'test_pmd_perf.c': ['ethdev', 'net'] + packet_burst_generator_deps,
> 'test_pmd_ring.c': ['net_ring', 'ethdev', 'bus_vdev'],
> 'test_pmd_ring_perf.c': ['ethdev', 'net_ring', 'bus_vdev'],
> + 'test_pmu.c': ['pmu'],
> 'test_power.c': ['power'],
> 'test_power_cpufreq.c': ['power'],
> 'test_power_intel_uncore.c': ['power'],
> diff --git a/app/test/test_pmu.c b/app/test/test_pmu.c
> new file mode 100644
> index 0000000000..464e5068ec
> --- /dev/null
> +++ b/app/test/test_pmu.c
> @@ -0,0 +1,49 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(C) 2024 Marvell International Ltd.
> + */
> +
> +#include <rte_pmu.h>
> +
> +#include "test.h"
> +
> +static int
> +test_pmu_read(void)
> +{
> + const char *name = NULL;
> + int tries = 10, event;
> + uint64_t val = 0;
> +
> + if (name == NULL) {
> + printf("PMU not supported on this arch\n");
> + return TEST_SKIPPED;
> + }
> +
> + if (rte_pmu_init() < 0)
> + return TEST_FAILED;
> +
> + event = rte_pmu_add_event(name);
> + while (tries--)
> + val += rte_pmu_read(event);
> +
> + rte_pmu_fini();
> +
> + return val ? TEST_SUCCESS : TEST_FAILED;
> +}
> +
> +static struct unit_test_suite pmu_tests = {
> + .suite_name = "pmu autotest",
> + .setup = NULL,
> + .teardown = NULL,
> + .unit_test_cases = {
> + TEST_CASE(test_pmu_read),
> + TEST_CASES_END()
> + }
> +};
> +
> +static int
> +test_pmu(void)
> +{
> + return unit_test_suite_runner(&pmu_tests);
> +}
> +
> +REGISTER_FAST_TEST(pmu_autotest, true, true, test_pmu);
> diff --git a/doc/api/doxy-api-index.md b/doc/api/doxy-api-index.md
> index 266c8b90dc..3e6020f367 100644
> --- a/doc/api/doxy-api-index.md
> +++ b/doc/api/doxy-api-index.md
> @@ -240,7 +240,8 @@ The public API headers are grouped by topics:
> [log](@ref rte_log.h),
> [errno](@ref rte_errno.h),
> [trace](@ref rte_trace.h),
> - [trace_point](@ref rte_trace_point.h)
> + [trace_point](@ref rte_trace_point.h),
> + [pmu](@ref rte_pmu.h)
>
> - **misc**:
> [EAL config](@ref rte_eal.h),
> diff --git a/doc/api/doxy-api.conf.in b/doc/api/doxy-api.conf.in
> index c94f02d411..f4b382e073 100644
> --- a/doc/api/doxy-api.conf.in
> +++ b/doc/api/doxy-api.conf.in
> @@ -69,6 +69,7 @@ INPUT = @TOPDIR@/doc/api/doxy-api-index.md \
> @TOPDIR@/lib/pdcp \
> @TOPDIR@/lib/pdump \
> @TOPDIR@/lib/pipeline \
> + @TOPDIR@/lib/pmu \
> @TOPDIR@/lib/port \
> @TOPDIR@/lib/power \
> @TOPDIR@/lib/ptr_compress \
> diff --git a/doc/guides/prog_guide/profile_app.rst b/doc/guides/prog_guide/profile_app.rst
> index a6b5fb4d5e..854c515a61 100644
> --- a/doc/guides/prog_guide/profile_app.rst
> +++ b/doc/guides/prog_guide/profile_app.rst
> @@ -7,6 +7,35 @@ Profile Your Application
> The following sections describe methods of profiling DPDK applications on
> different architectures.
>
> +Performance counter based profiling
> +-----------------------------------
> +
> +Majority of architectures support some performance monitoring unit (PMU).
> +Such unit provides programmable counters that monitor specific events.
> +
> +Different tools gather that information, like for example perf.
> +However, in some scenarios when CPU cores are isolated and run
> +dedicated tasks interrupting those tasks with perf may be undesirable.
> +
> +In such cases, an application can use the PMU library to read such events via ``rte_pmu_read()``.
> +
> +By default, userspace applications are not allowed to access PMU internals. That can be changed
> +by setting ``/sys/kernel/perf_event_paranoid`` to 2 (that should be a default value anyway) and
> +adding ``CAP_PERFMON`` capability to DPDK application. Please refer to
> +``Documentation/admin-guide/perf-security.rst`` under Linux sources for more information. Fairly
> +recent kernel, i.e >= 5.9, is advised too.
> +
> +As of now implementation imposes certain limitations:
> +
> +* Management APIs that normally return a non-negative value will return error (``-ENOTSUP``) while
> + ``rte_pmu_read()`` will return ``UINT64_MAX`` if running under unsupported operating system.
> +
> +* Only EAL lcores are supported
> +
> +* EAL lcores must not share a cpu
> +
> +* Each EAL lcore measures same group of events
> +
>
> Profiling on x86
> ----------------
> diff --git a/doc/guides/rel_notes/release_24_11.rst b/doc/guides/rel_notes/release_24_11.rst
> index fa4822d928..d34ecb55e0 100644
> --- a/doc/guides/rel_notes/release_24_11.rst
> +++ b/doc/guides/rel_notes/release_24_11.rst
> @@ -247,6 +247,13 @@ New Features
> Added ability for node to advertise and update multiple xstat counters,
> that can be retrieved using ``rte_graph_cluster_stats_get``.
>
> +* **Added PMU library.**
> +
> + Added a new performance monitoring unit (PMU) library which allows applications
> + to perform self monitoring activities without depending on external utilities like perf.
> + After integration with :doc:`../prog_guide/trace_lib` data gathered from hardware counters
> + can be stored in CTF format for further analysis.
> +
>
> Removed Items
> -------------
> diff --git a/lib/eal/meson.build b/lib/eal/meson.build
> index e1d6c4cf17..1349624653 100644
> --- a/lib/eal/meson.build
> +++ b/lib/eal/meson.build
> @@ -18,6 +18,9 @@ deps += ['log', 'kvargs']
> if not is_windows
> deps += ['telemetry']
> endif
> +if dpdk_conf.has('RTE_LIB_PMU')
> + deps += ['pmu']
> +endif
> if dpdk_conf.has('RTE_USE_LIBBSD')
> ext_deps += libbsd
> endif
> diff --git a/lib/meson.build b/lib/meson.build
> index ce92cb5537..968ad29e8d 100644
> --- a/lib/meson.build
> +++ b/lib/meson.build
> @@ -13,6 +13,7 @@ libraries = [
> 'kvargs', # eal depends on kvargs
> 'argparse',
> 'telemetry', # basic info querying
> + 'pmu',
> 'eal', # everything depends on eal
> 'ptr_compress',
> 'ring',
> diff --git a/lib/pmu/meson.build b/lib/pmu/meson.build
> new file mode 100644
> index 0000000000..f173b6f55c
> --- /dev/null
> +++ b/lib/pmu/meson.build
> @@ -0,0 +1,13 @@
> +# SPDX-License-Identifier: BSD-3-Clause
> +# Copyright(C) 2024 Marvell International Ltd.
> +
> +if not is_linux
> + build = false
> + reason = 'only supported on Linux'
> + subdir_done()
> +endif
> +
> +headers = files('rte_pmu.h')
> +sources = files('rte_pmu.c')
> +
> +deps += ['log']
> diff --git a/lib/pmu/pmu_private.h b/lib/pmu/pmu_private.h
> new file mode 100644
> index 0000000000..d2b15615bf
> --- /dev/null
> +++ b/lib/pmu/pmu_private.h
> @@ -0,0 +1,32 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2024 Marvell
> + */
> +
> +#ifndef _PMU_PRIVATE_H_
> +#define _PMU_PRIVATE_H_
> +
> +/**
> + * Architecture specific PMU init callback.
> + *
> + * @return
> + * 0 in case of success, negative value otherwise.
> + */
> +int
> +pmu_arch_init(void);
> +
> +/**
> + * Architecture specific PMU cleanup callback.
> + */
> +void
> +pmu_arch_fini(void);
> +
> +/**
> + * Apply architecture specific settings to config before passing it to syscall.
> + *
> + * @param config
> + * Architecture specific event configuration. Consult kernel sources for available options.
> + */
> +void
> +pmu_arch_fixup_config(uint64_t config[3]);
> +
> +#endif /* _PMU_PRIVATE_H_ */
> diff --git a/lib/pmu/rte_pmu.c b/lib/pmu/rte_pmu.c
> new file mode 100644
> index 0000000000..dd57961627
> --- /dev/null
> +++ b/lib/pmu/rte_pmu.c
> @@ -0,0 +1,473 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(C) 2024 Marvell International Ltd.
> + */
> +
> +#include <ctype.h>
> +#include <dirent.h>
> +#include <errno.h>
> +#include <regex.h>
> +#include <stdlib.h>
> +#include <string.h>
> +#include <sys/ioctl.h>
> +#include <sys/mman.h>
> +#include <sys/queue.h>
> +#include <sys/syscall.h>
> +#include <unistd.h>
> +
> +#include <rte_atomic.h>
> +#include <rte_bitops.h>
> +#include <rte_lcore.h>
> +#include <rte_log.h>
> +#include <rte_per_lcore.h>
> +#include <rte_pmu.h>
> +#include <rte_tailq.h>
> +
> +#include "pmu_private.h"
> +
> +#define EVENT_SOURCE_DEVICES_PATH "/sys/bus/event_source/devices"
> +
> +#define FIELD_PREP(m, v) (((uint64_t)(v) << (__builtin_ffsll(m) - 1)) & (m))
> +
> +RTE_LOG_REGISTER_DEFAULT(rte_pmu_logtype, INFO)
> +#define RTE_LOGTYPE_PMU rte_pmu_logtype
> +
> +#define PMU_LOG(level, ...) \
> + RTE_LOG_LINE(level, PMU, ## __VA_ARGS__)
> +
> +/* A structure describing an event */
> +struct rte_pmu_event {
> + char *name;
> + unsigned int index;
> + TAILQ_ENTRY(rte_pmu_event) next;
> +};
> +
> +struct rte_pmu rte_pmu;
> +
> +/*
> + * Following __rte_weak functions provide default no-op. Architectures should override them if
> + * necessary.
> + */
> +
> +int
> +__rte_weak pmu_arch_init(void)
> +{
> + return 0;
> +}
> +
> +void
> +__rte_weak pmu_arch_fini(void)
> +{
> +}
> +
> +void
> +__rte_weak pmu_arch_fixup_config(uint64_t __rte_unused config[3])
> +{
> +}
> +
> +static int
> +get_term_format(const char *name, int *num, uint64_t *mask)
> +{
> + char path[PATH_MAX];
> + char *config = NULL;
> + int high, low, ret;
> + FILE *fp;
> +
> + *num = *mask = 0;
> + snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/format/%s", rte_pmu.name, name);
> + fp = fopen(path, "r");
> + if (fp == NULL)
> + return -errno;
> +
> + errno = 0;
> + ret = fscanf(fp, "%m[^:]:%d-%d", &config, &low, &high);
> + if (ret < 2) {
> + ret = -ENODATA;
> + goto out;
> + }
> + if (errno) {
> + ret = -errno;
> + goto out;
> + }
> +
> + if (ret == 2)
> + high = low;
> +
> + *mask = RTE_GENMASK64(high, low);
> + /* Last digit should be [012]. If last digit is missing 0 is implied. */
> + *num = config[strlen(config) - 1];
> + *num = isdigit(*num) ? *num - '0' : 0;
> +
> + ret = 0;
> +out:
> + free(config);
> + fclose(fp);
> +
> + return ret;
> +}
> +
> +static int
> +parse_event(char *buf, uint64_t config[3])
> +{
> + char *token, *term;
> + int num, ret, val;
> + uint64_t mask;
> +
> + config[0] = config[1] = config[2] = 0;
> +
> + token = strtok(buf, ",");
> + while (token) {
> + errno = 0;
> + /* <term>=<value> */
> + ret = sscanf(token, "%m[^=]=%i", &term, &val);
> + if (ret < 1)
> + return -ENODATA;
> + if (errno)
> + return -errno;
> + if (ret == 1)
> + val = 1;
> +
> + ret = get_term_format(term, &num, &mask);
> + free(term);
> + if (ret)
> + return ret;
> +
> + config[num] |= FIELD_PREP(mask, val);
> + token = strtok(NULL, ",");
> + }
> +
> + return 0;
> +}
> +
> +static int
> +get_event_config(const char *name, uint64_t config[3])
> +{
> + char path[PATH_MAX], buf[BUFSIZ];
> + FILE *fp;
> + int ret;
> +
> + snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/events/%s", rte_pmu.name, name);
> + fp = fopen(path, "r");
> + if (fp == NULL)
> + return -errno;
> +
> + ret = fread(buf, 1, sizeof(buf), fp);
> + if (ret == 0) {
> + fclose(fp);
> +
> + return -EINVAL;
> + }
> + fclose(fp);
> + buf[ret] = '\0';
> +
> + return parse_event(buf, config);
> +}
> +
> +static int
> +do_perf_event_open(uint64_t config[3], int group_fd)
> +{
> + struct perf_event_attr attr = {
> + .size = sizeof(struct perf_event_attr),
> + .type = PERF_TYPE_RAW,
> + .exclude_kernel = 1,
> + .exclude_hv = 1,
> + .disabled = 1,
> + .pinned = group_fd == -1,
> + };
> +
> + pmu_arch_fixup_config(config);
> +
> + attr.config = config[0];
> + attr.config1 = config[1];
> + attr.config2 = config[2];
> +
> + return syscall(SYS_perf_event_open, &attr, 0, -1, group_fd, 0);
> +}
> +
> +static int
> +open_events(struct rte_pmu_event_group *group)
> +{
> + struct rte_pmu_event *event;
> + uint64_t config[3];
> + int num = 0, ret;
> +
> + /* group leader gets created first, with fd = -1 */
> + group->fds[0] = -1;
> +
> + TAILQ_FOREACH(event, &rte_pmu.event_list, next) {
> + ret = get_event_config(event->name, config);
> + if (ret)
> + continue;
> +
> + ret = do_perf_event_open(config, group->fds[0]);
> + if (ret == -1) {
> + ret = -errno;
> + goto out;
> + }
> +
> + group->fds[event->index] = ret;
> + num++;
> + }
> +
> + return 0;
> +out:
> + for (--num; num >= 0; num--) {
> + close(group->fds[num]);
> + group->fds[num] = -1;
> + }
> +
> + return ret;
> +}
> +
> +static int
> +mmap_events(struct rte_pmu_event_group *group)
> +{
> + long page_size = sysconf(_SC_PAGE_SIZE);
> + unsigned int i;
> + void *addr;
> + int ret;
> +
> + for (i = 0; i < rte_pmu.num_group_events; i++) {
> + addr = mmap(0, page_size, PROT_READ, MAP_SHARED, group->fds[i], 0);
> + if (addr == MAP_FAILED) {
> + ret = -errno;
> + goto out;
> + }
> +
> + group->mmap_pages[i] = addr;
> + }
> +
> + return 0;
> +out:
> + for (; i; i--) {
> + munmap(group->mmap_pages[i - 1], page_size);
> + group->mmap_pages[i - 1] = NULL;
> + }
> +
> + return ret;
> +}
> +
> +static void
> +cleanup_events(struct rte_pmu_event_group *group)
> +{
> + unsigned int i;
> +
> + if (group->fds[0] != -1)
> + ioctl(group->fds[0], PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP);
> +
> + for (i = 0; i < rte_pmu.num_group_events; i++) {
> + if (group->mmap_pages[i]) {
> + munmap(group->mmap_pages[i], sysconf(_SC_PAGE_SIZE));
> + group->mmap_pages[i] = NULL;
> + }
> +
> + if (group->fds[i] != -1) {
> + close(group->fds[i]);
> + group->fds[i] = -1;
> + }
> + }
> +
> + group->enabled = false;
> +}
> +
> +int
> +__rte_pmu_enable_group(struct rte_pmu_event_group *group)
> +{
> + int ret;
> +
> + if (rte_pmu.num_group_events == 0)
> + return -ENODEV;
> +
> + ret = open_events(group);
> + if (ret)
> + goto out;
> +
> + ret = mmap_events(group);
> + if (ret)
> + goto out;
> +
> + if (ioctl(group->fds[0], PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP) == -1) {
> + ret = -errno;
> + goto out;
> + }
> +
> + if (ioctl(group->fds[0], PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP) == -1) {
> + ret = -errno;
> + goto out;
> + }
> +
> + group->enabled = true;
> +
> + return 0;
> +out:
> + cleanup_events(group);
> +
> + return ret;
> +}
> +
> +static int
> +scan_pmus(void)
> +{
> + char path[PATH_MAX];
> + struct dirent *dent;
> + const char *name;
> + DIR *dirp;
> +
> + dirp = opendir(EVENT_SOURCE_DEVICES_PATH);
> + if (dirp == NULL)
> + return -errno;
> +
> + while ((dent = readdir(dirp))) {
> + name = dent->d_name;
> + if (name[0] == '.')
> + continue;
> +
> + /* sysfs entry should either contain cpus or be a cpu */
> + if (!strcmp(name, "cpu"))
> + break;
> +
> + snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/cpus", name);
> + if (access(path, F_OK) == 0)
> + break;
> + }
> +
> + if (dent) {
> + rte_pmu.name = strdup(name);
> + if (rte_pmu.name == NULL) {
> + closedir(dirp);
> +
> + return -ENOMEM;
> + }
> + }
> +
> + closedir(dirp);
> +
> + return rte_pmu.name ? 0 : -ENODEV;
> +}
> +
> +static struct rte_pmu_event *
> +new_event(const char *name)
> +{
> + struct rte_pmu_event *event;
> +
> + event = calloc(1, sizeof(*event));
> + if (event == NULL)
> + goto out;
> +
> + event->name = strdup(name);
> + if (event->name == NULL) {
> + free(event);
> + event = NULL;
> + }
> +
> +out:
> + return event;
> +}
> +
> +static void
> +free_event(struct rte_pmu_event *event)
> +{
> + free(event->name);
> + free(event);
> +}
> +
> +int
> +rte_pmu_add_event(const char *name)
> +{
> + struct rte_pmu_event *event;
> + char path[PATH_MAX];
> +
> + if (!rte_pmu.initialized) {
> + PMU_LOG(ERR, "PMU is not initialized");
> + return -ENODEV;
> + }
> +
> + if (rte_pmu.num_group_events + 1 >= RTE_MAX_NUM_GROUP_EVENTS) {
> + PMU_LOG(ERR, "Excessive number of events in a group (%d > %d)",
> + rte_pmu.num_group_events, RTE_MAX_NUM_GROUP_EVENTS);
> + return -ENOSPC;
> + }
> +
> + snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/events/%s", rte_pmu.name, name);
> + if (access(path, R_OK)) {
> + PMU_LOG(ERR, "Cannot access %s", path);
> + return -ENODEV;
> + }
> +
> + TAILQ_FOREACH(event, &rte_pmu.event_list, next) {
> + if (strcmp(event->name, name))
> + continue;
> +
> + return event->index;
> + }
> +
> + event = new_event(name);
> + if (event == NULL) {
> + PMU_LOG(ERR, "Failed to create event %s", name);
> + return -ENOMEM;
> + }
> +
> + event->index = rte_pmu.num_group_events++;
> + TAILQ_INSERT_TAIL(&rte_pmu.event_list, event, next);
> +
> + return event->index;
> +}
> +
> +int
> +rte_pmu_init(void)
> +{
> + int ret;
> +
> + if (rte_pmu.initialized)
> + return 0;
> +
> + ret = scan_pmus();
> + if (ret) {
> + PMU_LOG(ERR, "Failed to scan for event sources");
> + goto out;
> + }
> +
> + ret = pmu_arch_init();
> + if (ret) {
> + PMU_LOG(ERR, "Failed to setup arch internals");
> + goto out;
> + }
> +
> + TAILQ_INIT(&rte_pmu.event_list);
> + rte_pmu.initialized = 1;
> +out:
> + if (ret) {
> + free(rte_pmu.name);
> + rte_pmu.name = NULL;
> + }
> +
> + return ret;
> +}
> +
> +void
> +rte_pmu_fini(void)
> +{
> + struct rte_pmu_event *event, *tmp_event;
> + struct rte_pmu_event_group *group;
> + unsigned int i;
> +
> + if (!rte_pmu.initialized)
> + return;
> +
> + RTE_TAILQ_FOREACH_SAFE(event, &rte_pmu.event_list, next, tmp_event) {
> + TAILQ_REMOVE(&rte_pmu.event_list, event, next);
> + free_event(event);
> + }
> +
> + for (i = 0; i < RTE_DIM(rte_pmu.event_groups); i++) {
> + group = &rte_pmu.event_groups[i];
> + if (!group->enabled)
> + continue;
> +
> + cleanup_events(group);
> + }
> +
> + pmu_arch_fini();
> + free(rte_pmu.name);
> + rte_pmu.name = NULL;
> + rte_pmu.num_group_events = 0;
> +}
> diff --git a/lib/pmu/rte_pmu.h b/lib/pmu/rte_pmu.h
> new file mode 100644
> index 0000000000..7d10a0dc56
> --- /dev/null
> +++ b/lib/pmu/rte_pmu.h
> @@ -0,0 +1,205 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2024 Marvell
> + */
> +
> +#ifndef _RTE_PMU_H_
> +#define _RTE_PMU_H_
> +
> +/**
> + * @file
> + *
> + * PMU event tracing operations
> + *
> + * This file defines generic API and types necessary to setup PMU and
> + * read selected counters in runtime. Exported APIs are generally not MT-safe.
> + * One exception is rte_pmu_read() which can be called concurrently once
> + * everything has been setup.
> + */
> +
From reading the code - I can see ithat PMU API is not MT safe.
The only function that can run in parallel is rte_pmu_read(), correct?
All other combinations: let say pmu_read() and add_event() are not possible,
right?
If so, then it is probably worth to articulate that explicitly in the public API
comments, after all extra comment doesn't hurt.
> +#ifdef __cplusplus
> +extern "C" {
> +#endif
> +
> +#include <errno.h>
> +#include <linux/perf_event.h>
> +
> +#include <rte_atomic.h>
> +#include <rte_branch_prediction.h>
> +#include <rte_common.h>
> +#include <rte_compat.h>
> +#include <rte_lcore.h>
> +
> +/** Maximum number of events in a group */
> +#define RTE_MAX_NUM_GROUP_EVENTS 8
> +
> +/**
> + * A structure describing a group of events.
> + */
> +struct __rte_cache_aligned rte_pmu_event_group {
> + /** array of user pages */
> + struct perf_event_mmap_page *mmap_pages[RTE_MAX_NUM_GROUP_EVENTS];
> + int fds[RTE_MAX_NUM_GROUP_EVENTS]; /**< array of event descriptors */
> + TAILQ_ENTRY(rte_pmu_event_group) next; /**< list entry */
> + bool enabled; /**< true if group was enabled on particular lcore */
> +};
> +
> +/**
> + * A PMU state container.
> + */
> +struct rte_pmu {
> + struct rte_pmu_event_group event_groups[RTE_MAX_LCORE]; /**< event groups */
> + unsigned int num_group_events; /**< number of events in a group */
> + unsigned int initialized; /**< initialization counter */
> + char *name; /**< name of core PMU listed under /sys/bus/event_source/devices */
> + TAILQ_HEAD(, rte_pmu_event) event_list; /**< list of matching events */
> +};
> +
> +/** PMU state container */
> +extern struct rte_pmu rte_pmu;
> +
> +/** Each architecture supporting PMU needs to provide its own version */
> +#ifndef rte_pmu_pmc_read
> +#define rte_pmu_pmc_read(index) ({ (void)(index); 0; })
> +#endif
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice
> + *
> + * Read PMU counter.
> + *
> + * @warning This should not be called directly.
> + *
> + * @param pc
> + * Pointer to the mmapped user page.
> + * @return
> + * Counter value read from hardware.
> + */
> +__rte_experimental
> +static __rte_always_inline uint64_t
> +__rte_pmu_read_userpage(struct perf_event_mmap_page *pc)
> +{
> +#define __RTE_PMU_READ_ONCE(x) (*(const volatile typeof(x) *)&(x))
> + uint64_t width, offset;
> + uint32_t seq, index;
> + int64_t pmc;
> +
> + for (;;) {
> + seq = __RTE_PMU_READ_ONCE(pc->lock);
> + rte_compiler_barrier();
> + index = __RTE_PMU_READ_ONCE(pc->index);
> + offset = __RTE_PMU_READ_ONCE(pc->offset);
> + width = __RTE_PMU_READ_ONCE(pc->pmc_width);
> +
> + /* index set to 0 means that particular counter cannot be used */
> + if (likely(pc->cap_user_rdpmc && index)) {
> + pmc = rte_pmu_pmc_read(index - 1);
> + pmc <<= 64 - width;
> + pmc >>= 64 - width;
> + offset += pmc;
> + }
> +
> + rte_compiler_barrier();
> +
> + if (likely(__RTE_PMU_READ_ONCE(pc->lock) == seq))
> + return offset;
> + }
> +
> + return 0;
> +}
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice
> + *
> + * Enable group of events on the calling lcore.
> + *
> + * @warning This should not be called directly.
> + *
> + * @param group
> + * Pointer to the group which will be enabled.
> + * @return
> + * 0 in case of success, negative value otherwise.
> + */
> +__rte_experimental
> +int
> +__rte_pmu_enable_group(struct rte_pmu_event_group *group);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice
> + *
> + * Initialize PMU library.
> + *
> + * @return
> + * 0 in case of success, negative value otherwise.
> + */
> +__rte_experimental
> +int
> +rte_pmu_init(void);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice
> + *
> + * Finalize PMU library.
> + */
> +__rte_experimental
> +void
> +rte_pmu_fini(void);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice
> + *
> + * Add event to the group of enabled events.
> + *
> + * @param name
> + * Name of an event listed under /sys/bus/event_source/devices/<pmu>/events.
> + * @return
> + * Event index in case of success, negative value otherwise.
> + */
> +__rte_experimental
> +int
> +rte_pmu_add_event(const char *name);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice
> + *
> + * Read hardware counter configured to count occurrences of an event.
> + *
> + * @param index
> + * Index of an event to be read.
> + * @return
> + * Event value read from register. In case of errors or lack of support
> + * 0 is returned. In other words, stream of zeros in a trace file
> + * indicates problem with reading particular PMU event register.
> + */
As I remember that function implies to be called from the thread
bind to one particular cpu.
If that is still the case - please state it explicitly in the comments above.
> +__rte_experimental
> +static __rte_always_inline uint64_t
> +rte_pmu_read(unsigned int index)
> +{
> + struct rte_pmu_event_group *group = &rte_pmu.event_groups[rte_lcore_id()];
I think we better check here value returned by rte_lcore_id() before using it as an index.
> + int ret;
> +
> + if (unlikely(!rte_pmu.initialized))
> + return 0;
> +
> + if (unlikely(!group->enabled)) {
> + ret = __rte_pmu_enable_group(group);
> + if (ret)
> + return 0;
> + }
> +
> + if (unlikely(index >= rte_pmu.num_group_events))
> + return 0;
> +
> + return __rte_pmu_read_userpage(group->mmap_pages[index]);
> +}
> +
> +#ifdef __cplusplus
> +}
> +#endif
> +
> +#endif /* _RTE_PMU_H_ */
> diff --git a/lib/pmu/version.map b/lib/pmu/version.map
> new file mode 100644
> index 0000000000..d0f907d13d
> --- /dev/null
> +++ b/lib/pmu/version.map
> @@ -0,0 +1,13 @@
> +EXPERIMENTAL {
> + global:
> +
> + # added in 24.11
> + __rte_pmu_enable_group;
> + rte_pmu;
> + rte_pmu_add_event;
> + rte_pmu_fini;
> + rte_pmu_init;
> + rte_pmu_read;
> +
> + local: *;
> +};
> --
> 2.34.1
>-----Original Message-----
>From: Konstantin Ananyev <konstantin.ananyev@huawei.com>
>Sent: Tuesday, November 5, 2024 11:58 AM
>To: Tomasz Duszynski <tduszynski@marvell.com>; Thomas Monjalon <thomas@monjalon.net>
>Cc: Ruifeng.Wang@arm.com; bruce.richardson@intel.com; david.marchand@redhat.com; dev@dpdk.org;
>Jerin Jacob <jerinj@marvell.com>; konstantin.v.ananyev@yandex.ru; mattias.ronnblom@ericsson.com;
>mb@smartsharesystems.com; roretzla@linux.microsoft.com; stephen@networkplumber.org;
>zhoumin@loongson.cn
>Subject: [EXTERNAL] RE: [PATCH v15 1/4] lib: add generic support for reading PMU events
>
>> Add support for programming PMU counters and reading their values > in
>> runtime bypassing kernel completely. > > This is especially useful in
>> cases where CPU cores are isolated > i. e run dedicated tasks. In such
>> cases one cannot
>
>
>
>> Add support for programming PMU counters and reading their values in
>> runtime bypassing kernel completely.
>>
>> This is especially useful in cases where CPU cores are isolated i.e
>> run dedicated tasks. In such cases one cannot use standard perf
>> utility without sacrificing latency and performance.
>
>LGTM in general, just few questions, nits - majority about docs/comments.
>
>> Signed-off-by: Tomasz Duszynski <tduszynski@marvell.com>
>> ---
>> MAINTAINERS | 5 +
>> app/test/meson.build | 1 +
>> app/test/test_pmu.c | 49 +++
>> doc/api/doxy-api-index.md | 3 +-
>> doc/api/doxy-api.conf.in | 1 +
>> doc/guides/prog_guide/profile_app.rst | 29 ++
>> doc/guides/rel_notes/release_24_11.rst | 7 +
>> lib/eal/meson.build | 3 +
>> lib/meson.build | 1 +
>> lib/pmu/meson.build | 13 +
>> lib/pmu/pmu_private.h | 32 ++
>> lib/pmu/rte_pmu.c | 473 +++++++++++++++++++++++++
>> lib/pmu/rte_pmu.h | 205 +++++++++++
>> lib/pmu/version.map | 13 +
>> 14 files changed, 834 insertions(+), 1 deletion(-) create mode
>> 100644 app/test/test_pmu.c create mode 100644 lib/pmu/meson.build
>> create mode 100644 lib/pmu/pmu_private.h create mode 100644
>> lib/pmu/rte_pmu.c create mode 100644 lib/pmu/rte_pmu.h create mode
>> 100644 lib/pmu/version.map
>>
>> diff --git a/MAINTAINERS b/MAINTAINERS index cd78bc7db1..077efe41cf
>> 100644
>> --- a/MAINTAINERS
>> +++ b/MAINTAINERS
>> @@ -1816,6 +1816,11 @@ M: Nithin Dabilpuram <ndabilpuram@marvell.com>
>> M: Pavan Nikhilesh <pbhagavatula@marvell.com>
>> F: lib/node/
>>
>> +PMU - EXPERIMENTAL
>> +M: Tomasz Duszynski <tduszynski@marvell.com>
>> +F: lib/pmu/
>> +F: app/test/test_pmu*
>> +
>>
>> Test Applications
>> -----------------
>> diff --git a/app/test/meson.build b/app/test/meson.build index
>> 0f7e11969a..5f1622ecab 100644
>> --- a/app/test/meson.build
>> +++ b/app/test/meson.build
>> @@ -141,6 +141,7 @@ source_file_deps = {
>> 'test_pmd_perf.c': ['ethdev', 'net'] + packet_burst_generator_deps,
>> 'test_pmd_ring.c': ['net_ring', 'ethdev', 'bus_vdev'],
>> 'test_pmd_ring_perf.c': ['ethdev', 'net_ring', 'bus_vdev'],
>> + 'test_pmu.c': ['pmu'],
>> 'test_power.c': ['power'],
>> 'test_power_cpufreq.c': ['power'],
>> 'test_power_intel_uncore.c': ['power'], diff --git
>> a/app/test/test_pmu.c b/app/test/test_pmu.c new file mode 100644 index
>> 0000000000..464e5068ec
>> --- /dev/null
>> +++ b/app/test/test_pmu.c
>> @@ -0,0 +1,49 @@
>> +/* SPDX-License-Identifier: BSD-3-Clause
>> + * Copyright(C) 2024 Marvell International Ltd.
>> + */
>> +
>> +#include <rte_pmu.h>
>> +
>> +#include "test.h"
>> +
>> +static int
>> +test_pmu_read(void)
>> +{
>> + const char *name = NULL;
>> + int tries = 10, event;
>> + uint64_t val = 0;
>> +
>> + if (name == NULL) {
>> + printf("PMU not supported on this arch\n");
>> + return TEST_SKIPPED;
>> + }
>> +
>> + if (rte_pmu_init() < 0)
>> + return TEST_FAILED;
>> +
>> + event = rte_pmu_add_event(name);
>> + while (tries--)
>> + val += rte_pmu_read(event);
>> +
>> + rte_pmu_fini();
>> +
>> + return val ? TEST_SUCCESS : TEST_FAILED; }
>> +
>> +static struct unit_test_suite pmu_tests = {
>> + .suite_name = "pmu autotest",
>> + .setup = NULL,
>> + .teardown = NULL,
>> + .unit_test_cases = {
>> + TEST_CASE(test_pmu_read),
>> + TEST_CASES_END()
>> + }
>> +};
>> +
>> +static int
>> +test_pmu(void)
>> +{
>> + return unit_test_suite_runner(&pmu_tests);
>> +}
>> +
>> +REGISTER_FAST_TEST(pmu_autotest, true, true, test_pmu);
>> diff --git a/doc/api/doxy-api-index.md b/doc/api/doxy-api-index.md
>> index 266c8b90dc..3e6020f367 100644
>> --- a/doc/api/doxy-api-index.md
>> +++ b/doc/api/doxy-api-index.md
>> @@ -240,7 +240,8 @@ The public API headers are grouped by topics:
>> [log](@ref rte_log.h),
>> [errno](@ref rte_errno.h),
>> [trace](@ref rte_trace.h),
>> - [trace_point](@ref rte_trace_point.h)
>> + [trace_point](@ref rte_trace_point.h), [pmu](@ref rte_pmu.h)
>>
>> - **misc**:
>> [EAL config](@ref rte_eal.h),
>> diff --git a/doc/api/doxy-api.conf.in b/doc/api/doxy-api.conf.in index
>> c94f02d411..f4b382e073 100644
>> --- a/doc/api/doxy-api.conf.in
>> +++ b/doc/api/doxy-api.conf.in
>> @@ -69,6 +69,7 @@ INPUT = @TOPDIR@/doc/api/doxy-api-index.md \
>> @TOPDIR@/lib/pdcp \
>> @TOPDIR@/lib/pdump \
>> @TOPDIR@/lib/pipeline \
>> + @TOPDIR@/lib/pmu \
>> @TOPDIR@/lib/port \
>> @TOPDIR@/lib/power \
>> @TOPDIR@/lib/ptr_compress \ diff --git
>> a/doc/guides/prog_guide/profile_app.rst
>> b/doc/guides/prog_guide/profile_app.rst
>> index a6b5fb4d5e..854c515a61 100644
>> --- a/doc/guides/prog_guide/profile_app.rst
>> +++ b/doc/guides/prog_guide/profile_app.rst
>> @@ -7,6 +7,35 @@ Profile Your Application The following sections
>> describe methods of profiling DPDK applications on different
>> architectures.
>>
>> +Performance counter based profiling
>> +-----------------------------------
>> +
>> +Majority of architectures support some performance monitoring unit (PMU).
>> +Such unit provides programmable counters that monitor specific events.
>> +
>> +Different tools gather that information, like for example perf.
>> +However, in some scenarios when CPU cores are isolated and run
>> +dedicated tasks interrupting those tasks with perf may be undesirable.
>> +
>> +In such cases, an application can use the PMU library to read such events via
>``rte_pmu_read()``.
>> +
>> +By default, userspace applications are not allowed to access PMU
>> +internals. That can be changed by setting
>> +``/sys/kernel/perf_event_paranoid`` to 2 (that should be a default
>> +value anyway) and adding ``CAP_PERFMON`` capability to DPDK
>> +application. Please refer to ``Documentation/admin-guide/perf-security.rst`` under Linux sources
>for more information. Fairly recent kernel, i.e >= 5.9, is advised too.
>> +
>> +As of now implementation imposes certain limitations:
>> +
>> +* Management APIs that normally return a non-negative value will
>> +return error (``-ENOTSUP``) while
>> + ``rte_pmu_read()`` will return ``UINT64_MAX`` if running under unsupported operating system.
>> +
>> +* Only EAL lcores are supported
>> +
>> +* EAL lcores must not share a cpu
>> +
>> +* Each EAL lcore measures same group of events
>> +
>>
>> Profiling on x86
>> ----------------
>> diff --git a/doc/guides/rel_notes/release_24_11.rst
>> b/doc/guides/rel_notes/release_24_11.rst
>> index fa4822d928..d34ecb55e0 100644
>> --- a/doc/guides/rel_notes/release_24_11.rst
>> +++ b/doc/guides/rel_notes/release_24_11.rst
>> @@ -247,6 +247,13 @@ New Features
>> Added ability for node to advertise and update multiple xstat counters,
>> that can be retrieved using ``rte_graph_cluster_stats_get``.
>>
>> +* **Added PMU library.**
>> +
>> + Added a new performance monitoring unit (PMU) library which allows
>> + applications to perform self monitoring activities without depending on external utilities
>like perf.
>> + After integration with :doc:`../prog_guide/trace_lib` data gathered
>> + from hardware counters can be stored in CTF format for further analysis.
>> +
>>
>> Removed Items
>> -------------
>> diff --git a/lib/eal/meson.build b/lib/eal/meson.build index
>> e1d6c4cf17..1349624653 100644
>> --- a/lib/eal/meson.build
>> +++ b/lib/eal/meson.build
>> @@ -18,6 +18,9 @@ deps += ['log', 'kvargs'] if not is_windows
>> deps += ['telemetry']
>> endif
>> +if dpdk_conf.has('RTE_LIB_PMU')
>> + deps += ['pmu']
>> +endif
>> if dpdk_conf.has('RTE_USE_LIBBSD')
>> ext_deps += libbsd
>> endif
>> diff --git a/lib/meson.build b/lib/meson.build index
>> ce92cb5537..968ad29e8d 100644
>> --- a/lib/meson.build
>> +++ b/lib/meson.build
>> @@ -13,6 +13,7 @@ libraries = [
>> 'kvargs', # eal depends on kvargs
>> 'argparse',
>> 'telemetry', # basic info querying
>> + 'pmu',
>> 'eal', # everything depends on eal
>> 'ptr_compress',
>> 'ring',
>> diff --git a/lib/pmu/meson.build b/lib/pmu/meson.build new file mode
>> 100644 index 0000000000..f173b6f55c
>> --- /dev/null
>> +++ b/lib/pmu/meson.build
>> @@ -0,0 +1,13 @@
>> +# SPDX-License-Identifier: BSD-3-Clause # Copyright(C) 2024 Marvell
>> +International Ltd.
>> +
>> +if not is_linux
>> + build = false
>> + reason = 'only supported on Linux'
>> + subdir_done()
>> +endif
>> +
>> +headers = files('rte_pmu.h')
>> +sources = files('rte_pmu.c')
>> +
>> +deps += ['log']
>> diff --git a/lib/pmu/pmu_private.h b/lib/pmu/pmu_private.h new file
>> mode 100644 index 0000000000..d2b15615bf
>> --- /dev/null
>> +++ b/lib/pmu/pmu_private.h
>> @@ -0,0 +1,32 @@
>> +/* SPDX-License-Identifier: BSD-3-Clause
>> + * Copyright(c) 2024 Marvell
>> + */
>> +
>> +#ifndef _PMU_PRIVATE_H_
>> +#define _PMU_PRIVATE_H_
>> +
>> +/**
>> + * Architecture specific PMU init callback.
>> + *
>> + * @return
>> + * 0 in case of success, negative value otherwise.
>> + */
>> +int
>> +pmu_arch_init(void);
>> +
>> +/**
>> + * Architecture specific PMU cleanup callback.
>> + */
>> +void
>> +pmu_arch_fini(void);
>> +
>> +/**
>> + * Apply architecture specific settings to config before passing it to syscall.
>> + *
>> + * @param config
>> + * Architecture specific event configuration. Consult kernel sources for available options.
>> + */
>> +void
>> +pmu_arch_fixup_config(uint64_t config[3]);
>> +
>> +#endif /* _PMU_PRIVATE_H_ */
>> diff --git a/lib/pmu/rte_pmu.c b/lib/pmu/rte_pmu.c new file mode
>> 100644 index 0000000000..dd57961627
>> --- /dev/null
>> +++ b/lib/pmu/rte_pmu.c
>> @@ -0,0 +1,473 @@
>> +/* SPDX-License-Identifier: BSD-3-Clause
>> + * Copyright(C) 2024 Marvell International Ltd.
>> + */
>> +
>> +#include <ctype.h>
>> +#include <dirent.h>
>> +#include <errno.h>
>> +#include <regex.h>
>> +#include <stdlib.h>
>> +#include <string.h>
>> +#include <sys/ioctl.h>
>> +#include <sys/mman.h>
>> +#include <sys/queue.h>
>> +#include <sys/syscall.h>
>> +#include <unistd.h>
>> +
>> +#include <rte_atomic.h>
>> +#include <rte_bitops.h>
>> +#include <rte_lcore.h>
>> +#include <rte_log.h>
>> +#include <rte_per_lcore.h>
>> +#include <rte_pmu.h>
>> +#include <rte_tailq.h>
>> +
>> +#include "pmu_private.h"
>> +
>> +#define EVENT_SOURCE_DEVICES_PATH "/sys/bus/event_source/devices"
>> +
>> +#define FIELD_PREP(m, v) (((uint64_t)(v) << (__builtin_ffsll(m) - 1))
>> +& (m))
>> +
>> +RTE_LOG_REGISTER_DEFAULT(rte_pmu_logtype, INFO) #define
>> +RTE_LOGTYPE_PMU rte_pmu_logtype
>> +
>> +#define PMU_LOG(level, ...) \
>> + RTE_LOG_LINE(level, PMU, ## __VA_ARGS__)
>> +
>> +/* A structure describing an event */ struct rte_pmu_event {
>> + char *name;
>> + unsigned int index;
>> + TAILQ_ENTRY(rte_pmu_event) next;
>> +};
>> +
>> +struct rte_pmu rte_pmu;
>> +
>> +/*
>> + * Following __rte_weak functions provide default no-op.
>> +Architectures should override them if
>> + * necessary.
>> + */
>> +
>> +int
>> +__rte_weak pmu_arch_init(void)
>> +{
>> + return 0;
>> +}
>> +
>> +void
>> +__rte_weak pmu_arch_fini(void)
>> +{
>> +}
>> +
>> +void
>> +__rte_weak pmu_arch_fixup_config(uint64_t __rte_unused config[3]) { }
>> +
>> +static int
>> +get_term_format(const char *name, int *num, uint64_t *mask) {
>> + char path[PATH_MAX];
>> + char *config = NULL;
>> + int high, low, ret;
>> + FILE *fp;
>> +
>> + *num = *mask = 0;
>> + snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/format/%s", rte_pmu.name, name);
>> + fp = fopen(path, "r");
>> + if (fp == NULL)
>> + return -errno;
>> +
>> + errno = 0;
>> + ret = fscanf(fp, "%m[^:]:%d-%d", &config, &low, &high);
>> + if (ret < 2) {
>> + ret = -ENODATA;
>> + goto out;
>> + }
>> + if (errno) {
>> + ret = -errno;
>> + goto out;
>> + }
>> +
>> + if (ret == 2)
>> + high = low;
>> +
>> + *mask = RTE_GENMASK64(high, low);
>> + /* Last digit should be [012]. If last digit is missing 0 is implied. */
>> + *num = config[strlen(config) - 1];
>> + *num = isdigit(*num) ? *num - '0' : 0;
>> +
>> + ret = 0;
>> +out:
>> + free(config);
>> + fclose(fp);
>> +
>> + return ret;
>> +}
>> +
>> +static int
>> +parse_event(char *buf, uint64_t config[3]) {
>> + char *token, *term;
>> + int num, ret, val;
>> + uint64_t mask;
>> +
>> + config[0] = config[1] = config[2] = 0;
>> +
>> + token = strtok(buf, ",");
>> + while (token) {
>> + errno = 0;
>> + /* <term>=<value> */
>> + ret = sscanf(token, "%m[^=]=%i", &term, &val);
>> + if (ret < 1)
>> + return -ENODATA;
>> + if (errno)
>> + return -errno;
>> + if (ret == 1)
>> + val = 1;
>> +
>> + ret = get_term_format(term, &num, &mask);
>> + free(term);
>> + if (ret)
>> + return ret;
>> +
>> + config[num] |= FIELD_PREP(mask, val);
>> + token = strtok(NULL, ",");
>> + }
>> +
>> + return 0;
>> +}
>> +
>> +static int
>> +get_event_config(const char *name, uint64_t config[3]) {
>> + char path[PATH_MAX], buf[BUFSIZ];
>> + FILE *fp;
>> + int ret;
>> +
>> + snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/events/%s", rte_pmu.name, name);
>> + fp = fopen(path, "r");
>> + if (fp == NULL)
>> + return -errno;
>> +
>> + ret = fread(buf, 1, sizeof(buf), fp);
>> + if (ret == 0) {
>> + fclose(fp);
>> +
>> + return -EINVAL;
>> + }
>> + fclose(fp);
>> + buf[ret] = '\0';
>> +
>> + return parse_event(buf, config);
>> +}
>> +
>> +static int
>> +do_perf_event_open(uint64_t config[3], int group_fd) {
>> + struct perf_event_attr attr = {
>> + .size = sizeof(struct perf_event_attr),
>> + .type = PERF_TYPE_RAW,
>> + .exclude_kernel = 1,
>> + .exclude_hv = 1,
>> + .disabled = 1,
>> + .pinned = group_fd == -1,
>> + };
>> +
>> + pmu_arch_fixup_config(config);
>> +
>> + attr.config = config[0];
>> + attr.config1 = config[1];
>> + attr.config2 = config[2];
>> +
>> + return syscall(SYS_perf_event_open, &attr, 0, -1, group_fd, 0); }
>> +
>> +static int
>> +open_events(struct rte_pmu_event_group *group) {
>> + struct rte_pmu_event *event;
>> + uint64_t config[3];
>> + int num = 0, ret;
>> +
>> + /* group leader gets created first, with fd = -1 */
>> + group->fds[0] = -1;
>> +
>> + TAILQ_FOREACH(event, &rte_pmu.event_list, next) {
>> + ret = get_event_config(event->name, config);
>> + if (ret)
>> + continue;
>> +
>> + ret = do_perf_event_open(config, group->fds[0]);
>> + if (ret == -1) {
>> + ret = -errno;
>> + goto out;
>> + }
>> +
>> + group->fds[event->index] = ret;
>> + num++;
>> + }
>> +
>> + return 0;
>> +out:
>> + for (--num; num >= 0; num--) {
>> + close(group->fds[num]);
>> + group->fds[num] = -1;
>> + }
>> +
>> + return ret;
>> +}
>> +
>> +static int
>> +mmap_events(struct rte_pmu_event_group *group) {
>> + long page_size = sysconf(_SC_PAGE_SIZE);
>> + unsigned int i;
>> + void *addr;
>> + int ret;
>> +
>> + for (i = 0; i < rte_pmu.num_group_events; i++) {
>> + addr = mmap(0, page_size, PROT_READ, MAP_SHARED, group->fds[i], 0);
>> + if (addr == MAP_FAILED) {
>> + ret = -errno;
>> + goto out;
>> + }
>> +
>> + group->mmap_pages[i] = addr;
>> + }
>> +
>> + return 0;
>> +out:
>> + for (; i; i--) {
>> + munmap(group->mmap_pages[i - 1], page_size);
>> + group->mmap_pages[i - 1] = NULL;
>> + }
>> +
>> + return ret;
>> +}
>> +
>> +static void
>> +cleanup_events(struct rte_pmu_event_group *group) {
>> + unsigned int i;
>> +
>> + if (group->fds[0] != -1)
>> + ioctl(group->fds[0], PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP);
>> +
>> + for (i = 0; i < rte_pmu.num_group_events; i++) {
>> + if (group->mmap_pages[i]) {
>> + munmap(group->mmap_pages[i], sysconf(_SC_PAGE_SIZE));
>> + group->mmap_pages[i] = NULL;
>> + }
>> +
>> + if (group->fds[i] != -1) {
>> + close(group->fds[i]);
>> + group->fds[i] = -1;
>> + }
>> + }
>> +
>> + group->enabled = false;
>> +}
>> +
>> +int
>> +__rte_pmu_enable_group(struct rte_pmu_event_group *group) {
>> + int ret;
>> +
>> + if (rte_pmu.num_group_events == 0)
>> + return -ENODEV;
>> +
>> + ret = open_events(group);
>> + if (ret)
>> + goto out;
>> +
>> + ret = mmap_events(group);
>> + if (ret)
>> + goto out;
>> +
>> + if (ioctl(group->fds[0], PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP) == -1) {
>> + ret = -errno;
>> + goto out;
>> + }
>> +
>> + if (ioctl(group->fds[0], PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP) == -1) {
>> + ret = -errno;
>> + goto out;
>> + }
>> +
>> + group->enabled = true;
>> +
>> + return 0;
>> +out:
>> + cleanup_events(group);
>> +
>> + return ret;
>> +}
>> +
>> +static int
>> +scan_pmus(void)
>> +{
>> + char path[PATH_MAX];
>> + struct dirent *dent;
>> + const char *name;
>> + DIR *dirp;
>> +
>> + dirp = opendir(EVENT_SOURCE_DEVICES_PATH);
>> + if (dirp == NULL)
>> + return -errno;
>> +
>> + while ((dent = readdir(dirp))) {
>> + name = dent->d_name;
>> + if (name[0] == '.')
>> + continue;
>> +
>> + /* sysfs entry should either contain cpus or be a cpu */
>> + if (!strcmp(name, "cpu"))
>> + break;
>> +
>> + snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/cpus", name);
>> + if (access(path, F_OK) == 0)
>> + break;
>> + }
>> +
>> + if (dent) {
>> + rte_pmu.name = strdup(name);
>> + if (rte_pmu.name == NULL) {
>> + closedir(dirp);
>> +
>> + return -ENOMEM;
>> + }
>> + }
>> +
>> + closedir(dirp);
>> +
>> + return rte_pmu.name ? 0 : -ENODEV;
>> +}
>> +
>> +static struct rte_pmu_event *
>> +new_event(const char *name)
>> +{
>> + struct rte_pmu_event *event;
>> +
>> + event = calloc(1, sizeof(*event));
>> + if (event == NULL)
>> + goto out;
>> +
>> + event->name = strdup(name);
>> + if (event->name == NULL) {
>> + free(event);
>> + event = NULL;
>> + }
>> +
>> +out:
>> + return event;
>> +}
>> +
>> +static void
>> +free_event(struct rte_pmu_event *event) {
>> + free(event->name);
>> + free(event);
>> +}
>> +
>> +int
>> +rte_pmu_add_event(const char *name)
>> +{
>> + struct rte_pmu_event *event;
>> + char path[PATH_MAX];
>> +
>> + if (!rte_pmu.initialized) {
>> + PMU_LOG(ERR, "PMU is not initialized");
>> + return -ENODEV;
>> + }
>> +
>> + if (rte_pmu.num_group_events + 1 >= RTE_MAX_NUM_GROUP_EVENTS) {
>> + PMU_LOG(ERR, "Excessive number of events in a group (%d > %d)",
>> + rte_pmu.num_group_events, RTE_MAX_NUM_GROUP_EVENTS);
>> + return -ENOSPC;
>> + }
>> +
>> + snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/events/%s", rte_pmu.name, name);
>> + if (access(path, R_OK)) {
>> + PMU_LOG(ERR, "Cannot access %s", path);
>> + return -ENODEV;
>> + }
>> +
>> + TAILQ_FOREACH(event, &rte_pmu.event_list, next) {
>> + if (strcmp(event->name, name))
>> + continue;
>> +
>> + return event->index;
>> + }
>> +
>> + event = new_event(name);
>> + if (event == NULL) {
>> + PMU_LOG(ERR, "Failed to create event %s", name);
>> + return -ENOMEM;
>> + }
>> +
>> + event->index = rte_pmu.num_group_events++;
>> + TAILQ_INSERT_TAIL(&rte_pmu.event_list, event, next);
>> +
>> + return event->index;
>> +}
>> +
>> +int
>> +rte_pmu_init(void)
>> +{
>> + int ret;
>> +
>> + if (rte_pmu.initialized)
>> + return 0;
>> +
>> + ret = scan_pmus();
>> + if (ret) {
>> + PMU_LOG(ERR, "Failed to scan for event sources");
>> + goto out;
>> + }
>> +
>> + ret = pmu_arch_init();
>> + if (ret) {
>> + PMU_LOG(ERR, "Failed to setup arch internals");
>> + goto out;
>> + }
>> +
>> + TAILQ_INIT(&rte_pmu.event_list);
>> + rte_pmu.initialized = 1;
>> +out:
>> + if (ret) {
>> + free(rte_pmu.name);
>> + rte_pmu.name = NULL;
>> + }
>> +
>> + return ret;
>> +}
>> +
>> +void
>> +rte_pmu_fini(void)
>> +{
>> + struct rte_pmu_event *event, *tmp_event;
>> + struct rte_pmu_event_group *group;
>> + unsigned int i;
>> +
>> + if (!rte_pmu.initialized)
>> + return;
>> +
>> + RTE_TAILQ_FOREACH_SAFE(event, &rte_pmu.event_list, next, tmp_event) {
>> + TAILQ_REMOVE(&rte_pmu.event_list, event, next);
>> + free_event(event);
>> + }
>> +
>> + for (i = 0; i < RTE_DIM(rte_pmu.event_groups); i++) {
>> + group = &rte_pmu.event_groups[i];
>> + if (!group->enabled)
>> + continue;
>> +
>> + cleanup_events(group);
>> + }
>> +
>> + pmu_arch_fini();
>> + free(rte_pmu.name);
>> + rte_pmu.name = NULL;
>> + rte_pmu.num_group_events = 0;
>> +}
>> diff --git a/lib/pmu/rte_pmu.h b/lib/pmu/rte_pmu.h new file mode
>> 100644 index 0000000000..7d10a0dc56
>> --- /dev/null
>> +++ b/lib/pmu/rte_pmu.h
>> @@ -0,0 +1,205 @@
>> +/* SPDX-License-Identifier: BSD-3-Clause
>> + * Copyright(c) 2024 Marvell
>> + */
>> +
>> +#ifndef _RTE_PMU_H_
>> +#define _RTE_PMU_H_
>> +
>> +/**
>> + * @file
>> + *
>> + * PMU event tracing operations
>> + *
>> + * This file defines generic API and types necessary to setup PMU and
>> + * read selected counters in runtime. Exported APIs are generally not MT-safe.
>> + * One exception is rte_pmu_read() which can be called concurrently
>> +once
>> + * everything has been setup.
>> + */
>> +
>
>From reading the code - I can see ithat PMU API is not MT safe.
>The only function that can run in parallel is rte_pmu_read(), correct?
>All other combinations: let say pmu_read() and add_event() are not possible, right?
>If so, then it is probably worth to articulate that explicitly in the public API comments, after
>all extra comment doesn't hurt.
>
Yes, when it comes to this version pmu_read() is the only function that can be called concurrently
on different lcores. Okay, I'll try to rewrite existing comment to make that more explicit.
>> +#ifdef __cplusplus
>> +extern "C" {
>> +#endif
>> +
>> +#include <errno.h>
>> +#include <linux/perf_event.h>
>> +
>> +#include <rte_atomic.h>
>> +#include <rte_branch_prediction.h>
>> +#include <rte_common.h>
>> +#include <rte_compat.h>
>> +#include <rte_lcore.h>
>> +
>> +/** Maximum number of events in a group */ #define
>> +RTE_MAX_NUM_GROUP_EVENTS 8
>> +
>> +/**
>> + * A structure describing a group of events.
>> + */
>> +struct __rte_cache_aligned rte_pmu_event_group {
>> + /** array of user pages */
>> + struct perf_event_mmap_page *mmap_pages[RTE_MAX_NUM_GROUP_EVENTS];
>> + int fds[RTE_MAX_NUM_GROUP_EVENTS]; /**< array of event descriptors */
>> + TAILQ_ENTRY(rte_pmu_event_group) next; /**< list entry */
>> + bool enabled; /**< true if group was enabled on particular lcore */
>> +};
>> +
>> +/**
>> + * A PMU state container.
>> + */
>> +struct rte_pmu {
>> + struct rte_pmu_event_group event_groups[RTE_MAX_LCORE]; /**< event groups */
>> + unsigned int num_group_events; /**< number of events in a group */
>> + unsigned int initialized; /**< initialization counter */
>> + char *name; /**< name of core PMU listed under /sys/bus/event_source/devices */
>> + TAILQ_HEAD(, rte_pmu_event) event_list; /**< list of matching events
>> +*/ };
>> +
>> +/** PMU state container */
>> +extern struct rte_pmu rte_pmu;
>> +
>> +/** Each architecture supporting PMU needs to provide its own version
>> +*/ #ifndef rte_pmu_pmc_read #define rte_pmu_pmc_read(index) ({
>> +(void)(index); 0; }) #endif
>> +
>> +/**
>> + * @warning
>> + * @b EXPERIMENTAL: this API may change without prior notice
>> + *
>> + * Read PMU counter.
>> + *
>> + * @warning This should not be called directly.
>> + *
>> + * @param pc
>> + * Pointer to the mmapped user page.
>> + * @return
>> + * Counter value read from hardware.
>> + */
>> +__rte_experimental
>> +static __rte_always_inline uint64_t
>> +__rte_pmu_read_userpage(struct perf_event_mmap_page *pc) { #define
>> +__RTE_PMU_READ_ONCE(x) (*(const volatile typeof(x) *)&(x))
>> + uint64_t width, offset;
>> + uint32_t seq, index;
>> + int64_t pmc;
>> +
>> + for (;;) {
>> + seq = __RTE_PMU_READ_ONCE(pc->lock);
>> + rte_compiler_barrier();
>> + index = __RTE_PMU_READ_ONCE(pc->index);
>> + offset = __RTE_PMU_READ_ONCE(pc->offset);
>> + width = __RTE_PMU_READ_ONCE(pc->pmc_width);
>> +
>> + /* index set to 0 means that particular counter cannot be used */
>> + if (likely(pc->cap_user_rdpmc && index)) {
>> + pmc = rte_pmu_pmc_read(index - 1);
>> + pmc <<= 64 - width;
>> + pmc >>= 64 - width;
>> + offset += pmc;
>> + }
>> +
>> + rte_compiler_barrier();
>> +
>> + if (likely(__RTE_PMU_READ_ONCE(pc->lock) == seq))
>> + return offset;
>> + }
>> +
>> + return 0;
>> +}
>> +
>> +/**
>> + * @warning
>> + * @b EXPERIMENTAL: this API may change without prior notice
>> + *
>> + * Enable group of events on the calling lcore.
>> + *
>> + * @warning This should not be called directly.
>> + *
>> + * @param group
>> + * Pointer to the group which will be enabled.
>> + * @return
>> + * 0 in case of success, negative value otherwise.
>> + */
>> +__rte_experimental
>> +int
>> +__rte_pmu_enable_group(struct rte_pmu_event_group *group);
>> +
>> +/**
>> + * @warning
>> + * @b EXPERIMENTAL: this API may change without prior notice
>> + *
>> + * Initialize PMU library.
>> + *
>> + * @return
>> + * 0 in case of success, negative value otherwise.
>> + */
>> +__rte_experimental
>> +int
>> +rte_pmu_init(void);
>> +
>> +/**
>> + * @warning
>> + * @b EXPERIMENTAL: this API may change without prior notice
>> + *
>> + * Finalize PMU library.
>> + */
>> +__rte_experimental
>> +void
>> +rte_pmu_fini(void);
>> +
>> +/**
>> + * @warning
>> + * @b EXPERIMENTAL: this API may change without prior notice
>> + *
>> + * Add event to the group of enabled events.
>> + *
>> + * @param name
>> + * Name of an event listed under /sys/bus/event_source/devices/<pmu>/events.
>> + * @return
>> + * Event index in case of success, negative value otherwise.
>> + */
>> +__rte_experimental
>> +int
>> +rte_pmu_add_event(const char *name);
>> +
>> +/**
>> + * @warning
>> + * @b EXPERIMENTAL: this API may change without prior notice
>> + *
>> + * Read hardware counter configured to count occurrences of an event.
>> + *
>> + * @param index
>> + * Index of an event to be read.
>> + * @return
>> + * Event value read from register. In case of errors or lack of support
>> + * 0 is returned. In other words, stream of zeros in a trace file
>> + * indicates problem with reading particular PMU event register.
>> + */
>
>As I remember that function implies to be called from the thread bind to one particular cpu.
>If that is still the case - please state it explicitly in the comments above.
>
Yes, that still the case. Will mention that.
>> +__rte_experimental
>> +static __rte_always_inline uint64_t
>> +rte_pmu_read(unsigned int index)
>> +{
>> + struct rte_pmu_event_group *group =
>> +&rte_pmu.event_groups[rte_lcore_id()];
>
>I think we better check here value returned by rte_lcore_id() before using it as an index.
>
I'll add this check. Hopefully it will disappear once this gets switched to per-lcore-variables
(given performance won't deteriorate).
>> + int ret;
>> +
>> + if (unlikely(!rte_pmu.initialized))
>> + return 0;
>> +
>> + if (unlikely(!group->enabled)) {
>> + ret = __rte_pmu_enable_group(group);
>> + if (ret)
>> + return 0;
>> + }
>> +
>> + if (unlikely(index >= rte_pmu.num_group_events))
>> + return 0;
>> +
>> + return __rte_pmu_read_userpage(group->mmap_pages[index]);
>> +}
>> +
>> +#ifdef __cplusplus
>> +}
>> +#endif
>> +
>> +#endif /* _RTE_PMU_H_ */
>> diff --git a/lib/pmu/version.map b/lib/pmu/version.map new file mode
>> 100644 index 0000000000..d0f907d13d
>> --- /dev/null
>> +++ b/lib/pmu/version.map
>> @@ -0,0 +1,13 @@
>> +EXPERIMENTAL {
>> + global:
>> +
>> + # added in 24.11
>> + __rte_pmu_enable_group;
>> + rte_pmu;
>> + rte_pmu_add_event;
>> + rte_pmu_fini;
>> + rte_pmu_init;
>> + rte_pmu_read;
>> +
>> + local: *;
>> +};
>> --
>> 2.34.1
@@ -1816,6 +1816,11 @@ M: Nithin Dabilpuram <ndabilpuram@marvell.com>
M: Pavan Nikhilesh <pbhagavatula@marvell.com>
F: lib/node/
+PMU - EXPERIMENTAL
+M: Tomasz Duszynski <tduszynski@marvell.com>
+F: lib/pmu/
+F: app/test/test_pmu*
+
Test Applications
-----------------
@@ -141,6 +141,7 @@ source_file_deps = {
'test_pmd_perf.c': ['ethdev', 'net'] + packet_burst_generator_deps,
'test_pmd_ring.c': ['net_ring', 'ethdev', 'bus_vdev'],
'test_pmd_ring_perf.c': ['ethdev', 'net_ring', 'bus_vdev'],
+ 'test_pmu.c': ['pmu'],
'test_power.c': ['power'],
'test_power_cpufreq.c': ['power'],
'test_power_intel_uncore.c': ['power'],
new file mode 100644
@@ -0,0 +1,49 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2024 Marvell International Ltd.
+ */
+
+#include <rte_pmu.h>
+
+#include "test.h"
+
+static int
+test_pmu_read(void)
+{
+ const char *name = NULL;
+ int tries = 10, event;
+ uint64_t val = 0;
+
+ if (name == NULL) {
+ printf("PMU not supported on this arch\n");
+ return TEST_SKIPPED;
+ }
+
+ if (rte_pmu_init() < 0)
+ return TEST_FAILED;
+
+ event = rte_pmu_add_event(name);
+ while (tries--)
+ val += rte_pmu_read(event);
+
+ rte_pmu_fini();
+
+ return val ? TEST_SUCCESS : TEST_FAILED;
+}
+
+static struct unit_test_suite pmu_tests = {
+ .suite_name = "pmu autotest",
+ .setup = NULL,
+ .teardown = NULL,
+ .unit_test_cases = {
+ TEST_CASE(test_pmu_read),
+ TEST_CASES_END()
+ }
+};
+
+static int
+test_pmu(void)
+{
+ return unit_test_suite_runner(&pmu_tests);
+}
+
+REGISTER_FAST_TEST(pmu_autotest, true, true, test_pmu);
@@ -240,7 +240,8 @@ The public API headers are grouped by topics:
[log](@ref rte_log.h),
[errno](@ref rte_errno.h),
[trace](@ref rte_trace.h),
- [trace_point](@ref rte_trace_point.h)
+ [trace_point](@ref rte_trace_point.h),
+ [pmu](@ref rte_pmu.h)
- **misc**:
[EAL config](@ref rte_eal.h),
@@ -69,6 +69,7 @@ INPUT = @TOPDIR@/doc/api/doxy-api-index.md \
@TOPDIR@/lib/pdcp \
@TOPDIR@/lib/pdump \
@TOPDIR@/lib/pipeline \
+ @TOPDIR@/lib/pmu \
@TOPDIR@/lib/port \
@TOPDIR@/lib/power \
@TOPDIR@/lib/ptr_compress \
@@ -7,6 +7,35 @@ Profile Your Application
The following sections describe methods of profiling DPDK applications on
different architectures.
+Performance counter based profiling
+-----------------------------------
+
+Majority of architectures support some performance monitoring unit (PMU).
+Such unit provides programmable counters that monitor specific events.
+
+Different tools gather that information, like for example perf.
+However, in some scenarios when CPU cores are isolated and run
+dedicated tasks interrupting those tasks with perf may be undesirable.
+
+In such cases, an application can use the PMU library to read such events via ``rte_pmu_read()``.
+
+By default, userspace applications are not allowed to access PMU internals. That can be changed
+by setting ``/sys/kernel/perf_event_paranoid`` to 2 (that should be a default value anyway) and
+adding ``CAP_PERFMON`` capability to DPDK application. Please refer to
+``Documentation/admin-guide/perf-security.rst`` under Linux sources for more information. Fairly
+recent kernel, i.e >= 5.9, is advised too.
+
+As of now implementation imposes certain limitations:
+
+* Management APIs that normally return a non-negative value will return error (``-ENOTSUP``) while
+ ``rte_pmu_read()`` will return ``UINT64_MAX`` if running under unsupported operating system.
+
+* Only EAL lcores are supported
+
+* EAL lcores must not share a cpu
+
+* Each EAL lcore measures same group of events
+
Profiling on x86
----------------
@@ -247,6 +247,13 @@ New Features
Added ability for node to advertise and update multiple xstat counters,
that can be retrieved using ``rte_graph_cluster_stats_get``.
+* **Added PMU library.**
+
+ Added a new performance monitoring unit (PMU) library which allows applications
+ to perform self monitoring activities without depending on external utilities like perf.
+ After integration with :doc:`../prog_guide/trace_lib` data gathered from hardware counters
+ can be stored in CTF format for further analysis.
+
Removed Items
-------------
@@ -18,6 +18,9 @@ deps += ['log', 'kvargs']
if not is_windows
deps += ['telemetry']
endif
+if dpdk_conf.has('RTE_LIB_PMU')
+ deps += ['pmu']
+endif
if dpdk_conf.has('RTE_USE_LIBBSD')
ext_deps += libbsd
endif
@@ -13,6 +13,7 @@ libraries = [
'kvargs', # eal depends on kvargs
'argparse',
'telemetry', # basic info querying
+ 'pmu',
'eal', # everything depends on eal
'ptr_compress',
'ring',
new file mode 100644
@@ -0,0 +1,13 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(C) 2024 Marvell International Ltd.
+
+if not is_linux
+ build = false
+ reason = 'only supported on Linux'
+ subdir_done()
+endif
+
+headers = files('rte_pmu.h')
+sources = files('rte_pmu.c')
+
+deps += ['log']
new file mode 100644
@@ -0,0 +1,32 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2024 Marvell
+ */
+
+#ifndef _PMU_PRIVATE_H_
+#define _PMU_PRIVATE_H_
+
+/**
+ * Architecture specific PMU init callback.
+ *
+ * @return
+ * 0 in case of success, negative value otherwise.
+ */
+int
+pmu_arch_init(void);
+
+/**
+ * Architecture specific PMU cleanup callback.
+ */
+void
+pmu_arch_fini(void);
+
+/**
+ * Apply architecture specific settings to config before passing it to syscall.
+ *
+ * @param config
+ * Architecture specific event configuration. Consult kernel sources for available options.
+ */
+void
+pmu_arch_fixup_config(uint64_t config[3]);
+
+#endif /* _PMU_PRIVATE_H_ */
new file mode 100644
@@ -0,0 +1,473 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2024 Marvell International Ltd.
+ */
+
+#include <ctype.h>
+#include <dirent.h>
+#include <errno.h>
+#include <regex.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/queue.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+
+#include <rte_atomic.h>
+#include <rte_bitops.h>
+#include <rte_lcore.h>
+#include <rte_log.h>
+#include <rte_per_lcore.h>
+#include <rte_pmu.h>
+#include <rte_tailq.h>
+
+#include "pmu_private.h"
+
+#define EVENT_SOURCE_DEVICES_PATH "/sys/bus/event_source/devices"
+
+#define FIELD_PREP(m, v) (((uint64_t)(v) << (__builtin_ffsll(m) - 1)) & (m))
+
+RTE_LOG_REGISTER_DEFAULT(rte_pmu_logtype, INFO)
+#define RTE_LOGTYPE_PMU rte_pmu_logtype
+
+#define PMU_LOG(level, ...) \
+ RTE_LOG_LINE(level, PMU, ## __VA_ARGS__)
+
+/* A structure describing an event */
+struct rte_pmu_event {
+ char *name;
+ unsigned int index;
+ TAILQ_ENTRY(rte_pmu_event) next;
+};
+
+struct rte_pmu rte_pmu;
+
+/*
+ * Following __rte_weak functions provide default no-op. Architectures should override them if
+ * necessary.
+ */
+
+int
+__rte_weak pmu_arch_init(void)
+{
+ return 0;
+}
+
+void
+__rte_weak pmu_arch_fini(void)
+{
+}
+
+void
+__rte_weak pmu_arch_fixup_config(uint64_t __rte_unused config[3])
+{
+}
+
+static int
+get_term_format(const char *name, int *num, uint64_t *mask)
+{
+ char path[PATH_MAX];
+ char *config = NULL;
+ int high, low, ret;
+ FILE *fp;
+
+ *num = *mask = 0;
+ snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/format/%s", rte_pmu.name, name);
+ fp = fopen(path, "r");
+ if (fp == NULL)
+ return -errno;
+
+ errno = 0;
+ ret = fscanf(fp, "%m[^:]:%d-%d", &config, &low, &high);
+ if (ret < 2) {
+ ret = -ENODATA;
+ goto out;
+ }
+ if (errno) {
+ ret = -errno;
+ goto out;
+ }
+
+ if (ret == 2)
+ high = low;
+
+ *mask = RTE_GENMASK64(high, low);
+ /* Last digit should be [012]. If last digit is missing 0 is implied. */
+ *num = config[strlen(config) - 1];
+ *num = isdigit(*num) ? *num - '0' : 0;
+
+ ret = 0;
+out:
+ free(config);
+ fclose(fp);
+
+ return ret;
+}
+
+static int
+parse_event(char *buf, uint64_t config[3])
+{
+ char *token, *term;
+ int num, ret, val;
+ uint64_t mask;
+
+ config[0] = config[1] = config[2] = 0;
+
+ token = strtok(buf, ",");
+ while (token) {
+ errno = 0;
+ /* <term>=<value> */
+ ret = sscanf(token, "%m[^=]=%i", &term, &val);
+ if (ret < 1)
+ return -ENODATA;
+ if (errno)
+ return -errno;
+ if (ret == 1)
+ val = 1;
+
+ ret = get_term_format(term, &num, &mask);
+ free(term);
+ if (ret)
+ return ret;
+
+ config[num] |= FIELD_PREP(mask, val);
+ token = strtok(NULL, ",");
+ }
+
+ return 0;
+}
+
+static int
+get_event_config(const char *name, uint64_t config[3])
+{
+ char path[PATH_MAX], buf[BUFSIZ];
+ FILE *fp;
+ int ret;
+
+ snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/events/%s", rte_pmu.name, name);
+ fp = fopen(path, "r");
+ if (fp == NULL)
+ return -errno;
+
+ ret = fread(buf, 1, sizeof(buf), fp);
+ if (ret == 0) {
+ fclose(fp);
+
+ return -EINVAL;
+ }
+ fclose(fp);
+ buf[ret] = '\0';
+
+ return parse_event(buf, config);
+}
+
+static int
+do_perf_event_open(uint64_t config[3], int group_fd)
+{
+ struct perf_event_attr attr = {
+ .size = sizeof(struct perf_event_attr),
+ .type = PERF_TYPE_RAW,
+ .exclude_kernel = 1,
+ .exclude_hv = 1,
+ .disabled = 1,
+ .pinned = group_fd == -1,
+ };
+
+ pmu_arch_fixup_config(config);
+
+ attr.config = config[0];
+ attr.config1 = config[1];
+ attr.config2 = config[2];
+
+ return syscall(SYS_perf_event_open, &attr, 0, -1, group_fd, 0);
+}
+
+static int
+open_events(struct rte_pmu_event_group *group)
+{
+ struct rte_pmu_event *event;
+ uint64_t config[3];
+ int num = 0, ret;
+
+ /* group leader gets created first, with fd = -1 */
+ group->fds[0] = -1;
+
+ TAILQ_FOREACH(event, &rte_pmu.event_list, next) {
+ ret = get_event_config(event->name, config);
+ if (ret)
+ continue;
+
+ ret = do_perf_event_open(config, group->fds[0]);
+ if (ret == -1) {
+ ret = -errno;
+ goto out;
+ }
+
+ group->fds[event->index] = ret;
+ num++;
+ }
+
+ return 0;
+out:
+ for (--num; num >= 0; num--) {
+ close(group->fds[num]);
+ group->fds[num] = -1;
+ }
+
+ return ret;
+}
+
+static int
+mmap_events(struct rte_pmu_event_group *group)
+{
+ long page_size = sysconf(_SC_PAGE_SIZE);
+ unsigned int i;
+ void *addr;
+ int ret;
+
+ for (i = 0; i < rte_pmu.num_group_events; i++) {
+ addr = mmap(0, page_size, PROT_READ, MAP_SHARED, group->fds[i], 0);
+ if (addr == MAP_FAILED) {
+ ret = -errno;
+ goto out;
+ }
+
+ group->mmap_pages[i] = addr;
+ }
+
+ return 0;
+out:
+ for (; i; i--) {
+ munmap(group->mmap_pages[i - 1], page_size);
+ group->mmap_pages[i - 1] = NULL;
+ }
+
+ return ret;
+}
+
+static void
+cleanup_events(struct rte_pmu_event_group *group)
+{
+ unsigned int i;
+
+ if (group->fds[0] != -1)
+ ioctl(group->fds[0], PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP);
+
+ for (i = 0; i < rte_pmu.num_group_events; i++) {
+ if (group->mmap_pages[i]) {
+ munmap(group->mmap_pages[i], sysconf(_SC_PAGE_SIZE));
+ group->mmap_pages[i] = NULL;
+ }
+
+ if (group->fds[i] != -1) {
+ close(group->fds[i]);
+ group->fds[i] = -1;
+ }
+ }
+
+ group->enabled = false;
+}
+
+int
+__rte_pmu_enable_group(struct rte_pmu_event_group *group)
+{
+ int ret;
+
+ if (rte_pmu.num_group_events == 0)
+ return -ENODEV;
+
+ ret = open_events(group);
+ if (ret)
+ goto out;
+
+ ret = mmap_events(group);
+ if (ret)
+ goto out;
+
+ if (ioctl(group->fds[0], PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP) == -1) {
+ ret = -errno;
+ goto out;
+ }
+
+ if (ioctl(group->fds[0], PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP) == -1) {
+ ret = -errno;
+ goto out;
+ }
+
+ group->enabled = true;
+
+ return 0;
+out:
+ cleanup_events(group);
+
+ return ret;
+}
+
+static int
+scan_pmus(void)
+{
+ char path[PATH_MAX];
+ struct dirent *dent;
+ const char *name;
+ DIR *dirp;
+
+ dirp = opendir(EVENT_SOURCE_DEVICES_PATH);
+ if (dirp == NULL)
+ return -errno;
+
+ while ((dent = readdir(dirp))) {
+ name = dent->d_name;
+ if (name[0] == '.')
+ continue;
+
+ /* sysfs entry should either contain cpus or be a cpu */
+ if (!strcmp(name, "cpu"))
+ break;
+
+ snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/cpus", name);
+ if (access(path, F_OK) == 0)
+ break;
+ }
+
+ if (dent) {
+ rte_pmu.name = strdup(name);
+ if (rte_pmu.name == NULL) {
+ closedir(dirp);
+
+ return -ENOMEM;
+ }
+ }
+
+ closedir(dirp);
+
+ return rte_pmu.name ? 0 : -ENODEV;
+}
+
+static struct rte_pmu_event *
+new_event(const char *name)
+{
+ struct rte_pmu_event *event;
+
+ event = calloc(1, sizeof(*event));
+ if (event == NULL)
+ goto out;
+
+ event->name = strdup(name);
+ if (event->name == NULL) {
+ free(event);
+ event = NULL;
+ }
+
+out:
+ return event;
+}
+
+static void
+free_event(struct rte_pmu_event *event)
+{
+ free(event->name);
+ free(event);
+}
+
+int
+rte_pmu_add_event(const char *name)
+{
+ struct rte_pmu_event *event;
+ char path[PATH_MAX];
+
+ if (!rte_pmu.initialized) {
+ PMU_LOG(ERR, "PMU is not initialized");
+ return -ENODEV;
+ }
+
+ if (rte_pmu.num_group_events + 1 >= RTE_MAX_NUM_GROUP_EVENTS) {
+ PMU_LOG(ERR, "Excessive number of events in a group (%d > %d)",
+ rte_pmu.num_group_events, RTE_MAX_NUM_GROUP_EVENTS);
+ return -ENOSPC;
+ }
+
+ snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/events/%s", rte_pmu.name, name);
+ if (access(path, R_OK)) {
+ PMU_LOG(ERR, "Cannot access %s", path);
+ return -ENODEV;
+ }
+
+ TAILQ_FOREACH(event, &rte_pmu.event_list, next) {
+ if (strcmp(event->name, name))
+ continue;
+
+ return event->index;
+ }
+
+ event = new_event(name);
+ if (event == NULL) {
+ PMU_LOG(ERR, "Failed to create event %s", name);
+ return -ENOMEM;
+ }
+
+ event->index = rte_pmu.num_group_events++;
+ TAILQ_INSERT_TAIL(&rte_pmu.event_list, event, next);
+
+ return event->index;
+}
+
+int
+rte_pmu_init(void)
+{
+ int ret;
+
+ if (rte_pmu.initialized)
+ return 0;
+
+ ret = scan_pmus();
+ if (ret) {
+ PMU_LOG(ERR, "Failed to scan for event sources");
+ goto out;
+ }
+
+ ret = pmu_arch_init();
+ if (ret) {
+ PMU_LOG(ERR, "Failed to setup arch internals");
+ goto out;
+ }
+
+ TAILQ_INIT(&rte_pmu.event_list);
+ rte_pmu.initialized = 1;
+out:
+ if (ret) {
+ free(rte_pmu.name);
+ rte_pmu.name = NULL;
+ }
+
+ return ret;
+}
+
+void
+rte_pmu_fini(void)
+{
+ struct rte_pmu_event *event, *tmp_event;
+ struct rte_pmu_event_group *group;
+ unsigned int i;
+
+ if (!rte_pmu.initialized)
+ return;
+
+ RTE_TAILQ_FOREACH_SAFE(event, &rte_pmu.event_list, next, tmp_event) {
+ TAILQ_REMOVE(&rte_pmu.event_list, event, next);
+ free_event(event);
+ }
+
+ for (i = 0; i < RTE_DIM(rte_pmu.event_groups); i++) {
+ group = &rte_pmu.event_groups[i];
+ if (!group->enabled)
+ continue;
+
+ cleanup_events(group);
+ }
+
+ pmu_arch_fini();
+ free(rte_pmu.name);
+ rte_pmu.name = NULL;
+ rte_pmu.num_group_events = 0;
+}
new file mode 100644
@@ -0,0 +1,205 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2024 Marvell
+ */
+
+#ifndef _RTE_PMU_H_
+#define _RTE_PMU_H_
+
+/**
+ * @file
+ *
+ * PMU event tracing operations
+ *
+ * This file defines generic API and types necessary to setup PMU and
+ * read selected counters in runtime. Exported APIs are generally not MT-safe.
+ * One exception is rte_pmu_read() which can be called concurrently once
+ * everything has been setup.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <errno.h>
+#include <linux/perf_event.h>
+
+#include <rte_atomic.h>
+#include <rte_branch_prediction.h>
+#include <rte_common.h>
+#include <rte_compat.h>
+#include <rte_lcore.h>
+
+/** Maximum number of events in a group */
+#define RTE_MAX_NUM_GROUP_EVENTS 8
+
+/**
+ * A structure describing a group of events.
+ */
+struct __rte_cache_aligned rte_pmu_event_group {
+ /** array of user pages */
+ struct perf_event_mmap_page *mmap_pages[RTE_MAX_NUM_GROUP_EVENTS];
+ int fds[RTE_MAX_NUM_GROUP_EVENTS]; /**< array of event descriptors */
+ TAILQ_ENTRY(rte_pmu_event_group) next; /**< list entry */
+ bool enabled; /**< true if group was enabled on particular lcore */
+};
+
+/**
+ * A PMU state container.
+ */
+struct rte_pmu {
+ struct rte_pmu_event_group event_groups[RTE_MAX_LCORE]; /**< event groups */
+ unsigned int num_group_events; /**< number of events in a group */
+ unsigned int initialized; /**< initialization counter */
+ char *name; /**< name of core PMU listed under /sys/bus/event_source/devices */
+ TAILQ_HEAD(, rte_pmu_event) event_list; /**< list of matching events */
+};
+
+/** PMU state container */
+extern struct rte_pmu rte_pmu;
+
+/** Each architecture supporting PMU needs to provide its own version */
+#ifndef rte_pmu_pmc_read
+#define rte_pmu_pmc_read(index) ({ (void)(index); 0; })
+#endif
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Read PMU counter.
+ *
+ * @warning This should not be called directly.
+ *
+ * @param pc
+ * Pointer to the mmapped user page.
+ * @return
+ * Counter value read from hardware.
+ */
+__rte_experimental
+static __rte_always_inline uint64_t
+__rte_pmu_read_userpage(struct perf_event_mmap_page *pc)
+{
+#define __RTE_PMU_READ_ONCE(x) (*(const volatile typeof(x) *)&(x))
+ uint64_t width, offset;
+ uint32_t seq, index;
+ int64_t pmc;
+
+ for (;;) {
+ seq = __RTE_PMU_READ_ONCE(pc->lock);
+ rte_compiler_barrier();
+ index = __RTE_PMU_READ_ONCE(pc->index);
+ offset = __RTE_PMU_READ_ONCE(pc->offset);
+ width = __RTE_PMU_READ_ONCE(pc->pmc_width);
+
+ /* index set to 0 means that particular counter cannot be used */
+ if (likely(pc->cap_user_rdpmc && index)) {
+ pmc = rte_pmu_pmc_read(index - 1);
+ pmc <<= 64 - width;
+ pmc >>= 64 - width;
+ offset += pmc;
+ }
+
+ rte_compiler_barrier();
+
+ if (likely(__RTE_PMU_READ_ONCE(pc->lock) == seq))
+ return offset;
+ }
+
+ return 0;
+}
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Enable group of events on the calling lcore.
+ *
+ * @warning This should not be called directly.
+ *
+ * @param group
+ * Pointer to the group which will be enabled.
+ * @return
+ * 0 in case of success, negative value otherwise.
+ */
+__rte_experimental
+int
+__rte_pmu_enable_group(struct rte_pmu_event_group *group);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Initialize PMU library.
+ *
+ * @return
+ * 0 in case of success, negative value otherwise.
+ */
+__rte_experimental
+int
+rte_pmu_init(void);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Finalize PMU library.
+ */
+__rte_experimental
+void
+rte_pmu_fini(void);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Add event to the group of enabled events.
+ *
+ * @param name
+ * Name of an event listed under /sys/bus/event_source/devices/<pmu>/events.
+ * @return
+ * Event index in case of success, negative value otherwise.
+ */
+__rte_experimental
+int
+rte_pmu_add_event(const char *name);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Read hardware counter configured to count occurrences of an event.
+ *
+ * @param index
+ * Index of an event to be read.
+ * @return
+ * Event value read from register. In case of errors or lack of support
+ * 0 is returned. In other words, stream of zeros in a trace file
+ * indicates problem with reading particular PMU event register.
+ */
+__rte_experimental
+static __rte_always_inline uint64_t
+rte_pmu_read(unsigned int index)
+{
+ struct rte_pmu_event_group *group = &rte_pmu.event_groups[rte_lcore_id()];
+ int ret;
+
+ if (unlikely(!rte_pmu.initialized))
+ return 0;
+
+ if (unlikely(!group->enabled)) {
+ ret = __rte_pmu_enable_group(group);
+ if (ret)
+ return 0;
+ }
+
+ if (unlikely(index >= rte_pmu.num_group_events))
+ return 0;
+
+ return __rte_pmu_read_userpage(group->mmap_pages[index]);
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_PMU_H_ */
new file mode 100644
@@ -0,0 +1,13 @@
+EXPERIMENTAL {
+ global:
+
+ # added in 24.11
+ __rte_pmu_enable_group;
+ rte_pmu;
+ rte_pmu_add_event;
+ rte_pmu_fini;
+ rte_pmu_init;
+ rte_pmu_read;
+
+ local: *;
+};