[1/4] eal: add generic support for reading PMU events

Message ID 20221111094338.2736065-2-tduszynski@marvell.com (mailing list archive)
State Superseded, archived
Delegated to: Thomas Monjalon
Headers
Series add support for self monitoring |

Checks

Context Check Description
ci/checkpatch success coding style OK

Commit Message

Tomasz Duszynski Nov. 11, 2022, 9:43 a.m. UTC
  Add support for programming PMU counters and reading their values
in runtime bypassing kernel completely.

This is especially useful in cases where CPU cores are isolated
(nohz_full) i.e run dedicated tasks. In such cases one cannot use
standard perf utility without sacrificing latency and performance.

Signed-off-by: Tomasz Duszynski <tduszynski@marvell.com>
---
 app/test/meson.build                  |   1 +
 app/test/test_pmu.c                   |  41 +++
 doc/guides/prog_guide/profile_app.rst |   8 +
 lib/eal/common/meson.build            |   3 +
 lib/eal/common/pmu_private.h          |  41 +++
 lib/eal/common/rte_pmu.c              | 455 ++++++++++++++++++++++++++
 lib/eal/include/meson.build           |   1 +
 lib/eal/include/rte_pmu.h             | 204 ++++++++++++
 lib/eal/linux/eal.c                   |   4 +
 lib/eal/version.map                   |   3 +
 10 files changed, 761 insertions(+)
 create mode 100644 app/test/test_pmu.c
 create mode 100644 lib/eal/common/pmu_private.h
 create mode 100644 lib/eal/common/rte_pmu.c
 create mode 100644 lib/eal/include/rte_pmu.h
  

Comments

Mattias Rönnblom Dec. 15, 2022, 8:33 a.m. UTC | #1
On 2022-11-11 10:43, Tomasz Duszynski wrote:
> Add support for programming PMU counters and reading their values
> in runtime bypassing kernel completely.
> 
> This is especially useful in cases where CPU cores are isolated
> (nohz_full) i.e run dedicated tasks. In such cases one cannot use
> standard perf utility without sacrificing latency and performance.
> 
> Signed-off-by: Tomasz Duszynski <tduszynski@marvell.com>
> ---
>   app/test/meson.build                  |   1 +
>   app/test/test_pmu.c                   |  41 +++
>   doc/guides/prog_guide/profile_app.rst |   8 +
>   lib/eal/common/meson.build            |   3 +
>   lib/eal/common/pmu_private.h          |  41 +++
>   lib/eal/common/rte_pmu.c              | 455 ++++++++++++++++++++++++++
>   lib/eal/include/meson.build           |   1 +
>   lib/eal/include/rte_pmu.h             | 204 ++++++++++++
>   lib/eal/linux/eal.c                   |   4 +
>   lib/eal/version.map                   |   3 +
>   10 files changed, 761 insertions(+)
>   create mode 100644 app/test/test_pmu.c
>   create mode 100644 lib/eal/common/pmu_private.h
>   create mode 100644 lib/eal/common/rte_pmu.c
>   create mode 100644 lib/eal/include/rte_pmu.h
> 
> diff --git a/app/test/meson.build b/app/test/meson.build
> index f34d19e3c3..93b3300309 100644
> --- a/app/test/meson.build
> +++ b/app/test/meson.build
> @@ -143,6 +143,7 @@ test_sources = files(
>           'test_timer_racecond.c',
>           'test_timer_secondary.c',
>           'test_ticketlock.c',
> +        'test_pmu.c',
>           'test_trace.c',
>           'test_trace_register.c',
>           'test_trace_perf.c',
> diff --git a/app/test/test_pmu.c b/app/test/test_pmu.c
> new file mode 100644
> index 0000000000..fd331af9ee
> --- /dev/null
> +++ b/app/test/test_pmu.c
> @@ -0,0 +1,41 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(C) 2022 Marvell International Ltd.
> + */
> +
> +#include <rte_pmu.h>
> +
> +#include "test.h"
> +
> +static int
> +test_pmu_read(void)
> +{
> +	uint64_t val = 0;
> +	int tries = 10;
> +	int event = -1;
> +
> +	while (tries--)
> +		val += rte_pmu_read(event);
> +
> +	if (val == 0)
> +		return TEST_FAILED;
> +
> +	return TEST_SUCCESS;
> +}
> +
> +static struct unit_test_suite pmu_tests = {
> +	.suite_name = "pmu autotest",
> +	.setup = NULL,
> +	.teardown = NULL,
> +	.unit_test_cases = {
> +		TEST_CASE(test_pmu_read),
> +		TEST_CASES_END()
> +	}
> +};
> +
> +static int
> +test_pmu(void)
> +{
> +	return unit_test_suite_runner(&pmu_tests);
> +}
> +
> +REGISTER_TEST_COMMAND(pmu_autotest, test_pmu);
> diff --git a/doc/guides/prog_guide/profile_app.rst b/doc/guides/prog_guide/profile_app.rst
> index bd6700ef85..8fc1b20cab 100644
> --- a/doc/guides/prog_guide/profile_app.rst
> +++ b/doc/guides/prog_guide/profile_app.rst
> @@ -7,6 +7,14 @@ Profile Your Application
>   The following sections describe methods of profiling DPDK applications on
>   different architectures.
>   
> +Performance counter based profiling
> +-----------------------------------
> +
> +Majority of architectures support some sort hardware measurement unit which provides a set of
> +programmable counters that monitor specific events. There are different tools which can gather
> +that information, perf being an example here. Though in some scenarios, eg. when CPU cores are
> +isolated (nohz_full) and run dedicated tasks, using perf is less than ideal. In such cases one can
> +read specific events directly from application via ``rte_pmu_read()``.
>   
>   Profiling on x86
>   ----------------
> diff --git a/lib/eal/common/meson.build b/lib/eal/common/meson.build
> index 917758cc65..d6d05b56f3 100644
> --- a/lib/eal/common/meson.build
> +++ b/lib/eal/common/meson.build
> @@ -38,6 +38,9 @@ sources += files(
>           'rte_service.c',
>           'rte_version.c',
>   )
> +if is_linux
> +    sources += files('rte_pmu.c')
> +endif
>   if is_linux or is_windows
>       sources += files('eal_common_dynmem.c')
>   endif
> diff --git a/lib/eal/common/pmu_private.h b/lib/eal/common/pmu_private.h
> new file mode 100644
> index 0000000000..cade4245e6
> --- /dev/null
> +++ b/lib/eal/common/pmu_private.h
> @@ -0,0 +1,41 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2022 Marvell
> + */
> +
> +#ifndef _PMU_PRIVATE_H_
> +#define _PMU_PRIVATE_H_
> +
> +/**
> + * Architecture specific PMU init callback.
> + *
> + * @return
> + *   0 in case of success, negative value otherwise.
> + */
> +int
> +pmu_arch_init(void);
> +
> +/**
> + * Architecture specific PMU cleanup callback.
> + */
> +void
> +pmu_arch_fini(void);
> +
> +/**
> + * Apply architecture specific settings to config before passing it to syscall.
> + */
> +void
> +pmu_arch_fixup_config(uint64_t config[3]);
> +
> +/**
> + * Initialize PMU tracing internals.
> + */
> +void
> +eal_pmu_init(void);
> +
> +/**
> + * Cleanup PMU internals.
> + */
> +void
> +eal_pmu_fini(void);
> +
> +#endif /* _PMU_PRIVATE_H_ */
> diff --git a/lib/eal/common/rte_pmu.c b/lib/eal/common/rte_pmu.c
> new file mode 100644
> index 0000000000..7d3bd57d1d
> --- /dev/null
> +++ b/lib/eal/common/rte_pmu.c
> @@ -0,0 +1,455 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(C) 2022 Marvell International Ltd.
> + */
> +
> +#include <ctype.h>
> +#include <dirent.h>
> +#include <errno.h>
> +#include <regex.h>
> +#include <sys/ioctl.h>
> +#include <sys/mman.h>
> +#include <sys/queue.h>
> +#include <sys/syscall.h>
> +#include <unistd.h>
> +
> +#include <rte_eal_paging.h>
> +#include <rte_malloc.h>
> +#include <rte_pmu.h>
> +#include <rte_tailq.h>
> +
> +#include "pmu_private.h"
> +
> +#define EVENT_SOURCE_DEVICES_PATH "/sys/bus/event_source/devices"
> +
> +#ifndef GENMASK_ULL
> +#define GENMASK_ULL(h, l) ((~0ULL - (1ULL << (l)) + 1) & (~0ULL >> ((64 - 1 - (h)))))
> +#endif
> +
> +#ifndef FIELD_PREP
> +#define FIELD_PREP(m, v) (((uint64_t)(v) << (__builtin_ffsll(m) - 1)) & (m))
> +#endif
> +
> +struct rte_pmu *pmu;
> +
> +/*
> + * Following __rte_weak functions provide default no-op. Architectures should override them if
> + * necessary.
> + */
> +
> +int
> +__rte_weak pmu_arch_init(void)
> +{
> +	return 0;
> +}
> +
> +void
> +__rte_weak pmu_arch_fini(void)
> +{
> +}
> +
> +void
> +__rte_weak pmu_arch_fixup_config(uint64_t config[3])
> +{
> +	RTE_SET_USED(config);
> +}
> +
> +static int
> +get_term_format(const char *name, int *num, uint64_t *mask)
> +{
> +	char *config = NULL;
> +	char path[PATH_MAX];
> +	int high, low, ret;
> +	FILE *fp;
> +
> +	/* quiesce -Wmaybe-uninitialized warning */
> +	*num = 0;
> +	*mask = 0;
> +
> +	snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/format/%s", pmu->name, name);
> +	fp = fopen(path, "r");
> +	if (!fp)
> +		return -errno;
> +
> +	errno = 0;
> +	ret = fscanf(fp, "%m[^:]:%d-%d", &config, &low, &high);
> +	if (ret < 2) {
> +		ret = -ENODATA;
> +		goto out;
> +	}
> +	if (errno) {
> +		ret = -errno;
> +		goto out;
> +	}
> +
> +	if (ret == 2)
> +		high = low;
> +
> +	*mask = GENMASK_ULL(high, low);
> +	/* Last digit should be [012]. If last digit is missing 0 is implied. */
> +	*num = config[strlen(config) - 1];
> +	*num = isdigit(*num) ? *num - '0' : 0;
> +
> +	ret = 0;
> +out:
> +	free(config);
> +	fclose(fp);
> +
> +	return ret;
> +}
> +
> +static int
> +parse_event(char *buf, uint64_t config[3])
> +{
> +	char *token, *term;
> +	int num, ret, val;
> +	uint64_t mask;
> +
> +	config[0] = config[1] = config[2] = 0;
> +
> +	token = strtok(buf, ",");
> +	while (token) {
> +		errno = 0;
> +		/* <term>=<value> */
> +		ret = sscanf(token, "%m[^=]=%i", &term, &val);
> +		if (ret < 1)
> +			return -ENODATA;
> +		if (errno)
> +			return -errno;
> +		if (ret == 1)
> +			val = 1;
> +
> +		ret = get_term_format(term, &num, &mask);
> +		free(term);
> +		if (ret)
> +			return ret;
> +
> +		config[num] |= FIELD_PREP(mask, val);
> +		token = strtok(NULL, ",");
> +	}
> +
> +	return 0;
> +}
> +
> +static int
> +get_event_config(const char *name, uint64_t config[3])
> +{
> +	char path[PATH_MAX], buf[BUFSIZ];
> +	FILE *fp;
> +	int ret;
> +
> +	snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/events/%s", pmu->name, name);
> +	fp = fopen(path, "r");
> +	if (!fp)
> +		return -errno;
> +
> +	ret = fread(buf, 1, sizeof(buf), fp);
> +	if (ret == 0) {
> +		fclose(fp);
> +
> +		return -EINVAL;
> +	}
> +	fclose(fp);
> +	buf[ret] = '\0';
> +
> +	return parse_event(buf, config);
> +}
> +
> +static int
> +do_perf_event_open(uint64_t config[3], int lcore_id, int group_fd)
> +{
> +	struct perf_event_attr attr = {
> +		.size = sizeof(struct perf_event_attr),
> +		.type = PERF_TYPE_RAW,
> +		.exclude_kernel = 1,
> +		.exclude_hv = 1,
> +		.disabled = 1,
> +	};
> +
> +	pmu_arch_fixup_config(config);
> +
> +	attr.config = config[0];
> +	attr.config1 = config[1];
> +	attr.config2 = config[2];
> +
> +	return syscall(SYS_perf_event_open, &attr, rte_gettid(), rte_lcore_to_cpu_id(lcore_id),
> +		       group_fd, 0);
> +}
> +
> +static int
> +open_events(int lcore_id)
> +{
> +	struct rte_pmu_event_group *group = &pmu->group[lcore_id];
> +	struct rte_pmu_event *event;
> +	uint64_t config[3];
> +	int num = 0, ret;
> +
> +	/* group leader gets created first, with fd = -1 */
> +	group->fds[0] = -1;
> +
> +	TAILQ_FOREACH(event, &pmu->event_list, next) {
> +		ret = get_event_config(event->name, config);
> +		if (ret) {
> +			RTE_LOG(ERR, EAL, "failed to get %s event config\n", event->name);
> +			continue;
> +		}
> +
> +		ret = do_perf_event_open(config, lcore_id, group->fds[0]);
> +		if (ret == -1) {
> +			if (errno == EOPNOTSUPP)
> +				RTE_LOG(ERR, EAL, "64 bit counters not supported\n");
> +
> +			ret = -errno;
> +			goto out;
> +		}
> +
> +		group->fds[event->index] = ret;
> +		num++;
> +	}
> +
> +	return 0;
> +out:
> +	for (--num; num >= 0; num--) {
> +		close(group->fds[num]);
> +		group->fds[num] = -1;
> +	}
> +
> +
> +	return ret;
> +}
> +
> +static int
> +mmap_events(int lcore_id)
> +{
> +	struct rte_pmu_event_group *group = &pmu->group[lcore_id];
> +	void *addr;
> +	int ret, i;
> +
> +	for (i = 0; i < pmu->num_group_events; i++) {
> +		addr = mmap(0, rte_mem_page_size(), PROT_READ, MAP_SHARED, group->fds[i], 0);
> +		if (addr == MAP_FAILED) {
> +			ret = -errno;
> +			goto out;
> +		}
> +
> +		group->mmap_pages[i] = addr;
> +	}
> +
> +	return 0;
> +out:
> +	for (; i; i--) {
> +		munmap(group->mmap_pages[i - 1], rte_mem_page_size());
> +		group->mmap_pages[i - 1] = NULL;
> +	}
> +
> +	return ret;
> +}
> +
> +static void
> +cleanup_events(int lcore_id)
> +{
> +	struct rte_pmu_event_group *group = &pmu->group[lcore_id];
> +	int i;
> +
> +	if (!group->fds)
> +		return;
> +
> +	if (group->fds[0] != -1)
> +		ioctl(group->fds[0], PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP);
> +
> +	for (i = 0; i < pmu->num_group_events; i++) {
> +		if (group->mmap_pages[i]) {
> +			munmap(group->mmap_pages[i], rte_mem_page_size());
> +			group->mmap_pages[i] = NULL;
> +		}
> +
> +		if (group->fds[i] != -1) {
> +			close(group->fds[i]);
> +			group->fds[i] = -1;
> +		}
> +	}
> +
> +	rte_free(group->mmap_pages);
> +	rte_free(group->fds);
> +
> +	group->mmap_pages = NULL;
> +	group->fds = NULL;
> +	group->enabled = false;
> +}
> +
> +int __rte_noinline
> +rte_pmu_enable_group(int lcore_id)
> +{
> +	struct rte_pmu_event_group *group = &pmu->group[lcore_id];
> +	int ret;
> +
> +	if (pmu->num_group_events == 0) {
> +		RTE_LOG(DEBUG, EAL, "no matching PMU events\n");
> +
> +		return 0;
> +	}
> +
> +	group->fds = rte_zmalloc(NULL, pmu->num_group_events, sizeof(*group->fds));
> +	if (!group->fds) {
> +		RTE_LOG(ERR, EAL, "failed to alloc descriptor memory\n");
> +
> +		return -ENOMEM;
> +	}
> +
> +	group->mmap_pages = rte_zmalloc(NULL, pmu->num_group_events, sizeof(*group->mmap_pages));
> +	if (!group->mmap_pages) {
> +		RTE_LOG(ERR, EAL, "failed to alloc userpage memory\n");
> +
> +		ret = -ENOMEM;
> +		goto out;
> +	}
> +
> +	ret = open_events(lcore_id);
> +	if (ret) {
> +		RTE_LOG(ERR, EAL, "failed to open events on lcore-worker-%d\n", lcore_id);
> +		goto out;
> +	}
> +
> +	ret = mmap_events(lcore_id);
> +	if (ret) {
> +		RTE_LOG(ERR, EAL, "failed to map events on lcore-worker-%d\n", lcore_id);
> +		goto out;
> +	}
> +
> +	if (ioctl(group->fds[0], PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP) == -1) {
> +		RTE_LOG(ERR, EAL, "failed to enable events on lcore-worker-%d\n", lcore_id);
> +
> +		ret = -errno;
> +		goto out;
> +	}
> +
> +	return 0;
> +
> +out:
> +	cleanup_events(lcore_id);
> +
> +	return ret;
> +}
> +
> +static int
> +scan_pmus(void)
> +{
> +	char path[PATH_MAX];
> +	struct dirent *dent;
> +	const char *name;
> +	DIR *dirp;
> +
> +	dirp = opendir(EVENT_SOURCE_DEVICES_PATH);
> +	if (!dirp)
> +		return -errno;
> +
> +	while ((dent = readdir(dirp))) {
> +		name = dent->d_name;
> +		if (name[0] == '.')
> +			continue;
> +
> +		/* sysfs entry should either contain cpus or be a cpu */
> +		if (!strcmp(name, "cpu"))
> +			break;
> +
> +		snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/cpus", name);
> +		if (access(path, F_OK) == 0)
> +			break;
> +	}
> +
> +	closedir(dirp);
> +
> +	if (dent) {
> +		pmu->name = strdup(name);
> +		if (!pmu->name)
> +			return -ENOMEM;
> +	}
> +
> +	return pmu->name ? 0 : -ENODEV;
> +}
> +
> +int
> +rte_pmu_add_event(const char *name)
> +{
> +	struct rte_pmu_event *event;
> +	char path[PATH_MAX];
> +
> +	snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/events/%s", pmu->name, name);
> +	if (access(path, R_OK))
> +		return -ENODEV;
> +
> +	TAILQ_FOREACH(event, &pmu->event_list, next) {
> +		if (!strcmp(event->name, name))
> +			return event->index;
> +		continue;
> +	}
> +
> +	event = rte_zmalloc(NULL, 1, sizeof(*event));
> +	if (!event)
> +		return -ENOMEM;
> +
> +	event->name = strdup(name);
> +	if (!event->name) {
> +		rte_free(event);
> +
> +		return -ENOMEM;
> +	}
> +
> +	event->index = pmu->num_group_events++;
> +	TAILQ_INSERT_TAIL(&pmu->event_list, event, next);
> +
> +	RTE_LOG(DEBUG, EAL, "%s even added at index %d\n", name, event->index);
> +
> +	return event->index;
> +}
> +
> +void
> +eal_pmu_init(void)
> +{
> +	int ret;
> +
> +	pmu = rte_calloc(NULL, 1, sizeof(*pmu), RTE_CACHE_LINE_SIZE);
> +	if (!pmu) {
> +		RTE_LOG(ERR, EAL, "failed to alloc PMU\n");
> +
> +		return;
> +	}
> +
> +	TAILQ_INIT(&pmu->event_list);
> +
> +	ret = scan_pmus();
> +	if (ret) {
> +		RTE_LOG(ERR, EAL, "failed to find core pmu\n");
> +		goto out;
> +	}
> +
> +	ret = pmu_arch_init();
> +	if (ret) {
> +		RTE_LOG(ERR, EAL, "failed to setup arch for PMU\n");
> +		goto out;
> +	}
> +
> +	return;
> +out:
> +	free(pmu->name);
> +	rte_free(pmu);
> +}
> +
> +void
> +eal_pmu_fini(void)
> +{
> +	struct rte_pmu_event *event, *tmp;
> +	int lcore_id;
> +
> +	RTE_TAILQ_FOREACH_SAFE(event, &pmu->event_list, next, tmp) {
> +		TAILQ_REMOVE(&pmu->event_list, event, next);
> +		free(event->name);
> +		rte_free(event);
> +	}
> +
> +	RTE_LCORE_FOREACH_WORKER(lcore_id)
> +		cleanup_events(lcore_id);
> +
> +	pmu_arch_fini();
> +	free(pmu->name);
> +	rte_free(pmu);
> +}
> diff --git a/lib/eal/include/meson.build b/lib/eal/include/meson.build
> index cfcd40aaed..3bf830adee 100644
> --- a/lib/eal/include/meson.build
> +++ b/lib/eal/include/meson.build
> @@ -36,6 +36,7 @@ headers += files(
>           'rte_pci_dev_features.h',
>           'rte_per_lcore.h',
>           'rte_pflock.h',
> +        'rte_pmu.h',
>           'rte_random.h',
>           'rte_reciprocal.h',
>           'rte_seqcount.h',
> diff --git a/lib/eal/include/rte_pmu.h b/lib/eal/include/rte_pmu.h
> new file mode 100644
> index 0000000000..5955c22779
> --- /dev/null
> +++ b/lib/eal/include/rte_pmu.h
> @@ -0,0 +1,204 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2022 Marvell
> + */
> +
> +#ifndef _RTE_PMU_H_
> +#define _RTE_PMU_H_
> +
> +#ifdef __cplusplus
> +extern "C" {
> +#endif
> +
> +#include <rte_common.h>
> +#include <rte_compat.h>
> +
> +#ifdef RTE_EXEC_ENV_LINUX
> +
> +#include <linux/perf_event.h>
> +
> +#include <rte_atomic.h>
> +#include <rte_branch_prediction.h>
> +#include <rte_lcore.h>
> +#include <rte_log.h>
> +
> +/**
> + * @file
> + *
> + * PMU event tracing operations
> + *
> + * This file defines generic API and types necessary to setup PMU and
> + * read selected counters in runtime.
> + */
> +
> +/**
> + * A structure describing a group of events.
> + */
> +struct rte_pmu_event_group {
> +	int *fds; /**< array of event descriptors */
> +	void **mmap_pages; /**< array of pointers to mmapped perf_event_attr structures */
> +	bool enabled; /**< true if group was enabled on particular lcore */
> +};
> +
> +/**
> + * A structure describing an event.
> + */
> +struct rte_pmu_event {
> +	char *name; /** name of an event */
> +	int index; /** event index into fds/mmap_pages */
> +	TAILQ_ENTRY(rte_pmu_event) next; /** list entry */
> +};
> +
> +/**
> + * A PMU state container.
> + */
> +struct rte_pmu {
> +	char *name; /** name of core PMU listed under /sys/bus/event_source/devices */
> +	struct rte_pmu_event_group group[RTE_MAX_LCORE]; /**< per lcore event group data */
> +	int num_group_events; /**< number of events in a group */
> +	TAILQ_HEAD(, rte_pmu_event) event_list; /**< list of matching events */
> +};
> +
> +/** Pointer to the PMU state container */
> +extern struct rte_pmu *pmu;
> +
> +/** Each architecture supporting PMU needs to provide its own version */
> +#ifndef rte_pmu_pmc_read
> +#define rte_pmu_pmc_read(index) ({ 0; })
> +#endif
> +
> +/**
> + * @internal
> + *
> + * Read PMU counter.
> + *
> + * @param pc
> + *   Pointer to the mmapped user page.
> + * @return
> + *   Counter value read from hardware.
> + */
> +__rte_internal
> +static __rte_always_inline uint64_t
> +rte_pmu_read_userpage(struct perf_event_mmap_page *pc)
> +{
> +	uint64_t offset, width, pmc = 0;
> +	uint32_t seq, index;
> +	int tries = 100;
> +
> +	for (;;) {
> +		seq = pc->lock;
> +		rte_compiler_barrier();

I'm guessing this should be a load-acquire instead. Less heavy-handed 
than a compiler barrier on TSO CPUs, and works on weakly ordered systems 
as well (unlike the compiler barrier).

This looks like an open-coded sequence lock, so take a look in 
rte_seqcount.h for inspiration.

> +		index = pc->index;
> +		offset = pc->offset;
> +		width = pc->pmc_width;
> +
> +		if (likely(pc->cap_user_rdpmc && index)) {
> +			pmc = rte_pmu_pmc_read(index - 1);
> +			pmc <<= 64 - width;
> +			pmc >>= 64 - width;
> +		}
> +
> +		rte_compiler_barrier();
> +
> +		if (likely(pc->lock == seq))
> +			return pmc + offset;
> +
> +		if (--tries == 0) {
> +			RTE_LOG(DEBUG, EAL, "failed to get perf_event_mmap_page lock\n");
> +			break;
> +		}
> +	}
> +
> +	return 0;
> +}
> +
> +/**
> + * @internal
> + *
> + * Enable group of events for a given lcore.
> + *
> + * @param lcore_id
> + *   The identifier of the lcore.
> + * @return
> + *   0 in case of success, negative value otherwise.
> + */
> +__rte_internal
> +int
> +rte_pmu_enable_group(int lcore_id);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice
> + *
> + * Add event to the group of enabled events.
> + *
> + * @param name
> + *   Name of an event listed under /sys/bus/event_source/devices/pmu/events.
> + * @return
> + *   Event index in case of success, negative value otherwise.
> + */
> +__rte_experimental
> +int
> +rte_pmu_add_event(const char *name);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice
> + *
> + * Read hardware counter configured to count occurrences of an event.
> + *
> + * @param index
> + *   Index of an event to be read.
> + * @return
> + *   Event value read from register. In case of errors or lack of support
> + *   0 is returned. In other words, stream of zeros in a trace file
> + *   indicates problem with reading particular PMU event register.
> + */
> +__rte_experimental
> +static __rte_always_inline uint64_t
> +rte_pmu_read(int index)
> +{
> +	int lcore_id = rte_lcore_id();
> +	struct rte_pmu_event_group *group;
> +	int ret;
> +
> +	if (!pmu)
> +		return 0;
> +
> +	group = &pmu->group[lcore_id];
> +	if (!group->enabled) {
> +		ret = rte_pmu_enable_group(lcore_id);
> +		if (ret)
> +			return 0;
> +
> +		group->enabled = true;
> +	}
> +
> +	if (index < 0 || index >= pmu->num_group_events)
> +		return 0;
> +
> +	return rte_pmu_read_userpage(group->mmap_pages[index]);
> +}
> +
> +#else /* !RTE_EXEC_ENV_LINUX */
> +
> +__rte_experimental
> +static int __rte_unused
> +rte_pmu_add_event(__rte_unused const char *name)
> +{
> +	return -1;
> +}
> +
> +__rte_experimental
> +static __rte_always_inline uint64_t
> +rte_pmu_read(__rte_unused int index)
> +{
> +	return 0;
> +}
> +
> +#endif /* RTE_EXEC_ENV_LINUX */
> +
> +#ifdef __cplusplus
> +}
> +#endif
> +
> +#endif /* _RTE_PMU_H_ */
> diff --git a/lib/eal/linux/eal.c b/lib/eal/linux/eal.c
> index 8c118d0d9f..751a13b597 100644
> --- a/lib/eal/linux/eal.c
> +++ b/lib/eal/linux/eal.c
> @@ -53,6 +53,7 @@
>   #include "eal_options.h"
>   #include "eal_vfio.h"
>   #include "hotplug_mp.h"
> +#include "pmu_private.h"
>   
>   #define MEMSIZE_IF_NO_HUGE_PAGE (64ULL * 1024ULL * 1024ULL)
>   
> @@ -1206,6 +1207,8 @@ rte_eal_init(int argc, char **argv)
>   		return -1;
>   	}
>   
> +	eal_pmu_init();
> +
>   	if (rte_eal_tailqs_init() < 0) {
>   		rte_eal_init_alert("Cannot init tail queues for objects");
>   		rte_errno = EFAULT;
> @@ -1372,6 +1375,7 @@ rte_eal_cleanup(void)
>   	eal_bus_cleanup();
>   	rte_trace_save();
>   	eal_trace_fini();
> +	eal_pmu_fini();
>   	/* after this point, any DPDK pointers will become dangling */
>   	rte_eal_memory_detach();
>   	eal_mp_dev_hotplug_cleanup();
> diff --git a/lib/eal/version.map b/lib/eal/version.map
> index 7ad12a7dc9..e870c87493 100644
> --- a/lib/eal/version.map
> +++ b/lib/eal/version.map
> @@ -432,6 +432,8 @@ EXPERIMENTAL {
>   	rte_thread_set_priority;
>   
>   	# added in 22.11
> +	rte_pmu_add_event; # WINDOWS_NO_EXPORT
> +	rte_pmu_read; # WINDOWS_NO_EXPORT
>   	rte_thread_attr_get_affinity;
>   	rte_thread_attr_init;
>   	rte_thread_attr_set_affinity;
> @@ -483,4 +485,5 @@ INTERNAL {
>   	rte_mem_map;
>   	rte_mem_page_size;
>   	rte_mem_unmap;
> +	rte_pmu_enable_group;
>   };
  

Patch

diff --git a/app/test/meson.build b/app/test/meson.build
index f34d19e3c3..93b3300309 100644
--- a/app/test/meson.build
+++ b/app/test/meson.build
@@ -143,6 +143,7 @@  test_sources = files(
         'test_timer_racecond.c',
         'test_timer_secondary.c',
         'test_ticketlock.c',
+        'test_pmu.c',
         'test_trace.c',
         'test_trace_register.c',
         'test_trace_perf.c',
diff --git a/app/test/test_pmu.c b/app/test/test_pmu.c
new file mode 100644
index 0000000000..fd331af9ee
--- /dev/null
+++ b/app/test/test_pmu.c
@@ -0,0 +1,41 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2022 Marvell International Ltd.
+ */
+
+#include <rte_pmu.h>
+
+#include "test.h"
+
+static int
+test_pmu_read(void)
+{
+	uint64_t val = 0;
+	int tries = 10;
+	int event = -1;
+
+	while (tries--)
+		val += rte_pmu_read(event);
+
+	if (val == 0)
+		return TEST_FAILED;
+
+	return TEST_SUCCESS;
+}
+
+static struct unit_test_suite pmu_tests = {
+	.suite_name = "pmu autotest",
+	.setup = NULL,
+	.teardown = NULL,
+	.unit_test_cases = {
+		TEST_CASE(test_pmu_read),
+		TEST_CASES_END()
+	}
+};
+
+static int
+test_pmu(void)
+{
+	return unit_test_suite_runner(&pmu_tests);
+}
+
+REGISTER_TEST_COMMAND(pmu_autotest, test_pmu);
diff --git a/doc/guides/prog_guide/profile_app.rst b/doc/guides/prog_guide/profile_app.rst
index bd6700ef85..8fc1b20cab 100644
--- a/doc/guides/prog_guide/profile_app.rst
+++ b/doc/guides/prog_guide/profile_app.rst
@@ -7,6 +7,14 @@  Profile Your Application
 The following sections describe methods of profiling DPDK applications on
 different architectures.
 
+Performance counter based profiling
+-----------------------------------
+
+Majority of architectures support some sort hardware measurement unit which provides a set of
+programmable counters that monitor specific events. There are different tools which can gather
+that information, perf being an example here. Though in some scenarios, eg. when CPU cores are
+isolated (nohz_full) and run dedicated tasks, using perf is less than ideal. In such cases one can
+read specific events directly from application via ``rte_pmu_read()``.
 
 Profiling on x86
 ----------------
diff --git a/lib/eal/common/meson.build b/lib/eal/common/meson.build
index 917758cc65..d6d05b56f3 100644
--- a/lib/eal/common/meson.build
+++ b/lib/eal/common/meson.build
@@ -38,6 +38,9 @@  sources += files(
         'rte_service.c',
         'rte_version.c',
 )
+if is_linux
+    sources += files('rte_pmu.c')
+endif
 if is_linux or is_windows
     sources += files('eal_common_dynmem.c')
 endif
diff --git a/lib/eal/common/pmu_private.h b/lib/eal/common/pmu_private.h
new file mode 100644
index 0000000000..cade4245e6
--- /dev/null
+++ b/lib/eal/common/pmu_private.h
@@ -0,0 +1,41 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2022 Marvell
+ */
+
+#ifndef _PMU_PRIVATE_H_
+#define _PMU_PRIVATE_H_
+
+/**
+ * Architecture specific PMU init callback.
+ *
+ * @return
+ *   0 in case of success, negative value otherwise.
+ */
+int
+pmu_arch_init(void);
+
+/**
+ * Architecture specific PMU cleanup callback.
+ */
+void
+pmu_arch_fini(void);
+
+/**
+ * Apply architecture specific settings to config before passing it to syscall.
+ */
+void
+pmu_arch_fixup_config(uint64_t config[3]);
+
+/**
+ * Initialize PMU tracing internals.
+ */
+void
+eal_pmu_init(void);
+
+/**
+ * Cleanup PMU internals.
+ */
+void
+eal_pmu_fini(void);
+
+#endif /* _PMU_PRIVATE_H_ */
diff --git a/lib/eal/common/rte_pmu.c b/lib/eal/common/rte_pmu.c
new file mode 100644
index 0000000000..7d3bd57d1d
--- /dev/null
+++ b/lib/eal/common/rte_pmu.c
@@ -0,0 +1,455 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2022 Marvell International Ltd.
+ */
+
+#include <ctype.h>
+#include <dirent.h>
+#include <errno.h>
+#include <regex.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/queue.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+
+#include <rte_eal_paging.h>
+#include <rte_malloc.h>
+#include <rte_pmu.h>
+#include <rte_tailq.h>
+
+#include "pmu_private.h"
+
+#define EVENT_SOURCE_DEVICES_PATH "/sys/bus/event_source/devices"
+
+#ifndef GENMASK_ULL
+#define GENMASK_ULL(h, l) ((~0ULL - (1ULL << (l)) + 1) & (~0ULL >> ((64 - 1 - (h)))))
+#endif
+
+#ifndef FIELD_PREP
+#define FIELD_PREP(m, v) (((uint64_t)(v) << (__builtin_ffsll(m) - 1)) & (m))
+#endif
+
+struct rte_pmu *pmu;
+
+/*
+ * Following __rte_weak functions provide default no-op. Architectures should override them if
+ * necessary.
+ */
+
+int
+__rte_weak pmu_arch_init(void)
+{
+	return 0;
+}
+
+void
+__rte_weak pmu_arch_fini(void)
+{
+}
+
+void
+__rte_weak pmu_arch_fixup_config(uint64_t config[3])
+{
+	RTE_SET_USED(config);
+}
+
+static int
+get_term_format(const char *name, int *num, uint64_t *mask)
+{
+	char *config = NULL;
+	char path[PATH_MAX];
+	int high, low, ret;
+	FILE *fp;
+
+	/* quiesce -Wmaybe-uninitialized warning */
+	*num = 0;
+	*mask = 0;
+
+	snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/format/%s", pmu->name, name);
+	fp = fopen(path, "r");
+	if (!fp)
+		return -errno;
+
+	errno = 0;
+	ret = fscanf(fp, "%m[^:]:%d-%d", &config, &low, &high);
+	if (ret < 2) {
+		ret = -ENODATA;
+		goto out;
+	}
+	if (errno) {
+		ret = -errno;
+		goto out;
+	}
+
+	if (ret == 2)
+		high = low;
+
+	*mask = GENMASK_ULL(high, low);
+	/* Last digit should be [012]. If last digit is missing 0 is implied. */
+	*num = config[strlen(config) - 1];
+	*num = isdigit(*num) ? *num - '0' : 0;
+
+	ret = 0;
+out:
+	free(config);
+	fclose(fp);
+
+	return ret;
+}
+
+static int
+parse_event(char *buf, uint64_t config[3])
+{
+	char *token, *term;
+	int num, ret, val;
+	uint64_t mask;
+
+	config[0] = config[1] = config[2] = 0;
+
+	token = strtok(buf, ",");
+	while (token) {
+		errno = 0;
+		/* <term>=<value> */
+		ret = sscanf(token, "%m[^=]=%i", &term, &val);
+		if (ret < 1)
+			return -ENODATA;
+		if (errno)
+			return -errno;
+		if (ret == 1)
+			val = 1;
+
+		ret = get_term_format(term, &num, &mask);
+		free(term);
+		if (ret)
+			return ret;
+
+		config[num] |= FIELD_PREP(mask, val);
+		token = strtok(NULL, ",");
+	}
+
+	return 0;
+}
+
+static int
+get_event_config(const char *name, uint64_t config[3])
+{
+	char path[PATH_MAX], buf[BUFSIZ];
+	FILE *fp;
+	int ret;
+
+	snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/events/%s", pmu->name, name);
+	fp = fopen(path, "r");
+	if (!fp)
+		return -errno;
+
+	ret = fread(buf, 1, sizeof(buf), fp);
+	if (ret == 0) {
+		fclose(fp);
+
+		return -EINVAL;
+	}
+	fclose(fp);
+	buf[ret] = '\0';
+
+	return parse_event(buf, config);
+}
+
+static int
+do_perf_event_open(uint64_t config[3], int lcore_id, int group_fd)
+{
+	struct perf_event_attr attr = {
+		.size = sizeof(struct perf_event_attr),
+		.type = PERF_TYPE_RAW,
+		.exclude_kernel = 1,
+		.exclude_hv = 1,
+		.disabled = 1,
+	};
+
+	pmu_arch_fixup_config(config);
+
+	attr.config = config[0];
+	attr.config1 = config[1];
+	attr.config2 = config[2];
+
+	return syscall(SYS_perf_event_open, &attr, rte_gettid(), rte_lcore_to_cpu_id(lcore_id),
+		       group_fd, 0);
+}
+
+static int
+open_events(int lcore_id)
+{
+	struct rte_pmu_event_group *group = &pmu->group[lcore_id];
+	struct rte_pmu_event *event;
+	uint64_t config[3];
+	int num = 0, ret;
+
+	/* group leader gets created first, with fd = -1 */
+	group->fds[0] = -1;
+
+	TAILQ_FOREACH(event, &pmu->event_list, next) {
+		ret = get_event_config(event->name, config);
+		if (ret) {
+			RTE_LOG(ERR, EAL, "failed to get %s event config\n", event->name);
+			continue;
+		}
+
+		ret = do_perf_event_open(config, lcore_id, group->fds[0]);
+		if (ret == -1) {
+			if (errno == EOPNOTSUPP)
+				RTE_LOG(ERR, EAL, "64 bit counters not supported\n");
+
+			ret = -errno;
+			goto out;
+		}
+
+		group->fds[event->index] = ret;
+		num++;
+	}
+
+	return 0;
+out:
+	for (--num; num >= 0; num--) {
+		close(group->fds[num]);
+		group->fds[num] = -1;
+	}
+
+
+	return ret;
+}
+
+static int
+mmap_events(int lcore_id)
+{
+	struct rte_pmu_event_group *group = &pmu->group[lcore_id];
+	void *addr;
+	int ret, i;
+
+	for (i = 0; i < pmu->num_group_events; i++) {
+		addr = mmap(0, rte_mem_page_size(), PROT_READ, MAP_SHARED, group->fds[i], 0);
+		if (addr == MAP_FAILED) {
+			ret = -errno;
+			goto out;
+		}
+
+		group->mmap_pages[i] = addr;
+	}
+
+	return 0;
+out:
+	for (; i; i--) {
+		munmap(group->mmap_pages[i - 1], rte_mem_page_size());
+		group->mmap_pages[i - 1] = NULL;
+	}
+
+	return ret;
+}
+
+static void
+cleanup_events(int lcore_id)
+{
+	struct rte_pmu_event_group *group = &pmu->group[lcore_id];
+	int i;
+
+	if (!group->fds)
+		return;
+
+	if (group->fds[0] != -1)
+		ioctl(group->fds[0], PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP);
+
+	for (i = 0; i < pmu->num_group_events; i++) {
+		if (group->mmap_pages[i]) {
+			munmap(group->mmap_pages[i], rte_mem_page_size());
+			group->mmap_pages[i] = NULL;
+		}
+
+		if (group->fds[i] != -1) {
+			close(group->fds[i]);
+			group->fds[i] = -1;
+		}
+	}
+
+	rte_free(group->mmap_pages);
+	rte_free(group->fds);
+
+	group->mmap_pages = NULL;
+	group->fds = NULL;
+	group->enabled = false;
+}
+
+int __rte_noinline
+rte_pmu_enable_group(int lcore_id)
+{
+	struct rte_pmu_event_group *group = &pmu->group[lcore_id];
+	int ret;
+
+	if (pmu->num_group_events == 0) {
+		RTE_LOG(DEBUG, EAL, "no matching PMU events\n");
+
+		return 0;
+	}
+
+	group->fds = rte_zmalloc(NULL, pmu->num_group_events, sizeof(*group->fds));
+	if (!group->fds) {
+		RTE_LOG(ERR, EAL, "failed to alloc descriptor memory\n");
+
+		return -ENOMEM;
+	}
+
+	group->mmap_pages = rte_zmalloc(NULL, pmu->num_group_events, sizeof(*group->mmap_pages));
+	if (!group->mmap_pages) {
+		RTE_LOG(ERR, EAL, "failed to alloc userpage memory\n");
+
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ret = open_events(lcore_id);
+	if (ret) {
+		RTE_LOG(ERR, EAL, "failed to open events on lcore-worker-%d\n", lcore_id);
+		goto out;
+	}
+
+	ret = mmap_events(lcore_id);
+	if (ret) {
+		RTE_LOG(ERR, EAL, "failed to map events on lcore-worker-%d\n", lcore_id);
+		goto out;
+	}
+
+	if (ioctl(group->fds[0], PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP) == -1) {
+		RTE_LOG(ERR, EAL, "failed to enable events on lcore-worker-%d\n", lcore_id);
+
+		ret = -errno;
+		goto out;
+	}
+
+	return 0;
+
+out:
+	cleanup_events(lcore_id);
+
+	return ret;
+}
+
+static int
+scan_pmus(void)
+{
+	char path[PATH_MAX];
+	struct dirent *dent;
+	const char *name;
+	DIR *dirp;
+
+	dirp = opendir(EVENT_SOURCE_DEVICES_PATH);
+	if (!dirp)
+		return -errno;
+
+	while ((dent = readdir(dirp))) {
+		name = dent->d_name;
+		if (name[0] == '.')
+			continue;
+
+		/* sysfs entry should either contain cpus or be a cpu */
+		if (!strcmp(name, "cpu"))
+			break;
+
+		snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/cpus", name);
+		if (access(path, F_OK) == 0)
+			break;
+	}
+
+	closedir(dirp);
+
+	if (dent) {
+		pmu->name = strdup(name);
+		if (!pmu->name)
+			return -ENOMEM;
+	}
+
+	return pmu->name ? 0 : -ENODEV;
+}
+
+int
+rte_pmu_add_event(const char *name)
+{
+	struct rte_pmu_event *event;
+	char path[PATH_MAX];
+
+	snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/events/%s", pmu->name, name);
+	if (access(path, R_OK))
+		return -ENODEV;
+
+	TAILQ_FOREACH(event, &pmu->event_list, next) {
+		if (!strcmp(event->name, name))
+			return event->index;
+		continue;
+	}
+
+	event = rte_zmalloc(NULL, 1, sizeof(*event));
+	if (!event)
+		return -ENOMEM;
+
+	event->name = strdup(name);
+	if (!event->name) {
+		rte_free(event);
+
+		return -ENOMEM;
+	}
+
+	event->index = pmu->num_group_events++;
+	TAILQ_INSERT_TAIL(&pmu->event_list, event, next);
+
+	RTE_LOG(DEBUG, EAL, "%s even added at index %d\n", name, event->index);
+
+	return event->index;
+}
+
+void
+eal_pmu_init(void)
+{
+	int ret;
+
+	pmu = rte_calloc(NULL, 1, sizeof(*pmu), RTE_CACHE_LINE_SIZE);
+	if (!pmu) {
+		RTE_LOG(ERR, EAL, "failed to alloc PMU\n");
+
+		return;
+	}
+
+	TAILQ_INIT(&pmu->event_list);
+
+	ret = scan_pmus();
+	if (ret) {
+		RTE_LOG(ERR, EAL, "failed to find core pmu\n");
+		goto out;
+	}
+
+	ret = pmu_arch_init();
+	if (ret) {
+		RTE_LOG(ERR, EAL, "failed to setup arch for PMU\n");
+		goto out;
+	}
+
+	return;
+out:
+	free(pmu->name);
+	rte_free(pmu);
+}
+
+void
+eal_pmu_fini(void)
+{
+	struct rte_pmu_event *event, *tmp;
+	int lcore_id;
+
+	RTE_TAILQ_FOREACH_SAFE(event, &pmu->event_list, next, tmp) {
+		TAILQ_REMOVE(&pmu->event_list, event, next);
+		free(event->name);
+		rte_free(event);
+	}
+
+	RTE_LCORE_FOREACH_WORKER(lcore_id)
+		cleanup_events(lcore_id);
+
+	pmu_arch_fini();
+	free(pmu->name);
+	rte_free(pmu);
+}
diff --git a/lib/eal/include/meson.build b/lib/eal/include/meson.build
index cfcd40aaed..3bf830adee 100644
--- a/lib/eal/include/meson.build
+++ b/lib/eal/include/meson.build
@@ -36,6 +36,7 @@  headers += files(
         'rte_pci_dev_features.h',
         'rte_per_lcore.h',
         'rte_pflock.h',
+        'rte_pmu.h',
         'rte_random.h',
         'rte_reciprocal.h',
         'rte_seqcount.h',
diff --git a/lib/eal/include/rte_pmu.h b/lib/eal/include/rte_pmu.h
new file mode 100644
index 0000000000..5955c22779
--- /dev/null
+++ b/lib/eal/include/rte_pmu.h
@@ -0,0 +1,204 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2022 Marvell
+ */
+
+#ifndef _RTE_PMU_H_
+#define _RTE_PMU_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <rte_common.h>
+#include <rte_compat.h>
+
+#ifdef RTE_EXEC_ENV_LINUX
+
+#include <linux/perf_event.h>
+
+#include <rte_atomic.h>
+#include <rte_branch_prediction.h>
+#include <rte_lcore.h>
+#include <rte_log.h>
+
+/**
+ * @file
+ *
+ * PMU event tracing operations
+ *
+ * This file defines generic API and types necessary to setup PMU and
+ * read selected counters in runtime.
+ */
+
+/**
+ * A structure describing a group of events.
+ */
+struct rte_pmu_event_group {
+	int *fds; /**< array of event descriptors */
+	void **mmap_pages; /**< array of pointers to mmapped perf_event_attr structures */
+	bool enabled; /**< true if group was enabled on particular lcore */
+};
+
+/**
+ * A structure describing an event.
+ */
+struct rte_pmu_event {
+	char *name; /** name of an event */
+	int index; /** event index into fds/mmap_pages */
+	TAILQ_ENTRY(rte_pmu_event) next; /** list entry */
+};
+
+/**
+ * A PMU state container.
+ */
+struct rte_pmu {
+	char *name; /** name of core PMU listed under /sys/bus/event_source/devices */
+	struct rte_pmu_event_group group[RTE_MAX_LCORE]; /**< per lcore event group data */
+	int num_group_events; /**< number of events in a group */
+	TAILQ_HEAD(, rte_pmu_event) event_list; /**< list of matching events */
+};
+
+/** Pointer to the PMU state container */
+extern struct rte_pmu *pmu;
+
+/** Each architecture supporting PMU needs to provide its own version */
+#ifndef rte_pmu_pmc_read
+#define rte_pmu_pmc_read(index) ({ 0; })
+#endif
+
+/**
+ * @internal
+ *
+ * Read PMU counter.
+ *
+ * @param pc
+ *   Pointer to the mmapped user page.
+ * @return
+ *   Counter value read from hardware.
+ */
+__rte_internal
+static __rte_always_inline uint64_t
+rte_pmu_read_userpage(struct perf_event_mmap_page *pc)
+{
+	uint64_t offset, width, pmc = 0;
+	uint32_t seq, index;
+	int tries = 100;
+
+	for (;;) {
+		seq = pc->lock;
+		rte_compiler_barrier();
+		index = pc->index;
+		offset = pc->offset;
+		width = pc->pmc_width;
+
+		if (likely(pc->cap_user_rdpmc && index)) {
+			pmc = rte_pmu_pmc_read(index - 1);
+			pmc <<= 64 - width;
+			pmc >>= 64 - width;
+		}
+
+		rte_compiler_barrier();
+
+		if (likely(pc->lock == seq))
+			return pmc + offset;
+
+		if (--tries == 0) {
+			RTE_LOG(DEBUG, EAL, "failed to get perf_event_mmap_page lock\n");
+			break;
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * @internal
+ *
+ * Enable group of events for a given lcore.
+ *
+ * @param lcore_id
+ *   The identifier of the lcore.
+ * @return
+ *   0 in case of success, negative value otherwise.
+ */
+__rte_internal
+int
+rte_pmu_enable_group(int lcore_id);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Add event to the group of enabled events.
+ *
+ * @param name
+ *   Name of an event listed under /sys/bus/event_source/devices/pmu/events.
+ * @return
+ *   Event index in case of success, negative value otherwise.
+ */
+__rte_experimental
+int
+rte_pmu_add_event(const char *name);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Read hardware counter configured to count occurrences of an event.
+ *
+ * @param index
+ *   Index of an event to be read.
+ * @return
+ *   Event value read from register. In case of errors or lack of support
+ *   0 is returned. In other words, stream of zeros in a trace file
+ *   indicates problem with reading particular PMU event register.
+ */
+__rte_experimental
+static __rte_always_inline uint64_t
+rte_pmu_read(int index)
+{
+	int lcore_id = rte_lcore_id();
+	struct rte_pmu_event_group *group;
+	int ret;
+
+	if (!pmu)
+		return 0;
+
+	group = &pmu->group[lcore_id];
+	if (!group->enabled) {
+		ret = rte_pmu_enable_group(lcore_id);
+		if (ret)
+			return 0;
+
+		group->enabled = true;
+	}
+
+	if (index < 0 || index >= pmu->num_group_events)
+		return 0;
+
+	return rte_pmu_read_userpage(group->mmap_pages[index]);
+}
+
+#else /* !RTE_EXEC_ENV_LINUX */
+
+__rte_experimental
+static int __rte_unused
+rte_pmu_add_event(__rte_unused const char *name)
+{
+	return -1;
+}
+
+__rte_experimental
+static __rte_always_inline uint64_t
+rte_pmu_read(__rte_unused int index)
+{
+	return 0;
+}
+
+#endif /* RTE_EXEC_ENV_LINUX */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_PMU_H_ */
diff --git a/lib/eal/linux/eal.c b/lib/eal/linux/eal.c
index 8c118d0d9f..751a13b597 100644
--- a/lib/eal/linux/eal.c
+++ b/lib/eal/linux/eal.c
@@ -53,6 +53,7 @@ 
 #include "eal_options.h"
 #include "eal_vfio.h"
 #include "hotplug_mp.h"
+#include "pmu_private.h"
 
 #define MEMSIZE_IF_NO_HUGE_PAGE (64ULL * 1024ULL * 1024ULL)
 
@@ -1206,6 +1207,8 @@  rte_eal_init(int argc, char **argv)
 		return -1;
 	}
 
+	eal_pmu_init();
+
 	if (rte_eal_tailqs_init() < 0) {
 		rte_eal_init_alert("Cannot init tail queues for objects");
 		rte_errno = EFAULT;
@@ -1372,6 +1375,7 @@  rte_eal_cleanup(void)
 	eal_bus_cleanup();
 	rte_trace_save();
 	eal_trace_fini();
+	eal_pmu_fini();
 	/* after this point, any DPDK pointers will become dangling */
 	rte_eal_memory_detach();
 	eal_mp_dev_hotplug_cleanup();
diff --git a/lib/eal/version.map b/lib/eal/version.map
index 7ad12a7dc9..e870c87493 100644
--- a/lib/eal/version.map
+++ b/lib/eal/version.map
@@ -432,6 +432,8 @@  EXPERIMENTAL {
 	rte_thread_set_priority;
 
 	# added in 22.11
+	rte_pmu_add_event; # WINDOWS_NO_EXPORT
+	rte_pmu_read; # WINDOWS_NO_EXPORT
 	rte_thread_attr_get_affinity;
 	rte_thread_attr_init;
 	rte_thread_attr_set_affinity;
@@ -483,4 +485,5 @@  INTERNAL {
 	rte_mem_map;
 	rte_mem_page_size;
 	rte_mem_unmap;
+	rte_pmu_enable_group;
 };