[v5,1/4] eal: add generic support for reading PMU events

Message ID 20230110234642.1188550-2-tduszynski@marvell.com (mailing list archive)
State Superseded, archived
Delegated to: Thomas Monjalon
Headers
Series add support for self monitoring |

Checks

Context Check Description
ci/checkpatch success coding style OK

Commit Message

Tomasz Duszynski Jan. 10, 2023, 11:46 p.m. UTC
  Add support for programming PMU counters and reading their values
in runtime bypassing kernel completely.

This is especially useful in cases where CPU cores are isolated
(nohz_full) i.e run dedicated tasks. In such cases one cannot use
standard perf utility without sacrificing latency and performance.

Signed-off-by: Tomasz Duszynski <tduszynski@marvell.com>
---
 app/test/meson.build                  |   1 +
 app/test/test_pmu.c                   |  41 +++
 doc/guides/prog_guide/profile_app.rst |   8 +
 lib/eal/common/meson.build            |   3 +
 lib/eal/common/pmu_private.h          |  41 +++
 lib/eal/common/rte_pmu.c              | 435 ++++++++++++++++++++++++++
 lib/eal/include/meson.build           |   1 +
 lib/eal/include/rte_pmu.h             | 199 ++++++++++++
 lib/eal/linux/eal.c                   |   4 +
 lib/eal/version.map                   |   6 +
 10 files changed, 739 insertions(+)
 create mode 100644 app/test/test_pmu.c
 create mode 100644 lib/eal/common/pmu_private.h
 create mode 100644 lib/eal/common/rte_pmu.c
 create mode 100644 lib/eal/include/rte_pmu.h
  

Comments

Morten Brørup Jan. 11, 2023, 9:05 a.m. UTC | #1
> From: Tomasz Duszynski [mailto:tduszynski@marvell.com]
> Sent: Wednesday, 11 January 2023 00.47
> 
> Add support for programming PMU counters and reading their values
> in runtime bypassing kernel completely.
> 
> This is especially useful in cases where CPU cores are isolated
> (nohz_full) i.e run dedicated tasks. In such cases one cannot use
> standard perf utility without sacrificing latency and performance.
> 
> Signed-off-by: Tomasz Duszynski <tduszynski@marvell.com>
> ---

[...]

> +static int
> +do_perf_event_open(uint64_t config[3], unsigned int lcore_id, int
> group_fd)
> +{
> +	struct perf_event_attr attr = {
> +		.size = sizeof(struct perf_event_attr),
> +		.type = PERF_TYPE_RAW,
> +		.exclude_kernel = 1,
> +		.exclude_hv = 1,
> +		.disabled = 1,
> +	};
> +
> +	pmu_arch_fixup_config(config);
> +
> +	attr.config = config[0];
> +	attr.config1 = config[1];
> +	attr.config2 = config[2];
> +
> +	return syscall(SYS_perf_event_open, &attr, 0,
> rte_lcore_to_cpu_id(lcore_id), group_fd, 0);
> +}

If SYS_perf_event_open() must be called from the worker thread itself, then lcore_id must not be passed as a parameter to do_perf_event_open(). Otherwise, I would expect to be able to call do_perf_event_open() from the main thread and pass any lcore_id of a worker thread.
This comment applies to all functions that must be called from the worker thread itself. It also applies to the functions that call such functions.

[...]

> +/**
> + * A structure describing a group of events.
> + */
> +struct rte_pmu_event_group {
> +	int fds[MAX_NUM_GROUP_EVENTS]; /**< array of event descriptors */
> +	struct perf_event_mmap_page *mmap_pages[MAX_NUM_GROUP_EVENTS];
> /**< array of user pages */
> +	bool enabled; /**< true if group was enabled on particular lcore
> */
> +};
> +
> +/**
> + * A structure describing an event.
> + */
> +struct rte_pmu_event {
> +	char *name; /** name of an event */
> +	unsigned int index; /** event index into fds/mmap_pages */
> +	TAILQ_ENTRY(rte_pmu_event) next; /** list entry */
> +};

Move the "enabled" field up, making it the first field in this structure. This might reduce the number of instructions required to check (!group->enabled) in rte_pmu_read().

Also, each instance of the structure is used individually per lcore, so the structure should be cache line aligned to avoid unnecessarily crossing cache lines.

I.e.:

struct rte_pmu_event_group {
	bool enabled; /**< true if group was enabled on particular lcore */
	int fds[MAX_NUM_GROUP_EVENTS]; /**< array of event descriptors */
	struct perf_event_mmap_page *mmap_pages[MAX_NUM_GROUP_EVENTS]; /**< array of user pages */
} __rte_cache_aligned;

> +
> +/**
> + * A PMU state container.
> + */
> +struct rte_pmu {
> +	char *name; /** name of core PMU listed under
> /sys/bus/event_source/devices */
> +	struct rte_pmu_event_group group[RTE_MAX_LCORE]; /**< per lcore
> event group data */
> +	unsigned int num_group_events; /**< number of events in a group
> */
> +	TAILQ_HEAD(, rte_pmu_event) event_list; /**< list of matching
> events */
> +};
> +
> +/** Pointer to the PMU state container */
> +extern struct rte_pmu rte_pmu;

Just "The PMU state container". It is not a pointer anymore. :-)

[...]

> +/**
> + * @internal
> + *
> + * Read PMU counter.
> + *
> + * @param pc
> + *   Pointer to the mmapped user page.
> + * @return
> + *   Counter value read from hardware.
> + */
> +__rte_internal
> +static __rte_always_inline uint64_t
> +rte_pmu_read_userpage(struct perf_event_mmap_page *pc)
> +{
> +	uint64_t width, offset;
> +	uint32_t seq, index;
> +	int64_t pmc;
> +
> +	for (;;) {
> +		seq = pc->lock;
> +		rte_compiler_barrier();
> +		index = pc->index;
> +		offset = pc->offset;
> +		width = pc->pmc_width;
> +

Please add a comment here about the special meaning of index == 0.

> +		if (likely(pc->cap_user_rdpmc && index)) {
> +			pmc = rte_pmu_pmc_read(index - 1);
> +			pmc <<= 64 - width;
> +			pmc >>= 64 - width;
> +			offset += pmc;
> +		}
> +
> +		rte_compiler_barrier();
> +
> +		if (likely(pc->lock == seq))
> +			return offset;
> +	}
> +
> +	return 0;
> +}

[...]

> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice
> + *
> + * Read hardware counter configured to count occurrences of an event.
> + *
> + * @param index
> + *   Index of an event to be read.
> + * @return
> + *   Event value read from register. In case of errors or lack of
> support
> + *   0 is returned. In other words, stream of zeros in a trace file
> + *   indicates problem with reading particular PMU event register.
> + */
> +__rte_experimental
> +static __rte_always_inline uint64_t
> +rte_pmu_read(unsigned int index)
> +{
> +	struct rte_pmu_event_group *group;
> +	int ret, lcore_id = rte_lcore_id();
> +
> +	group = &rte_pmu.group[lcore_id];
> +	if (unlikely(!group->enabled)) {
> +		ret = rte_pmu_enable_group(lcore_id);
> +		if (ret)
> +			return 0;
> +
> +		group->enabled = true;

Group->enabled should be set inside rte_pmu_enable_group(), not here.

> +	}
> +
> +	if (unlikely(index >= rte_pmu.num_group_events))
> +		return 0;
> +
> +	return rte_pmu_read_userpage(group->mmap_pages[index]);
> +}
  
Tomasz Duszynski Jan. 11, 2023, 4:20 p.m. UTC | #2
>-----Original Message-----
>From: Morten Brørup <mb@smartsharesystems.com>
>Sent: Wednesday, January 11, 2023 10:06 AM
>To: Tomasz Duszynski <tduszynski@marvell.com>; dev@dpdk.org
>Cc: thomas@monjalon.net; Jerin Jacob Kollanukkaran <jerinj@marvell.com>; Ruifeng.Wang@arm.com;
>mattias.ronnblom@ericsson.com; zhoumin@loongson.cn
>Subject: [EXT] RE: [PATCH v5 1/4] eal: add generic support for reading PMU events
>
>External Email
>
>----------------------------------------------------------------------
>> From: Tomasz Duszynski [mailto:tduszynski@marvell.com]
>> Sent: Wednesday, 11 January 2023 00.47
>>
>> Add support for programming PMU counters and reading their values in
>> runtime bypassing kernel completely.
>>
>> This is especially useful in cases where CPU cores are isolated
>> (nohz_full) i.e run dedicated tasks. In such cases one cannot use
>> standard perf utility without sacrificing latency and performance.
>>
>> Signed-off-by: Tomasz Duszynski <tduszynski@marvell.com>
>> ---
>
>[...]
>
>> +static int
>> +do_perf_event_open(uint64_t config[3], unsigned int lcore_id, int
>> group_fd)
>> +{
>> +	struct perf_event_attr attr = {
>> +		.size = sizeof(struct perf_event_attr),
>> +		.type = PERF_TYPE_RAW,
>> +		.exclude_kernel = 1,
>> +		.exclude_hv = 1,
>> +		.disabled = 1,
>> +	};
>> +
>> +	pmu_arch_fixup_config(config);
>> +
>> +	attr.config = config[0];
>> +	attr.config1 = config[1];
>> +	attr.config2 = config[2];
>> +
>> +	return syscall(SYS_perf_event_open, &attr, 0,
>> rte_lcore_to_cpu_id(lcore_id), group_fd, 0);
>> +}
>
>If SYS_perf_event_open() must be called from the worker thread itself, then lcore_id must not be
>passed as a parameter to do_perf_event_open(). Otherwise, I would expect to be able to call
>do_perf_event_open() from the main thread and pass any lcore_id of a worker thread.
>This comment applies to all functions that must be called from the worker thread itself. It also
>applies to the functions that call such functions.
>

Lcore_id is being passed around so that we don't need to call rte_lcore_id() each and every time. 

>[...]
>
>> +/**
>> + * A structure describing a group of events.
>> + */
>> +struct rte_pmu_event_group {
>> +	int fds[MAX_NUM_GROUP_EVENTS]; /**< array of event descriptors */
>> +	struct perf_event_mmap_page *mmap_pages[MAX_NUM_GROUP_EVENTS];
>> /**< array of user pages */
>> +	bool enabled; /**< true if group was enabled on particular lcore
>> */
>> +};
>> +
>> +/**
>> + * A structure describing an event.
>> + */
>> +struct rte_pmu_event {
>> +	char *name; /** name of an event */
>> +	unsigned int index; /** event index into fds/mmap_pages */
>> +	TAILQ_ENTRY(rte_pmu_event) next; /** list entry */ };
>
>Move the "enabled" field up, making it the first field in this structure. This might reduce the
>number of instructions required to check (!group->enabled) in rte_pmu_read().
>

This will be called once and no this will not produce more instructions. Why should it?
In both cases compiler will need to load data at some offset and archs do have instructions for that. 

>Also, each instance of the structure is used individually per lcore, so the structure should be
>cache line aligned to avoid unnecessarily crossing cache lines.
>
>I.e.:
>
>struct rte_pmu_event_group {
>	bool enabled; /**< true if group was enabled on particular lcore */
>	int fds[MAX_NUM_GROUP_EVENTS]; /**< array of event descriptors */
>	struct perf_event_mmap_page *mmap_pages[MAX_NUM_GROUP_EVENTS]; /**< array of user pages */ }
>__rte_cache_aligned;

Yes, this can be aligned. While at it, I'd be more inclined to move mmap_pages up instead of enable.   

>
>> +
>> +/**
>> + * A PMU state container.
>> + */
>> +struct rte_pmu {
>> +	char *name; /** name of core PMU listed under
>> /sys/bus/event_source/devices */
>> +	struct rte_pmu_event_group group[RTE_MAX_LCORE]; /**< per lcore
>> event group data */
>> +	unsigned int num_group_events; /**< number of events in a group
>> */
>> +	TAILQ_HEAD(, rte_pmu_event) event_list; /**< list of matching
>> events */
>> +};
>> +
>> +/** Pointer to the PMU state container */ extern struct rte_pmu
>> +rte_pmu;
>
>Just "The PMU state container". It is not a pointer anymore. :-)
>

Good catch.

>[...]
>
>> +/**
>> + * @internal
>> + *
>> + * Read PMU counter.
>> + *
>> + * @param pc
>> + *   Pointer to the mmapped user page.
>> + * @return
>> + *   Counter value read from hardware.
>> + */
>> +__rte_internal
>> +static __rte_always_inline uint64_t
>> +rte_pmu_read_userpage(struct perf_event_mmap_page *pc) {
>> +	uint64_t width, offset;
>> +	uint32_t seq, index;
>> +	int64_t pmc;
>> +
>> +	for (;;) {
>> +		seq = pc->lock;
>> +		rte_compiler_barrier();
>> +		index = pc->index;
>> +		offset = pc->offset;
>> +		width = pc->pmc_width;
>> +
>
>Please add a comment here about the special meaning of index == 0.

Okay. 

>
>> +		if (likely(pc->cap_user_rdpmc && index)) {
>> +			pmc = rte_pmu_pmc_read(index - 1);
>> +			pmc <<= 64 - width;
>> +			pmc >>= 64 - width;
>> +			offset += pmc;
>> +		}
>> +
>> +		rte_compiler_barrier();
>> +
>> +		if (likely(pc->lock == seq))
>> +			return offset;
>> +	}
>> +
>> +	return 0;
>> +}
>
>[...]
>
>> +/**
>> + * @warning
>> + * @b EXPERIMENTAL: this API may change without prior notice
>> + *
>> + * Read hardware counter configured to count occurrences of an event.
>> + *
>> + * @param index
>> + *   Index of an event to be read.
>> + * @return
>> + *   Event value read from register. In case of errors or lack of
>> support
>> + *   0 is returned. In other words, stream of zeros in a trace file
>> + *   indicates problem with reading particular PMU event register.
>> + */
>> +__rte_experimental
>> +static __rte_always_inline uint64_t
>> +rte_pmu_read(unsigned int index)
>> +{
>> +	struct rte_pmu_event_group *group;
>> +	int ret, lcore_id = rte_lcore_id();
>> +
>> +	group = &rte_pmu.group[lcore_id];
>> +	if (unlikely(!group->enabled)) {
>> +		ret = rte_pmu_enable_group(lcore_id);
>> +		if (ret)
>> +			return 0;
>> +
>> +		group->enabled = true;
>
>Group->enabled should be set inside rte_pmu_enable_group(), not here.
>

This is easier to follow imo and not against coding guidelines so I prefer to leave it as is.  

>> +	}
>> +
>> +	if (unlikely(index >= rte_pmu.num_group_events))
>> +		return 0;
>> +
>> +	return rte_pmu_read_userpage(group->mmap_pages[index]);
>> +}
>
  
Morten Brørup Jan. 11, 2023, 4:54 p.m. UTC | #3
> From: Tomasz Duszynski [mailto:tduszynski@marvell.com]
> Sent: Wednesday, 11 January 2023 17.21
> 
> >From: Morten Brørup <mb@smartsharesystems.com>
> >Sent: Wednesday, January 11, 2023 10:06 AM
> >
> >> From: Tomasz Duszynski [mailto:tduszynski@marvell.com]
> >> Sent: Wednesday, 11 January 2023 00.47
> >>
> >> Add support for programming PMU counters and reading their values in
> >> runtime bypassing kernel completely.
> >>
> >> This is especially useful in cases where CPU cores are isolated
> >> (nohz_full) i.e run dedicated tasks. In such cases one cannot use
> >> standard perf utility without sacrificing latency and performance.
> >>
> >> Signed-off-by: Tomasz Duszynski <tduszynski@marvell.com>
> >> ---
> >
> >[...]
> >
> >> +static int
> >> +do_perf_event_open(uint64_t config[3], unsigned int lcore_id, int
> >> group_fd)
> >> +{
> >> +	struct perf_event_attr attr = {
> >> +		.size = sizeof(struct perf_event_attr),
> >> +		.type = PERF_TYPE_RAW,
> >> +		.exclude_kernel = 1,
> >> +		.exclude_hv = 1,
> >> +		.disabled = 1,
> >> +	};
> >> +
> >> +	pmu_arch_fixup_config(config);
> >> +
> >> +	attr.config = config[0];
> >> +	attr.config1 = config[1];
> >> +	attr.config2 = config[2];
> >> +
> >> +	return syscall(SYS_perf_event_open, &attr, 0,
> >> rte_lcore_to_cpu_id(lcore_id), group_fd, 0);
> >> +}
> >
> >If SYS_perf_event_open() must be called from the worker thread itself,
> then lcore_id must not be
> >passed as a parameter to do_perf_event_open(). Otherwise, I would
> expect to be able to call
> >do_perf_event_open() from the main thread and pass any lcore_id of a
> worker thread.
> >This comment applies to all functions that must be called from the
> worker thread itself. It also
> >applies to the functions that call such functions.
> >
> 
> Lcore_id is being passed around so that we don't need to call
> rte_lcore_id() each and every time.

Please take a look at the rte_lcore_id() implementation. :-)

Regardless, my argument still stands: If a function cannot be called with the lcore_id parameter set to any valid lcore id, it should not be a parameter to the function.

> 
> >[...]
> >
> >> +/**
> >> + * A structure describing a group of events.
> >> + */
> >> +struct rte_pmu_event_group {
> >> +	int fds[MAX_NUM_GROUP_EVENTS]; /**< array of event descriptors */
> >> +	struct perf_event_mmap_page *mmap_pages[MAX_NUM_GROUP_EVENTS];
> >> /**< array of user pages */
> >> +	bool enabled; /**< true if group was enabled on particular lcore
> >> */
> >> +};
> >> +
> >> +/**
> >> + * A structure describing an event.
> >> + */
> >> +struct rte_pmu_event {
> >> +	char *name; /** name of an event */
> >> +	unsigned int index; /** event index into fds/mmap_pages */
> >> +	TAILQ_ENTRY(rte_pmu_event) next; /** list entry */ };
> >
> >Move the "enabled" field up, making it the first field in this
> structure. This might reduce the
> >number of instructions required to check (!group->enabled) in
> rte_pmu_read().
> >
> 
> This will be called once and no this will not produce more
> instructions. Why should it?

It seems I was not clearly describing my intention here here. rte_pmu_read() a hot function, where the comparison "if (!group->enabled)" itself will be executed many times.

> In both cases compiler will need to load data at some offset and archs
> do have instructions for that.

Yes, the instructions are: address = BASE + sizeof(struct rte_pmu_event_group) * lcore_id + offsetof(struct rte_pmu_event, enabled).

I meant you could avoid the extra instructions stemming from the addition: "+ offsetof()". But you are right... Both BASE and offsetof(struct rte_pmu_event, enabled) are known in advance, and can be merged at compile time to avoid the addition.

> 
> >Also, each instance of the structure is used individually per lcore,
> so the structure should be
> >cache line aligned to avoid unnecessarily crossing cache lines.
> >
> >I.e.:
> >
> >struct rte_pmu_event_group {
> >	bool enabled; /**< true if group was enabled on particular lcore
> */
> >	int fds[MAX_NUM_GROUP_EVENTS]; /**< array of event descriptors */
> >	struct perf_event_mmap_page *mmap_pages[MAX_NUM_GROUP_EVENTS];
> /**< array of user pages */ }
> >__rte_cache_aligned;
> 
> Yes, this can be aligned. While at it, I'd be more inclined to move
> mmap_pages up instead of enable.

Yes, moving up mmap_pages is better.

> 
> >
> >> +
> >> +/**
> >> + * A PMU state container.
> >> + */
> >> +struct rte_pmu {
> >> +	char *name; /** name of core PMU listed under
> >> /sys/bus/event_source/devices */
> >> +	struct rte_pmu_event_group group[RTE_MAX_LCORE]; /**< per lcore
> >> event group data */
> >> +	unsigned int num_group_events; /**< number of events in a group
> >> */
> >> +	TAILQ_HEAD(, rte_pmu_event) event_list; /**< list of matching
> >> events */
> >> +};
> >> +
> >> +/** Pointer to the PMU state container */ extern struct rte_pmu
> >> +rte_pmu;
> >
> >Just "The PMU state container". It is not a pointer anymore. :-)
> >
> 
> Good catch.
> 
> >[...]
> >
> >> +/**
> >> + * @internal
> >> + *
> >> + * Read PMU counter.
> >> + *
> >> + * @param pc
> >> + *   Pointer to the mmapped user page.
> >> + * @return
> >> + *   Counter value read from hardware.
> >> + */
> >> +__rte_internal
> >> +static __rte_always_inline uint64_t
> >> +rte_pmu_read_userpage(struct perf_event_mmap_page *pc) {
> >> +	uint64_t width, offset;
> >> +	uint32_t seq, index;
> >> +	int64_t pmc;
> >> +
> >> +	for (;;) {
> >> +		seq = pc->lock;
> >> +		rte_compiler_barrier();
> >> +		index = pc->index;
> >> +		offset = pc->offset;
> >> +		width = pc->pmc_width;
> >> +
> >
> >Please add a comment here about the special meaning of index == 0.
> 
> Okay.
> 
> >
> >> +		if (likely(pc->cap_user_rdpmc && index)) {
> >> +			pmc = rte_pmu_pmc_read(index - 1);
> >> +			pmc <<= 64 - width;
> >> +			pmc >>= 64 - width;
> >> +			offset += pmc;
> >> +		}
> >> +
> >> +		rte_compiler_barrier();
> >> +
> >> +		if (likely(pc->lock == seq))
> >> +			return offset;
> >> +	}
> >> +
> >> +	return 0;
> >> +}
> >
> >[...]
> >
> >> +/**
> >> + * @warning
> >> + * @b EXPERIMENTAL: this API may change without prior notice
> >> + *
> >> + * Read hardware counter configured to count occurrences of an
> event.
> >> + *
> >> + * @param index
> >> + *   Index of an event to be read.
> >> + * @return
> >> + *   Event value read from register. In case of errors or lack of
> >> support
> >> + *   0 is returned. In other words, stream of zeros in a trace file
> >> + *   indicates problem with reading particular PMU event register.
> >> + */
> >> +__rte_experimental
> >> +static __rte_always_inline uint64_t
> >> +rte_pmu_read(unsigned int index)
> >> +{
> >> +	struct rte_pmu_event_group *group;
> >> +	int ret, lcore_id = rte_lcore_id();
> >> +
> >> +	group = &rte_pmu.group[lcore_id];
> >> +	if (unlikely(!group->enabled)) {
> >> +		ret = rte_pmu_enable_group(lcore_id);
> >> +		if (ret)
> >> +			return 0;
> >> +
> >> +		group->enabled = true;
> >
> >Group->enabled should be set inside rte_pmu_enable_group(), not here.
> >
> 
> This is easier to follow imo and not against coding guidelines so I
> prefer to leave it as is.

OK. It makes the rte_pmu_read() source code slightly shorter, but probably has zero effect on the generated code. No strong preference - feel free to follow your personal preference on this.

> 
> >> +	}
> >> +
> >> +	if (unlikely(index >= rte_pmu.num_group_events))
> >> +		return 0;
> >> +
> >> +	return rte_pmu_read_userpage(group->mmap_pages[index]);
> >> +}
> >
>
  

Patch

diff --git a/app/test/meson.build b/app/test/meson.build
index f34d19e3c3..93b3300309 100644
--- a/app/test/meson.build
+++ b/app/test/meson.build
@@ -143,6 +143,7 @@  test_sources = files(
         'test_timer_racecond.c',
         'test_timer_secondary.c',
         'test_ticketlock.c',
+        'test_pmu.c',
         'test_trace.c',
         'test_trace_register.c',
         'test_trace_perf.c',
diff --git a/app/test/test_pmu.c b/app/test/test_pmu.c
new file mode 100644
index 0000000000..9a90aaffdb
--- /dev/null
+++ b/app/test/test_pmu.c
@@ -0,0 +1,41 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2023 Marvell International Ltd.
+ */
+
+#include <rte_pmu.h>
+
+#include "test.h"
+
+static int
+test_pmu_read(void)
+{
+	uint64_t val = 0;
+	int tries = 10;
+	int event = -1;
+
+	while (tries--)
+		val += rte_pmu_read(event);
+
+	if (val == 0)
+		return TEST_FAILED;
+
+	return TEST_SUCCESS;
+}
+
+static struct unit_test_suite pmu_tests = {
+	.suite_name = "pmu autotest",
+	.setup = NULL,
+	.teardown = NULL,
+	.unit_test_cases = {
+		TEST_CASE(test_pmu_read),
+		TEST_CASES_END()
+	}
+};
+
+static int
+test_pmu(void)
+{
+	return unit_test_suite_runner(&pmu_tests);
+}
+
+REGISTER_TEST_COMMAND(pmu_autotest, test_pmu);
diff --git a/doc/guides/prog_guide/profile_app.rst b/doc/guides/prog_guide/profile_app.rst
index 14292d4c25..a8b501fe0c 100644
--- a/doc/guides/prog_guide/profile_app.rst
+++ b/doc/guides/prog_guide/profile_app.rst
@@ -7,6 +7,14 @@  Profile Your Application
 The following sections describe methods of profiling DPDK applications on
 different architectures.
 
+Performance counter based profiling
+-----------------------------------
+
+Majority of architectures support some sort hardware measurement unit which provides a set of
+programmable counters that monitor specific events. There are different tools which can gather
+that information, perf being an example here. Though in some scenarios, eg. when CPU cores are
+isolated (nohz_full) and run dedicated tasks, using perf is less than ideal. In such cases one can
+read specific events directly from application via ``rte_pmu_read()``.
 
 Profiling on x86
 ----------------
diff --git a/lib/eal/common/meson.build b/lib/eal/common/meson.build
index 917758cc65..d6d05b56f3 100644
--- a/lib/eal/common/meson.build
+++ b/lib/eal/common/meson.build
@@ -38,6 +38,9 @@  sources += files(
         'rte_service.c',
         'rte_version.c',
 )
+if is_linux
+    sources += files('rte_pmu.c')
+endif
 if is_linux or is_windows
     sources += files('eal_common_dynmem.c')
 endif
diff --git a/lib/eal/common/pmu_private.h b/lib/eal/common/pmu_private.h
new file mode 100644
index 0000000000..cade4245e6
--- /dev/null
+++ b/lib/eal/common/pmu_private.h
@@ -0,0 +1,41 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2022 Marvell
+ */
+
+#ifndef _PMU_PRIVATE_H_
+#define _PMU_PRIVATE_H_
+
+/**
+ * Architecture specific PMU init callback.
+ *
+ * @return
+ *   0 in case of success, negative value otherwise.
+ */
+int
+pmu_arch_init(void);
+
+/**
+ * Architecture specific PMU cleanup callback.
+ */
+void
+pmu_arch_fini(void);
+
+/**
+ * Apply architecture specific settings to config before passing it to syscall.
+ */
+void
+pmu_arch_fixup_config(uint64_t config[3]);
+
+/**
+ * Initialize PMU tracing internals.
+ */
+void
+eal_pmu_init(void);
+
+/**
+ * Cleanup PMU internals.
+ */
+void
+eal_pmu_fini(void);
+
+#endif /* _PMU_PRIVATE_H_ */
diff --git a/lib/eal/common/rte_pmu.c b/lib/eal/common/rte_pmu.c
new file mode 100644
index 0000000000..67e8ffefb2
--- /dev/null
+++ b/lib/eal/common/rte_pmu.c
@@ -0,0 +1,435 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2023 Marvell International Ltd.
+ */
+
+#include <ctype.h>
+#include <dirent.h>
+#include <errno.h>
+#include <regex.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/queue.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+
+#include <rte_eal_paging.h>
+#include <rte_pmu.h>
+#include <rte_tailq.h>
+
+#include "pmu_private.h"
+
+#define EVENT_SOURCE_DEVICES_PATH "/sys/bus/event_source/devices"
+
+#ifndef GENMASK_ULL
+#define GENMASK_ULL(h, l) ((~0ULL - (1ULL << (l)) + 1) & (~0ULL >> ((64 - 1 - (h)))))
+#endif
+
+#ifndef FIELD_PREP
+#define FIELD_PREP(m, v) (((uint64_t)(v) << (__builtin_ffsll(m) - 1)) & (m))
+#endif
+
+struct rte_pmu rte_pmu;
+
+/*
+ * Following __rte_weak functions provide default no-op. Architectures should override them if
+ * necessary.
+ */
+
+int
+__rte_weak pmu_arch_init(void)
+{
+	return 0;
+}
+
+void
+__rte_weak pmu_arch_fini(void)
+{
+}
+
+void
+__rte_weak pmu_arch_fixup_config(uint64_t config[3])
+{
+	RTE_SET_USED(config);
+}
+
+static int
+get_term_format(const char *name, int *num, uint64_t *mask)
+{
+	char *config = NULL;
+	char path[PATH_MAX];
+	int high, low, ret;
+	FILE *fp;
+
+	/* quiesce -Wmaybe-uninitialized warning */
+	*num = 0;
+	*mask = 0;
+
+	snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/format/%s", rte_pmu.name, name);
+	fp = fopen(path, "r");
+	if (fp == NULL)
+		return -errno;
+
+	errno = 0;
+	ret = fscanf(fp, "%m[^:]:%d-%d", &config, &low, &high);
+	if (ret < 2) {
+		ret = -ENODATA;
+		goto out;
+	}
+	if (errno) {
+		ret = -errno;
+		goto out;
+	}
+
+	if (ret == 2)
+		high = low;
+
+	*mask = GENMASK_ULL(high, low);
+	/* Last digit should be [012]. If last digit is missing 0 is implied. */
+	*num = config[strlen(config) - 1];
+	*num = isdigit(*num) ? *num - '0' : 0;
+
+	ret = 0;
+out:
+	free(config);
+	fclose(fp);
+
+	return ret;
+}
+
+static int
+parse_event(char *buf, uint64_t config[3])
+{
+	char *token, *term;
+	int num, ret, val;
+	uint64_t mask;
+
+	config[0] = config[1] = config[2] = 0;
+
+	token = strtok(buf, ",");
+	while (token) {
+		errno = 0;
+		/* <term>=<value> */
+		ret = sscanf(token, "%m[^=]=%i", &term, &val);
+		if (ret < 1)
+			return -ENODATA;
+		if (errno)
+			return -errno;
+		if (ret == 1)
+			val = 1;
+
+		ret = get_term_format(term, &num, &mask);
+		free(term);
+		if (ret)
+			return ret;
+
+		config[num] |= FIELD_PREP(mask, val);
+		token = strtok(NULL, ",");
+	}
+
+	return 0;
+}
+
+static int
+get_event_config(const char *name, uint64_t config[3])
+{
+	char path[PATH_MAX], buf[BUFSIZ];
+	FILE *fp;
+	int ret;
+
+	snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/events/%s", rte_pmu.name, name);
+	fp = fopen(path, "r");
+	if (fp == NULL)
+		return -errno;
+
+	ret = fread(buf, 1, sizeof(buf), fp);
+	if (ret == 0) {
+		fclose(fp);
+
+		return -EINVAL;
+	}
+	fclose(fp);
+	buf[ret] = '\0';
+
+	return parse_event(buf, config);
+}
+
+static int
+do_perf_event_open(uint64_t config[3], unsigned int lcore_id, int group_fd)
+{
+	struct perf_event_attr attr = {
+		.size = sizeof(struct perf_event_attr),
+		.type = PERF_TYPE_RAW,
+		.exclude_kernel = 1,
+		.exclude_hv = 1,
+		.disabled = 1,
+	};
+
+	pmu_arch_fixup_config(config);
+
+	attr.config = config[0];
+	attr.config1 = config[1];
+	attr.config2 = config[2];
+
+	return syscall(SYS_perf_event_open, &attr, 0, rte_lcore_to_cpu_id(lcore_id), group_fd, 0);
+}
+
+static int
+open_events(unsigned int lcore_id)
+{
+	struct rte_pmu_event_group *group = &rte_pmu.group[lcore_id];
+	struct rte_pmu_event *event;
+	uint64_t config[3];
+	int num = 0, ret;
+
+	/* group leader gets created first, with fd = -1 */
+	group->fds[0] = -1;
+
+	TAILQ_FOREACH(event, &rte_pmu.event_list, next) {
+		ret = get_event_config(event->name, config);
+		if (ret) {
+			RTE_LOG(ERR, EAL, "failed to get %s event config\n", event->name);
+			continue;
+		}
+
+		ret = do_perf_event_open(config, lcore_id, group->fds[0]);
+		if (ret == -1) {
+			if (errno == EOPNOTSUPP)
+				RTE_LOG(ERR, EAL, "64 bit counters not supported\n");
+
+			ret = -errno;
+			goto out;
+		}
+
+		group->fds[event->index] = ret;
+		num++;
+	}
+
+	return 0;
+out:
+	for (--num; num >= 0; num--) {
+		close(group->fds[num]);
+		group->fds[num] = -1;
+	}
+
+
+	return ret;
+}
+
+static int
+mmap_events(unsigned int lcore_id)
+{
+	struct rte_pmu_event_group *group = &rte_pmu.group[lcore_id];
+	unsigned int i;
+	void *addr;
+	int ret;
+
+	for (i = 0; i < rte_pmu.num_group_events; i++) {
+		addr = mmap(0, rte_mem_page_size(), PROT_READ, MAP_SHARED, group->fds[i], 0);
+		if (addr == MAP_FAILED) {
+			ret = -errno;
+			goto out;
+		}
+
+		group->mmap_pages[i] = addr;
+	}
+
+	return 0;
+out:
+	for (; i; i--) {
+		munmap(group->mmap_pages[i - 1], rte_mem_page_size());
+		group->mmap_pages[i - 1] = NULL;
+	}
+
+	return ret;
+}
+
+static void
+cleanup_events(unsigned int lcore_id)
+{
+	struct rte_pmu_event_group *group = &rte_pmu.group[lcore_id];
+	unsigned int i;
+
+	if (group->fds == NULL)
+		return;
+
+	if (group->fds[0] != -1)
+		ioctl(group->fds[0], PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP);
+
+	for (i = 0; i < rte_pmu.num_group_events; i++) {
+		if (group->mmap_pages[i]) {
+			munmap(group->mmap_pages[i], rte_mem_page_size());
+			group->mmap_pages[i] = NULL;
+		}
+
+		if (group->fds[i] != -1) {
+			close(group->fds[i]);
+			group->fds[i] = -1;
+		}
+	}
+
+	group->enabled = false;
+}
+
+int __rte_noinline
+rte_pmu_enable_group(unsigned int lcore_id)
+{
+	struct rte_pmu_event_group *group = &rte_pmu.group[lcore_id];
+	int ret;
+
+	if (rte_pmu.num_group_events == 0) {
+		RTE_LOG(DEBUG, EAL, "no matching PMU events\n");
+
+		return 0;
+	}
+
+	ret = open_events(lcore_id);
+	if (ret) {
+		RTE_LOG(ERR, EAL, "failed to open events on lcore-worker-%d\n", lcore_id);
+		goto out;
+	}
+
+	ret = mmap_events(lcore_id);
+	if (ret) {
+		RTE_LOG(ERR, EAL, "failed to map events on lcore-worker-%d\n", lcore_id);
+		goto out;
+	}
+
+	if (ioctl(group->fds[0], PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP) == -1) {
+		RTE_LOG(ERR, EAL, "failed to reset events on lcore-worker-%d\n", lcore_id);
+
+		ret = -errno;
+		goto out;
+	}
+
+	if (ioctl(group->fds[0], PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP) == -1) {
+		RTE_LOG(ERR, EAL, "failed to enable events on lcore-worker-%d\n", lcore_id);
+
+		ret = -errno;
+		goto out;
+	}
+
+	return 0;
+
+out:
+	cleanup_events(lcore_id);
+
+	return ret;
+}
+
+static int
+scan_pmus(void)
+{
+	char path[PATH_MAX];
+	struct dirent *dent;
+	const char *name;
+	DIR *dirp;
+
+	dirp = opendir(EVENT_SOURCE_DEVICES_PATH);
+	if (dirp == NULL)
+		return -errno;
+
+	while ((dent = readdir(dirp))) {
+		name = dent->d_name;
+		if (name[0] == '.')
+			continue;
+
+		/* sysfs entry should either contain cpus or be a cpu */
+		if (!strcmp(name, "cpu"))
+			break;
+
+		snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/cpus", name);
+		if (access(path, F_OK) == 0)
+			break;
+	}
+
+	closedir(dirp);
+
+	if (dent) {
+		rte_pmu.name = strdup(name);
+		if (rte_pmu.name == NULL)
+			return -ENOMEM;
+	}
+
+	return rte_pmu.name ? 0 : -ENODEV;
+}
+
+int
+rte_pmu_add_event(const char *name)
+{
+	struct rte_pmu_event *event;
+	char path[PATH_MAX];
+
+	snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/events/%s", rte_pmu.name, name);
+	if (access(path, R_OK))
+		return -ENODEV;
+
+	TAILQ_FOREACH(event, &rte_pmu.event_list, next) {
+		if (!strcmp(event->name, name))
+			return event->index;
+		continue;
+	}
+
+	event = calloc(1, sizeof(*event));
+	if (!event)
+		return -ENOMEM;
+
+	event->name = strdup(name);
+	if (!event->name) {
+		free(event);
+
+		return -ENOMEM;
+	}
+
+	event->index = rte_pmu.num_group_events++;
+	TAILQ_INSERT_TAIL(&rte_pmu.event_list, event, next);
+
+	RTE_LOG(DEBUG, EAL, "%s even added at index %d\n", name, event->index);
+
+	return event->index;
+}
+
+void
+eal_pmu_init(void)
+{
+	int ret;
+
+	TAILQ_INIT(&rte_pmu.event_list);
+
+	ret = scan_pmus();
+	if (ret) {
+		RTE_LOG(ERR, EAL, "failed to find core pmu\n");
+		goto out;
+	}
+
+	ret = pmu_arch_init();
+	if (ret) {
+		RTE_LOG(ERR, EAL, "failed to setup arch for PMU\n");
+		goto out;
+	}
+
+	return;
+out:
+	free(rte_pmu.name);
+	rte_pmu.name = NULL;
+}
+
+void
+eal_pmu_fini(void)
+{
+	struct rte_pmu_event *event, *tmp;
+	unsigned int lcore_id;
+
+	RTE_TAILQ_FOREACH_SAFE(event, &rte_pmu.event_list, next, tmp) {
+		TAILQ_REMOVE(&rte_pmu.event_list, event, next);
+		free(event->name);
+		free(event);
+	}
+
+	RTE_LCORE_FOREACH(lcore_id)
+		cleanup_events(lcore_id);
+
+	pmu_arch_fini();
+	free(rte_pmu.name);
+}
diff --git a/lib/eal/include/meson.build b/lib/eal/include/meson.build
index cfcd40aaed..3bf830adee 100644
--- a/lib/eal/include/meson.build
+++ b/lib/eal/include/meson.build
@@ -36,6 +36,7 @@  headers += files(
         'rte_pci_dev_features.h',
         'rte_per_lcore.h',
         'rte_pflock.h',
+        'rte_pmu.h',
         'rte_random.h',
         'rte_reciprocal.h',
         'rte_seqcount.h',
diff --git a/lib/eal/include/rte_pmu.h b/lib/eal/include/rte_pmu.h
new file mode 100644
index 0000000000..6968b35545
--- /dev/null
+++ b/lib/eal/include/rte_pmu.h
@@ -0,0 +1,199 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2023 Marvell
+ */
+
+#ifndef _RTE_PMU_H_
+#define _RTE_PMU_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <rte_common.h>
+#include <rte_compat.h>
+
+#ifdef RTE_EXEC_ENV_LINUX
+
+#include <linux/perf_event.h>
+
+#include <rte_atomic.h>
+#include <rte_branch_prediction.h>
+#include <rte_lcore.h>
+#include <rte_log.h>
+
+/**
+ * @file
+ *
+ * PMU event tracing operations
+ *
+ * This file defines generic API and types necessary to setup PMU and
+ * read selected counters in runtime.
+ */
+
+/** Maximum number of events in a group */
+#define MAX_NUM_GROUP_EVENTS 16
+
+/**
+ * A structure describing a group of events.
+ */
+struct rte_pmu_event_group {
+	int fds[MAX_NUM_GROUP_EVENTS]; /**< array of event descriptors */
+	struct perf_event_mmap_page *mmap_pages[MAX_NUM_GROUP_EVENTS]; /**< array of user pages */
+	bool enabled; /**< true if group was enabled on particular lcore */
+};
+
+/**
+ * A structure describing an event.
+ */
+struct rte_pmu_event {
+	char *name; /** name of an event */
+	unsigned int index; /** event index into fds/mmap_pages */
+	TAILQ_ENTRY(rte_pmu_event) next; /** list entry */
+};
+
+/**
+ * A PMU state container.
+ */
+struct rte_pmu {
+	char *name; /** name of core PMU listed under /sys/bus/event_source/devices */
+	struct rte_pmu_event_group group[RTE_MAX_LCORE]; /**< per lcore event group data */
+	unsigned int num_group_events; /**< number of events in a group */
+	TAILQ_HEAD(, rte_pmu_event) event_list; /**< list of matching events */
+};
+
+/** Pointer to the PMU state container */
+extern struct rte_pmu rte_pmu;
+
+/** Each architecture supporting PMU needs to provide its own version */
+#ifndef rte_pmu_pmc_read
+#define rte_pmu_pmc_read(index) ({ 0; })
+#endif
+
+/**
+ * @internal
+ *
+ * Read PMU counter.
+ *
+ * @param pc
+ *   Pointer to the mmapped user page.
+ * @return
+ *   Counter value read from hardware.
+ */
+__rte_internal
+static __rte_always_inline uint64_t
+rte_pmu_read_userpage(struct perf_event_mmap_page *pc)
+{
+	uint64_t width, offset;
+	uint32_t seq, index;
+	int64_t pmc;
+
+	for (;;) {
+		seq = pc->lock;
+		rte_compiler_barrier();
+		index = pc->index;
+		offset = pc->offset;
+		width = pc->pmc_width;
+
+		if (likely(pc->cap_user_rdpmc && index)) {
+			pmc = rte_pmu_pmc_read(index - 1);
+			pmc <<= 64 - width;
+			pmc >>= 64 - width;
+			offset += pmc;
+		}
+
+		rte_compiler_barrier();
+
+		if (likely(pc->lock == seq))
+			return offset;
+	}
+
+	return 0;
+}
+
+/**
+ * @internal
+ *
+ * Enable group of events for a given lcore.
+ *
+ * @param lcore_id
+ *   The identifier of the lcore.
+ * @return
+ *   0 in case of success, negative value otherwise.
+ */
+__rte_internal
+int
+rte_pmu_enable_group(unsigned int lcore_id);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Add event to the group of enabled events.
+ *
+ * @param name
+ *   Name of an event listed under /sys/bus/event_source/devices/pmu/events.
+ * @return
+ *   Event index in case of success, negative value otherwise.
+ */
+__rte_experimental
+int
+rte_pmu_add_event(const char *name);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Read hardware counter configured to count occurrences of an event.
+ *
+ * @param index
+ *   Index of an event to be read.
+ * @return
+ *   Event value read from register. In case of errors or lack of support
+ *   0 is returned. In other words, stream of zeros in a trace file
+ *   indicates problem with reading particular PMU event register.
+ */
+__rte_experimental
+static __rte_always_inline uint64_t
+rte_pmu_read(unsigned int index)
+{
+	struct rte_pmu_event_group *group;
+	int ret, lcore_id = rte_lcore_id();
+
+	group = &rte_pmu.group[lcore_id];
+	if (unlikely(!group->enabled)) {
+		ret = rte_pmu_enable_group(lcore_id);
+		if (ret)
+			return 0;
+
+		group->enabled = true;
+	}
+
+	if (unlikely(index >= rte_pmu.num_group_events))
+		return 0;
+
+	return rte_pmu_read_userpage(group->mmap_pages[index]);
+}
+
+#else /* !RTE_EXEC_ENV_LINUX */
+
+__rte_experimental
+static int __rte_unused
+rte_pmu_add_event(__rte_unused const char *name)
+{
+	return -1;
+}
+
+__rte_experimental
+static __rte_always_inline uint64_t
+rte_pmu_read(__rte_unused unsigned int index)
+{
+	return 0;
+}
+
+#endif /* RTE_EXEC_ENV_LINUX */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_PMU_H_ */
diff --git a/lib/eal/linux/eal.c b/lib/eal/linux/eal.c
index 8c118d0d9f..751a13b597 100644
--- a/lib/eal/linux/eal.c
+++ b/lib/eal/linux/eal.c
@@ -53,6 +53,7 @@ 
 #include "eal_options.h"
 #include "eal_vfio.h"
 #include "hotplug_mp.h"
+#include "pmu_private.h"
 
 #define MEMSIZE_IF_NO_HUGE_PAGE (64ULL * 1024ULL * 1024ULL)
 
@@ -1206,6 +1207,8 @@  rte_eal_init(int argc, char **argv)
 		return -1;
 	}
 
+	eal_pmu_init();
+
 	if (rte_eal_tailqs_init() < 0) {
 		rte_eal_init_alert("Cannot init tail queues for objects");
 		rte_errno = EFAULT;
@@ -1372,6 +1375,7 @@  rte_eal_cleanup(void)
 	eal_bus_cleanup();
 	rte_trace_save();
 	eal_trace_fini();
+	eal_pmu_fini();
 	/* after this point, any DPDK pointers will become dangling */
 	rte_eal_memory_detach();
 	eal_mp_dev_hotplug_cleanup();
diff --git a/lib/eal/version.map b/lib/eal/version.map
index 7ad12a7dc9..1717b221b4 100644
--- a/lib/eal/version.map
+++ b/lib/eal/version.map
@@ -440,6 +440,11 @@  EXPERIMENTAL {
 	rte_thread_detach;
 	rte_thread_equal;
 	rte_thread_join;
+
+	# added in 23.03
+	rte_pmu; # WINDOWS_NO_EXPORT
+	rte_pmu_add_event; # WINDOWS_NO_EXPORT
+	rte_pmu_read; # WINDOWS_NO_EXPORT
 };
 
 INTERNAL {
@@ -483,4 +488,5 @@  INTERNAL {
 	rte_mem_map;
 	rte_mem_page_size;
 	rte_mem_unmap;
+	rte_pmu_enable_group; # WINDOWS_NO_EXPORT
 };