[v4,1/4] eal: add generic support for reading PMU events

Message ID 20221213104350.3218167-2-tduszynski@marvell.com (mailing list archive)
State Superseded, archived
Delegated to: Thomas Monjalon
Headers
Series add support for self monitoring |

Checks

Context Check Description
ci/checkpatch warning coding style issues

Commit Message

Tomasz Duszynski Dec. 13, 2022, 10:43 a.m. UTC
  Add support for programming PMU counters and reading their values
in runtime bypassing kernel completely.

This is especially useful in cases where CPU cores are isolated
(nohz_full) i.e run dedicated tasks. In such cases one cannot use
standard perf utility without sacrificing latency and performance.

Signed-off-by: Tomasz Duszynski <tduszynski@marvell.com>
---
 app/test/meson.build                  |   1 +
 app/test/test_pmu.c                   |  41 +++
 doc/guides/prog_guide/profile_app.rst |   8 +
 lib/eal/common/meson.build            |   3 +
 lib/eal/common/pmu_private.h          |  41 +++
 lib/eal/common/rte_pmu.c              | 456 ++++++++++++++++++++++++++
 lib/eal/include/meson.build           |   1 +
 lib/eal/include/rte_pmu.h             | 204 ++++++++++++
 lib/eal/linux/eal.c                   |   4 +
 lib/eal/version.map                   |   6 +
 10 files changed, 765 insertions(+)
 create mode 100644 app/test/test_pmu.c
 create mode 100644 lib/eal/common/pmu_private.h
 create mode 100644 lib/eal/common/rte_pmu.c
 create mode 100644 lib/eal/include/rte_pmu.h
  

Comments

Morten Brørup Dec. 13, 2022, 11:52 a.m. UTC | #1
> From: Tomasz Duszynski [mailto:tduszynski@marvell.com]
> Sent: Tuesday, 13 December 2022 11.44
> 
> Add support for programming PMU counters and reading their values
> in runtime bypassing kernel completely.
> 
> This is especially useful in cases where CPU cores are isolated
> (nohz_full) i.e run dedicated tasks. In such cases one cannot use
> standard perf utility without sacrificing latency and performance.
> 
> Signed-off-by: Tomasz Duszynski <tduszynski@marvell.com>
> ---


> +++ b/lib/eal/common/rte_pmu.c
> @@ -0,0 +1,456 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(C) 2022 Marvell International Ltd.
> + */
> +
> +#include <ctype.h>
> +#include <dirent.h>
> +#include <errno.h>
> +#include <regex.h>
> +#include <stdlib.h>
> +#include <string.h>
> +#include <sys/ioctl.h>
> +#include <sys/mman.h>
> +#include <sys/queue.h>
> +#include <sys/syscall.h>
> +#include <unistd.h>
> +
> +#include <rte_eal_paging.h>
> +#include <rte_pmu.h>
> +#include <rte_tailq.h>
> +
> +#include "pmu_private.h"
> +
> +#define EVENT_SOURCE_DEVICES_PATH "/sys/bus/event_source/devices"
> +
> +#ifndef GENMASK_ULL
> +#define GENMASK_ULL(h, l) ((~0ULL - (1ULL << (l)) + 1) & (~0ULL >>
> ((64 - 1 - (h)))))
> +#endif
> +
> +#ifndef FIELD_PREP
> +#define FIELD_PREP(m, v) (((uint64_t)(v) << (__builtin_ffsll(m) - 1))
> & (m))
> +#endif
> +
> +struct rte_pmu *rte_pmu;
> +
> +/*
> + * Following __rte_weak functions provide default no-op. Architectures
> should override them if
> + * necessary.
> + */
> +
> +int
> +__rte_weak pmu_arch_init(void)
> +{
> +	return 0;
> +}
> +
> +void
> +__rte_weak pmu_arch_fini(void)
> +{
> +}
> +
> +void
> +__rte_weak pmu_arch_fixup_config(uint64_t config[3])
> +{
> +	RTE_SET_USED(config);
> +}
> +
> +static int
> +get_term_format(const char *name, int *num, uint64_t *mask)
> +{
> +	char *config = NULL;
> +	char path[PATH_MAX];
> +	int high, low, ret;
> +	FILE *fp;
> +
> +	/* quiesce -Wmaybe-uninitialized warning */
> +	*num = 0;
> +	*mask = 0;
> +
> +	snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH
> "/%s/format/%s", rte_pmu->name, name);
> +	fp = fopen(path, "r");
> +	if (!fp)
> +		return -errno;
> +
> +	errno = 0;
> +	ret = fscanf(fp, "%m[^:]:%d-%d", &config, &low, &high);
> +	if (ret < 2) {
> +		ret = -ENODATA;
> +		goto out;
> +	}
> +	if (errno) {
> +		ret = -errno;
> +		goto out;
> +	}
> +
> +	if (ret == 2)
> +		high = low;
> +
> +	*mask = GENMASK_ULL(high, low);
> +	/* Last digit should be [012]. If last digit is missing 0 is
> implied. */
> +	*num = config[strlen(config) - 1];
> +	*num = isdigit(*num) ? *num - '0' : 0;
> +
> +	ret = 0;
> +out:
> +	free(config);
> +	fclose(fp);
> +
> +	return ret;
> +}
> +
> +static int
> +parse_event(char *buf, uint64_t config[3])
> +{
> +	char *token, *term;
> +	int num, ret, val;
> +	uint64_t mask;
> +
> +	config[0] = config[1] = config[2] = 0;
> +
> +	token = strtok(buf, ",");
> +	while (token) {
> +		errno = 0;
> +		/* <term>=<value> */
> +		ret = sscanf(token, "%m[^=]=%i", &term, &val);
> +		if (ret < 1)
> +			return -ENODATA;
> +		if (errno)
> +			return -errno;
> +		if (ret == 1)
> +			val = 1;
> +
> +		ret = get_term_format(term, &num, &mask);
> +		free(term);
> +		if (ret)
> +			return ret;
> +
> +		config[num] |= FIELD_PREP(mask, val);
> +		token = strtok(NULL, ",");
> +	}
> +
> +	return 0;
> +}
> +
> +static int
> +get_event_config(const char *name, uint64_t config[3])
> +{
> +	char path[PATH_MAX], buf[BUFSIZ];
> +	FILE *fp;
> +	int ret;
> +
> +	snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH
> "/%s/events/%s", rte_pmu->name, name);
> +	fp = fopen(path, "r");
> +	if (!fp)
> +		return -errno;
> +
> +	ret = fread(buf, 1, sizeof(buf), fp);
> +	if (ret == 0) {
> +		fclose(fp);
> +
> +		return -EINVAL;
> +	}
> +	fclose(fp);
> +	buf[ret] = '\0';
> +
> +	return parse_event(buf, config);
> +}
> +
> +static int
> +do_perf_event_open(uint64_t config[3], int lcore_id, int group_fd)
> +{
> +	struct perf_event_attr attr = {
> +		.size = sizeof(struct perf_event_attr),
> +		.type = PERF_TYPE_RAW,
> +		.exclude_kernel = 1,
> +		.exclude_hv = 1,
> +		.disabled = 1,
> +	};
> +
> +	pmu_arch_fixup_config(config);
> +
> +	attr.config = config[0];
> +	attr.config1 = config[1];
> +	attr.config2 = config[2];
> +
> +	return syscall(SYS_perf_event_open, &attr, rte_gettid(),
> rte_lcore_to_cpu_id(lcore_id),
> +		       group_fd, 0);
> +}
> +
> +static int
> +open_events(int lcore_id)
> +{
> +	struct rte_pmu_event_group *group = &rte_pmu->group[lcore_id];
> +	struct rte_pmu_event *event;
> +	uint64_t config[3];
> +	int num = 0, ret;
> +
> +	/* group leader gets created first, with fd = -1 */
> +	group->fds[0] = -1;
> +
> +	TAILQ_FOREACH(event, &rte_pmu->event_list, next) {
> +		ret = get_event_config(event->name, config);
> +		if (ret) {
> +			RTE_LOG(ERR, EAL, "failed to get %s event config\n",
> event->name);
> +			continue;
> +		}
> +
> +		ret = do_perf_event_open(config, lcore_id, group->fds[0]);
> +		if (ret == -1) {
> +			if (errno == EOPNOTSUPP)
> +				RTE_LOG(ERR, EAL, "64 bit counters not
> supported\n");
> +
> +			ret = -errno;
> +			goto out;
> +		}
> +
> +		group->fds[event->index] = ret;
> +		num++;
> +	}
> +
> +	return 0;
> +out:
> +	for (--num; num >= 0; num--) {
> +		close(group->fds[num]);
> +		group->fds[num] = -1;
> +	}
> +
> +
> +	return ret;
> +}
> +
> +static int
> +mmap_events(int lcore_id)
> +{
> +	struct rte_pmu_event_group *group = &rte_pmu->group[lcore_id];
> +	void *addr;
> +	int ret, i;
> +
> +	for (i = 0; i < rte_pmu->num_group_events; i++) {
> +		addr = mmap(0, rte_mem_page_size(), PROT_READ, MAP_SHARED,
> group->fds[i], 0);
> +		if (addr == MAP_FAILED) {
> +			ret = -errno;
> +			goto out;
> +		}
> +
> +		group->mmap_pages[i] = addr;
> +	}
> +
> +	return 0;
> +out:
> +	for (; i; i--) {
> +		munmap(group->mmap_pages[i - 1], rte_mem_page_size());
> +		group->mmap_pages[i - 1] = NULL;
> +	}
> +
> +	return ret;
> +}
> +
> +static void
> +cleanup_events(int lcore_id)
> +{
> +	struct rte_pmu_event_group *group = &rte_pmu->group[lcore_id];
> +	int i;
> +
> +	if (!group->fds)
> +		return;
> +
> +	if (group->fds[0] != -1)
> +		ioctl(group->fds[0], PERF_EVENT_IOC_DISABLE,
> PERF_IOC_FLAG_GROUP);
> +
> +	for (i = 0; i < rte_pmu->num_group_events; i++) {
> +		if (group->mmap_pages[i]) {
> +			munmap(group->mmap_pages[i], rte_mem_page_size());
> +			group->mmap_pages[i] = NULL;
> +		}
> +
> +		if (group->fds[i] != -1) {
> +			close(group->fds[i]);
> +			group->fds[i] = -1;
> +		}
> +	}
> +
> +	free(group->mmap_pages);
> +	free(group->fds);
> +
> +	group->mmap_pages = NULL;
> +	group->fds = NULL;
> +	group->enabled = false;
> +}
> +
> +int __rte_noinline
> +rte_pmu_enable_group(int lcore_id)
> +{
> +	struct rte_pmu_event_group *group = &rte_pmu->group[lcore_id];
> +	int ret;
> +
> +	if (rte_pmu->num_group_events == 0) {
> +		RTE_LOG(DEBUG, EAL, "no matching PMU events\n");
> +
> +		return 0;
> +	}
> +
> +	group->fds = calloc(rte_pmu->num_group_events, sizeof(*group-
> >fds));
> +	if (!group->fds) {
> +		RTE_LOG(ERR, EAL, "failed to alloc descriptor memory\n");
> +
> +		return -ENOMEM;
> +	}
> +
> +	group->mmap_pages = calloc(rte_pmu->num_group_events,
> sizeof(*group->mmap_pages));
> +	if (!group->mmap_pages) {
> +		RTE_LOG(ERR, EAL, "failed to alloc userpage memory\n");
> +
> +		ret = -ENOMEM;
> +		goto out;
> +	}
> +
> +	ret = open_events(lcore_id);
> +	if (ret) {
> +		RTE_LOG(ERR, EAL, "failed to open events on lcore-worker-
> %d\n", lcore_id);
> +		goto out;
> +	}
> +
> +	ret = mmap_events(lcore_id);
> +	if (ret) {
> +		RTE_LOG(ERR, EAL, "failed to map events on lcore-worker-
> %d\n", lcore_id);
> +		goto out;
> +	}
> +
> +	if (ioctl(group->fds[0], PERF_EVENT_IOC_ENABLE,
> PERF_IOC_FLAG_GROUP) == -1) {
> +		RTE_LOG(ERR, EAL, "failed to enable events on lcore-worker-
> %d\n", lcore_id);
> +
> +		ret = -errno;
> +		goto out;
> +	}
> +
> +	return 0;
> +
> +out:
> +	cleanup_events(lcore_id);
> +
> +	return ret;
> +}
> +
> +static int
> +scan_pmus(void)
> +{
> +	char path[PATH_MAX];
> +	struct dirent *dent;
> +	const char *name;
> +	DIR *dirp;
> +
> +	dirp = opendir(EVENT_SOURCE_DEVICES_PATH);
> +	if (!dirp)
> +		return -errno;
> +
> +	while ((dent = readdir(dirp))) {
> +		name = dent->d_name;
> +		if (name[0] == '.')
> +			continue;
> +
> +		/* sysfs entry should either contain cpus or be a cpu */
> +		if (!strcmp(name, "cpu"))
> +			break;
> +
> +		snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH
> "/%s/cpus", name);
> +		if (access(path, F_OK) == 0)
> +			break;
> +	}
> +
> +	closedir(dirp);
> +
> +	if (dent) {
> +		rte_pmu->name = strdup(name);
> +		if (!rte_pmu->name)
> +			return -ENOMEM;
> +	}
> +
> +	return rte_pmu->name ? 0 : -ENODEV;
> +}
> +
> +int
> +rte_pmu_add_event(const char *name)
> +{
> +	struct rte_pmu_event *event;
> +	char path[PATH_MAX];
> +
> +	snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH
> "/%s/events/%s", rte_pmu->name, name);
> +	if (access(path, R_OK))
> +		return -ENODEV;
> +
> +	TAILQ_FOREACH(event, &rte_pmu->event_list, next) {
> +		if (!strcmp(event->name, name))
> +			return event->index;
> +		continue;
> +	}
> +
> +	event = calloc(1, sizeof(*event));
> +	if (!event)
> +		return -ENOMEM;
> +
> +	event->name = strdup(name);
> +	if (!event->name) {
> +		free(event);
> +
> +		return -ENOMEM;
> +	}
> +
> +	event->index = rte_pmu->num_group_events++;
> +	TAILQ_INSERT_TAIL(&rte_pmu->event_list, event, next);
> +
> +	RTE_LOG(DEBUG, EAL, "%s even added at index %d\n", name, event-
> >index);
> +
> +	return event->index;
> +}
> +
> +void
> +eal_pmu_init(void)
> +{
> +	int ret;
> +
> +	rte_pmu = calloc(1, sizeof(*rte_pmu));
> +	if (!rte_pmu) {
> +		RTE_LOG(ERR, EAL, "failed to alloc PMU\n");
> +
> +		return;
> +	}
> +
> +	TAILQ_INIT(&rte_pmu->event_list);
> +
> +	ret = scan_pmus();
> +	if (ret) {
> +		RTE_LOG(ERR, EAL, "failed to find core pmu\n");
> +		goto out;
> +	}
> +
> +	ret = pmu_arch_init();
> +	if (ret) {
> +		RTE_LOG(ERR, EAL, "failed to setup arch for PMU\n");
> +		goto out;
> +	}
> +
> +	return;
> +out:
> +	free(rte_pmu->name);
> +	free(rte_pmu);
> +}
> +
> +void
> +eal_pmu_fini(void)
> +{
> +	struct rte_pmu_event *event, *tmp;
> +	int lcore_id;
> +
> +	RTE_TAILQ_FOREACH_SAFE(event, &rte_pmu->event_list, next, tmp) {
> +		TAILQ_REMOVE(&rte_pmu->event_list, event, next);
> +		free(event->name);
> +		free(event);
> +	}
> +
> +	RTE_LCORE_FOREACH_WORKER(lcore_id)
> +		cleanup_events(lcore_id);
> +
> +	pmu_arch_fini();
> +	free(rte_pmu->name);
> +	free(rte_pmu);
> +}
> diff --git a/lib/eal/include/meson.build b/lib/eal/include/meson.build
> index cfcd40aaed..3bf830adee 100644
> --- a/lib/eal/include/meson.build
> +++ b/lib/eal/include/meson.build
> @@ -36,6 +36,7 @@ headers += files(
>          'rte_pci_dev_features.h',
>          'rte_per_lcore.h',
>          'rte_pflock.h',
> +        'rte_pmu.h',
>          'rte_random.h',
>          'rte_reciprocal.h',
>          'rte_seqcount.h',
> diff --git a/lib/eal/include/rte_pmu.h b/lib/eal/include/rte_pmu.h
> new file mode 100644
> index 0000000000..e4b4f6b052
> --- /dev/null
> +++ b/lib/eal/include/rte_pmu.h
> @@ -0,0 +1,204 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2022 Marvell
> + */
> +
> +#ifndef _RTE_PMU_H_
> +#define _RTE_PMU_H_
> +
> +#ifdef __cplusplus
> +extern "C" {
> +#endif
> +
> +#include <rte_common.h>
> +#include <rte_compat.h>
> +
> +#ifdef RTE_EXEC_ENV_LINUX
> +
> +#include <linux/perf_event.h>
> +
> +#include <rte_atomic.h>
> +#include <rte_branch_prediction.h>
> +#include <rte_lcore.h>
> +#include <rte_log.h>
> +
> +/**
> + * @file
> + *
> + * PMU event tracing operations
> + *
> + * This file defines generic API and types necessary to setup PMU and
> + * read selected counters in runtime.
> + */
> +
> +/**
> + * A structure describing a group of events.
> + */
> +struct rte_pmu_event_group {
> +	int *fds; /**< array of event descriptors */
> +	void **mmap_pages; /**< array of pointers to mmapped
> perf_event_attr structures */

There seems to be a lot of indirection involved here. Why are these arrays not statically sized, instead of dynamically allocated?

Also, what is the reason for hiding the type struct perf_event_mmap_page **mmap_pages opaque by using void **mmap_pages instead?

> +	bool enabled; /**< true if group was enabled on particular lcore
> */
> +};
> +
> +/**
> + * A structure describing an event.
> + */
> +struct rte_pmu_event {
> +	char *name; /** name of an event */
> +	int index; /** event index into fds/mmap_pages */
> +	TAILQ_ENTRY(rte_pmu_event) next; /** list entry */
> +};
> +
> +/**
> + * A PMU state container.
> + */
> +struct rte_pmu {
> +	char *name; /** name of core PMU listed under
> /sys/bus/event_source/devices */
> +	struct rte_pmu_event_group group[RTE_MAX_LCORE]; /**< per lcore
> event group data */
> +	int num_group_events; /**< number of events in a group */
> +	TAILQ_HEAD(, rte_pmu_event) event_list; /**< list of matching
> events */
> +};
> +
> +/** Pointer to the PMU state container */
> +extern struct rte_pmu *rte_pmu;

Again, why not just extern struct rte_pmu, instead of dynamic allocation?

> +
> +/** Each architecture supporting PMU needs to provide its own version
> */
> +#ifndef rte_pmu_pmc_read
> +#define rte_pmu_pmc_read(index) ({ 0; })
> +#endif
> +
> +/**
> + * @internal
> + *
> + * Read PMU counter.
> + *
> + * @param pc
> + *   Pointer to the mmapped user page.
> + * @return
> + *   Counter value read from hardware.
> + */
> +__rte_internal
> +static __rte_always_inline uint64_t
> +rte_pmu_read_userpage(struct perf_event_mmap_page *pc)
> +{
> +	uint64_t offset, width, pmc = 0;
> +	uint32_t seq, index;
> +	int tries = 100;
> +
> +	for (;;) {
> +		seq = pc->lock;
> +		rte_compiler_barrier();
> +		index = pc->index;
> +		offset = pc->offset;
> +		width = pc->pmc_width;
> +
> +		if (likely(pc->cap_user_rdpmc && index)) {
> +			pmc = rte_pmu_pmc_read(index - 1);
> +			pmc <<= 64 - width;
> +			pmc >>= 64 - width;
> +		}
> +
> +		rte_compiler_barrier();
> +
> +		if (likely(pc->lock == seq))
> +			return pmc + offset;
> +
> +		if (--tries == 0) {
> +			RTE_LOG(DEBUG, EAL, "failed to get
> perf_event_mmap_page lock\n");
> +			break;
> +		}
> +	}
> +
> +	return 0;
> +}
> +
> +/**
> + * @internal
> + *
> + * Enable group of events for a given lcore.
> + *
> + * @param lcore_id
> + *   The identifier of the lcore.
> + * @return
> + *   0 in case of success, negative value otherwise.
> + */
> +__rte_internal
> +int
> +rte_pmu_enable_group(int lcore_id);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice
> + *
> + * Add event to the group of enabled events.
> + *
> + * @param name
> + *   Name of an event listed under
> /sys/bus/event_source/devices/pmu/events.
> + * @return
> + *   Event index in case of success, negative value otherwise.
> + */
> +__rte_experimental
> +int
> +rte_pmu_add_event(const char *name);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice
> + *
> + * Read hardware counter configured to count occurrences of an event.
> + *
> + * @param index
> + *   Index of an event to be read.
> + * @return
> + *   Event value read from register. In case of errors or lack of
> support
> + *   0 is returned. In other words, stream of zeros in a trace file
> + *   indicates problem with reading particular PMU event register.
> + */
> +__rte_experimental
> +static __rte_always_inline uint64_t
> +rte_pmu_read(int index)
> +{
> +	int lcore_id = rte_lcore_id();
> +	struct rte_pmu_event_group *group;
> +	int ret;
> +
> +	if (!rte_pmu)
> +		return 0;
> +
> +	group = &rte_pmu->group[lcore_id];
> +	if (!group->enabled) {
> +		ret = rte_pmu_enable_group(lcore_id);
> +		if (ret)
> +			return 0;
> +
> +		group->enabled = true;
> +	}

Why is the group not enabled in the setup function, rte_pmu_add_event(), instead of here, in the hot path?

> +
> +	if (index < 0 || index >= rte_pmu->num_group_events)
> +		return 0;
> +
> +	return rte_pmu_read_userpage((struct perf_event_mmap_page
> *)group->mmap_pages[index]);

Using fixed size arrays instead of multiple indirections via pointers is faster. It could be:

return rte_pmu_read_userpage((struct perf_event_mmap_page *)rte_pmu.group[lcore_id].mmap_pages[index]);

With our without suggested performance improvements...

Series-acked-by: Morten Brørup <mb@smartsharesystems.com>
  
Tomasz Duszynski Dec. 14, 2022, 9:38 a.m. UTC | #2
Hello Morten, 

Thanks for review. Answers inline. 

[...]

> > +/**
> > + * @file
> > + *
> > + * PMU event tracing operations
> > + *
> > + * This file defines generic API and types necessary to setup PMU and
> > + * read selected counters in runtime.
> > + */
> > +
> > +/**
> > + * A structure describing a group of events.
> > + */
> > +struct rte_pmu_event_group {
> > +	int *fds; /**< array of event descriptors */
> > +	void **mmap_pages; /**< array of pointers to mmapped
> > perf_event_attr structures */
> 
> There seems to be a lot of indirection involved here. Why are these arrays not statically sized,
> instead of dynamically allocated?
> 

Different architectures/pmus impose limits on number of simultaneously enabled counters. So in order
relief the pain of thinking about it and adding macros for each and every arch I decided to allocate
the number user wants dynamically. Also assumption holds that user knows about tradeoffs of using
too many counters hence will not enable too many events at once. 

> Also, what is the reason for hiding the type struct perf_event_mmap_page **mmap_pages opaque by
> using void **mmap_pages instead?

I think, that part doing mmap/munmap was written first hence void ** was chosen in the first place. 

> 
> > +	bool enabled; /**< true if group was enabled on particular lcore
> > */
> > +};
> > +
> > +/**
> > + * A structure describing an event.
> > + */
> > +struct rte_pmu_event {
> > +	char *name; /** name of an event */
> > +	int index; /** event index into fds/mmap_pages */
> > +	TAILQ_ENTRY(rte_pmu_event) next; /** list entry */ };
> > +
> > +/**
> > + * A PMU state container.
> > + */
> > +struct rte_pmu {
> > +	char *name; /** name of core PMU listed under
> > /sys/bus/event_source/devices */
> > +	struct rte_pmu_event_group group[RTE_MAX_LCORE]; /**< per lcore
> > event group data */
> > +	int num_group_events; /**< number of events in a group */
> > +	TAILQ_HEAD(, rte_pmu_event) event_list; /**< list of matching
> > events */
> > +};
> > +
> > +/** Pointer to the PMU state container */ extern struct rte_pmu
> > +*rte_pmu;
> 
> Again, why not just extern struct rte_pmu, instead of dynamic allocation?
> 

No strong opinions here since this is a matter of personal preference. Can be removed
in the next version. 

> > +
> > +/** Each architecture supporting PMU needs to provide its own version
> > */
> > +#ifndef rte_pmu_pmc_read
> > +#define rte_pmu_pmc_read(index) ({ 0; }) #endif
> > +
> > +/**
> > + * @internal
> > + *
> > + * Read PMU counter.
> > + *
> > + * @param pc
> > + *   Pointer to the mmapped user page.
> > + * @return
> > + *   Counter value read from hardware.
> > + */
> > +__rte_internal
> > +static __rte_always_inline uint64_t
> > +rte_pmu_read_userpage(struct perf_event_mmap_page *pc) {
> > +	uint64_t offset, width, pmc = 0;
> > +	uint32_t seq, index;
> > +	int tries = 100;
> > +
> > +	for (;;) {
> > +		seq = pc->lock;
> > +		rte_compiler_barrier();
> > +		index = pc->index;
> > +		offset = pc->offset;
> > +		width = pc->pmc_width;
> > +
> > +		if (likely(pc->cap_user_rdpmc && index)) {
> > +			pmc = rte_pmu_pmc_read(index - 1);
> > +			pmc <<= 64 - width;
> > +			pmc >>= 64 - width;
> > +		}
> > +
> > +		rte_compiler_barrier();
> > +
> > +		if (likely(pc->lock == seq))
> > +			return pmc + offset;
> > +
> > +		if (--tries == 0) {
> > +			RTE_LOG(DEBUG, EAL, "failed to get
> > perf_event_mmap_page lock\n");
> > +			break;
> > +		}
> > +	}
> > +
> > +	return 0;
> > +}
> > +
> > +/**
> > + * @internal
> > + *
> > + * Enable group of events for a given lcore.
> > + *
> > + * @param lcore_id
> > + *   The identifier of the lcore.
> > + * @return
> > + *   0 in case of success, negative value otherwise.
> > + */
> > +__rte_internal
> > +int
> > +rte_pmu_enable_group(int lcore_id);
> > +
> > +/**
> > + * @warning
> > + * @b EXPERIMENTAL: this API may change without prior notice
> > + *
> > + * Add event to the group of enabled events.
> > + *
> > + * @param name
> > + *   Name of an event listed under
> > /sys/bus/event_source/devices/pmu/events.
> > + * @return
> > + *   Event index in case of success, negative value otherwise.
> > + */
> > +__rte_experimental
> > +int
> > +rte_pmu_add_event(const char *name);
> > +
> > +/**
> > + * @warning
> > + * @b EXPERIMENTAL: this API may change without prior notice
> > + *
> > + * Read hardware counter configured to count occurrences of an event.
> > + *
> > + * @param index
> > + *   Index of an event to be read.
> > + * @return
> > + *   Event value read from register. In case of errors or lack of
> > support
> > + *   0 is returned. In other words, stream of zeros in a trace file
> > + *   indicates problem with reading particular PMU event register.
> > + */
> > +__rte_experimental
> > +static __rte_always_inline uint64_t
> > +rte_pmu_read(int index)
> > +{
> > +	int lcore_id = rte_lcore_id();
> > +	struct rte_pmu_event_group *group;
> > +	int ret;
> > +
> > +	if (!rte_pmu)
> > +		return 0;
> > +
> > +	group = &rte_pmu->group[lcore_id];
> > +	if (!group->enabled) {
> > +		ret = rte_pmu_enable_group(lcore_id);
> > +		if (ret)
> > +			return 0;
> > +
> > +		group->enabled = true;
> > +	}
> 
> Why is the group not enabled in the setup function, rte_pmu_add_event(), instead of here, in the
> hot path?
> 

When this is executed for the very first time then cpu will have obviously more work to do 
but afterwards setup path is not taken hence much less cpu cycles are required.

Setup is executed by main lcore solely, before lcores are executed hence some info passed to
SYS_perf_event_open ioctl() is missing, pid (via rte_gettid()) being an example here. 

> > +
> > +	if (index < 0 || index >= rte_pmu->num_group_events)
> > +		return 0;
> > +
> > +	return rte_pmu_read_userpage((struct perf_event_mmap_page
> > *)group->mmap_pages[index]);
> 
> Using fixed size arrays instead of multiple indirections via pointers is faster. It could be:
> 
> return rte_pmu_read_userpage((struct perf_event_mmap_page
> *)rte_pmu.group[lcore_id].mmap_pages[index]);
> 
> With our without suggested performance improvements...
> 
> Series-acked-by: Morten Brørup <mb@smartsharesystems.com>
  
Morten Brørup Dec. 14, 2022, 10:41 a.m. UTC | #3
+CC: Mattias, see my comment below about per-thread constructor for this

> From: Tomasz Duszynski [mailto:tduszynski@marvell.com]
> Sent: Wednesday, 14 December 2022 10.39
> 
> Hello Morten,
> 
> Thanks for review. Answers inline.
> 
> [...]
> 
> > > +/**
> > > + * @file
> > > + *
> > > + * PMU event tracing operations
> > > + *
> > > + * This file defines generic API and types necessary to setup PMU
> and
> > > + * read selected counters in runtime.
> > > + */
> > > +
> > > +/**
> > > + * A structure describing a group of events.
> > > + */
> > > +struct rte_pmu_event_group {
> > > +	int *fds; /**< array of event descriptors */
> > > +	void **mmap_pages; /**< array of pointers to mmapped
> > > perf_event_attr structures */
> >
> > There seems to be a lot of indirection involved here. Why are these
> arrays not statically sized,
> > instead of dynamically allocated?
> >
> 
> Different architectures/pmus impose limits on number of simultaneously
> enabled counters. So in order
> relief the pain of thinking about it and adding macros for each and
> every arch I decided to allocate
> the number user wants dynamically. Also assumption holds that user
> knows about tradeoffs of using
> too many counters hence will not enable too many events at once.

The DPDK convention is to use fixed size arrays (with a maximum size, e.g. RTE_MAX_ETHPORTS) in the fast path, for performance reasons.

Please use fixed size arrays instead of dynamically allocated arrays.

> 
> > Also, what is the reason for hiding the type struct
> perf_event_mmap_page **mmap_pages opaque by
> > using void **mmap_pages instead?
> 
> I think, that part doing mmap/munmap was written first hence void **
> was chosen in the first place.

Please update it, so the actual type is reflected here.

> 
> >
> > > +	bool enabled; /**< true if group was enabled on particular lcore
> > > */
> > > +};
> > > +
> > > +/**
> > > + * A structure describing an event.
> > > + */
> > > +struct rte_pmu_event {
> > > +	char *name; /** name of an event */
> > > +	int index; /** event index into fds/mmap_pages */
> > > +	TAILQ_ENTRY(rte_pmu_event) next; /** list entry */ };
> > > +
> > > +/**
> > > + * A PMU state container.
> > > + */
> > > +struct rte_pmu {
> > > +	char *name; /** name of core PMU listed under
> > > /sys/bus/event_source/devices */
> > > +	struct rte_pmu_event_group group[RTE_MAX_LCORE]; /**< per lcore
> > > event group data */
> > > +	int num_group_events; /**< number of events in a group */
> > > +	TAILQ_HEAD(, rte_pmu_event) event_list; /**< list of matching
> > > events */

The event_list is used in slow path only, so it can remain a list - i.e. no change requested here. :-)

> > > +};
> > > +
> > > +/** Pointer to the PMU state container */ extern struct rte_pmu
> > > +*rte_pmu;
> >
> > Again, why not just extern struct rte_pmu, instead of dynamic
> allocation?
> >
> 
> No strong opinions here since this is a matter of personal preference.
> Can be removed
> in the next version.

Yes, please.

> 
> > > +
> > > +/** Each architecture supporting PMU needs to provide its own
> version
> > > */
> > > +#ifndef rte_pmu_pmc_read
> > > +#define rte_pmu_pmc_read(index) ({ 0; }) #endif
> > > +
> > > +/**
> > > + * @internal
> > > + *
> > > + * Read PMU counter.
> > > + *
> > > + * @param pc
> > > + *   Pointer to the mmapped user page.
> > > + * @return
> > > + *   Counter value read from hardware.
> > > + */
> > > +__rte_internal
> > > +static __rte_always_inline uint64_t
> > > +rte_pmu_read_userpage(struct perf_event_mmap_page *pc) {
> > > +	uint64_t offset, width, pmc = 0;
> > > +	uint32_t seq, index;
> > > +	int tries = 100;
> > > +
> > > +	for (;;) {

As a matter of personal preference, I would write this loop differently:

+ for (tries = 100; tries != 0; tries--) {

> > > +		seq = pc->lock;
> > > +		rte_compiler_barrier();
> > > +		index = pc->index;
> > > +		offset = pc->offset;
> > > +		width = pc->pmc_width;
> > > +
> > > +		if (likely(pc->cap_user_rdpmc && index)) {

Why "&& index"? The way I read [man perf_event_open], index 0 is perfectly valid.

[man perf_event_open]: https://man7.org/linux/man-pages/man2/perf_event_open.2.html

> > > +			pmc = rte_pmu_pmc_read(index - 1);
> > > +			pmc <<= 64 - width;
> > > +			pmc >>= 64 - width;
> > > +		}
> > > +
> > > +		rte_compiler_barrier();
> > > +
> > > +		if (likely(pc->lock == seq))
> > > +			return pmc + offset;
> > > +
> > > +		if (--tries == 0) {
> > > +			RTE_LOG(DEBUG, EAL, "failed to get
> > > perf_event_mmap_page lock\n");
> > > +			break;
> > > +		}

- Remove the 4 above lines of code, and move the debug log message to the end of the function instead.

> > > +	}

+ RTE_LOG(DEBUG, EAL, "failed to get perf_event_mmap_page lock\n");

> > > +
> > > +	return 0;
> > > +}
> > > +
> > > +/**
> > > + * @internal
> > > + *
> > > + * Enable group of events for a given lcore.
> > > + *
> > > + * @param lcore_id
> > > + *   The identifier of the lcore.
> > > + * @return
> > > + *   0 in case of success, negative value otherwise.
> > > + */
> > > +__rte_internal
> > > +int
> > > +rte_pmu_enable_group(int lcore_id);
> > > +
> > > +/**
> > > + * @warning
> > > + * @b EXPERIMENTAL: this API may change without prior notice
> > > + *
> > > + * Add event to the group of enabled events.
> > > + *
> > > + * @param name
> > > + *   Name of an event listed under
> > > /sys/bus/event_source/devices/pmu/events.
> > > + * @return
> > > + *   Event index in case of success, negative value otherwise.
> > > + */
> > > +__rte_experimental
> > > +int
> > > +rte_pmu_add_event(const char *name);
> > > +
> > > +/**
> > > + * @warning
> > > + * @b EXPERIMENTAL: this API may change without prior notice
> > > + *
> > > + * Read hardware counter configured to count occurrences of an
> event.
> > > + *
> > > + * @param index
> > > + *   Index of an event to be read.
> > > + * @return
> > > + *   Event value read from register. In case of errors or lack of
> > > support
> > > + *   0 is returned. In other words, stream of zeros in a trace
> file
> > > + *   indicates problem with reading particular PMU event register.
> > > + */
> > > +__rte_experimental
> > > +static __rte_always_inline uint64_t
> > > +rte_pmu_read(int index)

The index type can be changed from int to uint32_t. This also eliminates the "(index < 0" part of the comparison further below in this function.

> > > +{
> > > +	int lcore_id = rte_lcore_id();
> > > +	struct rte_pmu_event_group *group;
> > > +	int ret;
> > > +
> > > +	if (!rte_pmu)
> > > +		return 0;
> > > +
> > > +	group = &rte_pmu->group[lcore_id];
> > > +	if (!group->enabled) {

Optimized: if (unlikely(!group->enabled)) {

> > > +		ret = rte_pmu_enable_group(lcore_id);
> > > +		if (ret)
> > > +			return 0;
> > > +
> > > +		group->enabled = true;
> > > +	}
> >
> > Why is the group not enabled in the setup function,
> rte_pmu_add_event(), instead of here, in the
> > hot path?
> >
> 
> When this is executed for the very first time then cpu will have
> obviously more work to do
> but afterwards setup path is not taken hence much less cpu cycles are
> required.
> 
> Setup is executed by main lcore solely, before lcores are executed
> hence some info passed to
> SYS_perf_event_open ioctl() is missing, pid (via rte_gettid()) being an
> example here.

OK. Thank you for the explanation. Since impossible at setup, it has to be done at runtime.

@Mattias: Another good example of something that would belong in per-thread constructors, as my suggested feature creep in [1].

[1]: http://inbox.dpdk.org/dev/98CBD80474FA8B44BF855DF32C47DC35D87553@smartserver.smartshare.dk/

> 
> > > +
> > > +	if (index < 0 || index >= rte_pmu->num_group_events)

Optimized: if (unlikely(index >= rte_pmu.num_group_events))

> > > +		return 0;
> > > +
> > > +	return rte_pmu_read_userpage((struct perf_event_mmap_page
> > > *)group->mmap_pages[index]);
> >
> > Using fixed size arrays instead of multiple indirections via pointers
> is faster. It could be:
> >
> > return rte_pmu_read_userpage((struct perf_event_mmap_page
> > *)rte_pmu.group[lcore_id].mmap_pages[index]);
> >
> > With our without suggested performance improvements...
> >
> > Series-acked-by: Morten Brørup <mb@smartsharesystems.com>
>
  
Morten Brørup Dec. 15, 2022, 8:22 a.m. UTC | #4
> From: Morten Brørup [mailto:mb@smartsharesystems.com]
> Sent: Wednesday, 14 December 2022 11.41
> 
> +CC: Mattias, see my comment below about per-thread constructor for
> this
> 
> > From: Tomasz Duszynski [mailto:tduszynski@marvell.com]
> > Sent: Wednesday, 14 December 2022 10.39
> >
> > Hello Morten,
> >
> > Thanks for review. Answers inline.
> >
> > [...]
> >
> > > > +__rte_experimental
> > > > +static __rte_always_inline uint64_t
> > > > +rte_pmu_read(int index)
> 
> The index type can be changed from int to uint32_t. This also
> eliminates the "(index < 0" part of the comparison further below in
> this function.
> 
> > > > +{
> > > > +	int lcore_id = rte_lcore_id();
> > > > +	struct rte_pmu_event_group *group;
> > > > +	int ret;
> > > > +
> > > > +	if (!rte_pmu)
> > > > +		return 0;
> > > > +
> > > > +	group = &rte_pmu->group[lcore_id];
> > > > +	if (!group->enabled) {
> 
> Optimized: if (unlikely(!group->enabled)) {
> 
> > > > +		ret = rte_pmu_enable_group(lcore_id);
> > > > +		if (ret)
> > > > +			return 0;
> > > > +
> > > > +		group->enabled = true;
> > > > +	}
> > >
> > > Why is the group not enabled in the setup function,
> > rte_pmu_add_event(), instead of here, in the
> > > hot path?
> > >
> >
> > When this is executed for the very first time then cpu will have
> > obviously more work to do
> > but afterwards setup path is not taken hence much less cpu cycles are
> > required.
> >
> > Setup is executed by main lcore solely, before lcores are executed
> > hence some info passed to
> > SYS_perf_event_open ioctl() is missing, pid (via rte_gettid()) being
> an
> > example here.
> 
> OK. Thank you for the explanation. Since impossible at setup, it has to
> be done at runtime.
> 
> @Mattias: Another good example of something that would belong in per-
> thread constructors, as my suggested feature creep in [1].
> 
> [1]:
> http://inbox.dpdk.org/dev/98CBD80474FA8B44BF855DF32C47DC35D87553@smarts
> erver.smartshare.dk/

I just realized that this initialization is per-lcore (not per thread), so you can use rte_lcore_callback_register() to register a per-lcore initialization function, and move rte_pmu_enable_group(lcore_id) there.

-Morten
  
Mattias Rönnblom Dec. 15, 2022, 8:46 a.m. UTC | #5
On 2022-12-13 11:43, Tomasz Duszynski wrote:
> Add support for programming PMU counters and reading their values
> in runtime bypassing kernel completely.
> 
> This is especially useful in cases where CPU cores are isolated
> (nohz_full) i.e run dedicated tasks. In such cases one cannot use
> standard perf utility without sacrificing latency and performance.
> 
> Signed-off-by: Tomasz Duszynski <tduszynski@marvell.com>
> ---
>   app/test/meson.build                  |   1 +
>   app/test/test_pmu.c                   |  41 +++
>   doc/guides/prog_guide/profile_app.rst |   8 +
>   lib/eal/common/meson.build            |   3 +
>   lib/eal/common/pmu_private.h          |  41 +++
>   lib/eal/common/rte_pmu.c              | 456 ++++++++++++++++++++++++++
>   lib/eal/include/meson.build           |   1 +
>   lib/eal/include/rte_pmu.h             | 204 ++++++++++++
>   lib/eal/linux/eal.c                   |   4 +
>   lib/eal/version.map                   |   6 +
>   10 files changed, 765 insertions(+)
>   create mode 100644 app/test/test_pmu.c
>   create mode 100644 lib/eal/common/pmu_private.h
>   create mode 100644 lib/eal/common/rte_pmu.c
>   create mode 100644 lib/eal/include/rte_pmu.h
> 
> diff --git a/app/test/meson.build b/app/test/meson.build
> index f34d19e3c3..93b3300309 100644
> --- a/app/test/meson.build
> +++ b/app/test/meson.build
> @@ -143,6 +143,7 @@ test_sources = files(
>           'test_timer_racecond.c',
>           'test_timer_secondary.c',
>           'test_ticketlock.c',
> +        'test_pmu.c',
>           'test_trace.c',
>           'test_trace_register.c',
>           'test_trace_perf.c',
> diff --git a/app/test/test_pmu.c b/app/test/test_pmu.c
> new file mode 100644
> index 0000000000..fd331af9ee
> --- /dev/null
> +++ b/app/test/test_pmu.c
> @@ -0,0 +1,41 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(C) 2022 Marvell International Ltd.
> + */
> +
> +#include <rte_pmu.h>
> +
> +#include "test.h"
> +
> +static int
> +test_pmu_read(void)
> +{
> +	uint64_t val = 0;
> +	int tries = 10;
> +	int event = -1;
> +
> +	while (tries--)
> +		val += rte_pmu_read(event);
> +
> +	if (val == 0)
> +		return TEST_FAILED;
> +
> +	return TEST_SUCCESS;
> +}
> +
> +static struct unit_test_suite pmu_tests = {
> +	.suite_name = "pmu autotest",
> +	.setup = NULL,
> +	.teardown = NULL,
> +	.unit_test_cases = {
> +		TEST_CASE(test_pmu_read),
> +		TEST_CASES_END()
> +	}
> +};
> +
> +static int
> +test_pmu(void)
> +{
> +	return unit_test_suite_runner(&pmu_tests);
> +}
> +
> +REGISTER_TEST_COMMAND(pmu_autotest, test_pmu);
> diff --git a/doc/guides/prog_guide/profile_app.rst b/doc/guides/prog_guide/profile_app.rst
> index 14292d4c25..a8b501fe0c 100644
> --- a/doc/guides/prog_guide/profile_app.rst
> +++ b/doc/guides/prog_guide/profile_app.rst
> @@ -7,6 +7,14 @@ Profile Your Application
>   The following sections describe methods of profiling DPDK applications on
>   different architectures.
>   
> +Performance counter based profiling
> +-----------------------------------
> +
> +Majority of architectures support some sort hardware measurement unit which provides a set of
> +programmable counters that monitor specific events. There are different tools which can gather
> +that information, perf being an example here. Though in some scenarios, eg. when CPU cores are
> +isolated (nohz_full) and run dedicated tasks, using perf is less than ideal. In such cases one can
> +read specific events directly from application via ``rte_pmu_read()``.
>   
>   Profiling on x86
>   ----------------
> diff --git a/lib/eal/common/meson.build b/lib/eal/common/meson.build
> index 917758cc65..d6d05b56f3 100644
> --- a/lib/eal/common/meson.build
> +++ b/lib/eal/common/meson.build
> @@ -38,6 +38,9 @@ sources += files(
>           'rte_service.c',
>           'rte_version.c',
>   )
> +if is_linux
> +    sources += files('rte_pmu.c')
> +endif
>   if is_linux or is_windows
>       sources += files('eal_common_dynmem.c')
>   endif
> diff --git a/lib/eal/common/pmu_private.h b/lib/eal/common/pmu_private.h
> new file mode 100644
> index 0000000000..cade4245e6
> --- /dev/null
> +++ b/lib/eal/common/pmu_private.h
> @@ -0,0 +1,41 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2022 Marvell
> + */
> +
> +#ifndef _PMU_PRIVATE_H_
> +#define _PMU_PRIVATE_H_
> +
> +/**
> + * Architecture specific PMU init callback.
> + *
> + * @return
> + *   0 in case of success, negative value otherwise.
> + */
> +int
> +pmu_arch_init(void);
> +
> +/**
> + * Architecture specific PMU cleanup callback.
> + */
> +void
> +pmu_arch_fini(void);
> +
> +/**
> + * Apply architecture specific settings to config before passing it to syscall.
> + */
> +void
> +pmu_arch_fixup_config(uint64_t config[3]);
> +
> +/**
> + * Initialize PMU tracing internals.
> + */
> +void
> +eal_pmu_init(void);
> +
> +/**
> + * Cleanup PMU internals.
> + */
> +void
> +eal_pmu_fini(void);
> +
> +#endif /* _PMU_PRIVATE_H_ */
> diff --git a/lib/eal/common/rte_pmu.c b/lib/eal/common/rte_pmu.c
> new file mode 100644
> index 0000000000..049fe19fe3
> --- /dev/null
> +++ b/lib/eal/common/rte_pmu.c
> @@ -0,0 +1,456 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(C) 2022 Marvell International Ltd.
> + */
> +
> +#include <ctype.h>
> +#include <dirent.h>
> +#include <errno.h>
> +#include <regex.h>
> +#include <stdlib.h>
> +#include <string.h>
> +#include <sys/ioctl.h>
> +#include <sys/mman.h>
> +#include <sys/queue.h>
> +#include <sys/syscall.h>
> +#include <unistd.h>
> +
> +#include <rte_eal_paging.h>
> +#include <rte_pmu.h>
> +#include <rte_tailq.h>
> +
> +#include "pmu_private.h"
> +
> +#define EVENT_SOURCE_DEVICES_PATH "/sys/bus/event_source/devices"
> +
> +#ifndef GENMASK_ULL
> +#define GENMASK_ULL(h, l) ((~0ULL - (1ULL << (l)) + 1) & (~0ULL >> ((64 - 1 - (h)))))
> +#endif
> +
> +#ifndef FIELD_PREP
> +#define FIELD_PREP(m, v) (((uint64_t)(v) << (__builtin_ffsll(m) - 1)) & (m))
> +#endif
> +
> +struct rte_pmu *rte_pmu;
> +
> +/*
> + * Following __rte_weak functions provide default no-op. Architectures should override them if
> + * necessary.
> + */
> +
> +int
> +__rte_weak pmu_arch_init(void)
> +{
> +	return 0;
> +}
> +
> +void
> +__rte_weak pmu_arch_fini(void)
> +{
> +}
> +
> +void
> +__rte_weak pmu_arch_fixup_config(uint64_t config[3])
> +{
> +	RTE_SET_USED(config);
> +}
> +
> +static int
> +get_term_format(const char *name, int *num, uint64_t *mask)
> +{
> +	char *config = NULL;
> +	char path[PATH_MAX];
> +	int high, low, ret;
> +	FILE *fp;
> +
> +	/* quiesce -Wmaybe-uninitialized warning */
> +	*num = 0;
> +	*mask = 0;
> +
> +	snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/format/%s", rte_pmu->name, name)

This code might crash in case a long name is supplied, which is maybe 
not want you want. A trunacte and a "file not found" is probably better. 
I believe there is a snprintf lookalike with these properties in DPDK.

> +	fp = fopen(path, "r");
> +	if (!fp)
> +		return -errno;
> +
> +	errno = 0;
> +	ret = fscanf(fp, "%m[^:]:%d-%d", &config, &low, &high);
> +	if (ret < 2) {
> +		ret = -ENODATA;
> +		goto out;
> +	}
> +	if (errno) {
> +		ret = -errno;
> +		goto out;
> +	}
> +
> +	if (ret == 2)
> +		high = low;
> +
> +	*mask = GENMASK_ULL(high, low);
> +	/* Last digit should be [012]. If last digit is missing 0 is implied. */
> +	*num = config[strlen(config) - 1];
> +	*num = isdigit(*num) ? *num - '0' : 0;
> +
> +	ret = 0;
> +out:
> +	free(config);
> +	fclose(fp);
> +
> +	return ret;
> +}
> +
> +static int
> +parse_event(char *buf, uint64_t config[3])
> +{
> +	char *token, *term;
> +	int num, ret, val;
> +	uint64_t mask;
> +
> +	config[0] = config[1] = config[2] = 0;
> +
> +	token = strtok(buf, ",");
> +	while (token) {
> +		errno = 0;
> +		/* <term>=<value> */
> +		ret = sscanf(token, "%m[^=]=%i", &term, &val);
> +		if (ret < 1)
> +			return -ENODATA;
> +		if (errno)
> +			return -errno;
> +		if (ret == 1)
> +			val = 1;
> +
> +		ret = get_term_format(term, &num, &mask);
> +		free(term);
> +		if (ret)
> +			return ret;
> +
> +		config[num] |= FIELD_PREP(mask, val);
> +		token = strtok(NULL, ",");
> +	}
> +
> +	return 0;
> +}
> +
> +static int
> +get_event_config(const char *name, uint64_t config[3])
> +{
> +	char path[PATH_MAX], buf[BUFSIZ];
> +	FILE *fp;
> +	int ret;
> +
> +	snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/events/%s", rte_pmu->name, name);
> +	fp = fopen(path, "r");
> +	if (!fp)
> +		return -errno;
> +
> +	ret = fread(buf, 1, sizeof(buf), fp);
> +	if (ret == 0) {
> +		fclose(fp);
> +
> +		return -EINVAL;
> +	}
> +	fclose(fp);
> +	buf[ret] = '\0';
> +
> +	return parse_event(buf, config);
> +}
> +
> +static int
> +do_perf_event_open(uint64_t config[3], int lcore_id, int group_fd)
> +{
> +	struct perf_event_attr attr = {
> +		.size = sizeof(struct perf_event_attr),
> +		.type = PERF_TYPE_RAW,
> +		.exclude_kernel = 1,
> +		.exclude_hv = 1,
> +		.disabled = 1,
> +	};
> +
> +	pmu_arch_fixup_config(config);
> +
> +	attr.config = config[0];
> +	attr.config1 = config[1];
> +	attr.config2 = config[2];
> +
> +	return syscall(SYS_perf_event_open, &attr, rte_gettid(), rte_lcore_to_cpu_id(lcore_id),
> +		       group_fd, 0);
> +}
> +
> +static int
> +open_events(int lcore_id)
> +{
> +	struct rte_pmu_event_group *group = &rte_pmu->group[lcore_id];
> +	struct rte_pmu_event *event;
> +	uint64_t config[3];
> +	int num = 0, ret;
> +
> +	/* group leader gets created first, with fd = -1 */
> +	group->fds[0] = -1;
> +
> +	TAILQ_FOREACH(event, &rte_pmu->event_list, next) {
> +		ret = get_event_config(event->name, config);
> +		if (ret) {
> +			RTE_LOG(ERR, EAL, "failed to get %s event config\n", event->name);
> +			continue;
> +		}
> +
> +		ret = do_perf_event_open(config, lcore_id, group->fds[0]);
> +		if (ret == -1) {
> +			if (errno == EOPNOTSUPP)
> +				RTE_LOG(ERR, EAL, "64 bit counters not supported\n");
> +
> +			ret = -errno;
> +			goto out;
> +		}
> +
> +		group->fds[event->index] = ret;
> +		num++;
> +	}
> +
> +	return 0;
> +out:
> +	for (--num; num >= 0; num--) {
> +		close(group->fds[num]);
> +		group->fds[num] = -1;
> +	}
> +
> +
> +	return ret;
> +}
> +
> +static int
> +mmap_events(int lcore_id)
> +{
> +	struct rte_pmu_event_group *group = &rte_pmu->group[lcore_id];
> +	void *addr;
> +	int ret, i;
> +
> +	for (i = 0; i < rte_pmu->num_group_events; i++) {
> +		addr = mmap(0, rte_mem_page_size(), PROT_READ, MAP_SHARED, group->fds[i], 0);
> +		if (addr == MAP_FAILED) {
> +			ret = -errno;
> +			goto out;
> +		}
> +
> +		group->mmap_pages[i] = addr;
> +	}
> +
> +	return 0;
> +out:
> +	for (; i; i--) {
> +		munmap(group->mmap_pages[i - 1], rte_mem_page_size());
> +		group->mmap_pages[i - 1] = NULL;
> +	}
> +
> +	return ret;
> +}
> +
> +static void
> +cleanup_events(int lcore_id)

lcore_id is an unsigned int.

> +{
> +	struct rte_pmu_event_group *group = &rte_pmu->group[lcore_id];
> +	int i;
> +
> +	if (!group->fds)
> +		return;

group->fds == NULL

This coding style violating appears throughput the patch set.

> +
> +	if (group->fds[0] != -1)
> +		ioctl(group->fds[0], PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP);
> +
> +	for (i = 0; i < rte_pmu->num_group_events; i++) {
> +		if (group->mmap_pages[i]) {
> +			munmap(group->mmap_pages[i], rte_mem_page_size());
> +			group->mmap_pages[i] = NULL;
> +		}
> +
> +		if (group->fds[i] != -1) {
> +			close(group->fds[i]);
> +			group->fds[i] = -1;
> +		}
> +	}
> +
> +	free(group->mmap_pages);
> +	free(group->fds);
> +
> +	group->mmap_pages = NULL;
> +	group->fds = NULL;
> +	group->enabled = false;
> +}
> +
> +int __rte_noinline
> +rte_pmu_enable_group(int lcore_id)

unsigned

> +{
> +	struct rte_pmu_event_group *group = &rte_pmu->group[lcore_id];
> +	int ret;
> +
> +	if (rte_pmu->num_group_events == 0) {
> +		RTE_LOG(DEBUG, EAL, "no matching PMU events\n");
> +
> +		return 0;
> +	}
> +
> +	group->fds = calloc(rte_pmu->num_group_events, sizeof(*group->fds));
> +	if (!group->fds) {
> +		RTE_LOG(ERR, EAL, "failed to alloc descriptor memory\n");
> +
> +		return -ENOMEM;
> +	}
> +
> +	group->mmap_pages = calloc(rte_pmu->num_group_events, sizeof(*group->mmap_pages));
> +	if (!group->mmap_pages) {
> +		RTE_LOG(ERR, EAL, "failed to alloc userpage memory\n");
> +
> +		ret = -ENOMEM;
> +		goto out;
> +	}
> +
> +	ret = open_events(lcore_id);
> +	if (ret) {
> +		RTE_LOG(ERR, EAL, "failed to open events on lcore-worker-%d\n", lcore_id);
> +		goto out;
> +	}
> +
> +	ret = mmap_events(lcore_id);
> +	if (ret) {
> +		RTE_LOG(ERR, EAL, "failed to map events on lcore-worker-%d\n", lcore_id);
> +		goto out;
> +	}
> +
> +	if (ioctl(group->fds[0], PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP) == -1) {
> +		RTE_LOG(ERR, EAL, "failed to enable events on lcore-worker-%d\n", lcore_id);
> +
> +		ret = -errno;
> +		goto out;
> +	}
> +
> +	return 0;
> +
> +out:
> +	cleanup_events(lcore_id);
> +
> +	return ret;
> +}
> +
> +static int
> +scan_pmus(void)
> +{
> +	char path[PATH_MAX];
> +	struct dirent *dent;
> +	const char *name;
> +	DIR *dirp;
> +
> +	dirp = opendir(EVENT_SOURCE_DEVICES_PATH);
> +	if (!dirp)
> +		return -errno;
> +
> +	while ((dent = readdir(dirp))) {
> +		name = dent->d_name;
> +		if (name[0] == '.')
> +			continue;
> +
> +		/* sysfs entry should either contain cpus or be a cpu */
> +		if (!strcmp(name, "cpu"))
> +			break;
> +
> +		snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/cpus", name);
> +		if (access(path, F_OK) == 0)
> +			break;
> +	}
> +
> +	closedir(dirp);
> +
> +	if (dent) {
> +		rte_pmu->name = strdup(name);
> +		if (!rte_pmu->name)
> +			return -ENOMEM;
> +	}
> +
> +	return rte_pmu->name ? 0 : -ENODEV;
> +}
> +
> +int
> +rte_pmu_add_event(const char *name)
> +{
> +	struct rte_pmu_event *event;
> +	char path[PATH_MAX];
> +
> +	snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/events/%s", rte_pmu->name, name);
> +	if (access(path, R_OK))
> +		return -ENODEV;
> +
> +	TAILQ_FOREACH(event, &rte_pmu->event_list, next) {
> +		if (!strcmp(event->name, name))
> +			return event->index;
> +		continue;
> +	}
> +
> +	event = calloc(1, sizeof(*event));
> +	if (!event)
> +		return -ENOMEM;
> +
> +	event->name = strdup(name);
> +	if (!event->name) {
> +		free(event);
> +
> +		return -ENOMEM;
> +	}
> +
> +	event->index = rte_pmu->num_group_events++;
> +	TAILQ_INSERT_TAIL(&rte_pmu->event_list, event, next);
> +
> +	RTE_LOG(DEBUG, EAL, "%s even added at index %d\n", name, event->index);
> +
> +	return event->index;
> +}
> +
> +void
> +eal_pmu_init(void)
> +{
> +	int ret;
> +
> +	rte_pmu = calloc(1, sizeof(*rte_pmu));
> +	if (!rte_pmu) {
> +		RTE_LOG(ERR, EAL, "failed to alloc PMU\n");
> +
> +		return;
> +	}
> +
> +	TAILQ_INIT(&rte_pmu->event_list);
> +
> +	ret = scan_pmus();
> +	if (ret) {
> +		RTE_LOG(ERR, EAL, "failed to find core pmu\n");
> +		goto out;
> +	}
> +
> +	ret = pmu_arch_init();
> +	if (ret) {
> +		RTE_LOG(ERR, EAL, "failed to setup arch for PMU\n");
> +		goto out;
> +	}
> +
> +	return;
> +out:
> +	free(rte_pmu->name);
> +	free(rte_pmu);
> +}
> +
> +void
> +eal_pmu_fini(void)
> +{
> +	struct rte_pmu_event *event, *tmp;
> +	int lcore_id;
> +
> +	RTE_TAILQ_FOREACH_SAFE(event, &rte_pmu->event_list, next, tmp) {
> +		TAILQ_REMOVE(&rte_pmu->event_list, event, next);
> +		free(event->name);
> +		free(event);
> +	}
> +
> +	RTE_LCORE_FOREACH_WORKER(lcore_id)
> +		cleanup_events(lcore_id);

Why is the main lcore left out?

> +
> +	pmu_arch_fini();
> +	free(rte_pmu->name);
> +	free(rte_pmu);
> +}
> diff --git a/lib/eal/include/meson.build b/lib/eal/include/meson.build
> index cfcd40aaed..3bf830adee 100644
> --- a/lib/eal/include/meson.build
> +++ b/lib/eal/include/meson.build
> @@ -36,6 +36,7 @@ headers += files(
>           'rte_pci_dev_features.h',
>           'rte_per_lcore.h',
>           'rte_pflock.h',
> +        'rte_pmu.h',
>           'rte_random.h',
>           'rte_reciprocal.h',
>           'rte_seqcount.h',
> diff --git a/lib/eal/include/rte_pmu.h b/lib/eal/include/rte_pmu.h
> new file mode 100644
> index 0000000000..e4b4f6b052
> --- /dev/null
> +++ b/lib/eal/include/rte_pmu.h
> @@ -0,0 +1,204 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2022 Marvell
> + */
> +
> +#ifndef _RTE_PMU_H_
> +#define _RTE_PMU_H_
> +
> +#ifdef __cplusplus
> +extern "C" {
> +#endif
> +
> +#include <rte_common.h>
> +#include <rte_compat.h>
> +
> +#ifdef RTE_EXEC_ENV_LINUX
> +
> +#include <linux/perf_event.h>
> +
> +#include <rte_atomic.h>
> +#include <rte_branch_prediction.h>
> +#include <rte_lcore.h>
> +#include <rte_log.h>
> +
> +/**
> + * @file
> + *
> + * PMU event tracing operations
> + *
> + * This file defines generic API and types necessary to setup PMU and
> + * read selected counters in runtime.
> + */
> +
> +/**
> + * A structure describing a group of events.
> + */
> +struct rte_pmu_event_group {
> +	int *fds; /**< array of event descriptors */
> +	void **mmap_pages; /**< array of pointers to mmapped perf_event_attr structures */
> +	bool enabled; /**< true if group was enabled on particular lcore */
> +};
> +
> +/**
> + * A structure describing an event.
> + */
> +struct rte_pmu_event {
> +	char *name; /** name of an event */
> +	int index; /** event index into fds/mmap_pages */
> +	TAILQ_ENTRY(rte_pmu_event) next; /** list entry */
> +};
> +
> +/**
> + * A PMU state container.
> + */
> +struct rte_pmu {
> +	char *name; /** name of core PMU listed under /sys/bus/event_source/devices */
> +	struct rte_pmu_event_group group[RTE_MAX_LCORE]; /**< per lcore event group data */
> +	int num_group_events; /**< number of events in a group */
> +	TAILQ_HEAD(, rte_pmu_event) event_list; /**< list of matching events */
> +};
> +
> +/** Pointer to the PMU state container */
> +extern struct rte_pmu *rte_pmu;
> +
> +/** Each architecture supporting PMU needs to provide its own version */
> +#ifndef rte_pmu_pmc_read
> +#define rte_pmu_pmc_read(index) ({ 0; })
> +#endif
> +
> +/**
> + * @internal
> + *
> + * Read PMU counter.
> + *
> + * @param pc
> + *   Pointer to the mmapped user page.
> + * @return
> + *   Counter value read from hardware.
> + */
> +__rte_internal
> +static __rte_always_inline uint64_t
> +rte_pmu_read_userpage(struct perf_event_mmap_page *pc)
> +{
> +	uint64_t offset, width, pmc = 0;
> +	uint32_t seq, index;
> +	int tries = 100;
> +
> +	for (;;) {
> +		seq = pc->lock;
> +		rte_compiler_barrier();
> +		index = pc->index;
> +		offset = pc->offset;
> +		width = pc->pmc_width;
> +
> +		if (likely(pc->cap_user_rdpmc && index)) {
> +			pmc = rte_pmu_pmc_read(index - 1);
> +			pmc <<= 64 - width;
> +			pmc >>= 64 - width;
> +		}
> +
> +		rte_compiler_barrier();
> +
> +		if (likely(pc->lock == seq))
> +			return pmc + offset;
> +
> +		if (--tries == 0) {
> +			RTE_LOG(DEBUG, EAL, "failed to get perf_event_mmap_page lock\n");
> +			break;
> +		}
> +	}
> +
> +	return 0;
> +}
> +
> +/**
> + * @internal
> + *
> + * Enable group of events for a given lcore.
> + *
> + * @param lcore_id
> + *   The identifier of the lcore.
> + * @return
> + *   0 in case of success, negative value otherwise.
> + */
> +__rte_internal
> +int
> +rte_pmu_enable_group(int lcore_id);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice
> + *
> + * Add event to the group of enabled events.
> + *
> + * @param name
> + *   Name of an event listed under /sys/bus/event_source/devices/pmu/events.
> + * @return
> + *   Event index in case of success, negative value otherwise.
> + */
> +__rte_experimental
> +int
> +rte_pmu_add_event(const char *name);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice
> + *
> + * Read hardware counter configured to count occurrences of an event.
> + *
> + * @param index
> + *   Index of an event to be read.
> + * @return
> + *   Event value read from register. In case of errors or lack of support
> + *   0 is returned. In other words, stream of zeros in a trace file
> + *   indicates problem with reading particular PMU event register.
> + */
> +__rte_experimental
> +static __rte_always_inline uint64_t
> +rte_pmu_read(int index)
> +{
> +	int lcore_id = rte_lcore_id();
> +	struct rte_pmu_event_group *group;
> +	int ret;
> +
> +	if (!rte_pmu)
> +		return 0;
> +
> +	group = &rte_pmu->group[lcore_id];
> +	if (!group->enabled) {
> +		ret = rte_pmu_enable_group(lcore_id);
> +		if (ret)
> +			return 0;
> +
> +		group->enabled = true;
> +	}
> +
> +	if (index < 0 || index >= rte_pmu->num_group_events)
> +		return 0;
> +
> +	return rte_pmu_read_userpage((struct perf_event_mmap_page *)group->mmap_pages[index]);
> +}
> +
> +#else /* !RTE_EXEC_ENV_LINUX */
> +
> +__rte_experimental
> +static int __rte_unused
> +rte_pmu_add_event(__rte_unused const char *name)
> +{
> +	return -1;
> +}
> +
> +__rte_experimental
> +static __rte_always_inline uint64_t
> +rte_pmu_read(__rte_unused int index)
> +{
> +	return 0;
> +}
> +
> +#endif /* RTE_EXEC_ENV_LINUX */
> +
> +#ifdef __cplusplus
> +}
> +#endif
> +
> +#endif /* _RTE_PMU_H_ */
> diff --git a/lib/eal/linux/eal.c b/lib/eal/linux/eal.c
> index 8c118d0d9f..751a13b597 100644
> --- a/lib/eal/linux/eal.c
> +++ b/lib/eal/linux/eal.c
> @@ -53,6 +53,7 @@
>   #include "eal_options.h"
>   #include "eal_vfio.h"
>   #include "hotplug_mp.h"
> +#include "pmu_private.h"
>   
>   #define MEMSIZE_IF_NO_HUGE_PAGE (64ULL * 1024ULL * 1024ULL)
>   
> @@ -1206,6 +1207,8 @@ rte_eal_init(int argc, char **argv)
>   		return -1;
>   	}
>   
> +	eal_pmu_init();
> +
>   	if (rte_eal_tailqs_init() < 0) {
>   		rte_eal_init_alert("Cannot init tail queues for objects");
>   		rte_errno = EFAULT;
> @@ -1372,6 +1375,7 @@ rte_eal_cleanup(void)
>   	eal_bus_cleanup();
>   	rte_trace_save();
>   	eal_trace_fini();
> +	eal_pmu_fini();
>   	/* after this point, any DPDK pointers will become dangling */
>   	rte_eal_memory_detach();
>   	eal_mp_dev_hotplug_cleanup();
> diff --git a/lib/eal/version.map b/lib/eal/version.map
> index 7ad12a7dc9..9225f46f67 100644
> --- a/lib/eal/version.map
> +++ b/lib/eal/version.map
> @@ -440,6 +440,11 @@ EXPERIMENTAL {
>   	rte_thread_detach;
>   	rte_thread_equal;
>   	rte_thread_join;
> +
> +	# added in 23.03
> +	rte_pmu; # WINDOWS_NO_EXPORT
> +	rte_pmu_add_event; # WINDOWS_NO_EXPORT
> +	rte_pmu_read; # WINDOWS_NO_EXPORT
>   };
>   
>   INTERNAL {
> @@ -483,4 +488,5 @@ INTERNAL {
>   	rte_mem_map;
>   	rte_mem_page_size;
>   	rte_mem_unmap;
> +	rte_pmu_enable_group;
>   };
  
Morten Brørup Dec. 16, 2022, 7:33 a.m. UTC | #6
> From: Morten Brørup [mailto:mb@smartsharesystems.com]
> Sent: Thursday, 15 December 2022 09.22
> 
> > From: Morten Brørup [mailto:mb@smartsharesystems.com]
> > Sent: Wednesday, 14 December 2022 11.41
> >
> > +CC: Mattias, see my comment below about per-thread constructor for
> > this
> >
> > > From: Tomasz Duszynski [mailto:tduszynski@marvell.com]
> > > Sent: Wednesday, 14 December 2022 10.39
> > >
> > > Hello Morten,
> > >
> > > Thanks for review. Answers inline.
> > >
> > > [...]
> > >
> > > > > +__rte_experimental
> > > > > +static __rte_always_inline uint64_t
> > > > > +rte_pmu_read(int index)
> >
> > The index type can be changed from int to uint32_t. This also
> > eliminates the "(index < 0" part of the comparison further below in
> > this function.
> >
> > > > > +{
> > > > > +	int lcore_id = rte_lcore_id();
> > > > > +	struct rte_pmu_event_group *group;
> > > > > +	int ret;
> > > > > +
> > > > > +	if (!rte_pmu)
> > > > > +		return 0;
> > > > > +
> > > > > +	group = &rte_pmu->group[lcore_id];
> > > > > +	if (!group->enabled) {
> >
> > Optimized: if (unlikely(!group->enabled)) {
> >
> > > > > +		ret = rte_pmu_enable_group(lcore_id);
> > > > > +		if (ret)
> > > > > +			return 0;
> > > > > +
> > > > > +		group->enabled = true;
> > > > > +	}
> > > >
> > > > Why is the group not enabled in the setup function,
> > > rte_pmu_add_event(), instead of here, in the
> > > > hot path?
> > > >
> > >
> > > When this is executed for the very first time then cpu will have
> > > obviously more work to do
> > > but afterwards setup path is not taken hence much less cpu cycles
> are
> > > required.
> > >
> > > Setup is executed by main lcore solely, before lcores are executed
> > > hence some info passed to
> > > SYS_perf_event_open ioctl() is missing, pid (via rte_gettid())
> being
> > an
> > > example here.
> >
> > OK. Thank you for the explanation. Since impossible at setup, it has
> to
> > be done at runtime.
> >
> > @Mattias: Another good example of something that would belong in per-
> > thread constructors, as my suggested feature creep in [1].
> >
> > [1]:
> >
> http://inbox.dpdk.org/dev/98CBD80474FA8B44BF855DF32C47DC35D87553@smarts
> > erver.smartshare.dk/
> 
> I just realized that this initialization is per-lcore (not per thread),
> so you can use rte_lcore_callback_register() to register a per-lcore
> initialization function, and move rte_pmu_enable_group(lcore_id) there.

Sorry, Thomasz!

You can't use rte_lcore_callback_register()... it doesn't provide per-lcore thread constructors/destructors the way I thought. :-(
  
Tomasz Duszynski Jan. 4, 2023, 3:47 p.m. UTC | #7
> -----Original Message-----
> From: Mattias Rönnblom <mattias.ronnblom@ericsson.com>
> Sent: Thursday, December 15, 2022 9:46 AM
> To: Tomasz Duszynski <tduszynski@marvell.com>; dev@dpdk.org
> Cc: thomas@monjalon.net; Jerin Jacob Kollanukkaran <jerinj@marvell.com>; mb@smartsharesystems.com;
> zhoumin@loongson.cn
> Subject: [EXT] Re: [PATCH v4 1/4] eal: add generic support for reading PMU events
> 
> External Email
> 
> ----------------------------------------------------------------------
> On 2022-12-13 11:43, Tomasz Duszynski wrote:
> > Add support for programming PMU counters and reading their values in
> > runtime bypassing kernel completely.
> >
> > This is especially useful in cases where CPU cores are isolated
> > (nohz_full) i.e run dedicated tasks. In such cases one cannot use
> > standard perf utility without sacrificing latency and performance.
> >
> > Signed-off-by: Tomasz Duszynski <tduszynski@marvell.com>
> > ---
> >   app/test/meson.build                  |   1 +
> >   app/test/test_pmu.c                   |  41 +++
> >   doc/guides/prog_guide/profile_app.rst |   8 +
> >   lib/eal/common/meson.build            |   3 +
> >   lib/eal/common/pmu_private.h          |  41 +++
> >   lib/eal/common/rte_pmu.c              | 456 ++++++++++++++++++++++++++
> >   lib/eal/include/meson.build           |   1 +
> >   lib/eal/include/rte_pmu.h             | 204 ++++++++++++
> >   lib/eal/linux/eal.c                   |   4 +
> >   lib/eal/version.map                   |   6 +
> >   10 files changed, 765 insertions(+)
> >   create mode 100644 app/test/test_pmu.c
> >   create mode 100644 lib/eal/common/pmu_private.h
> >   create mode 100644 lib/eal/common/rte_pmu.c
> >   create mode 100644 lib/eal/include/rte_pmu.h
> >
> > diff --git a/app/test/meson.build b/app/test/meson.build index
> > f34d19e3c3..93b3300309 100644
> > --- a/app/test/meson.build
> > +++ b/app/test/meson.build
> > @@ -143,6 +143,7 @@ test_sources = files(
> >           'test_timer_racecond.c',
> >           'test_timer_secondary.c',
> >           'test_ticketlock.c',
> > +        'test_pmu.c',
> >           'test_trace.c',
> >           'test_trace_register.c',
> >           'test_trace_perf.c',
> > diff --git a/app/test/test_pmu.c b/app/test/test_pmu.c new file mode
> > 100644 index 0000000000..fd331af9ee
> > --- /dev/null
> > +++ b/app/test/test_pmu.c
> > @@ -0,0 +1,41 @@
> > +/* SPDX-License-Identifier: BSD-3-Clause
> > + * Copyright(C) 2022 Marvell International Ltd.
> > + */
> > +
> > +#include <rte_pmu.h>
> > +
> > +#include "test.h"
> > +
> > +static int
> > +test_pmu_read(void)
> > +{
> > +	uint64_t val = 0;
> > +	int tries = 10;
> > +	int event = -1;
> > +
> > +	while (tries--)
> > +		val += rte_pmu_read(event);
> > +
> > +	if (val == 0)
> > +		return TEST_FAILED;
> > +
> > +	return TEST_SUCCESS;
> > +}
> > +
> > +static struct unit_test_suite pmu_tests = {
> > +	.suite_name = "pmu autotest",
> > +	.setup = NULL,
> > +	.teardown = NULL,
> > +	.unit_test_cases = {
> > +		TEST_CASE(test_pmu_read),
> > +		TEST_CASES_END()
> > +	}
> > +};
> > +
> > +static int
> > +test_pmu(void)
> > +{
> > +	return unit_test_suite_runner(&pmu_tests);
> > +}
> > +
> > +REGISTER_TEST_COMMAND(pmu_autotest, test_pmu);
> > diff --git a/doc/guides/prog_guide/profile_app.rst
> > b/doc/guides/prog_guide/profile_app.rst
> > index 14292d4c25..a8b501fe0c 100644
> > --- a/doc/guides/prog_guide/profile_app.rst
> > +++ b/doc/guides/prog_guide/profile_app.rst
> > @@ -7,6 +7,14 @@ Profile Your Application
> >   The following sections describe methods of profiling DPDK applications on
> >   different architectures.
> >
> > +Performance counter based profiling
> > +-----------------------------------
> > +
> > +Majority of architectures support some sort hardware measurement unit
> > +which provides a set of programmable counters that monitor specific
> > +events. There are different tools which can gather that information,
> > +perf being an example here. Though in some scenarios, eg. when CPU
> > +cores are isolated (nohz_full) and run dedicated tasks, using perf is less than ideal. In such
> cases one can read specific events directly from application via ``rte_pmu_read()``.
> >
> >   Profiling on x86
> >   ----------------
> > diff --git a/lib/eal/common/meson.build b/lib/eal/common/meson.build
> > index 917758cc65..d6d05b56f3 100644
> > --- a/lib/eal/common/meson.build
> > +++ b/lib/eal/common/meson.build
> > @@ -38,6 +38,9 @@ sources += files(
> >           'rte_service.c',
> >           'rte_version.c',
> >   )
> > +if is_linux
> > +    sources += files('rte_pmu.c')
> > +endif
> >   if is_linux or is_windows
> >       sources += files('eal_common_dynmem.c')
> >   endif
> > diff --git a/lib/eal/common/pmu_private.h
> > b/lib/eal/common/pmu_private.h new file mode 100644 index
> > 0000000000..cade4245e6
> > --- /dev/null
> > +++ b/lib/eal/common/pmu_private.h
> > @@ -0,0 +1,41 @@
> > +/* SPDX-License-Identifier: BSD-3-Clause
> > + * Copyright(c) 2022 Marvell
> > + */
> > +
> > +#ifndef _PMU_PRIVATE_H_
> > +#define _PMU_PRIVATE_H_
> > +
> > +/**
> > + * Architecture specific PMU init callback.
> > + *
> > + * @return
> > + *   0 in case of success, negative value otherwise.
> > + */
> > +int
> > +pmu_arch_init(void);
> > +
> > +/**
> > + * Architecture specific PMU cleanup callback.
> > + */
> > +void
> > +pmu_arch_fini(void);
> > +
> > +/**
> > + * Apply architecture specific settings to config before passing it to syscall.
> > + */
> > +void
> > +pmu_arch_fixup_config(uint64_t config[3]);
> > +
> > +/**
> > + * Initialize PMU tracing internals.
> > + */
> > +void
> > +eal_pmu_init(void);
> > +
> > +/**
> > + * Cleanup PMU internals.
> > + */
> > +void
> > +eal_pmu_fini(void);
> > +
> > +#endif /* _PMU_PRIVATE_H_ */
> > diff --git a/lib/eal/common/rte_pmu.c b/lib/eal/common/rte_pmu.c new
> > file mode 100644 index 0000000000..049fe19fe3
> > --- /dev/null
> > +++ b/lib/eal/common/rte_pmu.c
> > @@ -0,0 +1,456 @@
> > +/* SPDX-License-Identifier: BSD-3-Clause
> > + * Copyright(C) 2022 Marvell International Ltd.
> > + */
> > +
> > +#include <ctype.h>
> > +#include <dirent.h>
> > +#include <errno.h>
> > +#include <regex.h>
> > +#include <stdlib.h>
> > +#include <string.h>
> > +#include <sys/ioctl.h>
> > +#include <sys/mman.h>
> > +#include <sys/queue.h>
> > +#include <sys/syscall.h>
> > +#include <unistd.h>
> > +
> > +#include <rte_eal_paging.h>
> > +#include <rte_pmu.h>
> > +#include <rte_tailq.h>
> > +
> > +#include "pmu_private.h"
> > +
> > +#define EVENT_SOURCE_DEVICES_PATH "/sys/bus/event_source/devices"
> > +
> > +#ifndef GENMASK_ULL
> > +#define GENMASK_ULL(h, l) ((~0ULL - (1ULL << (l)) + 1) & (~0ULL >>
> > +((64 - 1 - (h))))) #endif
> > +
> > +#ifndef FIELD_PREP
> > +#define FIELD_PREP(m, v) (((uint64_t)(v) << (__builtin_ffsll(m) - 1))
> > +& (m)) #endif
> > +
> > +struct rte_pmu *rte_pmu;
> > +
> > +/*
> > + * Following __rte_weak functions provide default no-op.
> > +Architectures should override them if
> > + * necessary.
> > + */
> > +
> > +int
> > +__rte_weak pmu_arch_init(void)
> > +{
> > +	return 0;
> > +}
> > +
> > +void
> > +__rte_weak pmu_arch_fini(void)
> > +{
> > +}
> > +
> > +void
> > +__rte_weak pmu_arch_fixup_config(uint64_t config[3]) {
> > +	RTE_SET_USED(config);
> > +}
> > +
> > +static int
> > +get_term_format(const char *name, int *num, uint64_t *mask) {
> > +	char *config = NULL;
> > +	char path[PATH_MAX];
> > +	int high, low, ret;
> > +	FILE *fp;
> > +
> > +	/* quiesce -Wmaybe-uninitialized warning */
> > +	*num = 0;
> > +	*mask = 0;
> > +
> > +	snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH
> > +"/%s/format/%s", rte_pmu->name, name)
> 
> This code might crash in case a long name is supplied, which is maybe not want you want. A
> trunacte and a "file not found" is probably better.
> I believe there is a snprintf lookalike with these properties in DPDK.
> 

In scenario, which is pretty unlikely especially because sysfs files have sane names, where 
'path' cannot accommodate the whole string there will be NUL implicitly appended by snprintf.
Hence fopen will fail. Not sure how this may go wrong. 

> > +	fp = fopen(path, "r");
> > +	if (!fp)
> > +		return -errno;
> > +
> > +	errno = 0;
> > +	ret = fscanf(fp, "%m[^:]:%d-%d", &config, &low, &high);
> > +	if (ret < 2) {
> > +		ret = -ENODATA;
> > +		goto out;
> > +	}
> > +	if (errno) {
> > +		ret = -errno;
> > +		goto out;
> > +	}
> > +
> > +	if (ret == 2)
> > +		high = low;
> > +
> > +	*mask = GENMASK_ULL(high, low);
> > +	/* Last digit should be [012]. If last digit is missing 0 is implied. */
> > +	*num = config[strlen(config) - 1];
> > +	*num = isdigit(*num) ? *num - '0' : 0;
> > +
> > +	ret = 0;
> > +out:
> > +	free(config);
> > +	fclose(fp);
> > +
> > +	return ret;
> > +}
> > +
> > +static int
> > +parse_event(char *buf, uint64_t config[3]) {
> > +	char *token, *term;
> > +	int num, ret, val;
> > +	uint64_t mask;
> > +
> > +	config[0] = config[1] = config[2] = 0;
> > +
> > +	token = strtok(buf, ",");
> > +	while (token) {
> > +		errno = 0;
> > +		/* <term>=<value> */
> > +		ret = sscanf(token, "%m[^=]=%i", &term, &val);
> > +		if (ret < 1)
> > +			return -ENODATA;
> > +		if (errno)
> > +			return -errno;
> > +		if (ret == 1)
> > +			val = 1;
> > +
> > +		ret = get_term_format(term, &num, &mask);
> > +		free(term);
> > +		if (ret)
> > +			return ret;
> > +
> > +		config[num] |= FIELD_PREP(mask, val);
> > +		token = strtok(NULL, ",");
> > +	}
> > +
> > +	return 0;
> > +}
> > +
> > +static int
> > +get_event_config(const char *name, uint64_t config[3]) {
> > +	char path[PATH_MAX], buf[BUFSIZ];
> > +	FILE *fp;
> > +	int ret;
> > +
> > +	snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/events/%s", rte_pmu->name,
> name);
> > +	fp = fopen(path, "r");
> > +	if (!fp)
> > +		return -errno;
> > +
> > +	ret = fread(buf, 1, sizeof(buf), fp);
> > +	if (ret == 0) {
> > +		fclose(fp);
> > +
> > +		return -EINVAL;
> > +	}
> > +	fclose(fp);
> > +	buf[ret] = '\0';
> > +
> > +	return parse_event(buf, config);
> > +}
> > +
> > +static int
> > +do_perf_event_open(uint64_t config[3], int lcore_id, int group_fd) {
> > +	struct perf_event_attr attr = {
> > +		.size = sizeof(struct perf_event_attr),
> > +		.type = PERF_TYPE_RAW,
> > +		.exclude_kernel = 1,
> > +		.exclude_hv = 1,
> > +		.disabled = 1,
> > +	};
> > +
> > +	pmu_arch_fixup_config(config);
> > +
> > +	attr.config = config[0];
> > +	attr.config1 = config[1];
> > +	attr.config2 = config[2];
> > +
> > +	return syscall(SYS_perf_event_open, &attr, rte_gettid(), rte_lcore_to_cpu_id(lcore_id),
> > +		       group_fd, 0);
> > +}
> > +
> > +static int
> > +open_events(int lcore_id)
> > +{
> > +	struct rte_pmu_event_group *group = &rte_pmu->group[lcore_id];
> > +	struct rte_pmu_event *event;
> > +	uint64_t config[3];
> > +	int num = 0, ret;
> > +
> > +	/* group leader gets created first, with fd = -1 */
> > +	group->fds[0] = -1;
> > +
> > +	TAILQ_FOREACH(event, &rte_pmu->event_list, next) {
> > +		ret = get_event_config(event->name, config);
> > +		if (ret) {
> > +			RTE_LOG(ERR, EAL, "failed to get %s event config\n", event->name);
> > +			continue;
> > +		}
> > +
> > +		ret = do_perf_event_open(config, lcore_id, group->fds[0]);
> > +		if (ret == -1) {
> > +			if (errno == EOPNOTSUPP)
> > +				RTE_LOG(ERR, EAL, "64 bit counters not supported\n");
> > +
> > +			ret = -errno;
> > +			goto out;
> > +		}
> > +
> > +		group->fds[event->index] = ret;
> > +		num++;
> > +	}
> > +
> > +	return 0;
> > +out:
> > +	for (--num; num >= 0; num--) {
> > +		close(group->fds[num]);
> > +		group->fds[num] = -1;
> > +	}
> > +
> > +
> > +	return ret;
> > +}
> > +
> > +static int
> > +mmap_events(int lcore_id)
> > +{
> > +	struct rte_pmu_event_group *group = &rte_pmu->group[lcore_id];
> > +	void *addr;
> > +	int ret, i;
> > +
> > +	for (i = 0; i < rte_pmu->num_group_events; i++) {
> > +		addr = mmap(0, rte_mem_page_size(), PROT_READ, MAP_SHARED, group->fds[i], 0);
> > +		if (addr == MAP_FAILED) {
> > +			ret = -errno;
> > +			goto out;
> > +		}
> > +
> > +		group->mmap_pages[i] = addr;
> > +	}
> > +
> > +	return 0;
> > +out:
> > +	for (; i; i--) {
> > +		munmap(group->mmap_pages[i - 1], rte_mem_page_size());
> > +		group->mmap_pages[i - 1] = NULL;
> > +	}
> > +
> > +	return ret;
> > +}
> > +
> > +static void
> > +cleanup_events(int lcore_id)
> 
> lcore_id is an unsigned int.
> 

True, unsigned seems to be more common. 

> > +{
> > +	struct rte_pmu_event_group *group = &rte_pmu->group[lcore_id];
> > +	int i;
> > +
> > +	if (!group->fds)
> > +		return;
> 
> group->fds == NULL
> 
> This coding style violating appears throughput the patch set.
> 

Good point. 

> > +
> > +	if (group->fds[0] != -1)
> > +		ioctl(group->fds[0], PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP);
> > +
> > +	for (i = 0; i < rte_pmu->num_group_events; i++) {
> > +		if (group->mmap_pages[i]) {
> > +			munmap(group->mmap_pages[i], rte_mem_page_size());
> > +			group->mmap_pages[i] = NULL;
> > +		}
> > +
> > +		if (group->fds[i] != -1) {
> > +			close(group->fds[i]);
> > +			group->fds[i] = -1;
> > +		}
> > +	}
> > +
> > +	free(group->mmap_pages);
> > +	free(group->fds);
> > +
> > +	group->mmap_pages = NULL;
> > +	group->fds = NULL;
> > +	group->enabled = false;
> > +}
> > +
> > +int __rte_noinline
> > +rte_pmu_enable_group(int lcore_id)
> 
> unsigned
> 
> > +{
> > +	struct rte_pmu_event_group *group = &rte_pmu->group[lcore_id];
> > +	int ret;
> > +
> > +	if (rte_pmu->num_group_events == 0) {
> > +		RTE_LOG(DEBUG, EAL, "no matching PMU events\n");
> > +
> > +		return 0;
> > +	}
> > +
> > +	group->fds = calloc(rte_pmu->num_group_events, sizeof(*group->fds));
> > +	if (!group->fds) {
> > +		RTE_LOG(ERR, EAL, "failed to alloc descriptor memory\n");
> > +
> > +		return -ENOMEM;
> > +	}
> > +
> > +	group->mmap_pages = calloc(rte_pmu->num_group_events, sizeof(*group->mmap_pages));
> > +	if (!group->mmap_pages) {
> > +		RTE_LOG(ERR, EAL, "failed to alloc userpage memory\n");
> > +
> > +		ret = -ENOMEM;
> > +		goto out;
> > +	}
> > +
> > +	ret = open_events(lcore_id);
> > +	if (ret) {
> > +		RTE_LOG(ERR, EAL, "failed to open events on lcore-worker-%d\n", lcore_id);
> > +		goto out;
> > +	}
> > +
> > +	ret = mmap_events(lcore_id);
> > +	if (ret) {
> > +		RTE_LOG(ERR, EAL, "failed to map events on lcore-worker-%d\n", lcore_id);
> > +		goto out;
> > +	}
> > +
> > +	if (ioctl(group->fds[0], PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP) == -1) {
> > +		RTE_LOG(ERR, EAL, "failed to enable events on lcore-worker-%d\n",
> > +lcore_id);
> > +
> > +		ret = -errno;
> > +		goto out;
> > +	}
> > +
> > +	return 0;
> > +
> > +out:
> > +	cleanup_events(lcore_id);
> > +
> > +	return ret;
> > +}
> > +
> > +static int
> > +scan_pmus(void)
> > +{
> > +	char path[PATH_MAX];
> > +	struct dirent *dent;
> > +	const char *name;
> > +	DIR *dirp;
> > +
> > +	dirp = opendir(EVENT_SOURCE_DEVICES_PATH);
> > +	if (!dirp)
> > +		return -errno;
> > +
> > +	while ((dent = readdir(dirp))) {
> > +		name = dent->d_name;
> > +		if (name[0] == '.')
> > +			continue;
> > +
> > +		/* sysfs entry should either contain cpus or be a cpu */
> > +		if (!strcmp(name, "cpu"))
> > +			break;
> > +
> > +		snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/cpus", name);
> > +		if (access(path, F_OK) == 0)
> > +			break;
> > +	}
> > +
> > +	closedir(dirp);
> > +
> > +	if (dent) {
> > +		rte_pmu->name = strdup(name);
> > +		if (!rte_pmu->name)
> > +			return -ENOMEM;
> > +	}
> > +
> > +	return rte_pmu->name ? 0 : -ENODEV;
> > +}
> > +
> > +int
> > +rte_pmu_add_event(const char *name)
> > +{
> > +	struct rte_pmu_event *event;
> > +	char path[PATH_MAX];
> > +
> > +	snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/events/%s", rte_pmu->name,
> name);
> > +	if (access(path, R_OK))
> > +		return -ENODEV;
> > +
> > +	TAILQ_FOREACH(event, &rte_pmu->event_list, next) {
> > +		if (!strcmp(event->name, name))
> > +			return event->index;
> > +		continue;
> > +	}
> > +
> > +	event = calloc(1, sizeof(*event));
> > +	if (!event)
> > +		return -ENOMEM;
> > +
> > +	event->name = strdup(name);
> > +	if (!event->name) {
> > +		free(event);
> > +
> > +		return -ENOMEM;
> > +	}
> > +
> > +	event->index = rte_pmu->num_group_events++;
> > +	TAILQ_INSERT_TAIL(&rte_pmu->event_list, event, next);
> > +
> > +	RTE_LOG(DEBUG, EAL, "%s even added at index %d\n", name,
> > +event->index);
> > +
> > +	return event->index;
> > +}
> > +
> > +void
> > +eal_pmu_init(void)
> > +{
> > +	int ret;
> > +
> > +	rte_pmu = calloc(1, sizeof(*rte_pmu));
> > +	if (!rte_pmu) {
> > +		RTE_LOG(ERR, EAL, "failed to alloc PMU\n");
> > +
> > +		return;
> > +	}
> > +
> > +	TAILQ_INIT(&rte_pmu->event_list);
> > +
> > +	ret = scan_pmus();
> > +	if (ret) {
> > +		RTE_LOG(ERR, EAL, "failed to find core pmu\n");
> > +		goto out;
> > +	}
> > +
> > +	ret = pmu_arch_init();
> > +	if (ret) {
> > +		RTE_LOG(ERR, EAL, "failed to setup arch for PMU\n");
> > +		goto out;
> > +	}
> > +
> > +	return;
> > +out:
> > +	free(rte_pmu->name);
> > +	free(rte_pmu);
> > +}
> > +
> > +void
> > +eal_pmu_fini(void)
> > +{
> > +	struct rte_pmu_event *event, *tmp;
> > +	int lcore_id;
> > +
> > +	RTE_TAILQ_FOREACH_SAFE(event, &rte_pmu->event_list, next, tmp) {
> > +		TAILQ_REMOVE(&rte_pmu->event_list, event, next);
> > +		free(event->name);
> > +		free(event);
> > +	}
> > +
> > +	RTE_LCORE_FOREACH_WORKER(lcore_id)
> > +		cleanup_events(lcore_id);
> 
> Why is the main lcore left out?
> 

Main lcore was omitted because it's pretty uncommon for it to do a heavy-lifting so
usefulness of reading counters is questionable. It can be added for completeness
though. 

Do you have any specific use case on your mind?

> > +
> > +	pmu_arch_fini();
> > +	free(rte_pmu->name);
> > +	free(rte_pmu);
> > +}
> > diff --git a/lib/eal/include/meson.build b/lib/eal/include/meson.build
> > index cfcd40aaed..3bf830adee 100644
> > --- a/lib/eal/include/meson.build
> > +++ b/lib/eal/include/meson.build
> > @@ -36,6 +36,7 @@ headers += files(
> >           'rte_pci_dev_features.h',
> >           'rte_per_lcore.h',
> >           'rte_pflock.h',
> > +        'rte_pmu.h',
> >           'rte_random.h',
> >           'rte_reciprocal.h',
> >           'rte_seqcount.h',
> > diff --git a/lib/eal/include/rte_pmu.h b/lib/eal/include/rte_pmu.h new
> > file mode 100644 index 0000000000..e4b4f6b052
> > --- /dev/null
> > +++ b/lib/eal/include/rte_pmu.h
> > @@ -0,0 +1,204 @@
> > +/* SPDX-License-Identifier: BSD-3-Clause
> > + * Copyright(c) 2022 Marvell
> > + */
> > +
> > +#ifndef _RTE_PMU_H_
> > +#define _RTE_PMU_H_
> > +
> > +#ifdef __cplusplus
> > +extern "C" {
> > +#endif
> > +
> > +#include <rte_common.h>
> > +#include <rte_compat.h>
> > +
> > +#ifdef RTE_EXEC_ENV_LINUX
> > +
> > +#include <linux/perf_event.h>
> > +
> > +#include <rte_atomic.h>
> > +#include <rte_branch_prediction.h>
> > +#include <rte_lcore.h>
> > +#include <rte_log.h>
> > +
> > +/**
> > + * @file
> > + *
> > + * PMU event tracing operations
> > + *
> > + * This file defines generic API and types necessary to setup PMU and
> > + * read selected counters in runtime.
> > + */
> > +
> > +/**
> > + * A structure describing a group of events.
> > + */
> > +struct rte_pmu_event_group {
> > +	int *fds; /**< array of event descriptors */
> > +	void **mmap_pages; /**< array of pointers to mmapped perf_event_attr structures */
> > +	bool enabled; /**< true if group was enabled on particular lcore */
> > +};
> > +
> > +/**
> > + * A structure describing an event.
> > + */
> > +struct rte_pmu_event {
> > +	char *name; /** name of an event */
> > +	int index; /** event index into fds/mmap_pages */
> > +	TAILQ_ENTRY(rte_pmu_event) next; /** list entry */ };
> > +
> > +/**
> > + * A PMU state container.
> > + */
> > +struct rte_pmu {
> > +	char *name; /** name of core PMU listed under /sys/bus/event_source/devices */
> > +	struct rte_pmu_event_group group[RTE_MAX_LCORE]; /**< per lcore event group data */
> > +	int num_group_events; /**< number of events in a group */
> > +	TAILQ_HEAD(, rte_pmu_event) event_list; /**< list of matching events
> > +*/ };
> > +
> > +/** Pointer to the PMU state container */ extern struct rte_pmu
> > +*rte_pmu;
> > +
> > +/** Each architecture supporting PMU needs to provide its own version
> > +*/ #ifndef rte_pmu_pmc_read #define rte_pmu_pmc_read(index) ({ 0; })
> > +#endif
> > +
> > +/**
> > + * @internal
> > + *
> > + * Read PMU counter.
> > + *
> > + * @param pc
> > + *   Pointer to the mmapped user page.
> > + * @return
> > + *   Counter value read from hardware.
> > + */
> > +__rte_internal
> > +static __rte_always_inline uint64_t
> > +rte_pmu_read_userpage(struct perf_event_mmap_page *pc) {
> > +	uint64_t offset, width, pmc = 0;
> > +	uint32_t seq, index;
> > +	int tries = 100;
> > +
> > +	for (;;) {
> > +		seq = pc->lock;
> > +		rte_compiler_barrier();
> > +		index = pc->index;
> > +		offset = pc->offset;
> > +		width = pc->pmc_width;
> > +
> > +		if (likely(pc->cap_user_rdpmc && index)) {
> > +			pmc = rte_pmu_pmc_read(index - 1);
> > +			pmc <<= 64 - width;
> > +			pmc >>= 64 - width;
> > +		}
> > +
> > +		rte_compiler_barrier();
> > +
> > +		if (likely(pc->lock == seq))
> > +			return pmc + offset;
> > +
> > +		if (--tries == 0) {
> > +			RTE_LOG(DEBUG, EAL, "failed to get perf_event_mmap_page lock\n");
> > +			break;
> > +		}
> > +	}
> > +
> > +	return 0;
> > +}
> > +
> > +/**
> > + * @internal
> > + *
> > + * Enable group of events for a given lcore.
> > + *
> > + * @param lcore_id
> > + *   The identifier of the lcore.
> > + * @return
> > + *   0 in case of success, negative value otherwise.
> > + */
> > +__rte_internal
> > +int
> > +rte_pmu_enable_group(int lcore_id);
> > +
> > +/**
> > + * @warning
> > + * @b EXPERIMENTAL: this API may change without prior notice
> > + *
> > + * Add event to the group of enabled events.
> > + *
> > + * @param name
> > + *   Name of an event listed under /sys/bus/event_source/devices/pmu/events.
> > + * @return
> > + *   Event index in case of success, negative value otherwise.
> > + */
> > +__rte_experimental
> > +int
> > +rte_pmu_add_event(const char *name);
> > +
> > +/**
> > + * @warning
> > + * @b EXPERIMENTAL: this API may change without prior notice
> > + *
> > + * Read hardware counter configured to count occurrences of an event.
> > + *
> > + * @param index
> > + *   Index of an event to be read.
> > + * @return
> > + *   Event value read from register. In case of errors or lack of support
> > + *   0 is returned. In other words, stream of zeros in a trace file
> > + *   indicates problem with reading particular PMU event register.
> > + */
> > +__rte_experimental
> > +static __rte_always_inline uint64_t
> > +rte_pmu_read(int index)
> > +{
> > +	int lcore_id = rte_lcore_id();
> > +	struct rte_pmu_event_group *group;
> > +	int ret;
> > +
> > +	if (!rte_pmu)
> > +		return 0;
> > +
> > +	group = &rte_pmu->group[lcore_id];
> > +	if (!group->enabled) {
> > +		ret = rte_pmu_enable_group(lcore_id);
> > +		if (ret)
> > +			return 0;
> > +
> > +		group->enabled = true;
> > +	}
> > +
> > +	if (index < 0 || index >= rte_pmu->num_group_events)
> > +		return 0;
> > +
> > +	return rte_pmu_read_userpage((struct perf_event_mmap_page
> > +*)group->mmap_pages[index]); }
> > +
> > +#else /* !RTE_EXEC_ENV_LINUX */
> > +
> > +__rte_experimental
> > +static int __rte_unused
> > +rte_pmu_add_event(__rte_unused const char *name) {
> > +	return -1;
> > +}
> > +
> > +__rte_experimental
> > +static __rte_always_inline uint64_t
> > +rte_pmu_read(__rte_unused int index)
> > +{
> > +	return 0;
> > +}
> > +
> > +#endif /* RTE_EXEC_ENV_LINUX */
> > +
> > +#ifdef __cplusplus
> > +}
> > +#endif
> > +
> > +#endif /* _RTE_PMU_H_ */
> > diff --git a/lib/eal/linux/eal.c b/lib/eal/linux/eal.c index
> > 8c118d0d9f..751a13b597 100644
> > --- a/lib/eal/linux/eal.c
> > +++ b/lib/eal/linux/eal.c
> > @@ -53,6 +53,7 @@
> >   #include "eal_options.h"
> >   #include "eal_vfio.h"
> >   #include "hotplug_mp.h"
> > +#include "pmu_private.h"
> >
> >   #define MEMSIZE_IF_NO_HUGE_PAGE (64ULL * 1024ULL * 1024ULL)
> >
> > @@ -1206,6 +1207,8 @@ rte_eal_init(int argc, char **argv)
> >   		return -1;
> >   	}
> >
> > +	eal_pmu_init();
> > +
> >   	if (rte_eal_tailqs_init() < 0) {
> >   		rte_eal_init_alert("Cannot init tail queues for objects");
> >   		rte_errno = EFAULT;
> > @@ -1372,6 +1375,7 @@ rte_eal_cleanup(void)
> >   	eal_bus_cleanup();
> >   	rte_trace_save();
> >   	eal_trace_fini();
> > +	eal_pmu_fini();
> >   	/* after this point, any DPDK pointers will become dangling */
> >   	rte_eal_memory_detach();
> >   	eal_mp_dev_hotplug_cleanup();
> > diff --git a/lib/eal/version.map b/lib/eal/version.map index
> > 7ad12a7dc9..9225f46f67 100644
> > --- a/lib/eal/version.map
> > +++ b/lib/eal/version.map
> > @@ -440,6 +440,11 @@ EXPERIMENTAL {
> >   	rte_thread_detach;
> >   	rte_thread_equal;
> >   	rte_thread_join;
> > +
> > +	# added in 23.03
> > +	rte_pmu; # WINDOWS_NO_EXPORT
> > +	rte_pmu_add_event; # WINDOWS_NO_EXPORT
> > +	rte_pmu_read; # WINDOWS_NO_EXPORT
> >   };
> >
> >   INTERNAL {
> > @@ -483,4 +488,5 @@ INTERNAL {
> >   	rte_mem_map;
> >   	rte_mem_page_size;
> >   	rte_mem_unmap;
> > +	rte_pmu_enable_group;
> >   };
  
Tomasz Duszynski Jan. 5, 2023, 9:14 p.m. UTC | #8
Hi Morten, 

A few comments inline. 

>-----Original Message-----
>From: Morten Brørup <mb@smartsharesystems.com>
>Sent: Wednesday, December 14, 2022 11:41 AM
>To: Tomasz Duszynski <tduszynski@marvell.com>; dev@dpdk.org
>Cc: thomas@monjalon.net; Jerin Jacob Kollanukkaran <jerinj@marvell.com>; zhoumin@loongson.cn;
>mattias.ronnblom@ericsson.com
>Subject: [EXT] RE: [PATCH v4 1/4] eal: add generic support for reading PMU events
>
>External Email
>
>----------------------------------------------------------------------
>+CC: Mattias, see my comment below about per-thread constructor for this
>
>> From: Tomasz Duszynski [mailto:tduszynski@marvell.com]
>> Sent: Wednesday, 14 December 2022 10.39
>>
>> Hello Morten,
>>
>> Thanks for review. Answers inline.
>>
>> [...]
>>
>> > > +/**
>> > > + * @file
>> > > + *
>> > > + * PMU event tracing operations
>> > > + *
>> > > + * This file defines generic API and types necessary to setup PMU
>> and
>> > > + * read selected counters in runtime.
>> > > + */
>> > > +
>> > > +/**
>> > > + * A structure describing a group of events.
>> > > + */
>> > > +struct rte_pmu_event_group {
>> > > +	int *fds; /**< array of event descriptors */
>> > > +	void **mmap_pages; /**< array of pointers to mmapped
>> > > perf_event_attr structures */
>> >
>> > There seems to be a lot of indirection involved here. Why are these
>> arrays not statically sized,
>> > instead of dynamically allocated?
>> >
>>
>> Different architectures/pmus impose limits on number of simultaneously
>> enabled counters. So in order relief the pain of thinking about it and
>> adding macros for each and every arch I decided to allocate the number
>> user wants dynamically. Also assumption holds that user knows about
>> tradeoffs of using too many counters hence will not enable too many
>> events at once.
>
>The DPDK convention is to use fixed size arrays (with a maximum size, e.g. RTE_MAX_ETHPORTS) in the
>fast path, for performance reasons.
>
>Please use fixed size arrays instead of dynamically allocated arrays.
>

I do agree that from maintenance angle fixed arrays are much more convenient 
but when optimization kicks in then that statement does not necessarily
hold true anymore.

For example, in this case performance dropped by ~0.3% which is insignificant imo. So
given simpler code, next patchset will use fixed arrays. 

>>
>> > Also, what is the reason for hiding the type struct
>> perf_event_mmap_page **mmap_pages opaque by
>> > using void **mmap_pages instead?
>>
>> I think, that part doing mmap/munmap was written first hence void **
>> was chosen in the first place.
>
>Please update it, so the actual type is reflected here.
>
>>
>> >
>> > > +	bool enabled; /**< true if group was enabled on particular lcore
>> > > */
>> > > +};
>> > > +
>> > > +/**
>> > > + * A structure describing an event.
>> > > + */
>> > > +struct rte_pmu_event {
>> > > +	char *name; /** name of an event */
>> > > +	int index; /** event index into fds/mmap_pages */
>> > > +	TAILQ_ENTRY(rte_pmu_event) next; /** list entry */ };
>> > > +
>> > > +/**
>> > > + * A PMU state container.
>> > > + */
>> > > +struct rte_pmu {
>> > > +	char *name; /** name of core PMU listed under
>> > > /sys/bus/event_source/devices */
>> > > +	struct rte_pmu_event_group group[RTE_MAX_LCORE]; /**< per lcore
>> > > event group data */
>> > > +	int num_group_events; /**< number of events in a group */
>> > > +	TAILQ_HEAD(, rte_pmu_event) event_list; /**< list of matching
>> > > events */
>
>The event_list is used in slow path only, so it can remain a list - i.e. no change requested here.
>:-)
>
>> > > +};
>> > > +
>> > > +/** Pointer to the PMU state container */ extern struct rte_pmu
>> > > +*rte_pmu;
>> >
>> > Again, why not just extern struct rte_pmu, instead of dynamic
>> allocation?
>> >
>>
>> No strong opinions here since this is a matter of personal preference.
>> Can be removed
>> in the next version.
>
>Yes, please.
>
>>
>> > > +
>> > > +/** Each architecture supporting PMU needs to provide its own
>> version
>> > > */
>> > > +#ifndef rte_pmu_pmc_read
>> > > +#define rte_pmu_pmc_read(index) ({ 0; }) #endif
>> > > +
>> > > +/**
>> > > + * @internal
>> > > + *
>> > > + * Read PMU counter.
>> > > + *
>> > > + * @param pc
>> > > + *   Pointer to the mmapped user page.
>> > > + * @return
>> > > + *   Counter value read from hardware.
>> > > + */
>> > > +__rte_internal
>> > > +static __rte_always_inline uint64_t rte_pmu_read_userpage(struct
>> > > +perf_event_mmap_page *pc) {
>> > > +	uint64_t offset, width, pmc = 0;
>> > > +	uint32_t seq, index;
>> > > +	int tries = 100;
>> > > +
>> > > +	for (;;) {
>
>As a matter of personal preference, I would write this loop differently:
>
>+ for (tries = 100; tries != 0; tries--) {
>
>> > > +		seq = pc->lock;
>> > > +		rte_compiler_barrier();
>> > > +		index = pc->index;
>> > > +		offset = pc->offset;
>> > > +		width = pc->pmc_width;
>> > > +
>> > > +		if (likely(pc->cap_user_rdpmc && index)) {
>
>Why "&& index"? The way I read [man perf_event_open], index 0 is perfectly valid.
>

Valid index starts at 1. 0 means that either hw counter is stopped or isn't active. Maybe this is not
initially clear from man but there's example later on how to get actual number. 

>[man perf_event_open]: https://urldefense.proofpoint.com/v2/url?u=https-3A__man7.org_linux_man-
>2Dpages_man2_perf-5Fevent-
>5Fopen.2.html&d=DwIFAw&c=nKjWec2b6R0mOyPaz7xtfQ&r=PZNXgrbjdlXxVEEGYkxIxRndyEUwWU_ad5ce22YI6Is&m=tny
>gBVwOnoZDV7hItku1HtmsI8R3F6vPJdr7ON3hE5iAds96T2C9JTNcnt6ptN4Q&s=s10yJogwRRXHqAuIay47H-
>aWl9SL5wpQ9tCjfiQUgrY&e=
>
>> > > +			pmc = rte_pmu_pmc_read(index - 1);
>> > > +			pmc <<= 64 - width;
>> > > +			pmc >>= 64 - width;
>> > > +		}
>> > > +
>> > > +		rte_compiler_barrier();
>> > > +
>> > > +		if (likely(pc->lock == seq))
>> > > +			return pmc + offset;
>> > > +
>> > > +		if (--tries == 0) {
>> > > +			RTE_LOG(DEBUG, EAL, "failed to get
>> > > perf_event_mmap_page lock\n");
>> > > +			break;
>> > > +		}
>
>- Remove the 4 above lines of code, and move the debug log message to the end of the function
>instead.
>
>> > > +	}
>
>+ RTE_LOG(DEBUG, EAL, "failed to get perf_event_mmap_page lock\n");
>
>> > > +
>> > > +	return 0;
>> > > +}
>> > > +
>> > > +/**
>> > > + * @internal
>> > > + *
>> > > + * Enable group of events for a given lcore.
>> > > + *
>> > > + * @param lcore_id
>> > > + *   The identifier of the lcore.
>> > > + * @return
>> > > + *   0 in case of success, negative value otherwise.
>> > > + */
>> > > +__rte_internal
>> > > +int
>> > > +rte_pmu_enable_group(int lcore_id);
>> > > +
>> > > +/**
>> > > + * @warning
>> > > + * @b EXPERIMENTAL: this API may change without prior notice
>> > > + *
>> > > + * Add event to the group of enabled events.
>> > > + *
>> > > + * @param name
>> > > + *   Name of an event listed under
>> > > /sys/bus/event_source/devices/pmu/events.
>> > > + * @return
>> > > + *   Event index in case of success, negative value otherwise.
>> > > + */
>> > > +__rte_experimental
>> > > +int
>> > > +rte_pmu_add_event(const char *name);
>> > > +
>> > > +/**
>> > > + * @warning
>> > > + * @b EXPERIMENTAL: this API may change without prior notice
>> > > + *
>> > > + * Read hardware counter configured to count occurrences of an
>> event.
>> > > + *
>> > > + * @param index
>> > > + *   Index of an event to be read.
>> > > + * @return
>> > > + *   Event value read from register. In case of errors or lack of
>> > > support
>> > > + *   0 is returned. In other words, stream of zeros in a trace
>> file
>> > > + *   indicates problem with reading particular PMU event register.
>> > > + */
>> > > +__rte_experimental
>> > > +static __rte_always_inline uint64_t rte_pmu_read(int index)
>
>The index type can be changed from int to uint32_t. This also eliminates the "(index < 0" part of
>the comparison further below in this function.
>

That's true. 

>> > > +{
>> > > +	int lcore_id = rte_lcore_id();
>> > > +	struct rte_pmu_event_group *group;
>> > > +	int ret;
>> > > +
>> > > +	if (!rte_pmu)
>> > > +		return 0;
>> > > +
>> > > +	group = &rte_pmu->group[lcore_id];
>> > > +	if (!group->enabled) {
>
>Optimized: if (unlikely(!group->enabled)) {
>

Compiler will optimize the branch itself correctly. Extra hint is not required.  

>> > > +		ret = rte_pmu_enable_group(lcore_id);
>> > > +		if (ret)
>> > > +			return 0;
>> > > +
>> > > +		group->enabled = true;
>> > > +	}
>> >
>> > Why is the group not enabled in the setup function,
>> rte_pmu_add_event(), instead of here, in the
>> > hot path?
>> >
>>
>> When this is executed for the very first time then cpu will have
>> obviously more work to do but afterwards setup path is not taken hence
>> much less cpu cycles are required.
>>
>> Setup is executed by main lcore solely, before lcores are executed
>> hence some info passed to SYS_perf_event_open ioctl() is missing, pid
>> (via rte_gettid()) being an example here.
>
>OK. Thank you for the explanation. Since impossible at setup, it has to be done at runtime.
>
>@Mattias: Another good example of something that would belong in per-thread constructors, as my
>suggested feature creep in [1].
>
>[1]: https://urldefense.proofpoint.com/v2/url?u=http-
>3A__inbox.dpdk.org_dev_98CBD80474FA8B44BF855DF32C47DC35D87553-
>40smartserver.smartshare.dk_&d=DwIFAw&c=nKjWec2b6R0mOyPaz7xtfQ&r=PZNXgrbjdlXxVEEGYkxIxRndyEUwWU_ad5
>ce22YI6Is&m=tnygBVwOnoZDV7hItku1HtmsI8R3F6vPJdr7ON3hE5iAds96T2C9JTNcnt6ptN4Q&s=aSAnYqgVnrgDp6yyMtGC
>uWgJjDlgqj1wHf1nGWyHCNo&e=
>
>>
>> > > +
>> > > +	if (index < 0 || index >= rte_pmu->num_group_events)
>
>Optimized: if (unlikely(index >= rte_pmu.num_group_events))
>
>> > > +		return 0;
>> > > +
>> > > +	return rte_pmu_read_userpage((struct perf_event_mmap_page
>> > > *)group->mmap_pages[index]);
>> >
>> > Using fixed size arrays instead of multiple indirections via
>> > pointers
>> is faster. It could be:
>> >
>> > return rte_pmu_read_userpage((struct perf_event_mmap_page
>> > *)rte_pmu.group[lcore_id].mmap_pages[index]);
>> >
>> > With our without suggested performance improvements...
>> >
>> > Series-acked-by: Morten Brørup <mb@smartsharesystems.com>
>>
  
Morten Brørup Jan. 5, 2023, 10:07 p.m. UTC | #9
> From: Tomasz Duszynski [mailto:tduszynski@marvell.com]
> Sent: Thursday, 5 January 2023 22.14
> 
> Hi Morten,
> 
> A few comments inline.
> 
> >From: Morten Brørup <mb@smartsharesystems.com>
> >Sent: Wednesday, December 14, 2022 11:41 AM
> >
> >External Email
> >
> >----------------------------------------------------------------------
> >+CC: Mattias, see my comment below about per-thread constructor for
> this
> >
> >> From: Tomasz Duszynski [mailto:tduszynski@marvell.com]
> >> Sent: Wednesday, 14 December 2022 10.39
> >>
> >> Hello Morten,
> >>
> >> Thanks for review. Answers inline.
> >>
> >> [...]
> >>
> >> > > +/**
> >> > > + * @file
> >> > > + *
> >> > > + * PMU event tracing operations
> >> > > + *
> >> > > + * This file defines generic API and types necessary to setup
> PMU
> >> and
> >> > > + * read selected counters in runtime.
> >> > > + */
> >> > > +
> >> > > +/**
> >> > > + * A structure describing a group of events.
> >> > > + */
> >> > > +struct rte_pmu_event_group {
> >> > > +	int *fds; /**< array of event descriptors */
> >> > > +	void **mmap_pages; /**< array of pointers to mmapped
> >> > > perf_event_attr structures */
> >> >
> >> > There seems to be a lot of indirection involved here. Why are
> these
> >> arrays not statically sized,
> >> > instead of dynamically allocated?
> >> >
> >>
> >> Different architectures/pmus impose limits on number of
> simultaneously
> >> enabled counters. So in order relief the pain of thinking about it
> and
> >> adding macros for each and every arch I decided to allocate the
> number
> >> user wants dynamically. Also assumption holds that user knows about
> >> tradeoffs of using too many counters hence will not enable too many
> >> events at once.
> >
> >The DPDK convention is to use fixed size arrays (with a maximum size,
> e.g. RTE_MAX_ETHPORTS) in the
> >fast path, for performance reasons.
> >
> >Please use fixed size arrays instead of dynamically allocated arrays.
> >
> 
> I do agree that from maintenance angle fixed arrays are much more
> convenient
> but when optimization kicks in then that statement does not necessarily
> hold true anymore.
> 
> For example, in this case performance dropped by ~0.3% which is
> insignificant imo. So
> given simpler code, next patchset will use fixed arrays.

I fail to understand how pointer chasing can perform better than obtaining an address by multiplying by a constant. Modern CPUs work in mysterious ways, and you obviously tested this, so I believe your test results. But in theory, pointer chasing touches more cache lines, and should perform worse in a loaded system where pointers in the chain have been evicted from the cache.

Anyway, you agreed to use fixed arrays, so I am happy. :-)

> 
> >>
> >> > Also, what is the reason for hiding the type struct
> >> perf_event_mmap_page **mmap_pages opaque by
> >> > using void **mmap_pages instead?
> >>
> >> I think, that part doing mmap/munmap was written first hence void **
> >> was chosen in the first place.
> >
> >Please update it, so the actual type is reflected here.
> >
> >>
> >> >
> >> > > +	bool enabled; /**< true if group was enabled on particular
> lcore
> >> > > */
> >> > > +};
> >> > > +
> >> > > +/**
> >> > > + * A structure describing an event.
> >> > > + */
> >> > > +struct rte_pmu_event {
> >> > > +	char *name; /** name of an event */
> >> > > +	int index; /** event index into fds/mmap_pages */
> >> > > +	TAILQ_ENTRY(rte_pmu_event) next; /** list entry */ };
> >> > > +
> >> > > +/**
> >> > > + * A PMU state container.
> >> > > + */
> >> > > +struct rte_pmu {
> >> > > +	char *name; /** name of core PMU listed under
> >> > > /sys/bus/event_source/devices */
> >> > > +	struct rte_pmu_event_group group[RTE_MAX_LCORE]; /**< per
> lcore
> >> > > event group data */
> >> > > +	int num_group_events; /**< number of events in a group */
> >> > > +	TAILQ_HEAD(, rte_pmu_event) event_list; /**< list of
> matching
> >> > > events */
> >
> >The event_list is used in slow path only, so it can remain a list -
> i.e. no change requested here.
> >:-)
> >
> >> > > +};
> >> > > +
> >> > > +/** Pointer to the PMU state container */ extern struct rte_pmu
> >> > > +*rte_pmu;
> >> >
> >> > Again, why not just extern struct rte_pmu, instead of dynamic
> >> allocation?
> >> >
> >>
> >> No strong opinions here since this is a matter of personal
> preference.
> >> Can be removed
> >> in the next version.
> >
> >Yes, please.
> >
> >>
> >> > > +
> >> > > +/** Each architecture supporting PMU needs to provide its own
> >> version
> >> > > */
> >> > > +#ifndef rte_pmu_pmc_read
> >> > > +#define rte_pmu_pmc_read(index) ({ 0; }) #endif
> >> > > +
> >> > > +/**
> >> > > + * @internal
> >> > > + *
> >> > > + * Read PMU counter.
> >> > > + *
> >> > > + * @param pc
> >> > > + *   Pointer to the mmapped user page.
> >> > > + * @return
> >> > > + *   Counter value read from hardware.
> >> > > + */
> >> > > +__rte_internal
> >> > > +static __rte_always_inline uint64_t
> rte_pmu_read_userpage(struct
> >> > > +perf_event_mmap_page *pc) {
> >> > > +	uint64_t offset, width, pmc = 0;
> >> > > +	uint32_t seq, index;
> >> > > +	int tries = 100;
> >> > > +
> >> > > +	for (;;) {
> >
> >As a matter of personal preference, I would write this loop
> differently:
> >
> >+ for (tries = 100; tries != 0; tries--) {
> >
> >> > > +		seq = pc->lock;
> >> > > +		rte_compiler_barrier();
> >> > > +		index = pc->index;
> >> > > +		offset = pc->offset;
> >> > > +		width = pc->pmc_width;
> >> > > +
> >> > > +		if (likely(pc->cap_user_rdpmc && index)) {
> >
> >Why "&& index"? The way I read [man perf_event_open], index 0 is
> perfectly valid.
> >
> 
> Valid index starts at 1. 0 means that either hw counter is stopped or
> isn't active. Maybe this is not
> initially clear from man but there's example later on how to get actual
> number.

OK. Thanks for the reference.

Please add a comment about the special meaning of index 0 in the code.

> 
> >[man perf_event_open]:
> https://urldefense.proofpoint.com/v2/url?u=https-
> 3A__man7.org_linux_man-
> >2Dpages_man2_perf-5Fevent-
> >5Fopen.2.html&d=DwIFAw&c=nKjWec2b6R0mOyPaz7xtfQ&r=PZNXgrbjdlXxVEEGYkxI
> xRndyEUwWU_ad5ce22YI6Is&m=tny
> >gBVwOnoZDV7hItku1HtmsI8R3F6vPJdr7ON3hE5iAds96T2C9JTNcnt6ptN4Q&s=s10yJo
> gwRRXHqAuIay47H-
> >aWl9SL5wpQ9tCjfiQUgrY&e=
> >
> >> > > +			pmc = rte_pmu_pmc_read(index - 1);
> >> > > +			pmc <<= 64 - width;
> >> > > +			pmc >>= 64 - width;
> >> > > +		}
> >> > > +
> >> > > +		rte_compiler_barrier();
> >> > > +
> >> > > +		if (likely(pc->lock == seq))
> >> > > +			return pmc + offset;
> >> > > +
> >> > > +		if (--tries == 0) {
> >> > > +			RTE_LOG(DEBUG, EAL, "failed to get
> >> > > perf_event_mmap_page lock\n");
> >> > > +			break;
> >> > > +		}
> >
> >- Remove the 4 above lines of code, and move the debug log message to
> the end of the function
> >instead.
> >
> >> > > +	}
> >
> >+ RTE_LOG(DEBUG, EAL, "failed to get perf_event_mmap_page lock\n");
> >
> >> > > +
> >> > > +	return 0;
> >> > > +}
> >> > > +
> >> > > +/**
> >> > > + * @internal
> >> > > + *
> >> > > + * Enable group of events for a given lcore.
> >> > > + *
> >> > > + * @param lcore_id
> >> > > + *   The identifier of the lcore.
> >> > > + * @return
> >> > > + *   0 in case of success, negative value otherwise.
> >> > > + */
> >> > > +__rte_internal
> >> > > +int
> >> > > +rte_pmu_enable_group(int lcore_id);
> >> > > +
> >> > > +/**
> >> > > + * @warning
> >> > > + * @b EXPERIMENTAL: this API may change without prior notice
> >> > > + *
> >> > > + * Add event to the group of enabled events.
> >> > > + *
> >> > > + * @param name
> >> > > + *   Name of an event listed under
> >> > > /sys/bus/event_source/devices/pmu/events.
> >> > > + * @return
> >> > > + *   Event index in case of success, negative value otherwise.
> >> > > + */
> >> > > +__rte_experimental
> >> > > +int
> >> > > +rte_pmu_add_event(const char *name);
> >> > > +
> >> > > +/**
> >> > > + * @warning
> >> > > + * @b EXPERIMENTAL: this API may change without prior notice
> >> > > + *
> >> > > + * Read hardware counter configured to count occurrences of an
> >> event.
> >> > > + *
> >> > > + * @param index
> >> > > + *   Index of an event to be read.
> >> > > + * @return
> >> > > + *   Event value read from register. In case of errors or lack
> of
> >> > > support
> >> > > + *   0 is returned. In other words, stream of zeros in a trace
> >> file
> >> > > + *   indicates problem with reading particular PMU event
> register.
> >> > > + */
> >> > > +__rte_experimental
> >> > > +static __rte_always_inline uint64_t rte_pmu_read(int index)
> >
> >The index type can be changed from int to uint32_t. This also
> eliminates the "(index < 0" part of
> >the comparison further below in this function.
> >
> 
> That's true.
> 
> >> > > +{
> >> > > +	int lcore_id = rte_lcore_id();
> >> > > +	struct rte_pmu_event_group *group;
> >> > > +	int ret;
> >> > > +
> >> > > +	if (!rte_pmu)
> >> > > +		return 0;
> >> > > +
> >> > > +	group = &rte_pmu->group[lcore_id];
> >> > > +	if (!group->enabled) {
> >
> >Optimized: if (unlikely(!group->enabled)) {
> >
> 
> Compiler will optimize the branch itself correctly. Extra hint is not
> required.

I haven't reviewed the output from this, so I'll take your word for it. I suggested the unlikely() because I previously tested some very simple code, and it optimized for taking the "if":

void testb(bool b)
{
    if (!b)
        exit(1);
    
    exit(99);
}

I guess I should experiment with more realistic code, and update my optimization notes!

You could add the unlikely() for readability purposes. ;-)

> 
> >> > > +		ret = rte_pmu_enable_group(lcore_id);
> >> > > +		if (ret)
> >> > > +			return 0;
> >> > > +
> >> > > +		group->enabled = true;
> >> > > +	}
> >> >
> >> > Why is the group not enabled in the setup function,
> >> rte_pmu_add_event(), instead of here, in the
> >> > hot path?
> >> >
> >>
> >> When this is executed for the very first time then cpu will have
> >> obviously more work to do but afterwards setup path is not taken
> hence
> >> much less cpu cycles are required.
> >>
> >> Setup is executed by main lcore solely, before lcores are executed
> >> hence some info passed to SYS_perf_event_open ioctl() is missing,
> pid
> >> (via rte_gettid()) being an example here.
> >
> >OK. Thank you for the explanation. Since impossible at setup, it has
> to be done at runtime.
> >
> >@Mattias: Another good example of something that would belong in per-
> thread constructors, as my
> >suggested feature creep in [1].
> >
> >[1]: https://urldefense.proofpoint.com/v2/url?u=http-
> >3A__inbox.dpdk.org_dev_98CBD80474FA8B44BF855DF32C47DC35D87553-
> >40smartserver.smartshare.dk_&d=DwIFAw&c=nKjWec2b6R0mOyPaz7xtfQ&r=PZNXg
> rbjdlXxVEEGYkxIxRndyEUwWU_ad5
> >ce22YI6Is&m=tnygBVwOnoZDV7hItku1HtmsI8R3F6vPJdr7ON3hE5iAds96T2C9JTNcnt
> 6ptN4Q&s=aSAnYqgVnrgDp6yyMtGC
> >uWgJjDlgqj1wHf1nGWyHCNo&e=
> >
> >>
> >> > > +
> >> > > +	if (index < 0 || index >= rte_pmu->num_group_events)
> >
> >Optimized: if (unlikely(index >= rte_pmu.num_group_events))
> >
> >> > > +		return 0;
> >> > > +
> >> > > +	return rte_pmu_read_userpage((struct perf_event_mmap_page
> >> > > *)group->mmap_pages[index]);
> >> >
> >> > Using fixed size arrays instead of multiple indirections via
> >> > pointers
> >> is faster. It could be:
> >> >
> >> > return rte_pmu_read_userpage((struct perf_event_mmap_page
> >> > *)rte_pmu.group[lcore_id].mmap_pages[index]);
> >> >
> >> > With our without suggested performance improvements...
> >> >
> >> > Series-acked-by: Morten Brørup <mb@smartsharesystems.com>
> >>
>
  
Tomasz Duszynski Jan. 8, 2023, 3:41 p.m. UTC | #10
>-----Original Message-----
>From: Morten Brørup <mb@smartsharesystems.com>
>Sent: Thursday, January 5, 2023 11:08 PM
>To: Tomasz Duszynski <tduszynski@marvell.com>; dev@dpdk.org
>Cc: thomas@monjalon.net; Jerin Jacob Kollanukkaran <jerinj@marvell.com>; zhoumin@loongson.cn;
>mattias.ronnblom@ericsson.com
>Subject: [EXT] RE: [PATCH v4 1/4] eal: add generic support for reading PMU events
>
>External Email
>
>----------------------------------------------------------------------
>> From: Tomasz Duszynski [mailto:tduszynski@marvell.com]
>> Sent: Thursday, 5 January 2023 22.14
>>
>> Hi Morten,
>>
>> A few comments inline.
>>
>> >From: Morten Brørup <mb@smartsharesystems.com>
>> >Sent: Wednesday, December 14, 2022 11:41 AM
>> >
>> >External Email
>> >
>> >---------------------------------------------------------------------
>> >-
>> >+CC: Mattias, see my comment below about per-thread constructor for
>> this
>> >
>> >> From: Tomasz Duszynski [mailto:tduszynski@marvell.com]
>> >> Sent: Wednesday, 14 December 2022 10.39
>> >>
>> >> Hello Morten,
>> >>
>> >> Thanks for review. Answers inline.
>> >>
>> >> [...]
>> >>
>> >> > > +/**
>> >> > > + * @file
>> >> > > + *
>> >> > > + * PMU event tracing operations
>> >> > > + *
>> >> > > + * This file defines generic API and types necessary to setup
>> PMU
>> >> and
>> >> > > + * read selected counters in runtime.
>> >> > > + */
>> >> > > +
>> >> > > +/**
>> >> > > + * A structure describing a group of events.
>> >> > > + */
>> >> > > +struct rte_pmu_event_group {
>> >> > > +	int *fds; /**< array of event descriptors */
>> >> > > +	void **mmap_pages; /**< array of pointers to mmapped
>> >> > > perf_event_attr structures */
>> >> >
>> >> > There seems to be a lot of indirection involved here. Why are
>> these
>> >> arrays not statically sized,
>> >> > instead of dynamically allocated?
>> >> >
>> >>
>> >> Different architectures/pmus impose limits on number of
>> simultaneously
>> >> enabled counters. So in order relief the pain of thinking about it
>> and
>> >> adding macros for each and every arch I decided to allocate the
>> number
>> >> user wants dynamically. Also assumption holds that user knows about
>> >> tradeoffs of using too many counters hence will not enable too many
>> >> events at once.
>> >
>> >The DPDK convention is to use fixed size arrays (with a maximum size,
>> e.g. RTE_MAX_ETHPORTS) in the
>> >fast path, for performance reasons.
>> >
>> >Please use fixed size arrays instead of dynamically allocated arrays.
>> >
>>
>> I do agree that from maintenance angle fixed arrays are much more
>> convenient but when optimization kicks in then that statement does not
>> necessarily hold true anymore.
>>
>> For example, in this case performance dropped by ~0.3% which is
>> insignificant imo. So given simpler code, next patchset will use fixed
>> arrays.
>
>I fail to understand how pointer chasing can perform better than obtaining an address by
>multiplying by a constant. Modern CPUs work in mysterious ways, and you obviously tested this, so I
>believe your test results. But in theory, pointer chasing touches more cache lines, and should
>perform worse in a loaded system where pointers in the chain have been evicted from the cache.
>
>Anyway, you agreed to use fixed arrays, so I am happy. :-)
>
>>
>> >>
>> >> > Also, what is the reason for hiding the type struct
>> >> perf_event_mmap_page **mmap_pages opaque by
>> >> > using void **mmap_pages instead?
>> >>
>> >> I think, that part doing mmap/munmap was written first hence void
>> >> ** was chosen in the first place.
>> >
>> >Please update it, so the actual type is reflected here.
>> >
>> >>
>> >> >
>> >> > > +	bool enabled; /**< true if group was enabled on particular
>> lcore
>> >> > > */
>> >> > > +};
>> >> > > +
>> >> > > +/**
>> >> > > + * A structure describing an event.
>> >> > > + */
>> >> > > +struct rte_pmu_event {
>> >> > > +	char *name; /** name of an event */
>> >> > > +	int index; /** event index into fds/mmap_pages */
>> >> > > +	TAILQ_ENTRY(rte_pmu_event) next; /** list entry */ };
>> >> > > +
>> >> > > +/**
>> >> > > + * A PMU state container.
>> >> > > + */
>> >> > > +struct rte_pmu {
>> >> > > +	char *name; /** name of core PMU listed under
>> >> > > /sys/bus/event_source/devices */
>> >> > > +	struct rte_pmu_event_group group[RTE_MAX_LCORE]; /**< per
>> lcore
>> >> > > event group data */
>> >> > > +	int num_group_events; /**< number of events in a group */
>> >> > > +	TAILQ_HEAD(, rte_pmu_event) event_list; /**< list of
>> matching
>> >> > > events */
>> >
>> >The event_list is used in slow path only, so it can remain a list -
>> i.e. no change requested here.
>> >:-)
>> >
>> >> > > +};
>> >> > > +
>> >> > > +/** Pointer to the PMU state container */ extern struct
>> >> > > +rte_pmu *rte_pmu;
>> >> >
>> >> > Again, why not just extern struct rte_pmu, instead of dynamic
>> >> allocation?
>> >> >
>> >>
>> >> No strong opinions here since this is a matter of personal
>> preference.
>> >> Can be removed
>> >> in the next version.
>> >
>> >Yes, please.
>> >
>> >>
>> >> > > +
>> >> > > +/** Each architecture supporting PMU needs to provide its own
>> >> version
>> >> > > */
>> >> > > +#ifndef rte_pmu_pmc_read
>> >> > > +#define rte_pmu_pmc_read(index) ({ 0; }) #endif
>> >> > > +
>> >> > > +/**
>> >> > > + * @internal
>> >> > > + *
>> >> > > + * Read PMU counter.
>> >> > > + *
>> >> > > + * @param pc
>> >> > > + *   Pointer to the mmapped user page.
>> >> > > + * @return
>> >> > > + *   Counter value read from hardware.
>> >> > > + */
>> >> > > +__rte_internal
>> >> > > +static __rte_always_inline uint64_t
>> rte_pmu_read_userpage(struct
>> >> > > +perf_event_mmap_page *pc) {
>> >> > > +	uint64_t offset, width, pmc = 0;
>> >> > > +	uint32_t seq, index;
>> >> > > +	int tries = 100;
>> >> > > +
>> >> > > +	for (;;) {
>> >
>> >As a matter of personal preference, I would write this loop
>> differently:
>> >
>> >+ for (tries = 100; tries != 0; tries--) {
>> >
>> >> > > +		seq = pc->lock;
>> >> > > +		rte_compiler_barrier();
>> >> > > +		index = pc->index;
>> >> > > +		offset = pc->offset;
>> >> > > +		width = pc->pmc_width;
>> >> > > +
>> >> > > +		if (likely(pc->cap_user_rdpmc && index)) {
>> >
>> >Why "&& index"? The way I read [man perf_event_open], index 0 is
>> perfectly valid.
>> >
>>
>> Valid index starts at 1. 0 means that either hw counter is stopped or
>> isn't active. Maybe this is not initially clear from man but there's
>> example later on how to get actual number.
>
>OK. Thanks for the reference.
>
>Please add a comment about the special meaning of index 0 in the code.
>
>>
>> >[man perf_event_open]:
>> https://urldefense.proofpoint.com/v2/url?u=https-
>> 3A__man7.org_linux_man-
>> >2Dpages_man2_perf-5Fevent-
>> >5Fopen.2.html&d=DwIFAw&c=nKjWec2b6R0mOyPaz7xtfQ&r=PZNXgrbjdlXxVEEGYkx
>> >I
>> xRndyEUwWU_ad5ce22YI6Is&m=tny
>> >gBVwOnoZDV7hItku1HtmsI8R3F6vPJdr7ON3hE5iAds96T2C9JTNcnt6ptN4Q&s=s10yJ
>> >o
>> gwRRXHqAuIay47H-
>> >aWl9SL5wpQ9tCjfiQUgrY&e=
>> >
>> >> > > +			pmc = rte_pmu_pmc_read(index - 1);
>> >> > > +			pmc <<= 64 - width;
>> >> > > +			pmc >>= 64 - width;
>> >> > > +		}
>> >> > > +
>> >> > > +		rte_compiler_barrier();
>> >> > > +
>> >> > > +		if (likely(pc->lock == seq))
>> >> > > +			return pmc + offset;
>> >> > > +
>> >> > > +		if (--tries == 0) {
>> >> > > +			RTE_LOG(DEBUG, EAL, "failed to get
>> >> > > perf_event_mmap_page lock\n");
>> >> > > +			break;
>> >> > > +		}
>> >
>> >- Remove the 4 above lines of code, and move the debug log message to
>> the end of the function
>> >instead.
>> >
>> >> > > +	}
>> >
>> >+ RTE_LOG(DEBUG, EAL, "failed to get perf_event_mmap_page lock\n");
>> >
>> >> > > +
>> >> > > +	return 0;
>> >> > > +}
>> >> > > +
>> >> > > +/**
>> >> > > + * @internal
>> >> > > + *
>> >> > > + * Enable group of events for a given lcore.
>> >> > > + *
>> >> > > + * @param lcore_id
>> >> > > + *   The identifier of the lcore.
>> >> > > + * @return
>> >> > > + *   0 in case of success, negative value otherwise.
>> >> > > + */
>> >> > > +__rte_internal
>> >> > > +int
>> >> > > +rte_pmu_enable_group(int lcore_id);
>> >> > > +
>> >> > > +/**
>> >> > > + * @warning
>> >> > > + * @b EXPERIMENTAL: this API may change without prior notice
>> >> > > + *
>> >> > > + * Add event to the group of enabled events.
>> >> > > + *
>> >> > > + * @param name
>> >> > > + *   Name of an event listed under
>> >> > > /sys/bus/event_source/devices/pmu/events.
>> >> > > + * @return
>> >> > > + *   Event index in case of success, negative value otherwise.
>> >> > > + */
>> >> > > +__rte_experimental
>> >> > > +int
>> >> > > +rte_pmu_add_event(const char *name);
>> >> > > +
>> >> > > +/**
>> >> > > + * @warning
>> >> > > + * @b EXPERIMENTAL: this API may change without prior notice
>> >> > > + *
>> >> > > + * Read hardware counter configured to count occurrences of an
>> >> event.
>> >> > > + *
>> >> > > + * @param index
>> >> > > + *   Index of an event to be read.
>> >> > > + * @return
>> >> > > + *   Event value read from register. In case of errors or lack
>> of
>> >> > > support
>> >> > > + *   0 is returned. In other words, stream of zeros in a trace
>> >> file
>> >> > > + *   indicates problem with reading particular PMU event
>> register.
>> >> > > + */
>> >> > > +__rte_experimental
>> >> > > +static __rte_always_inline uint64_t rte_pmu_read(int index)
>> >
>> >The index type can be changed from int to uint32_t. This also
>> eliminates the "(index < 0" part of
>> >the comparison further below in this function.
>> >
>>
>> That's true.
>>
>> >> > > +{
>> >> > > +	int lcore_id = rte_lcore_id();
>> >> > > +	struct rte_pmu_event_group *group;
>> >> > > +	int ret;
>> >> > > +
>> >> > > +	if (!rte_pmu)
>> >> > > +		return 0;
>> >> > > +
>> >> > > +	group = &rte_pmu->group[lcore_id];
>> >> > > +	if (!group->enabled) {
>> >
>> >Optimized: if (unlikely(!group->enabled)) {
>> >
>>
>> Compiler will optimize the branch itself correctly. Extra hint is not
>> required.
>
>I haven't reviewed the output from this, so I'll take your word for it. I suggested the unlikely()
>because I previously tested some very simple code, and it optimized for taking the "if":
>
>void testb(bool b)
>{
>    if (!b)
>        exit(1);
>
>    exit(99);
>}
>
>I guess I should experiment with more realistic code, and update my optimization notes!
>

I think this may be too simple to draw far-reaching conclusions from it. Compiler will make the
fall-through path more likely. If I recall Intel Optimization Reference Manual has some more
info on this. 

Lets take a different example.  

int main(int argc, char *argv[])
{
        int *p;

        p = malloc(sizeof(*p));
        if (!p)
                return 1;
        *p = atoi(argv[1]);
        if (*p < 0)
                return 2;
        free(p);

        return 0;
}

If compiled with -O3 and disassembled I got below. 

00000000000010a0 <main>:
    10a0:       f3 0f 1e fa             endbr64
    10a4:       55                      push   %rbp
    10a5:       bf 04 00 00 00          mov    $0x4,%edi
    10aa:       53                      push   %rbx
    10ab:       48 89 f3                mov    %rsi,%rbx
    10ae:       48 83 ec 08             sub    $0x8,%rsp
    10b2:       e8 d9 ff ff ff          call   1090 <malloc@plt>
    10b7:       48 85 c0                test   %rax,%rax
    10ba:       74 31                   je     10ed <main+0x4d>
    10bc:       48 8b 7b 08             mov    0x8(%rbx),%rdi
    10c0:       ba 0a 00 00 00          mov    $0xa,%edx
    10c5:       31 f6                   xor    %esi,%esi
    10c7:       48 89 c5                mov    %rax,%rbp
    10ca:       e8 b1 ff ff ff          call   1080 <strtol@plt>
    10cf:       49 89 c0                mov    %rax,%r8
    10d2:       b8 02 00 00 00          mov    $0x2,%eax
    10d7:       45 85 c0                test   %r8d,%r8d
    10da:       78 0a                   js     10e6 <main+0x46>
    10dc:       48 89 ef                mov    %rbp,%rdi
    10df:       e8 8c ff ff ff          call   1070 <free@plt>
    10e4:       31 c0                   xor    %eax,%eax
    10e6:       48 83 c4 08             add    $0x8,%rsp
    10ea:       5b                      pop    %rbx
    10eb:       5d                      pop    %rbp
    10ec:       c3                      ret
    10ed:       b8 01 00 00 00          mov    $0x1,%eax
    10f2:       eb f2                   jmp    10e6 <main+0x46>

Looking at both 10ba and 10da suggests that code was laid out in a way that jumping is frowned upon. Also 
potentially lest frequently executed code (at 10ed) is pushed further down the memory hence optimizing cache line usage. 

That said, each and every scenario needs analysis on its own. 

>You could add the unlikely() for readability purposes. ;-)
>

Sure. That won't hurt performance.   

>>
>> >> > > +		ret = rte_pmu_enable_group(lcore_id);
>> >> > > +		if (ret)
>> >> > > +			return 0;
>> >> > > +
>> >> > > +		group->enabled = true;
>> >> > > +	}
>> >> >
>> >> > Why is the group not enabled in the setup function,
>> >> rte_pmu_add_event(), instead of here, in the
>> >> > hot path?
>> >> >
>> >>
>> >> When this is executed for the very first time then cpu will have
>> >> obviously more work to do but afterwards setup path is not taken
>> hence
>> >> much less cpu cycles are required.
>> >>
>> >> Setup is executed by main lcore solely, before lcores are executed
>> >> hence some info passed to SYS_perf_event_open ioctl() is missing,
>> pid
>> >> (via rte_gettid()) being an example here.
>> >
>> >OK. Thank you for the explanation. Since impossible at setup, it has
>> to be done at runtime.
>> >
>> >@Mattias: Another good example of something that would belong in per-
>> thread constructors, as my
>> >suggested feature creep in [1].
>> >
>> >[1]: https://urldefense.proofpoint.com/v2/url?u=http-
>> >3A__inbox.dpdk.org_dev_98CBD80474FA8B44BF855DF32C47DC35D87553-
>> >40smartserver.smartshare.dk_&d=DwIFAw&c=nKjWec2b6R0mOyPaz7xtfQ&r=PZNX
>> >g
>> rbjdlXxVEEGYkxIxRndyEUwWU_ad5
>> >ce22YI6Is&m=tnygBVwOnoZDV7hItku1HtmsI8R3F6vPJdr7ON3hE5iAds96T2C9JTNcn
>> >t
>> 6ptN4Q&s=aSAnYqgVnrgDp6yyMtGC
>> >uWgJjDlgqj1wHf1nGWyHCNo&e=
>> >
>> >>
>> >> > > +
>> >> > > +	if (index < 0 || index >= rte_pmu->num_group_events)
>> >
>> >Optimized: if (unlikely(index >= rte_pmu.num_group_events))
>> >
>> >> > > +		return 0;
>> >> > > +
>> >> > > +	return rte_pmu_read_userpage((struct perf_event_mmap_page
>> >> > > *)group->mmap_pages[index]);
>> >> >
>> >> > Using fixed size arrays instead of multiple indirections via
>> >> > pointers
>> >> is faster. It could be:
>> >> >
>> >> > return rte_pmu_read_userpage((struct perf_event_mmap_page
>> >> > *)rte_pmu.group[lcore_id].mmap_pages[index]);
>> >> >
>> >> > With our without suggested performance improvements...
>> >> >
>> >> > Series-acked-by: Morten Brørup <mb@smartsharesystems.com>
>> >>
>>
  
Morten Brørup Jan. 8, 2023, 4:30 p.m. UTC | #11
> From: Tomasz Duszynski [mailto:tduszynski@marvell.com]
> Sent: Sunday, 8 January 2023 16.41
> 
> >From: Morten Brørup <mb@smartsharesystems.com>
> >Sent: Thursday, January 5, 2023 11:08 PM
> >
> >> From: Tomasz Duszynski [mailto:tduszynski@marvell.com]
> >> Sent: Thursday, 5 January 2023 22.14
> >>
> >> Hi Morten,
> >>
> >> A few comments inline.
> >>
> >> >From: Morten Brørup <mb@smartsharesystems.com>
> >> >Sent: Wednesday, December 14, 2022 11:41 AM
> >> >
> >> >External Email
> >> >
> >> >-------------------------------------------------------------------
> --
> >> >-
> >> >+CC: Mattias, see my comment below about per-thread constructor for
> >> this
> >> >
> >> >> From: Tomasz Duszynski [mailto:tduszynski@marvell.com]
> >> >> Sent: Wednesday, 14 December 2022 10.39
> >> >>
> >> >> Hello Morten,
> >> >>
> >> >> Thanks for review. Answers inline.
> >> >>

[...]

> >> >> > > +{
> >> >> > > +	int lcore_id = rte_lcore_id();
> >> >> > > +	struct rte_pmu_event_group *group;
> >> >> > > +	int ret;
> >> >> > > +
> >> >> > > +	if (!rte_pmu)
> >> >> > > +		return 0;
> >> >> > > +
> >> >> > > +	group = &rte_pmu->group[lcore_id];
> >> >> > > +	if (!group->enabled) {
> >> >
> >> >Optimized: if (unlikely(!group->enabled)) {
> >> >
> >>
> >> Compiler will optimize the branch itself correctly. Extra hint is
> not
> >> required.
> >
> >I haven't reviewed the output from this, so I'll take your word for
> it. I suggested the unlikely()
> >because I previously tested some very simple code, and it optimized
> for taking the "if":
> >
> >void testb(bool b)
> >{
> >    if (!b)
> >        exit(1);
> >
> >    exit(99);
> >}
> >
> >I guess I should experiment with more realistic code, and update my
> optimization notes!
> >
> 
> I think this may be too simple to draw far-reaching conclusions from
> it. Compiler will make the
> fall-through path more likely. If I recall Intel Optimization Reference
> Manual has some more
> info on this.

IIRC, the Intel Optimization Reference Manual discusses branch optimization for assembler, not C.

> 
> Lets take a different example.
> 
> int main(int argc, char *argv[])
> {
>         int *p;
> 
>         p = malloc(sizeof(*p));
>         if (!p)
>                 return 1;
>         *p = atoi(argv[1]);
>         if (*p < 0)
>                 return 2;
>         free(p);
> 
>         return 0;
> }
> 
> If compiled with -O3 and disassembled I got below.
> 
> 00000000000010a0 <main>:
>     10a0:       f3 0f 1e fa             endbr64
>     10a4:       55                      push   %rbp
>     10a5:       bf 04 00 00 00          mov    $0x4,%edi
>     10aa:       53                      push   %rbx
>     10ab:       48 89 f3                mov    %rsi,%rbx
>     10ae:       48 83 ec 08             sub    $0x8,%rsp
>     10b2:       e8 d9 ff ff ff          call   1090 <malloc@plt>
>     10b7:       48 85 c0                test   %rax,%rax
>     10ba:       74 31                   je     10ed <main+0x4d>
>     10bc:       48 8b 7b 08             mov    0x8(%rbx),%rdi
>     10c0:       ba 0a 00 00 00          mov    $0xa,%edx
>     10c5:       31 f6                   xor    %esi,%esi
>     10c7:       48 89 c5                mov    %rax,%rbp
>     10ca:       e8 b1 ff ff ff          call   1080 <strtol@plt>
>     10cf:       49 89 c0                mov    %rax,%r8
>     10d2:       b8 02 00 00 00          mov    $0x2,%eax
>     10d7:       45 85 c0                test   %r8d,%r8d
>     10da:       78 0a                   js     10e6 <main+0x46>
>     10dc:       48 89 ef                mov    %rbp,%rdi
>     10df:       e8 8c ff ff ff          call   1070 <free@plt>
>     10e4:       31 c0                   xor    %eax,%eax
>     10e6:       48 83 c4 08             add    $0x8,%rsp
>     10ea:       5b                      pop    %rbx
>     10eb:       5d                      pop    %rbp
>     10ec:       c3                      ret
>     10ed:       b8 01 00 00 00          mov    $0x1,%eax
>     10f2:       eb f2                   jmp    10e6 <main+0x46>
> 
> Looking at both 10ba and 10da suggests that code was laid out in a way
> that jumping is frowned upon. Also
> potentially lest frequently executed code (at 10ed) is pushed further
> down the memory hence optimizing cache line usage.

In my notes, I have (ptr == NULL) marked as considered unlikely, but (int == 0) marked as considered likely. Since group->enabled is bool, I guessed the compiler would treat it like int and consider (!group->enabled) as likely.

Like in your example here, I also have (int < 0) marked as considered unlikely.

> 
> That said, each and every scenario needs analysis on its own.

Agree. Theory is good, validation is better. ;-)

> 
> >You could add the unlikely() for readability purposes. ;-)
> >
> 
> Sure. That won't hurt performance.

I think we are both in agreement about the intentions here, so I won't hold you back with further academic discussions at this point. I might resume the discussion with your next patch version, though. ;-)
  
Ruifeng Wang Jan. 9, 2023, 7:37 a.m. UTC | #12
> -----Original Message-----
> From: Tomasz Duszynski <tduszynski@marvell.com>
> Sent: Tuesday, December 13, 2022 6:44 PM
> To: dev@dpdk.org
> Cc: thomas@monjalon.net; jerinj@marvell.com; mb@smartsharesystems.com; zhoumin@loongson.cn;
> Tomasz Duszynski <tduszynski@marvell.com>
> Subject: [PATCH v4 1/4] eal: add generic support for reading PMU events
> 
> Add support for programming PMU counters and reading their values in runtime bypassing
> kernel completely.
> 
> This is especially useful in cases where CPU cores are isolated
> (nohz_full) i.e run dedicated tasks. In such cases one cannot use standard perf utility
> without sacrificing latency and performance.
> 
> Signed-off-by: Tomasz Duszynski <tduszynski@marvell.com>
> ---
>  app/test/meson.build                  |   1 +
>  app/test/test_pmu.c                   |  41 +++
>  doc/guides/prog_guide/profile_app.rst |   8 +
>  lib/eal/common/meson.build            |   3 +
>  lib/eal/common/pmu_private.h          |  41 +++
>  lib/eal/common/rte_pmu.c              | 456 ++++++++++++++++++++++++++
>  lib/eal/include/meson.build           |   1 +
>  lib/eal/include/rte_pmu.h             | 204 ++++++++++++
>  lib/eal/linux/eal.c                   |   4 +
>  lib/eal/version.map                   |   6 +
>  10 files changed, 765 insertions(+)
>  create mode 100644 app/test/test_pmu.c
>  create mode 100644 lib/eal/common/pmu_private.h  create mode 100644
> lib/eal/common/rte_pmu.c  create mode 100644 lib/eal/include/rte_pmu.h
> 
<snip>
> diff --git a/lib/eal/common/rte_pmu.c b/lib/eal/common/rte_pmu.c new file mode 100644
> index 0000000000..049fe19fe3
> --- /dev/null
> +++ b/lib/eal/common/rte_pmu.c
> @@ -0,0 +1,456 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(C) 2022 Marvell International Ltd.
> + */
> +
> +#include <ctype.h>
> +#include <dirent.h>
> +#include <errno.h>
> +#include <regex.h>
> +#include <stdlib.h>
> +#include <string.h>
> +#include <sys/ioctl.h>
> +#include <sys/mman.h>
> +#include <sys/queue.h>
> +#include <sys/syscall.h>
> +#include <unistd.h>
> +
> +#include <rte_eal_paging.h>
> +#include <rte_pmu.h>
> +#include <rte_tailq.h>
> +
> +#include "pmu_private.h"
> +
> +#define EVENT_SOURCE_DEVICES_PATH "/sys/bus/event_source/devices"
> +
> +#ifndef GENMASK_ULL
> +#define GENMASK_ULL(h, l) ((~0ULL - (1ULL << (l)) + 1) & (~0ULL >> ((64
> +- 1 - (h))))) #endif
> +
> +#ifndef FIELD_PREP
> +#define FIELD_PREP(m, v) (((uint64_t)(v) << (__builtin_ffsll(m) - 1)) &
> +(m)) #endif
> +
> +struct rte_pmu *rte_pmu;
> +
> +/*
> + * Following __rte_weak functions provide default no-op. Architectures
> +should override them if
> + * necessary.
> + */
> +
> +int
> +__rte_weak pmu_arch_init(void)
> +{
> +	return 0;
> +}
> +
> +void
> +__rte_weak pmu_arch_fini(void)
> +{
> +}
> +
> +void
> +__rte_weak pmu_arch_fixup_config(uint64_t config[3]) {
> +	RTE_SET_USED(config);
> +}
> +
> +static int
> +get_term_format(const char *name, int *num, uint64_t *mask) {
> +	char *config = NULL;
> +	char path[PATH_MAX];
> +	int high, low, ret;
> +	FILE *fp;
> +
> +	/* quiesce -Wmaybe-uninitialized warning */
> +	*num = 0;
> +	*mask = 0;
> +
> +	snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/format/%s", rte_pmu->name,
> name);
> +	fp = fopen(path, "r");
> +	if (!fp)
> +		return -errno;
> +
> +	errno = 0;
> +	ret = fscanf(fp, "%m[^:]:%d-%d", &config, &low, &high);
> +	if (ret < 2) {
> +		ret = -ENODATA;
> +		goto out;
> +	}
> +	if (errno) {
> +		ret = -errno;
> +		goto out;
> +	}
> +
> +	if (ret == 2)
> +		high = low;
> +
> +	*mask = GENMASK_ULL(high, low);
> +	/* Last digit should be [012]. If last digit is missing 0 is implied. */
> +	*num = config[strlen(config) - 1];
> +	*num = isdigit(*num) ? *num - '0' : 0;
> +
> +	ret = 0;
> +out:
> +	free(config);
> +	fclose(fp);
> +
> +	return ret;
> +}
> +
> +static int
> +parse_event(char *buf, uint64_t config[3]) {
> +	char *token, *term;
> +	int num, ret, val;
> +	uint64_t mask;
> +
> +	config[0] = config[1] = config[2] = 0;
> +
> +	token = strtok(buf, ",");
> +	while (token) {
> +		errno = 0;
> +		/* <term>=<value> */
> +		ret = sscanf(token, "%m[^=]=%i", &term, &val);
> +		if (ret < 1)
> +			return -ENODATA;
> +		if (errno)
> +			return -errno;
> +		if (ret == 1)
> +			val = 1;
> +
> +		ret = get_term_format(term, &num, &mask);
> +		free(term);
> +		if (ret)
> +			return ret;
> +
> +		config[num] |= FIELD_PREP(mask, val);
> +		token = strtok(NULL, ",");
> +	}
> +
> +	return 0;
> +}
> +
> +static int
> +get_event_config(const char *name, uint64_t config[3]) {
> +	char path[PATH_MAX], buf[BUFSIZ];
> +	FILE *fp;
> +	int ret;
> +
> +	snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/events/%s", rte_pmu->name,
> name);
> +	fp = fopen(path, "r");
> +	if (!fp)
> +		return -errno;
> +
> +	ret = fread(buf, 1, sizeof(buf), fp);
> +	if (ret == 0) {
> +		fclose(fp);
> +
> +		return -EINVAL;
> +	}
> +	fclose(fp);
> +	buf[ret] = '\0';
> +
> +	return parse_event(buf, config);
> +}
> +
> +static int
> +do_perf_event_open(uint64_t config[3], int lcore_id, int group_fd) {
> +	struct perf_event_attr attr = {
> +		.size = sizeof(struct perf_event_attr),
> +		.type = PERF_TYPE_RAW,
> +		.exclude_kernel = 1,
> +		.exclude_hv = 1,
> +		.disabled = 1,
> +	};
> +
> +	pmu_arch_fixup_config(config);
> +
> +	attr.config = config[0];
> +	attr.config1 = config[1];
> +	attr.config2 = config[2];
> +
> +	return syscall(SYS_perf_event_open, &attr, rte_gettid(),

Looks like using '0' instead of rte_gettid() takes the same effect. A small optimization.

> rte_lcore_to_cpu_id(lcore_id),
> +		       group_fd, 0);
> +}
> +
> +static int
> +open_events(int lcore_id)
> +{
> +	struct rte_pmu_event_group *group = &rte_pmu->group[lcore_id];
> +	struct rte_pmu_event *event;
> +	uint64_t config[3];
> +	int num = 0, ret;
> +
> +	/* group leader gets created first, with fd = -1 */
> +	group->fds[0] = -1;
> +
> +	TAILQ_FOREACH(event, &rte_pmu->event_list, next) {
> +		ret = get_event_config(event->name, config);
> +		if (ret) {
> +			RTE_LOG(ERR, EAL, "failed to get %s event config\n", event->name);
> +			continue;
> +		}
> +
> +		ret = do_perf_event_open(config, lcore_id, group->fds[0]);
> +		if (ret == -1) {
> +			if (errno == EOPNOTSUPP)
> +				RTE_LOG(ERR, EAL, "64 bit counters not supported\n");
> +
> +			ret = -errno;
> +			goto out;
> +		}
> +
> +		group->fds[event->index] = ret;
> +		num++;
> +	}
> +
> +	return 0;
> +out:
> +	for (--num; num >= 0; num--) {
> +		close(group->fds[num]);
> +		group->fds[num] = -1;
> +	}
> +
> +
> +	return ret;
> +}
> +
> +static int
> +mmap_events(int lcore_id)
> +{
> +	struct rte_pmu_event_group *group = &rte_pmu->group[lcore_id];
> +	void *addr;
> +	int ret, i;
> +
> +	for (i = 0; i < rte_pmu->num_group_events; i++) {
> +		addr = mmap(0, rte_mem_page_size(), PROT_READ, MAP_SHARED, group->fds[i], 0);
> +		if (addr == MAP_FAILED) {
> +			ret = -errno;
> +			goto out;
> +		}
> +
> +		group->mmap_pages[i] = addr;
> +	}
> +
> +	return 0;
> +out:
> +	for (; i; i--) {
> +		munmap(group->mmap_pages[i - 1], rte_mem_page_size());
> +		group->mmap_pages[i - 1] = NULL;
> +	}
> +
> +	return ret;
> +}
> +
> +static void
> +cleanup_events(int lcore_id)
> +{
> +	struct rte_pmu_event_group *group = &rte_pmu->group[lcore_id];
> +	int i;
> +
> +	if (!group->fds)
> +		return;
> +
> +	if (group->fds[0] != -1)
> +		ioctl(group->fds[0], PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP);
> +
> +	for (i = 0; i < rte_pmu->num_group_events; i++) {
> +		if (group->mmap_pages[i]) {
> +			munmap(group->mmap_pages[i], rte_mem_page_size());
> +			group->mmap_pages[i] = NULL;
> +		}
> +
> +		if (group->fds[i] != -1) {
> +			close(group->fds[i]);
> +			group->fds[i] = -1;
> +		}
> +	}
> +
> +	free(group->mmap_pages);
> +	free(group->fds);
> +
> +	group->mmap_pages = NULL;
> +	group->fds = NULL;
> +	group->enabled = false;
> +}
> +
> +int __rte_noinline
> +rte_pmu_enable_group(int lcore_id)
> +{
> +	struct rte_pmu_event_group *group = &rte_pmu->group[lcore_id];
> +	int ret;
> +
> +	if (rte_pmu->num_group_events == 0) {
> +		RTE_LOG(DEBUG, EAL, "no matching PMU events\n");
> +
> +		return 0;
> +	}
> +
> +	group->fds = calloc(rte_pmu->num_group_events, sizeof(*group->fds));
> +	if (!group->fds) {
> +		RTE_LOG(ERR, EAL, "failed to alloc descriptor memory\n");
> +
> +		return -ENOMEM;
> +	}
> +
> +	group->mmap_pages = calloc(rte_pmu->num_group_events, sizeof(*group->mmap_pages));
> +	if (!group->mmap_pages) {
> +		RTE_LOG(ERR, EAL, "failed to alloc userpage memory\n");
> +
> +		ret = -ENOMEM;
> +		goto out;
> +	}
> +
> +	ret = open_events(lcore_id);
> +	if (ret) {
> +		RTE_LOG(ERR, EAL, "failed to open events on lcore-worker-%d\n", lcore_id);
> +		goto out;
> +	}
> +
> +	ret = mmap_events(lcore_id);
> +	if (ret) {
> +		RTE_LOG(ERR, EAL, "failed to map events on lcore-worker-%d\n", lcore_id);
> +		goto out;
> +	}
> +
> +	if (ioctl(group->fds[0], PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP) == -1) {
> +		RTE_LOG(ERR, EAL, "failed to enable events on lcore-worker-%d\n",
> +lcore_id);
> +
> +		ret = -errno;
> +		goto out;
> +	}
> +
> +	return 0;
> +
> +out:
> +	cleanup_events(lcore_id);
> +
> +	return ret;
> +}
> +
> +static int
> +scan_pmus(void)
> +{
> +	char path[PATH_MAX];
> +	struct dirent *dent;
> +	const char *name;
> +	DIR *dirp;
> +
> +	dirp = opendir(EVENT_SOURCE_DEVICES_PATH);
> +	if (!dirp)
> +		return -errno;
> +
> +	while ((dent = readdir(dirp))) {
> +		name = dent->d_name;
> +		if (name[0] == '.')
> +			continue;
> +
> +		/* sysfs entry should either contain cpus or be a cpu */
> +		if (!strcmp(name, "cpu"))
> +			break;
> +
> +		snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/cpus", name);
> +		if (access(path, F_OK) == 0)
> +			break;
> +	}
> +
> +	closedir(dirp);
> +
> +	if (dent) {
> +		rte_pmu->name = strdup(name);
> +		if (!rte_pmu->name)
> +			return -ENOMEM;
> +	}
> +
> +	return rte_pmu->name ? 0 : -ENODEV;
> +}
> +
> +int
> +rte_pmu_add_event(const char *name)
> +{
> +	struct rte_pmu_event *event;
> +	char path[PATH_MAX];
> +
> +	snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/events/%s", rte_pmu->name,
> name);

Better to check if rte_pmu is available.
See below.

> +	if (access(path, R_OK))
> +		return -ENODEV;
> +
> +	TAILQ_FOREACH(event, &rte_pmu->event_list, next) {
> +		if (!strcmp(event->name, name))
> +			return event->index;
> +		continue;
> +	}
> +
> +	event = calloc(1, sizeof(*event));
> +	if (!event)
> +		return -ENOMEM;
> +
> +	event->name = strdup(name);
> +	if (!event->name) {
> +		free(event);
> +
> +		return -ENOMEM;
> +	}
> +
> +	event->index = rte_pmu->num_group_events++;
> +	TAILQ_INSERT_TAIL(&rte_pmu->event_list, event, next);
> +
> +	RTE_LOG(DEBUG, EAL, "%s even added at index %d\n", name,
> +event->index);
> +
> +	return event->index;
> +}
> +
> +void
> +eal_pmu_init(void)
> +{
> +	int ret;
> +
> +	rte_pmu = calloc(1, sizeof(*rte_pmu));
> +	if (!rte_pmu) {
> +		RTE_LOG(ERR, EAL, "failed to alloc PMU\n");
> +
> +		return;
> +	}
> +
> +	TAILQ_INIT(&rte_pmu->event_list);
> +
> +	ret = scan_pmus();
> +	if (ret) {
> +		RTE_LOG(ERR, EAL, "failed to find core pmu\n");
> +		goto out;
> +	}
> +
> +	ret = pmu_arch_init();
> +	if (ret) {
> +		RTE_LOG(ERR, EAL, "failed to setup arch for PMU\n");
> +		goto out;
> +	}
> +
> +	return;
> +out:
> +	free(rte_pmu->name);
> +	free(rte_pmu);

Set rte_pmu to NULL to prevent unintentional use?

> +}
> +
> +void
> +eal_pmu_fini(void)
> +{
> +	struct rte_pmu_event *event, *tmp;
> +	int lcore_id;
> +
> +	RTE_TAILQ_FOREACH_SAFE(event, &rte_pmu->event_list, next, tmp) {

rte_pmu can be unavailable if init fails. Better to check before accessing.

> +		TAILQ_REMOVE(&rte_pmu->event_list, event, next);
> +		free(event->name);
> +		free(event);
> +	}
> +
> +	RTE_LCORE_FOREACH_WORKER(lcore_id)
> +		cleanup_events(lcore_id);
> +
> +	pmu_arch_fini();
> +	free(rte_pmu->name);
> +	free(rte_pmu);
> +}
<snip>
  
Tomasz Duszynski Jan. 9, 2023, 3:40 p.m. UTC | #13
Hi Ruifeng, 

>-----Original Message-----
>From: Ruifeng Wang <Ruifeng.Wang@arm.com>
>Sent: Monday, January 9, 2023 8:37 AM
>To: Tomasz Duszynski <tduszynski@marvell.com>; dev@dpdk.org
>Cc: thomas@monjalon.net; Jerin Jacob Kollanukkaran <jerinj@marvell.com>; mb@smartsharesystems.com;
>zhoumin@loongson.cn; nd <nd@arm.com>
>Subject: [EXT] RE: [PATCH v4 1/4] eal: add generic support for reading PMU events
>
>External Email
>
>----------------------------------------------------------------------
>> -----Original Message-----
>> From: Tomasz Duszynski <tduszynski@marvell.com>
>> Sent: Tuesday, December 13, 2022 6:44 PM
>> To: dev@dpdk.org
>> Cc: thomas@monjalon.net; jerinj@marvell.com; mb@smartsharesystems.com;
>> zhoumin@loongson.cn; Tomasz Duszynski <tduszynski@marvell.com>
>> Subject: [PATCH v4 1/4] eal: add generic support for reading PMU
>> events
>>
>> Add support for programming PMU counters and reading their values in
>> runtime bypassing kernel completely.
>>
>> This is especially useful in cases where CPU cores are isolated
>> (nohz_full) i.e run dedicated tasks. In such cases one cannot use
>> standard perf utility without sacrificing latency and performance.
>>
>> Signed-off-by: Tomasz Duszynski <tduszynski@marvell.com>
>> ---
>>  app/test/meson.build                  |   1 +
>>  app/test/test_pmu.c                   |  41 +++
>>  doc/guides/prog_guide/profile_app.rst |   8 +
>>  lib/eal/common/meson.build            |   3 +
>>  lib/eal/common/pmu_private.h          |  41 +++
>>  lib/eal/common/rte_pmu.c              | 456 ++++++++++++++++++++++++++
>>  lib/eal/include/meson.build           |   1 +
>>  lib/eal/include/rte_pmu.h             | 204 ++++++++++++
>>  lib/eal/linux/eal.c                   |   4 +
>>  lib/eal/version.map                   |   6 +
>>  10 files changed, 765 insertions(+)
>>  create mode 100644 app/test/test_pmu.c  create mode 100644
>> lib/eal/common/pmu_private.h  create mode 100644
>> lib/eal/common/rte_pmu.c  create mode 100644 lib/eal/include/rte_pmu.h
>>
><snip>
>> diff --git a/lib/eal/common/rte_pmu.c b/lib/eal/common/rte_pmu.c new
>> file mode 100644 index 0000000000..049fe19fe3
>> --- /dev/null
>> +++ b/lib/eal/common/rte_pmu.c
>> @@ -0,0 +1,456 @@
>> +/* SPDX-License-Identifier: BSD-3-Clause
>> + * Copyright(C) 2022 Marvell International Ltd.
>> + */
>> +
>> +#include <ctype.h>
>> +#include <dirent.h>
>> +#include <errno.h>
>> +#include <regex.h>
>> +#include <stdlib.h>
>> +#include <string.h>
>> +#include <sys/ioctl.h>
>> +#include <sys/mman.h>
>> +#include <sys/queue.h>
>> +#include <sys/syscall.h>
>> +#include <unistd.h>
>> +
>> +#include <rte_eal_paging.h>
>> +#include <rte_pmu.h>
>> +#include <rte_tailq.h>
>> +
>> +#include "pmu_private.h"
>> +
>> +#define EVENT_SOURCE_DEVICES_PATH "/sys/bus/event_source/devices"
>> +
>> +#ifndef GENMASK_ULL
>> +#define GENMASK_ULL(h, l) ((~0ULL - (1ULL << (l)) + 1) & (~0ULL >>
>> +((64
>> +- 1 - (h))))) #endif
>> +
>> +#ifndef FIELD_PREP
>> +#define FIELD_PREP(m, v) (((uint64_t)(v) << (__builtin_ffsll(m) - 1))
>> +&
>> +(m)) #endif
>> +
>> +struct rte_pmu *rte_pmu;
>> +
>> +/*
>> + * Following __rte_weak functions provide default no-op.
>> +Architectures should override them if
>> + * necessary.
>> + */
>> +
>> +int
>> +__rte_weak pmu_arch_init(void)
>> +{
>> +	return 0;
>> +}
>> +
>> +void
>> +__rte_weak pmu_arch_fini(void)
>> +{
>> +}
>> +
>> +void
>> +__rte_weak pmu_arch_fixup_config(uint64_t config[3]) {
>> +	RTE_SET_USED(config);
>> +}
>> +
>> +static int
>> +get_term_format(const char *name, int *num, uint64_t *mask) {
>> +	char *config = NULL;
>> +	char path[PATH_MAX];
>> +	int high, low, ret;
>> +	FILE *fp;
>> +
>> +	/* quiesce -Wmaybe-uninitialized warning */
>> +	*num = 0;
>> +	*mask = 0;
>> +
>> +	snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH
>> +"/%s/format/%s", rte_pmu->name,
>> name);
>> +	fp = fopen(path, "r");
>> +	if (!fp)
>> +		return -errno;
>> +
>> +	errno = 0;
>> +	ret = fscanf(fp, "%m[^:]:%d-%d", &config, &low, &high);
>> +	if (ret < 2) {
>> +		ret = -ENODATA;
>> +		goto out;
>> +	}
>> +	if (errno) {
>> +		ret = -errno;
>> +		goto out;
>> +	}
>> +
>> +	if (ret == 2)
>> +		high = low;
>> +
>> +	*mask = GENMASK_ULL(high, low);
>> +	/* Last digit should be [012]. If last digit is missing 0 is implied. */
>> +	*num = config[strlen(config) - 1];
>> +	*num = isdigit(*num) ? *num - '0' : 0;
>> +
>> +	ret = 0;
>> +out:
>> +	free(config);
>> +	fclose(fp);
>> +
>> +	return ret;
>> +}
>> +
>> +static int
>> +parse_event(char *buf, uint64_t config[3]) {
>> +	char *token, *term;
>> +	int num, ret, val;
>> +	uint64_t mask;
>> +
>> +	config[0] = config[1] = config[2] = 0;
>> +
>> +	token = strtok(buf, ",");
>> +	while (token) {
>> +		errno = 0;
>> +		/* <term>=<value> */
>> +		ret = sscanf(token, "%m[^=]=%i", &term, &val);
>> +		if (ret < 1)
>> +			return -ENODATA;
>> +		if (errno)
>> +			return -errno;
>> +		if (ret == 1)
>> +			val = 1;
>> +
>> +		ret = get_term_format(term, &num, &mask);
>> +		free(term);
>> +		if (ret)
>> +			return ret;
>> +
>> +		config[num] |= FIELD_PREP(mask, val);
>> +		token = strtok(NULL, ",");
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>> +static int
>> +get_event_config(const char *name, uint64_t config[3]) {
>> +	char path[PATH_MAX], buf[BUFSIZ];
>> +	FILE *fp;
>> +	int ret;
>> +
>> +	snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH
>> +"/%s/events/%s", rte_pmu->name,
>> name);
>> +	fp = fopen(path, "r");
>> +	if (!fp)
>> +		return -errno;
>> +
>> +	ret = fread(buf, 1, sizeof(buf), fp);
>> +	if (ret == 0) {
>> +		fclose(fp);
>> +
>> +		return -EINVAL;
>> +	}
>> +	fclose(fp);
>> +	buf[ret] = '\0';
>> +
>> +	return parse_event(buf, config);
>> +}
>> +
>> +static int
>> +do_perf_event_open(uint64_t config[3], int lcore_id, int group_fd) {
>> +	struct perf_event_attr attr = {
>> +		.size = sizeof(struct perf_event_attr),
>> +		.type = PERF_TYPE_RAW,
>> +		.exclude_kernel = 1,
>> +		.exclude_hv = 1,
>> +		.disabled = 1,
>> +	};
>> +
>> +	pmu_arch_fixup_config(config);
>> +
>> +	attr.config = config[0];
>> +	attr.config1 = config[1];
>> +	attr.config2 = config[2];
>> +
>> +	return syscall(SYS_perf_event_open, &attr, rte_gettid(),
>
>Looks like using '0' instead of rte_gettid() takes the same effect. A small optimization.
>
>> rte_lcore_to_cpu_id(lcore_id),
>> +		       group_fd, 0);
>> +}
>> +
>> +static int
>> +open_events(int lcore_id)
>> +{
>> +	struct rte_pmu_event_group *group = &rte_pmu->group[lcore_id];
>> +	struct rte_pmu_event *event;
>> +	uint64_t config[3];
>> +	int num = 0, ret;
>> +
>> +	/* group leader gets created first, with fd = -1 */
>> +	group->fds[0] = -1;
>> +
>> +	TAILQ_FOREACH(event, &rte_pmu->event_list, next) {
>> +		ret = get_event_config(event->name, config);
>> +		if (ret) {
>> +			RTE_LOG(ERR, EAL, "failed to get %s event config\n", event->name);
>> +			continue;
>> +		}
>> +
>> +		ret = do_perf_event_open(config, lcore_id, group->fds[0]);
>> +		if (ret == -1) {
>> +			if (errno == EOPNOTSUPP)
>> +				RTE_LOG(ERR, EAL, "64 bit counters not supported\n");
>> +
>> +			ret = -errno;
>> +			goto out;
>> +		}
>> +
>> +		group->fds[event->index] = ret;
>> +		num++;
>> +	}
>> +
>> +	return 0;
>> +out:
>> +	for (--num; num >= 0; num--) {
>> +		close(group->fds[num]);
>> +		group->fds[num] = -1;
>> +	}
>> +
>> +
>> +	return ret;
>> +}
>> +
>> +static int
>> +mmap_events(int lcore_id)
>> +{
>> +	struct rte_pmu_event_group *group = &rte_pmu->group[lcore_id];
>> +	void *addr;
>> +	int ret, i;
>> +
>> +	for (i = 0; i < rte_pmu->num_group_events; i++) {
>> +		addr = mmap(0, rte_mem_page_size(), PROT_READ, MAP_SHARED, group->fds[i], 0);
>> +		if (addr == MAP_FAILED) {
>> +			ret = -errno;
>> +			goto out;
>> +		}
>> +
>> +		group->mmap_pages[i] = addr;
>> +	}
>> +
>> +	return 0;
>> +out:
>> +	for (; i; i--) {
>> +		munmap(group->mmap_pages[i - 1], rte_mem_page_size());
>> +		group->mmap_pages[i - 1] = NULL;
>> +	}
>> +
>> +	return ret;
>> +}
>> +
>> +static void
>> +cleanup_events(int lcore_id)
>> +{
>> +	struct rte_pmu_event_group *group = &rte_pmu->group[lcore_id];
>> +	int i;
>> +
>> +	if (!group->fds)
>> +		return;
>> +
>> +	if (group->fds[0] != -1)
>> +		ioctl(group->fds[0], PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP);
>> +
>> +	for (i = 0; i < rte_pmu->num_group_events; i++) {
>> +		if (group->mmap_pages[i]) {
>> +			munmap(group->mmap_pages[i], rte_mem_page_size());
>> +			group->mmap_pages[i] = NULL;
>> +		}
>> +
>> +		if (group->fds[i] != -1) {
>> +			close(group->fds[i]);
>> +			group->fds[i] = -1;
>> +		}
>> +	}
>> +
>> +	free(group->mmap_pages);
>> +	free(group->fds);
>> +
>> +	group->mmap_pages = NULL;
>> +	group->fds = NULL;
>> +	group->enabled = false;
>> +}
>> +
>> +int __rte_noinline
>> +rte_pmu_enable_group(int lcore_id)
>> +{
>> +	struct rte_pmu_event_group *group = &rte_pmu->group[lcore_id];
>> +	int ret;
>> +
>> +	if (rte_pmu->num_group_events == 0) {
>> +		RTE_LOG(DEBUG, EAL, "no matching PMU events\n");
>> +
>> +		return 0;
>> +	}
>> +
>> +	group->fds = calloc(rte_pmu->num_group_events, sizeof(*group->fds));
>> +	if (!group->fds) {
>> +		RTE_LOG(ERR, EAL, "failed to alloc descriptor memory\n");
>> +
>> +		return -ENOMEM;
>> +	}
>> +
>> +	group->mmap_pages = calloc(rte_pmu->num_group_events, sizeof(*group->mmap_pages));
>> +	if (!group->mmap_pages) {
>> +		RTE_LOG(ERR, EAL, "failed to alloc userpage memory\n");
>> +
>> +		ret = -ENOMEM;
>> +		goto out;
>> +	}
>> +
>> +	ret = open_events(lcore_id);
>> +	if (ret) {
>> +		RTE_LOG(ERR, EAL, "failed to open events on lcore-worker-%d\n", lcore_id);
>> +		goto out;
>> +	}
>> +
>> +	ret = mmap_events(lcore_id);
>> +	if (ret) {
>> +		RTE_LOG(ERR, EAL, "failed to map events on lcore-worker-%d\n", lcore_id);
>> +		goto out;
>> +	}
>> +
>> +	if (ioctl(group->fds[0], PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP) == -1) {
>> +		RTE_LOG(ERR, EAL, "failed to enable events on lcore-worker-%d\n",
>> +lcore_id);
>> +
>> +		ret = -errno;
>> +		goto out;
>> +	}
>> +
>> +	return 0;
>> +
>> +out:
>> +	cleanup_events(lcore_id);
>> +
>> +	return ret;
>> +}
>> +
>> +static int
>> +scan_pmus(void)
>> +{
>> +	char path[PATH_MAX];
>> +	struct dirent *dent;
>> +	const char *name;
>> +	DIR *dirp;
>> +
>> +	dirp = opendir(EVENT_SOURCE_DEVICES_PATH);
>> +	if (!dirp)
>> +		return -errno;
>> +
>> +	while ((dent = readdir(dirp))) {
>> +		name = dent->d_name;
>> +		if (name[0] == '.')
>> +			continue;
>> +
>> +		/* sysfs entry should either contain cpus or be a cpu */
>> +		if (!strcmp(name, "cpu"))
>> +			break;
>> +
>> +		snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/cpus", name);
>> +		if (access(path, F_OK) == 0)
>> +			break;
>> +	}
>> +
>> +	closedir(dirp);
>> +
>> +	if (dent) {
>> +		rte_pmu->name = strdup(name);
>> +		if (!rte_pmu->name)
>> +			return -ENOMEM;
>> +	}
>> +
>> +	return rte_pmu->name ? 0 : -ENODEV;
>> +}
>> +
>> +int
>> +rte_pmu_add_event(const char *name)
>> +{
>> +	struct rte_pmu_event *event;
>> +	char path[PATH_MAX];
>> +
>> +	snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH
>> +"/%s/events/%s", rte_pmu->name,
>> name);
>
>Better to check if rte_pmu is available.
>See below.
>
>> +	if (access(path, R_OK))
>> +		return -ENODEV;
>> +
>> +	TAILQ_FOREACH(event, &rte_pmu->event_list, next) {
>> +		if (!strcmp(event->name, name))
>> +			return event->index;
>> +		continue;
>> +	}
>> +
>> +	event = calloc(1, sizeof(*event));
>> +	if (!event)
>> +		return -ENOMEM;
>> +
>> +	event->name = strdup(name);
>> +	if (!event->name) {
>> +		free(event);
>> +
>> +		return -ENOMEM;
>> +	}
>> +
>> +	event->index = rte_pmu->num_group_events++;
>> +	TAILQ_INSERT_TAIL(&rte_pmu->event_list, event, next);
>> +
>> +	RTE_LOG(DEBUG, EAL, "%s even added at index %d\n", name,
>> +event->index);
>> +
>> +	return event->index;
>> +}
>> +
>> +void
>> +eal_pmu_init(void)
>> +{
>> +	int ret;
>> +
>> +	rte_pmu = calloc(1, sizeof(*rte_pmu));
>> +	if (!rte_pmu) {
>> +		RTE_LOG(ERR, EAL, "failed to alloc PMU\n");
>> +
>> +		return;
>> +	}
>> +
>> +	TAILQ_INIT(&rte_pmu->event_list);
>> +
>> +	ret = scan_pmus();
>> +	if (ret) {
>> +		RTE_LOG(ERR, EAL, "failed to find core pmu\n");
>> +		goto out;
>> +	}
>> +
>> +	ret = pmu_arch_init();
>> +	if (ret) {
>> +		RTE_LOG(ERR, EAL, "failed to setup arch for PMU\n");
>> +		goto out;
>> +	}
>> +
>> +	return;
>> +out:
>> +	free(rte_pmu->name);
>> +	free(rte_pmu);
>
>Set rte_pmu to NULL to prevent unintentional use?
>

Next series will take use of global pmu instance so this will no longer be
required though your suggestions may be applied to other pointers around.  

>> +}
>> +
>> +void
>> +eal_pmu_fini(void)
>> +{
>> +	struct rte_pmu_event *event, *tmp;
>> +	int lcore_id;
>> +
>> +	RTE_TAILQ_FOREACH_SAFE(event, &rte_pmu->event_list, next, tmp) {
>
>rte_pmu can be unavailable if init fails. Better to check before accessing.
>

Yep. 

>> +		TAILQ_REMOVE(&rte_pmu->event_list, event, next);
>> +		free(event->name);
>> +		free(event);
>> +	}
>> +
>> +	RTE_LCORE_FOREACH_WORKER(lcore_id)
>> +		cleanup_events(lcore_id);
>> +
>> +	pmu_arch_fini();
>> +	free(rte_pmu->name);
>> +	free(rte_pmu);
>> +}
><snip>
  

Patch

diff --git a/app/test/meson.build b/app/test/meson.build
index f34d19e3c3..93b3300309 100644
--- a/app/test/meson.build
+++ b/app/test/meson.build
@@ -143,6 +143,7 @@  test_sources = files(
         'test_timer_racecond.c',
         'test_timer_secondary.c',
         'test_ticketlock.c',
+        'test_pmu.c',
         'test_trace.c',
         'test_trace_register.c',
         'test_trace_perf.c',
diff --git a/app/test/test_pmu.c b/app/test/test_pmu.c
new file mode 100644
index 0000000000..fd331af9ee
--- /dev/null
+++ b/app/test/test_pmu.c
@@ -0,0 +1,41 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2022 Marvell International Ltd.
+ */
+
+#include <rte_pmu.h>
+
+#include "test.h"
+
+static int
+test_pmu_read(void)
+{
+	uint64_t val = 0;
+	int tries = 10;
+	int event = -1;
+
+	while (tries--)
+		val += rte_pmu_read(event);
+
+	if (val == 0)
+		return TEST_FAILED;
+
+	return TEST_SUCCESS;
+}
+
+static struct unit_test_suite pmu_tests = {
+	.suite_name = "pmu autotest",
+	.setup = NULL,
+	.teardown = NULL,
+	.unit_test_cases = {
+		TEST_CASE(test_pmu_read),
+		TEST_CASES_END()
+	}
+};
+
+static int
+test_pmu(void)
+{
+	return unit_test_suite_runner(&pmu_tests);
+}
+
+REGISTER_TEST_COMMAND(pmu_autotest, test_pmu);
diff --git a/doc/guides/prog_guide/profile_app.rst b/doc/guides/prog_guide/profile_app.rst
index 14292d4c25..a8b501fe0c 100644
--- a/doc/guides/prog_guide/profile_app.rst
+++ b/doc/guides/prog_guide/profile_app.rst
@@ -7,6 +7,14 @@  Profile Your Application
 The following sections describe methods of profiling DPDK applications on
 different architectures.
 
+Performance counter based profiling
+-----------------------------------
+
+Majority of architectures support some sort hardware measurement unit which provides a set of
+programmable counters that monitor specific events. There are different tools which can gather
+that information, perf being an example here. Though in some scenarios, eg. when CPU cores are
+isolated (nohz_full) and run dedicated tasks, using perf is less than ideal. In such cases one can
+read specific events directly from application via ``rte_pmu_read()``.
 
 Profiling on x86
 ----------------
diff --git a/lib/eal/common/meson.build b/lib/eal/common/meson.build
index 917758cc65..d6d05b56f3 100644
--- a/lib/eal/common/meson.build
+++ b/lib/eal/common/meson.build
@@ -38,6 +38,9 @@  sources += files(
         'rte_service.c',
         'rte_version.c',
 )
+if is_linux
+    sources += files('rte_pmu.c')
+endif
 if is_linux or is_windows
     sources += files('eal_common_dynmem.c')
 endif
diff --git a/lib/eal/common/pmu_private.h b/lib/eal/common/pmu_private.h
new file mode 100644
index 0000000000..cade4245e6
--- /dev/null
+++ b/lib/eal/common/pmu_private.h
@@ -0,0 +1,41 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2022 Marvell
+ */
+
+#ifndef _PMU_PRIVATE_H_
+#define _PMU_PRIVATE_H_
+
+/**
+ * Architecture specific PMU init callback.
+ *
+ * @return
+ *   0 in case of success, negative value otherwise.
+ */
+int
+pmu_arch_init(void);
+
+/**
+ * Architecture specific PMU cleanup callback.
+ */
+void
+pmu_arch_fini(void);
+
+/**
+ * Apply architecture specific settings to config before passing it to syscall.
+ */
+void
+pmu_arch_fixup_config(uint64_t config[3]);
+
+/**
+ * Initialize PMU tracing internals.
+ */
+void
+eal_pmu_init(void);
+
+/**
+ * Cleanup PMU internals.
+ */
+void
+eal_pmu_fini(void);
+
+#endif /* _PMU_PRIVATE_H_ */
diff --git a/lib/eal/common/rte_pmu.c b/lib/eal/common/rte_pmu.c
new file mode 100644
index 0000000000..049fe19fe3
--- /dev/null
+++ b/lib/eal/common/rte_pmu.c
@@ -0,0 +1,456 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2022 Marvell International Ltd.
+ */
+
+#include <ctype.h>
+#include <dirent.h>
+#include <errno.h>
+#include <regex.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/queue.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+
+#include <rte_eal_paging.h>
+#include <rte_pmu.h>
+#include <rte_tailq.h>
+
+#include "pmu_private.h"
+
+#define EVENT_SOURCE_DEVICES_PATH "/sys/bus/event_source/devices"
+
+#ifndef GENMASK_ULL
+#define GENMASK_ULL(h, l) ((~0ULL - (1ULL << (l)) + 1) & (~0ULL >> ((64 - 1 - (h)))))
+#endif
+
+#ifndef FIELD_PREP
+#define FIELD_PREP(m, v) (((uint64_t)(v) << (__builtin_ffsll(m) - 1)) & (m))
+#endif
+
+struct rte_pmu *rte_pmu;
+
+/*
+ * Following __rte_weak functions provide default no-op. Architectures should override them if
+ * necessary.
+ */
+
+int
+__rte_weak pmu_arch_init(void)
+{
+	return 0;
+}
+
+void
+__rte_weak pmu_arch_fini(void)
+{
+}
+
+void
+__rte_weak pmu_arch_fixup_config(uint64_t config[3])
+{
+	RTE_SET_USED(config);
+}
+
+static int
+get_term_format(const char *name, int *num, uint64_t *mask)
+{
+	char *config = NULL;
+	char path[PATH_MAX];
+	int high, low, ret;
+	FILE *fp;
+
+	/* quiesce -Wmaybe-uninitialized warning */
+	*num = 0;
+	*mask = 0;
+
+	snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/format/%s", rte_pmu->name, name);
+	fp = fopen(path, "r");
+	if (!fp)
+		return -errno;
+
+	errno = 0;
+	ret = fscanf(fp, "%m[^:]:%d-%d", &config, &low, &high);
+	if (ret < 2) {
+		ret = -ENODATA;
+		goto out;
+	}
+	if (errno) {
+		ret = -errno;
+		goto out;
+	}
+
+	if (ret == 2)
+		high = low;
+
+	*mask = GENMASK_ULL(high, low);
+	/* Last digit should be [012]. If last digit is missing 0 is implied. */
+	*num = config[strlen(config) - 1];
+	*num = isdigit(*num) ? *num - '0' : 0;
+
+	ret = 0;
+out:
+	free(config);
+	fclose(fp);
+
+	return ret;
+}
+
+static int
+parse_event(char *buf, uint64_t config[3])
+{
+	char *token, *term;
+	int num, ret, val;
+	uint64_t mask;
+
+	config[0] = config[1] = config[2] = 0;
+
+	token = strtok(buf, ",");
+	while (token) {
+		errno = 0;
+		/* <term>=<value> */
+		ret = sscanf(token, "%m[^=]=%i", &term, &val);
+		if (ret < 1)
+			return -ENODATA;
+		if (errno)
+			return -errno;
+		if (ret == 1)
+			val = 1;
+
+		ret = get_term_format(term, &num, &mask);
+		free(term);
+		if (ret)
+			return ret;
+
+		config[num] |= FIELD_PREP(mask, val);
+		token = strtok(NULL, ",");
+	}
+
+	return 0;
+}
+
+static int
+get_event_config(const char *name, uint64_t config[3])
+{
+	char path[PATH_MAX], buf[BUFSIZ];
+	FILE *fp;
+	int ret;
+
+	snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/events/%s", rte_pmu->name, name);
+	fp = fopen(path, "r");
+	if (!fp)
+		return -errno;
+
+	ret = fread(buf, 1, sizeof(buf), fp);
+	if (ret == 0) {
+		fclose(fp);
+
+		return -EINVAL;
+	}
+	fclose(fp);
+	buf[ret] = '\0';
+
+	return parse_event(buf, config);
+}
+
+static int
+do_perf_event_open(uint64_t config[3], int lcore_id, int group_fd)
+{
+	struct perf_event_attr attr = {
+		.size = sizeof(struct perf_event_attr),
+		.type = PERF_TYPE_RAW,
+		.exclude_kernel = 1,
+		.exclude_hv = 1,
+		.disabled = 1,
+	};
+
+	pmu_arch_fixup_config(config);
+
+	attr.config = config[0];
+	attr.config1 = config[1];
+	attr.config2 = config[2];
+
+	return syscall(SYS_perf_event_open, &attr, rte_gettid(), rte_lcore_to_cpu_id(lcore_id),
+		       group_fd, 0);
+}
+
+static int
+open_events(int lcore_id)
+{
+	struct rte_pmu_event_group *group = &rte_pmu->group[lcore_id];
+	struct rte_pmu_event *event;
+	uint64_t config[3];
+	int num = 0, ret;
+
+	/* group leader gets created first, with fd = -1 */
+	group->fds[0] = -1;
+
+	TAILQ_FOREACH(event, &rte_pmu->event_list, next) {
+		ret = get_event_config(event->name, config);
+		if (ret) {
+			RTE_LOG(ERR, EAL, "failed to get %s event config\n", event->name);
+			continue;
+		}
+
+		ret = do_perf_event_open(config, lcore_id, group->fds[0]);
+		if (ret == -1) {
+			if (errno == EOPNOTSUPP)
+				RTE_LOG(ERR, EAL, "64 bit counters not supported\n");
+
+			ret = -errno;
+			goto out;
+		}
+
+		group->fds[event->index] = ret;
+		num++;
+	}
+
+	return 0;
+out:
+	for (--num; num >= 0; num--) {
+		close(group->fds[num]);
+		group->fds[num] = -1;
+	}
+
+
+	return ret;
+}
+
+static int
+mmap_events(int lcore_id)
+{
+	struct rte_pmu_event_group *group = &rte_pmu->group[lcore_id];
+	void *addr;
+	int ret, i;
+
+	for (i = 0; i < rte_pmu->num_group_events; i++) {
+		addr = mmap(0, rte_mem_page_size(), PROT_READ, MAP_SHARED, group->fds[i], 0);
+		if (addr == MAP_FAILED) {
+			ret = -errno;
+			goto out;
+		}
+
+		group->mmap_pages[i] = addr;
+	}
+
+	return 0;
+out:
+	for (; i; i--) {
+		munmap(group->mmap_pages[i - 1], rte_mem_page_size());
+		group->mmap_pages[i - 1] = NULL;
+	}
+
+	return ret;
+}
+
+static void
+cleanup_events(int lcore_id)
+{
+	struct rte_pmu_event_group *group = &rte_pmu->group[lcore_id];
+	int i;
+
+	if (!group->fds)
+		return;
+
+	if (group->fds[0] != -1)
+		ioctl(group->fds[0], PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP);
+
+	for (i = 0; i < rte_pmu->num_group_events; i++) {
+		if (group->mmap_pages[i]) {
+			munmap(group->mmap_pages[i], rte_mem_page_size());
+			group->mmap_pages[i] = NULL;
+		}
+
+		if (group->fds[i] != -1) {
+			close(group->fds[i]);
+			group->fds[i] = -1;
+		}
+	}
+
+	free(group->mmap_pages);
+	free(group->fds);
+
+	group->mmap_pages = NULL;
+	group->fds = NULL;
+	group->enabled = false;
+}
+
+int __rte_noinline
+rte_pmu_enable_group(int lcore_id)
+{
+	struct rte_pmu_event_group *group = &rte_pmu->group[lcore_id];
+	int ret;
+
+	if (rte_pmu->num_group_events == 0) {
+		RTE_LOG(DEBUG, EAL, "no matching PMU events\n");
+
+		return 0;
+	}
+
+	group->fds = calloc(rte_pmu->num_group_events, sizeof(*group->fds));
+	if (!group->fds) {
+		RTE_LOG(ERR, EAL, "failed to alloc descriptor memory\n");
+
+		return -ENOMEM;
+	}
+
+	group->mmap_pages = calloc(rte_pmu->num_group_events, sizeof(*group->mmap_pages));
+	if (!group->mmap_pages) {
+		RTE_LOG(ERR, EAL, "failed to alloc userpage memory\n");
+
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ret = open_events(lcore_id);
+	if (ret) {
+		RTE_LOG(ERR, EAL, "failed to open events on lcore-worker-%d\n", lcore_id);
+		goto out;
+	}
+
+	ret = mmap_events(lcore_id);
+	if (ret) {
+		RTE_LOG(ERR, EAL, "failed to map events on lcore-worker-%d\n", lcore_id);
+		goto out;
+	}
+
+	if (ioctl(group->fds[0], PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP) == -1) {
+		RTE_LOG(ERR, EAL, "failed to enable events on lcore-worker-%d\n", lcore_id);
+
+		ret = -errno;
+		goto out;
+	}
+
+	return 0;
+
+out:
+	cleanup_events(lcore_id);
+
+	return ret;
+}
+
+static int
+scan_pmus(void)
+{
+	char path[PATH_MAX];
+	struct dirent *dent;
+	const char *name;
+	DIR *dirp;
+
+	dirp = opendir(EVENT_SOURCE_DEVICES_PATH);
+	if (!dirp)
+		return -errno;
+
+	while ((dent = readdir(dirp))) {
+		name = dent->d_name;
+		if (name[0] == '.')
+			continue;
+
+		/* sysfs entry should either contain cpus or be a cpu */
+		if (!strcmp(name, "cpu"))
+			break;
+
+		snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/cpus", name);
+		if (access(path, F_OK) == 0)
+			break;
+	}
+
+	closedir(dirp);
+
+	if (dent) {
+		rte_pmu->name = strdup(name);
+		if (!rte_pmu->name)
+			return -ENOMEM;
+	}
+
+	return rte_pmu->name ? 0 : -ENODEV;
+}
+
+int
+rte_pmu_add_event(const char *name)
+{
+	struct rte_pmu_event *event;
+	char path[PATH_MAX];
+
+	snprintf(path, sizeof(path), EVENT_SOURCE_DEVICES_PATH "/%s/events/%s", rte_pmu->name, name);
+	if (access(path, R_OK))
+		return -ENODEV;
+
+	TAILQ_FOREACH(event, &rte_pmu->event_list, next) {
+		if (!strcmp(event->name, name))
+			return event->index;
+		continue;
+	}
+
+	event = calloc(1, sizeof(*event));
+	if (!event)
+		return -ENOMEM;
+
+	event->name = strdup(name);
+	if (!event->name) {
+		free(event);
+
+		return -ENOMEM;
+	}
+
+	event->index = rte_pmu->num_group_events++;
+	TAILQ_INSERT_TAIL(&rte_pmu->event_list, event, next);
+
+	RTE_LOG(DEBUG, EAL, "%s even added at index %d\n", name, event->index);
+
+	return event->index;
+}
+
+void
+eal_pmu_init(void)
+{
+	int ret;
+
+	rte_pmu = calloc(1, sizeof(*rte_pmu));
+	if (!rte_pmu) {
+		RTE_LOG(ERR, EAL, "failed to alloc PMU\n");
+
+		return;
+	}
+
+	TAILQ_INIT(&rte_pmu->event_list);
+
+	ret = scan_pmus();
+	if (ret) {
+		RTE_LOG(ERR, EAL, "failed to find core pmu\n");
+		goto out;
+	}
+
+	ret = pmu_arch_init();
+	if (ret) {
+		RTE_LOG(ERR, EAL, "failed to setup arch for PMU\n");
+		goto out;
+	}
+
+	return;
+out:
+	free(rte_pmu->name);
+	free(rte_pmu);
+}
+
+void
+eal_pmu_fini(void)
+{
+	struct rte_pmu_event *event, *tmp;
+	int lcore_id;
+
+	RTE_TAILQ_FOREACH_SAFE(event, &rte_pmu->event_list, next, tmp) {
+		TAILQ_REMOVE(&rte_pmu->event_list, event, next);
+		free(event->name);
+		free(event);
+	}
+
+	RTE_LCORE_FOREACH_WORKER(lcore_id)
+		cleanup_events(lcore_id);
+
+	pmu_arch_fini();
+	free(rte_pmu->name);
+	free(rte_pmu);
+}
diff --git a/lib/eal/include/meson.build b/lib/eal/include/meson.build
index cfcd40aaed..3bf830adee 100644
--- a/lib/eal/include/meson.build
+++ b/lib/eal/include/meson.build
@@ -36,6 +36,7 @@  headers += files(
         'rte_pci_dev_features.h',
         'rte_per_lcore.h',
         'rte_pflock.h',
+        'rte_pmu.h',
         'rte_random.h',
         'rte_reciprocal.h',
         'rte_seqcount.h',
diff --git a/lib/eal/include/rte_pmu.h b/lib/eal/include/rte_pmu.h
new file mode 100644
index 0000000000..e4b4f6b052
--- /dev/null
+++ b/lib/eal/include/rte_pmu.h
@@ -0,0 +1,204 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2022 Marvell
+ */
+
+#ifndef _RTE_PMU_H_
+#define _RTE_PMU_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <rte_common.h>
+#include <rte_compat.h>
+
+#ifdef RTE_EXEC_ENV_LINUX
+
+#include <linux/perf_event.h>
+
+#include <rte_atomic.h>
+#include <rte_branch_prediction.h>
+#include <rte_lcore.h>
+#include <rte_log.h>
+
+/**
+ * @file
+ *
+ * PMU event tracing operations
+ *
+ * This file defines generic API and types necessary to setup PMU and
+ * read selected counters in runtime.
+ */
+
+/**
+ * A structure describing a group of events.
+ */
+struct rte_pmu_event_group {
+	int *fds; /**< array of event descriptors */
+	void **mmap_pages; /**< array of pointers to mmapped perf_event_attr structures */
+	bool enabled; /**< true if group was enabled on particular lcore */
+};
+
+/**
+ * A structure describing an event.
+ */
+struct rte_pmu_event {
+	char *name; /** name of an event */
+	int index; /** event index into fds/mmap_pages */
+	TAILQ_ENTRY(rte_pmu_event) next; /** list entry */
+};
+
+/**
+ * A PMU state container.
+ */
+struct rte_pmu {
+	char *name; /** name of core PMU listed under /sys/bus/event_source/devices */
+	struct rte_pmu_event_group group[RTE_MAX_LCORE]; /**< per lcore event group data */
+	int num_group_events; /**< number of events in a group */
+	TAILQ_HEAD(, rte_pmu_event) event_list; /**< list of matching events */
+};
+
+/** Pointer to the PMU state container */
+extern struct rte_pmu *rte_pmu;
+
+/** Each architecture supporting PMU needs to provide its own version */
+#ifndef rte_pmu_pmc_read
+#define rte_pmu_pmc_read(index) ({ 0; })
+#endif
+
+/**
+ * @internal
+ *
+ * Read PMU counter.
+ *
+ * @param pc
+ *   Pointer to the mmapped user page.
+ * @return
+ *   Counter value read from hardware.
+ */
+__rte_internal
+static __rte_always_inline uint64_t
+rte_pmu_read_userpage(struct perf_event_mmap_page *pc)
+{
+	uint64_t offset, width, pmc = 0;
+	uint32_t seq, index;
+	int tries = 100;
+
+	for (;;) {
+		seq = pc->lock;
+		rte_compiler_barrier();
+		index = pc->index;
+		offset = pc->offset;
+		width = pc->pmc_width;
+
+		if (likely(pc->cap_user_rdpmc && index)) {
+			pmc = rte_pmu_pmc_read(index - 1);
+			pmc <<= 64 - width;
+			pmc >>= 64 - width;
+		}
+
+		rte_compiler_barrier();
+
+		if (likely(pc->lock == seq))
+			return pmc + offset;
+
+		if (--tries == 0) {
+			RTE_LOG(DEBUG, EAL, "failed to get perf_event_mmap_page lock\n");
+			break;
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * @internal
+ *
+ * Enable group of events for a given lcore.
+ *
+ * @param lcore_id
+ *   The identifier of the lcore.
+ * @return
+ *   0 in case of success, negative value otherwise.
+ */
+__rte_internal
+int
+rte_pmu_enable_group(int lcore_id);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Add event to the group of enabled events.
+ *
+ * @param name
+ *   Name of an event listed under /sys/bus/event_source/devices/pmu/events.
+ * @return
+ *   Event index in case of success, negative value otherwise.
+ */
+__rte_experimental
+int
+rte_pmu_add_event(const char *name);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Read hardware counter configured to count occurrences of an event.
+ *
+ * @param index
+ *   Index of an event to be read.
+ * @return
+ *   Event value read from register. In case of errors or lack of support
+ *   0 is returned. In other words, stream of zeros in a trace file
+ *   indicates problem with reading particular PMU event register.
+ */
+__rte_experimental
+static __rte_always_inline uint64_t
+rte_pmu_read(int index)
+{
+	int lcore_id = rte_lcore_id();
+	struct rte_pmu_event_group *group;
+	int ret;
+
+	if (!rte_pmu)
+		return 0;
+
+	group = &rte_pmu->group[lcore_id];
+	if (!group->enabled) {
+		ret = rte_pmu_enable_group(lcore_id);
+		if (ret)
+			return 0;
+
+		group->enabled = true;
+	}
+
+	if (index < 0 || index >= rte_pmu->num_group_events)
+		return 0;
+
+	return rte_pmu_read_userpage((struct perf_event_mmap_page *)group->mmap_pages[index]);
+}
+
+#else /* !RTE_EXEC_ENV_LINUX */
+
+__rte_experimental
+static int __rte_unused
+rte_pmu_add_event(__rte_unused const char *name)
+{
+	return -1;
+}
+
+__rte_experimental
+static __rte_always_inline uint64_t
+rte_pmu_read(__rte_unused int index)
+{
+	return 0;
+}
+
+#endif /* RTE_EXEC_ENV_LINUX */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_PMU_H_ */
diff --git a/lib/eal/linux/eal.c b/lib/eal/linux/eal.c
index 8c118d0d9f..751a13b597 100644
--- a/lib/eal/linux/eal.c
+++ b/lib/eal/linux/eal.c
@@ -53,6 +53,7 @@ 
 #include "eal_options.h"
 #include "eal_vfio.h"
 #include "hotplug_mp.h"
+#include "pmu_private.h"
 
 #define MEMSIZE_IF_NO_HUGE_PAGE (64ULL * 1024ULL * 1024ULL)
 
@@ -1206,6 +1207,8 @@  rte_eal_init(int argc, char **argv)
 		return -1;
 	}
 
+	eal_pmu_init();
+
 	if (rte_eal_tailqs_init() < 0) {
 		rte_eal_init_alert("Cannot init tail queues for objects");
 		rte_errno = EFAULT;
@@ -1372,6 +1375,7 @@  rte_eal_cleanup(void)
 	eal_bus_cleanup();
 	rte_trace_save();
 	eal_trace_fini();
+	eal_pmu_fini();
 	/* after this point, any DPDK pointers will become dangling */
 	rte_eal_memory_detach();
 	eal_mp_dev_hotplug_cleanup();
diff --git a/lib/eal/version.map b/lib/eal/version.map
index 7ad12a7dc9..9225f46f67 100644
--- a/lib/eal/version.map
+++ b/lib/eal/version.map
@@ -440,6 +440,11 @@  EXPERIMENTAL {
 	rte_thread_detach;
 	rte_thread_equal;
 	rte_thread_join;
+
+	# added in 23.03
+	rte_pmu; # WINDOWS_NO_EXPORT
+	rte_pmu_add_event; # WINDOWS_NO_EXPORT
+	rte_pmu_read; # WINDOWS_NO_EXPORT
 };
 
 INTERNAL {
@@ -483,4 +488,5 @@  INTERNAL {
 	rte_mem_map;
 	rte_mem_page_size;
 	rte_mem_unmap;
+	rte_pmu_enable_group;
 };