[v4,04/10] power: add simple power management API and callback

Message ID 1601647919-25312-4-git-send-email-liang.j.ma@intel.com (mailing list archive)
State Superseded, archived
Delegated to: Thomas Monjalon
Headers
Series [v4,01/10] eal: add new x86 cpuid support for WAITPKG |

Checks

Context Check Description
ci/checkpatch success coding style OK

Commit Message

Liang, Ma Oct. 2, 2020, 2:11 p.m. UTC
  Add a simple on/off switch that will enable saving power when no
packets are arriving. It is based on counting the number of empty
polls and, when the number reaches a certain threshold, entering an
architecture-defined optimized power state that will either wait
until a TSC timestamp expires, or when packets arrive.

This API support 1 port to multiple core use case.

This design leverage RX Callback mechnaism which allow three
different power management methodology co exist.

1. umwait/umonitor:

   The TSC timestamp is automatically calculated using current
   link speed and RX descriptor ring size, such that the sleep
   time is not longer than it would take for a NIC to fill its
   entire RX descriptor ring.

2. Pause instruction

   Instead of move the core into deeper C state, this lightweight
   method use Pause instruction to relief the processor from
   busy polling.

3. Frequency Scaling
   Reuse exist rte power library to scale up/down core frequency
   depend on traffic volume.

Signed-off-by: Liang Ma <liang.j.ma@intel.com>
Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
---
 lib/librte_power/meson.build           |   5 +-
 lib/librte_power/pmd_mgmt.h            |  49 ++++++
 lib/librte_power/rte_power_pmd_mgmt.c  | 208 +++++++++++++++++++++++++
 lib/librte_power/rte_power_pmd_mgmt.h  |  88 +++++++++++
 lib/librte_power/rte_power_version.map |   4 +
 5 files changed, 352 insertions(+), 2 deletions(-)
 create mode 100644 lib/librte_power/pmd_mgmt.h
 create mode 100644 lib/librte_power/rte_power_pmd_mgmt.c
 create mode 100644 lib/librte_power/rte_power_pmd_mgmt.h
  

Comments

Ananyev, Konstantin Oct. 9, 2020, 4:38 p.m. UTC | #1
> Add a simple on/off switch that will enable saving power when no
> packets are arriving. It is based on counting the number of empty
> polls and, when the number reaches a certain threshold, entering an
> architecture-defined optimized power state that will either wait
> until a TSC timestamp expires, or when packets arrive.
> 
> This API support 1 port to multiple core use case.
> 
> This design leverage RX Callback mechnaism which allow three
> different power management methodology co exist.
> 
> 1. umwait/umonitor:
> 
>    The TSC timestamp is automatically calculated using current
>    link speed and RX descriptor ring size, such that the sleep
>    time is not longer than it would take for a NIC to fill its
>    entire RX descriptor ring.
> 
> 2. Pause instruction
> 
>    Instead of move the core into deeper C state, this lightweight
>    method use Pause instruction to relief the processor from
>    busy polling.
> 
> 3. Frequency Scaling
>    Reuse exist rte power library to scale up/down core frequency
>    depend on traffic volume.
> 
> Signed-off-by: Liang Ma <liang.j.ma@intel.com>
> Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
> ---
>  lib/librte_power/meson.build           |   5 +-
>  lib/librte_power/pmd_mgmt.h            |  49 ++++++
>  lib/librte_power/rte_power_pmd_mgmt.c  | 208 +++++++++++++++++++++++++
>  lib/librte_power/rte_power_pmd_mgmt.h  |  88 +++++++++++
>  lib/librte_power/rte_power_version.map |   4 +
>  5 files changed, 352 insertions(+), 2 deletions(-)
>  create mode 100644 lib/librte_power/pmd_mgmt.h
>  create mode 100644 lib/librte_power/rte_power_pmd_mgmt.c
>  create mode 100644 lib/librte_power/rte_power_pmd_mgmt.h
> 
> diff --git a/lib/librte_power/meson.build b/lib/librte_power/meson.build
> index 78c031c943..cc3c7a8646 100644
> --- a/lib/librte_power/meson.build
> +++ b/lib/librte_power/meson.build
> @@ -9,6 +9,7 @@ sources = files('rte_power.c', 'power_acpi_cpufreq.c',
>  		'power_kvm_vm.c', 'guest_channel.c',
>  		'rte_power_empty_poll.c',
>  		'power_pstate_cpufreq.c',
> +		'rte_power_pmd_mgmt.c',
>  		'power_common.c')
> -headers = files('rte_power.h','rte_power_empty_poll.h')
> -deps += ['timer']
> +headers = files('rte_power.h','rte_power_empty_poll.h','rte_power_pmd_mgmt.h')
> +deps += ['timer' ,'ethdev']
> diff --git a/lib/librte_power/pmd_mgmt.h b/lib/librte_power/pmd_mgmt.h
> new file mode 100644
> index 0000000000..756fbe20f7
> --- /dev/null
> +++ b/lib/librte_power/pmd_mgmt.h
> @@ -0,0 +1,49 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2010-2020 Intel Corporation
> + */
> +
> +#ifndef _PMD_MGMT_H
> +#define _PMD_MGMT_H
> +
> +/**
> + * @file
> + * Power Management
> + */
> +
> +#ifdef __cplusplus
> +extern "C" {
> +#endif
> +
> +/**
> + * Possible power management states of an ethdev port.
> + */
> +enum pmd_mgmt_state {
> +	/** Device power management is disabled. */
> +	PMD_MGMT_DISABLED = 0,
> +	/** Device power management is enabled. */
> +	PMD_MGMT_ENABLED,
> +};
> +
> +struct pmd_queue_cfg {
> +	enum pmd_mgmt_state pwr_mgmt_state;
> +	/**< Power mgmt Callback mode */
> +	enum rte_power_pmd_mgmt_type cb_mode;
> +	/**< Empty poll number */
> +	uint16_t empty_poll_stats;
> +	/**< Callback instance  */
> +	const struct rte_eth_rxtx_callback *cur_cb;
> +} __rte_cache_aligned;
> +
> +struct pmd_port_cfg {
> +	int  ref_cnt;
> +	struct pmd_queue_cfg *queue_cfg;
> +} __rte_cache_aligned;
> +
> +
> +
> +
> +#ifdef __cplusplus
> +}
> +#endif
> +
> +#endif
> diff --git a/lib/librte_power/rte_power_pmd_mgmt.c b/lib/librte_power/rte_power_pmd_mgmt.c
> new file mode 100644
> index 0000000000..35d2af46a4
> --- /dev/null
> +++ b/lib/librte_power/rte_power_pmd_mgmt.c
> @@ -0,0 +1,208 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2010-2020 Intel Corporation
> + */
> +
> +#include <rte_lcore.h>
> +#include <rte_cycles.h>
> +#include <rte_malloc.h>
> +#include <rte_ethdev.h>
> +#include <rte_power_intrinsics.h>
> +
> +#include "rte_power_pmd_mgmt.h"
> +#include "pmd_mgmt.h"
> +
> +
> +#define EMPTYPOLL_MAX  512
> +#define PAUSE_NUM  64
> +
> +static struct pmd_port_cfg port_cfg[RTE_MAX_ETHPORTS];
> +
> +static uint16_t
> +rte_power_mgmt_umwait(uint16_t port_id, uint16_t qidx,
> +		struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx,
> +		uint16_t max_pkts __rte_unused, void *_  __rte_unused)
> +{
> +
> +	struct pmd_queue_cfg *q_conf;
> +	q_conf = &port_cfg[port_id].queue_cfg[qidx];
> +
> +	if (unlikely(nb_rx == 0)) {
> +		q_conf->empty_poll_stats++;
> +		if (unlikely(q_conf->empty_poll_stats > EMPTYPOLL_MAX)) {


Here and in other places - wouldn't it be better to empty_poll_max as configurable
parameter, instead of constant value? 

> +			volatile void *target_addr;
> +			uint64_t expected, mask;
> +			uint16_t ret;
> +
> +			/*
> +			 * get address of next descriptor in the RX
> +			 * ring for this queue, as well as expected
> +			 * value and a mask.
> +			 */
> +			ret = rte_eth_get_wake_addr(port_id, qidx,
> +						    &target_addr, &expected,
> +						    &mask);
> +			if (ret == 0)
> +				/* -1ULL is maximum value for TSC */
> +				rte_power_monitor(target_addr,
> +						  expected, mask,
> +						  0, -1ULL);


Why not make timeout a user specified parameter?

> +		}
> +	} else
> +		q_conf->empty_poll_stats = 0;
> +
> +	return nb_rx;
> +}
> +
> +static uint16_t
> +rte_power_mgmt_pause(uint16_t port_id, uint16_t qidx,
> +		struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx,
> +		uint16_t max_pkts __rte_unused, void *_  __rte_unused)
> +{
> +	struct pmd_queue_cfg *q_conf;
> +	int i;
> +	q_conf = &port_cfg[port_id].queue_cfg[qidx];
> +
> +	if (unlikely(nb_rx == 0)) {
> +		q_conf->empty_poll_stats++;
> +		if (unlikely(q_conf->empty_poll_stats > EMPTYPOLL_MAX)) {
> +			for (i = 0; i < PAUSE_NUM; i++)
> +				rte_pause();

Just rte_delay_us(timeout) instead of this loop?

> +		}
> +	} else
> +		q_conf->empty_poll_stats = 0;
> +
> +	return nb_rx;
> +}
> +
> +static uint16_t
> +rte_power_mgmt_scalefreq(uint16_t port_id, uint16_t qidx,
> +		struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx,
> +		uint16_t max_pkts __rte_unused, void *_  __rte_unused)
> +{
> +	struct pmd_queue_cfg *q_conf;
> +	q_conf = &port_cfg[port_id].queue_cfg[qidx];
> +
> +	if (unlikely(nb_rx == 0)) {
> +		q_conf->empty_poll_stats++;
> +		if (unlikely(q_conf->empty_poll_stats > EMPTYPOLL_MAX)) {
> +			/*scale down freq */
> +			rte_power_freq_min(rte_lcore_id());
> +
> +		}
> +	} else {
> +		q_conf->empty_poll_stats = 0;
> +		/* scal up freq */
> +		rte_power_freq_max(rte_lcore_id());
> +	}
> +
> +	return nb_rx;
> +}
> +

Probably worth to mention in comments that these functions enable/disable
are not MT safe.

> +int
> +rte_power_pmd_mgmt_queue_enable(unsigned int lcore_id,
> +				uint16_t port_id,
> +				uint16_t queue_id,
> +				enum rte_power_pmd_mgmt_type mode)
> +{
> +	struct rte_eth_dev *dev;
> +	struct pmd_queue_cfg *queue_cfg;
> +	int ret = 0;
> +
> +	RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL);
> +	dev = &rte_eth_devices[port_id];
> +
> +	if (port_cfg[port_id].queue_cfg == NULL) {
> +		port_cfg[port_id].ref_cnt = 0;
> +		/* allocate memory for empty poll stats */
> +		port_cfg[port_id].queue_cfg  = rte_malloc_socket(NULL,
> +					sizeof(struct pmd_queue_cfg)
> +					* RTE_MAX_QUEUES_PER_PORT,
> +					0, dev->data->numa_node);
> +		if (port_cfg[port_id].queue_cfg == NULL)
> +			return -ENOMEM;
> +	}
> +
> +	queue_cfg = &port_cfg[port_id].queue_cfg[queue_id];
> +
> +	if (queue_cfg->pwr_mgmt_state == PMD_MGMT_ENABLED) {
> +		ret = -EINVAL;
> +		goto failure_handler;
> +	}
> +
> +	switch (mode) {
> +	case RTE_POWER_MGMT_TYPE_WAIT:
> +		if (!rte_cpu_get_flag_enabled(RTE_CPUFLAG_WAITPKG)) {
> +			ret = -ENOTSUP;
> +			goto failure_handler;
> +		}
> +		queue_cfg->cur_cb = rte_eth_add_rx_callback(port_id, queue_id,
> +						rte_power_mgmt_umwait, NULL);
> +		break;
> +	case RTE_POWER_MGMT_TYPE_SCALE:
> +		/* init scale freq */
> +		if (rte_power_init(lcore_id)) {
> +			ret = -EINVAL;
> +			goto failure_handler;
> +		}
> +		queue_cfg->cur_cb = rte_eth_add_rx_callback(port_id, queue_id,
> +					rte_power_mgmt_scalefreq, NULL);
> +		break;
> +	case RTE_POWER_MGMT_TYPE_PAUSE:
> +		queue_cfg->cur_cb = rte_eth_add_rx_callback(port_id, queue_id,
> +						rte_power_mgmt_pause, NULL);
> +		break;

	default:
		....

> +	}
> +	queue_cfg->cb_mode = mode;
> +	port_cfg[port_id].ref_cnt++;
> +	queue_cfg->pwr_mgmt_state = PMD_MGMT_ENABLED;
> +	return ret;
> +
> +failure_handler:
> +	if (port_cfg[port_id].ref_cnt == 0) {
> +		rte_free(port_cfg[port_id].queue_cfg);
> +		port_cfg[port_id].queue_cfg = NULL;
> +	}
> +	return ret;
> +}
> +
> +int
> +rte_power_pmd_mgmt_queue_disable(unsigned int lcore_id,
> +				uint16_t port_id,
> +				uint16_t queue_id)
> +{
> +	struct pmd_queue_cfg *queue_cfg;
> +
> +	if (port_cfg[port_id].ref_cnt <= 0)
> +		return -EINVAL;
> +
> +	queue_cfg = &port_cfg[port_id].queue_cfg[queue_id];
> +
> +	if (queue_cfg->pwr_mgmt_state == PMD_MGMT_DISABLED)
> +		return -EINVAL;
> +
> +	switch (queue_cfg->cb_mode) {
> +	case RTE_POWER_MGMT_TYPE_WAIT:

Think we need wakeup(lcore_id) here.

> +	case RTE_POWER_MGMT_TYPE_PAUSE:
> +		rte_eth_remove_rx_callback(port_id, queue_id,
> +					   queue_cfg->cur_cb);
> +		break;
> +	case RTE_POWER_MGMT_TYPE_SCALE:
> +		rte_power_freq_max(lcore_id);
> +		rte_eth_remove_rx_callback(port_id, queue_id,
> +					   queue_cfg->cur_cb);
> +		rte_power_exit(lcore_id);
> +		break;
> +	}
> +	/* it's not recommend to free callback instance here.
> +	 * it cause memory leak which is a known issue.
> +	 */
> +	queue_cfg->cur_cb = NULL;
> +	queue_cfg->pwr_mgmt_state = PMD_MGMT_DISABLED;
> +	port_cfg[port_id].ref_cnt--;
> +
> +	if (port_cfg[port_id].ref_cnt == 0) {
> +		rte_free(port_cfg[port_id].queue_cfg);

It is not safe to do so, unless device is already stopped.
Otherwise you need some sync mechanism here (hand-made as bpf lib, or rcu online/offline, or...)

> +		port_cfg[port_id].queue_cfg = NULL;
> +	}
> +	return 0;
> +}
  
Burakov, Anatoly Oct. 9, 2020, 4:47 p.m. UTC | #2
On 09-Oct-20 5:38 PM, Ananyev, Konstantin wrote:
>> Add a simple on/off switch that will enable saving power when no
>> packets are arriving. It is based on counting the number of empty
>> polls and, when the number reaches a certain threshold, entering an
>> architecture-defined optimized power state that will either wait
>> until a TSC timestamp expires, or when packets arrive.
>>
>> This API support 1 port to multiple core use case.
>>
>> This design leverage RX Callback mechnaism which allow three
>> different power management methodology co exist.
>>
>> 1. umwait/umonitor:
>>
>>     The TSC timestamp is automatically calculated using current
>>     link speed and RX descriptor ring size, such that the sleep
>>     time is not longer than it would take for a NIC to fill its
>>     entire RX descriptor ring.
>>
>> 2. Pause instruction
>>
>>     Instead of move the core into deeper C state, this lightweight
>>     method use Pause instruction to relief the processor from
>>     busy polling.
>>
>> 3. Frequency Scaling
>>     Reuse exist rte power library to scale up/down core frequency
>>     depend on traffic volume.
>>
>> Signed-off-by: Liang Ma <liang.j.ma@intel.com>
>> Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
>> ---
>>   lib/librte_power/meson.build           |   5 +-
>>   lib/librte_power/pmd_mgmt.h            |  49 ++++++
>>   lib/librte_power/rte_power_pmd_mgmt.c  | 208 +++++++++++++++++++++++++
>>   lib/librte_power/rte_power_pmd_mgmt.h  |  88 +++++++++++
>>   lib/librte_power/rte_power_version.map |   4 +
>>   5 files changed, 352 insertions(+), 2 deletions(-)
>>   create mode 100644 lib/librte_power/pmd_mgmt.h
>>   create mode 100644 lib/librte_power/rte_power_pmd_mgmt.c
>>   create mode 100644 lib/librte_power/rte_power_pmd_mgmt.h
>>
>> diff --git a/lib/librte_power/meson.build b/lib/librte_power/meson.build
>> index 78c031c943..cc3c7a8646 100644
>> --- a/lib/librte_power/meson.build
>> +++ b/lib/librte_power/meson.build
>> @@ -9,6 +9,7 @@ sources = files('rte_power.c', 'power_acpi_cpufreq.c',
>>   'power_kvm_vm.c', 'guest_channel.c',
>>   'rte_power_empty_poll.c',
>>   'power_pstate_cpufreq.c',
>> +'rte_power_pmd_mgmt.c',
>>   'power_common.c')
>> -headers = files('rte_power.h','rte_power_empty_poll.h')
>> -deps += ['timer']
>> +headers = files('rte_power.h','rte_power_empty_poll.h','rte_power_pmd_mgmt.h')
>> +deps += ['timer' ,'ethdev']
>> diff --git a/lib/librte_power/pmd_mgmt.h b/lib/librte_power/pmd_mgmt.h
>> new file mode 100644
>> index 0000000000..756fbe20f7
>> --- /dev/null
>> +++ b/lib/librte_power/pmd_mgmt.h
>> @@ -0,0 +1,49 @@
>> +/* SPDX-License-Identifier: BSD-3-Clause
>> + * Copyright(c) 2010-2020 Intel Corporation
>> + */
>> +
>> +#ifndef _PMD_MGMT_H
>> +#define _PMD_MGMT_H
>> +
>> +/**
>> + * @file
>> + * Power Management
>> + */
>> +
>> +#ifdef __cplusplus
>> +extern "C" {
>> +#endif
>> +
>> +/**
>> + * Possible power management states of an ethdev port.
>> + */
>> +enum pmd_mgmt_state {
>> +/** Device power management is disabled. */
>> +PMD_MGMT_DISABLED = 0,
>> +/** Device power management is enabled. */
>> +PMD_MGMT_ENABLED,
>> +};
>> +
>> +struct pmd_queue_cfg {
>> +enum pmd_mgmt_state pwr_mgmt_state;
>> +/**< Power mgmt Callback mode */
>> +enum rte_power_pmd_mgmt_type cb_mode;
>> +/**< Empty poll number */
>> +uint16_t empty_poll_stats;
>> +/**< Callback instance  */
>> +const struct rte_eth_rxtx_callback *cur_cb;
>> +} __rte_cache_aligned;
>> +
>> +struct pmd_port_cfg {
>> +int  ref_cnt;
>> +struct pmd_queue_cfg *queue_cfg;
>> +} __rte_cache_aligned;
>> +
>> +
>> +
>> +
>> +#ifdef __cplusplus
>> +}
>> +#endif
>> +
>> +#endif
>> diff --git a/lib/librte_power/rte_power_pmd_mgmt.c b/lib/librte_power/rte_power_pmd_mgmt.c
>> new file mode 100644
>> index 0000000000..35d2af46a4
>> --- /dev/null
>> +++ b/lib/librte_power/rte_power_pmd_mgmt.c
>> @@ -0,0 +1,208 @@
>> +/* SPDX-License-Identifier: BSD-3-Clause
>> + * Copyright(c) 2010-2020 Intel Corporation
>> + */
>> +
>> +#include <rte_lcore.h>
>> +#include <rte_cycles.h>
>> +#include <rte_malloc.h>
>> +#include <rte_ethdev.h>
>> +#include <rte_power_intrinsics.h>
>> +
>> +#include "rte_power_pmd_mgmt.h"
>> +#include "pmd_mgmt.h"
>> +
>> +
>> +#define EMPTYPOLL_MAX  512
>> +#define PAUSE_NUM  64
>> +
>> +static struct pmd_port_cfg port_cfg[RTE_MAX_ETHPORTS];
>> +
>> +static uint16_t
>> +rte_power_mgmt_umwait(uint16_t port_id, uint16_t qidx,
>> +struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx,
>> +uint16_t max_pkts __rte_unused, void *_  __rte_unused)
>> +{
>> +
>> +struct pmd_queue_cfg *q_conf;
>> +q_conf = &port_cfg[port_id].queue_cfg[qidx];
>> +
>> +if (unlikely(nb_rx == 0)) {
>> +q_conf->empty_poll_stats++;
>> +if (unlikely(q_conf->empty_poll_stats > EMPTYPOLL_MAX)) {
> 
> 
> Here and in other places - wouldn't it be better to empty_poll_max as configurable
> parameter, instead of constant value?

It would be more flexible, but i don't think it's "better" in the sense 
that providing additional options will only make using this (already 
under-utilized!) API harder than it needs to be.

> 
>> +volatile void *target_addr;
>> +uint64_t expected, mask;
>> +uint16_t ret;
>> +
>> +/*
>> + * get address of next descriptor in the RX
>> + * ring for this queue, as well as expected
>> + * value and a mask.
>> + */
>> +ret = rte_eth_get_wake_addr(port_id, qidx,
>> +    &target_addr, &expected,
>> +    &mask);
>> +if (ret == 0)
>> +/* -1ULL is maximum value for TSC */
>> +rte_power_monitor(target_addr,
>> +  expected, mask,
>> +  0, -1ULL);
> 
> 
> Why not make timeout a user specified parameter?

This is meant to be an "easy to use" API, we were trying to keep the 
amount of configuration to an absolute minimum. If the user wants to use 
custom timeouts, they can do so with using rte_power_monitor API explicitly.

> 
>> +}
>> +} else
>> +q_conf->empty_poll_stats = 0;
>> +
>> +return nb_rx;
>> +}
>> +
>> +static uint16_t
>> +rte_power_mgmt_pause(uint16_t port_id, uint16_t qidx,
>> +struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx,
>> +uint16_t max_pkts __rte_unused, void *_  __rte_unused)
>> +{
>> +struct pmd_queue_cfg *q_conf;
>> +int i;
>> +q_conf = &port_cfg[port_id].queue_cfg[qidx];
>> +
>> +if (unlikely(nb_rx == 0)) {
>> +q_conf->empty_poll_stats++;
>> +if (unlikely(q_conf->empty_poll_stats > EMPTYPOLL_MAX)) {
>> +for (i = 0; i < PAUSE_NUM; i++)
>> +rte_pause();
> 
> Just rte_delay_us(timeout) instead of this loop?

Yes, seems better, thanks.

> 
>> +}
>> +} else
>> +q_conf->empty_poll_stats = 0;
>> +
>> +return nb_rx;
>> +}
>> +
>> +static uint16_t
>> +rte_power_mgmt_scalefreq(uint16_t port_id, uint16_t qidx,
>> +struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx,
>> +uint16_t max_pkts __rte_unused, void *_  __rte_unused)
>> +{
>> +struct pmd_queue_cfg *q_conf;
>> +q_conf = &port_cfg[port_id].queue_cfg[qidx];
>> +
>> +if (unlikely(nb_rx == 0)) {
>> +q_conf->empty_poll_stats++;
>> +if (unlikely(q_conf->empty_poll_stats > EMPTYPOLL_MAX)) {
>> +/*scale down freq */
>> +rte_power_freq_min(rte_lcore_id());
>> +
>> +}
>> +} else {
>> +q_conf->empty_poll_stats = 0;
>> +/* scal up freq */
>> +rte_power_freq_max(rte_lcore_id());
>> +}
>> +
>> +return nb_rx;
>> +}
>> +
> 
> Probably worth to mention in comments that these functions enable/disable
> are not MT safe.

Will do in v6.

> 
>> +int
>> +rte_power_pmd_mgmt_queue_enable(unsigned int lcore_id,
>> +uint16_t port_id,
>> +uint16_t queue_id,
>> +enum rte_power_pmd_mgmt_type mode)
>> +{
>> +struct rte_eth_dev *dev;
>> +struct pmd_queue_cfg *queue_cfg;
>> +int ret = 0;
>> +
>> +RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL);
>> +dev = &rte_eth_devices[port_id];
>> +
>> +if (port_cfg[port_id].queue_cfg == NULL) {
>> +port_cfg[port_id].ref_cnt = 0;
>> +/* allocate memory for empty poll stats */
>> +port_cfg[port_id].queue_cfg  = rte_malloc_socket(NULL,
>> +sizeof(struct pmd_queue_cfg)
>> +* RTE_MAX_QUEUES_PER_PORT,
>> +0, dev->data->numa_node);
>> +if (port_cfg[port_id].queue_cfg == NULL)
>> +return -ENOMEM;
>> +}
>> +
>> +queue_cfg = &port_cfg[port_id].queue_cfg[queue_id];
>> +
>> +if (queue_cfg->pwr_mgmt_state == PMD_MGMT_ENABLED) {
>> +ret = -EINVAL;
>> +goto failure_handler;
>> +}
>> +
>> +switch (mode) {
>> +case RTE_POWER_MGMT_TYPE_WAIT:
>> +if (!rte_cpu_get_flag_enabled(RTE_CPUFLAG_WAITPKG)) {
>> +ret = -ENOTSUP;
>> +goto failure_handler;
>> +}
>> +queue_cfg->cur_cb = rte_eth_add_rx_callback(port_id, queue_id,
>> +rte_power_mgmt_umwait, NULL);
>> +break;
>> +case RTE_POWER_MGMT_TYPE_SCALE:
>> +/* init scale freq */
>> +if (rte_power_init(lcore_id)) {
>> +ret = -EINVAL;
>> +goto failure_handler;
>> +}
>> +queue_cfg->cur_cb = rte_eth_add_rx_callback(port_id, queue_id,
>> +rte_power_mgmt_scalefreq, NULL);
>> +break;
>> +case RTE_POWER_MGMT_TYPE_PAUSE:
>> +queue_cfg->cur_cb = rte_eth_add_rx_callback(port_id, queue_id,
>> +rte_power_mgmt_pause, NULL);
>> +break;
> 
> default:
> ....

Will add in v6.

> 
>> +}
>> +queue_cfg->cb_mode = mode;
>> +port_cfg[port_id].ref_cnt++;
>> +queue_cfg->pwr_mgmt_state = PMD_MGMT_ENABLED;
>> +return ret;
>> +
>> +failure_handler:
>> +if (port_cfg[port_id].ref_cnt == 0) {
>> +rte_free(port_cfg[port_id].queue_cfg);
>> +port_cfg[port_id].queue_cfg = NULL;
>> +}
>> +return ret;
>> +}
>> +
>> +int
>> +rte_power_pmd_mgmt_queue_disable(unsigned int lcore_id,
>> +uint16_t port_id,
>> +uint16_t queue_id)
>> +{
>> +struct pmd_queue_cfg *queue_cfg;
>> +
>> +if (port_cfg[port_id].ref_cnt <= 0)
>> +return -EINVAL;
>> +
>> +queue_cfg = &port_cfg[port_id].queue_cfg[queue_id];
>> +
>> +if (queue_cfg->pwr_mgmt_state == PMD_MGMT_DISABLED)
>> +return -EINVAL;
>> +
>> +switch (queue_cfg->cb_mode) {
>> +case RTE_POWER_MGMT_TYPE_WAIT:
> 
> Think we need wakeup(lcore_id) here.

Not sure what you mean? Could you please elaborate?

> 
>> +case RTE_POWER_MGMT_TYPE_PAUSE:
>> +rte_eth_remove_rx_callback(port_id, queue_id,
>> +   queue_cfg->cur_cb);
>> +break;
>> +case RTE_POWER_MGMT_TYPE_SCALE:
>> +rte_power_freq_max(lcore_id);
>> +rte_eth_remove_rx_callback(port_id, queue_id,
>> +   queue_cfg->cur_cb);
>> +rte_power_exit(lcore_id);
>> +break;
>> +}
>> +/* it's not recommend to free callback instance here.
>> + * it cause memory leak which is a known issue.
>> + */
>> +queue_cfg->cur_cb = NULL;
>> +queue_cfg->pwr_mgmt_state = PMD_MGMT_DISABLED;
>> +port_cfg[port_id].ref_cnt--;
>> +
>> +if (port_cfg[port_id].ref_cnt == 0) {
>> +rte_free(port_cfg[port_id].queue_cfg);
> 
> It is not safe to do so, unless device is already stopped.
> Otherwise you need some sync mechanism here (hand-made as bpf lib, or rcu online/offline, or...)

Not sure what you mean. We're not freeing the callback structure, we're 
freeing the local data structure holding the per-port status.

> 
>> +port_cfg[port_id].queue_cfg = NULL;
>> +}
>> +return 0;
>> +}
  
Ananyev, Konstantin Oct. 9, 2020, 4:51 p.m. UTC | #3
> On 09-Oct-20 5:38 PM, Ananyev, Konstantin wrote:
> >> Add a simple on/off switch that will enable saving power when no
> >> packets are arriving. It is based on counting the number of empty
> >> polls and, when the number reaches a certain threshold, entering an
> >> architecture-defined optimized power state that will either wait
> >> until a TSC timestamp expires, or when packets arrive.
> >>
> >> This API support 1 port to multiple core use case.
> >>
> >> This design leverage RX Callback mechnaism which allow three
> >> different power management methodology co exist.
> >>
> >> 1. umwait/umonitor:
> >>
> >>     The TSC timestamp is automatically calculated using current
> >>     link speed and RX descriptor ring size, such that the sleep
> >>     time is not longer than it would take for a NIC to fill its
> >>     entire RX descriptor ring.
> >>
> >> 2. Pause instruction
> >>
> >>     Instead of move the core into deeper C state, this lightweight
> >>     method use Pause instruction to relief the processor from
> >>     busy polling.
> >>
> >> 3. Frequency Scaling
> >>     Reuse exist rte power library to scale up/down core frequency
> >>     depend on traffic volume.
> >>
> >> Signed-off-by: Liang Ma <liang.j.ma@intel.com>
> >> Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
> >> ---
> >>   lib/librte_power/meson.build           |   5 +-
> >>   lib/librte_power/pmd_mgmt.h            |  49 ++++++
> >>   lib/librte_power/rte_power_pmd_mgmt.c  | 208 +++++++++++++++++++++++++
> >>   lib/librte_power/rte_power_pmd_mgmt.h  |  88 +++++++++++
> >>   lib/librte_power/rte_power_version.map |   4 +
> >>   5 files changed, 352 insertions(+), 2 deletions(-)
> >>   create mode 100644 lib/librte_power/pmd_mgmt.h
> >>   create mode 100644 lib/librte_power/rte_power_pmd_mgmt.c
> >>   create mode 100644 lib/librte_power/rte_power_pmd_mgmt.h
> >>
> >> diff --git a/lib/librte_power/meson.build b/lib/librte_power/meson.build
> >> index 78c031c943..cc3c7a8646 100644
> >> --- a/lib/librte_power/meson.build
> >> +++ b/lib/librte_power/meson.build
> >> @@ -9,6 +9,7 @@ sources = files('rte_power.c', 'power_acpi_cpufreq.c',
> >>   'power_kvm_vm.c', 'guest_channel.c',
> >>   'rte_power_empty_poll.c',
> >>   'power_pstate_cpufreq.c',
> >> +'rte_power_pmd_mgmt.c',
> >>   'power_common.c')
> >> -headers = files('rte_power.h','rte_power_empty_poll.h')
> >> -deps += ['timer']
> >> +headers = files('rte_power.h','rte_power_empty_poll.h','rte_power_pmd_mgmt.h')
> >> +deps += ['timer' ,'ethdev']
> >> diff --git a/lib/librte_power/pmd_mgmt.h b/lib/librte_power/pmd_mgmt.h
> >> new file mode 100644
> >> index 0000000000..756fbe20f7
> >> --- /dev/null
> >> +++ b/lib/librte_power/pmd_mgmt.h
> >> @@ -0,0 +1,49 @@
> >> +/* SPDX-License-Identifier: BSD-3-Clause
> >> + * Copyright(c) 2010-2020 Intel Corporation
> >> + */
> >> +
> >> +#ifndef _PMD_MGMT_H
> >> +#define _PMD_MGMT_H
> >> +
> >> +/**
> >> + * @file
> >> + * Power Management
> >> + */
> >> +
> >> +#ifdef __cplusplus
> >> +extern "C" {
> >> +#endif
> >> +
> >> +/**
> >> + * Possible power management states of an ethdev port.
> >> + */
> >> +enum pmd_mgmt_state {
> >> +/** Device power management is disabled. */
> >> +PMD_MGMT_DISABLED = 0,
> >> +/** Device power management is enabled. */
> >> +PMD_MGMT_ENABLED,
> >> +};
> >> +
> >> +struct pmd_queue_cfg {
> >> +enum pmd_mgmt_state pwr_mgmt_state;
> >> +/**< Power mgmt Callback mode */
> >> +enum rte_power_pmd_mgmt_type cb_mode;
> >> +/**< Empty poll number */
> >> +uint16_t empty_poll_stats;
> >> +/**< Callback instance  */
> >> +const struct rte_eth_rxtx_callback *cur_cb;
> >> +} __rte_cache_aligned;
> >> +
> >> +struct pmd_port_cfg {
> >> +int  ref_cnt;
> >> +struct pmd_queue_cfg *queue_cfg;
> >> +} __rte_cache_aligned;
> >> +
> >> +
> >> +
> >> +
> >> +#ifdef __cplusplus
> >> +}
> >> +#endif
> >> +
> >> +#endif
> >> diff --git a/lib/librte_power/rte_power_pmd_mgmt.c b/lib/librte_power/rte_power_pmd_mgmt.c
> >> new file mode 100644
> >> index 0000000000..35d2af46a4
> >> --- /dev/null
> >> +++ b/lib/librte_power/rte_power_pmd_mgmt.c
> >> @@ -0,0 +1,208 @@
> >> +/* SPDX-License-Identifier: BSD-3-Clause
> >> + * Copyright(c) 2010-2020 Intel Corporation
> >> + */
> >> +
> >> +#include <rte_lcore.h>
> >> +#include <rte_cycles.h>
> >> +#include <rte_malloc.h>
> >> +#include <rte_ethdev.h>
> >> +#include <rte_power_intrinsics.h>
> >> +
> >> +#include "rte_power_pmd_mgmt.h"
> >> +#include "pmd_mgmt.h"
> >> +
> >> +
> >> +#define EMPTYPOLL_MAX  512
> >> +#define PAUSE_NUM  64
> >> +
> >> +static struct pmd_port_cfg port_cfg[RTE_MAX_ETHPORTS];
> >> +
> >> +static uint16_t
> >> +rte_power_mgmt_umwait(uint16_t port_id, uint16_t qidx,
> >> +struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx,
> >> +uint16_t max_pkts __rte_unused, void *_  __rte_unused)
> >> +{
> >> +
> >> +struct pmd_queue_cfg *q_conf;
> >> +q_conf = &port_cfg[port_id].queue_cfg[qidx];
> >> +
> >> +if (unlikely(nb_rx == 0)) {
> >> +q_conf->empty_poll_stats++;
> >> +if (unlikely(q_conf->empty_poll_stats > EMPTYPOLL_MAX)) {
> >
> >
> > Here and in other places - wouldn't it be better to empty_poll_max as configurable
> > parameter, instead of constant value?
> 
> It would be more flexible, but i don't think it's "better" in the sense
> that providing additional options will only make using this (already
> under-utilized!) API harder than it needs to be.
> 
> >
> >> +volatile void *target_addr;
> >> +uint64_t expected, mask;
> >> +uint16_t ret;
> >> +
> >> +/*
> >> + * get address of next descriptor in the RX
> >> + * ring for this queue, as well as expected
> >> + * value and a mask.
> >> + */
> >> +ret = rte_eth_get_wake_addr(port_id, qidx,
> >> +    &target_addr, &expected,
> >> +    &mask);
> >> +if (ret == 0)
> >> +/* -1ULL is maximum value for TSC */
> >> +rte_power_monitor(target_addr,
> >> +  expected, mask,
> >> +  0, -1ULL);
> >
> >
> > Why not make timeout a user specified parameter?
> 
> This is meant to be an "easy to use" API, we were trying to keep the
> amount of configuration to an absolute minimum. If the user wants to use
> custom timeouts, they can do so with using rte_power_monitor API explicitly.
> 
> >
> >> +}
> >> +} else
> >> +q_conf->empty_poll_stats = 0;
> >> +
> >> +return nb_rx;
> >> +}
> >> +
> >> +static uint16_t
> >> +rte_power_mgmt_pause(uint16_t port_id, uint16_t qidx,
> >> +struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx,
> >> +uint16_t max_pkts __rte_unused, void *_  __rte_unused)
> >> +{
> >> +struct pmd_queue_cfg *q_conf;
> >> +int i;
> >> +q_conf = &port_cfg[port_id].queue_cfg[qidx];
> >> +
> >> +if (unlikely(nb_rx == 0)) {
> >> +q_conf->empty_poll_stats++;
> >> +if (unlikely(q_conf->empty_poll_stats > EMPTYPOLL_MAX)) {
> >> +for (i = 0; i < PAUSE_NUM; i++)
> >> +rte_pause();
> >
> > Just rte_delay_us(timeout) instead of this loop?
> 
> Yes, seems better, thanks.
> 
> >
> >> +}
> >> +} else
> >> +q_conf->empty_poll_stats = 0;
> >> +
> >> +return nb_rx;
> >> +}
> >> +
> >> +static uint16_t
> >> +rte_power_mgmt_scalefreq(uint16_t port_id, uint16_t qidx,
> >> +struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx,
> >> +uint16_t max_pkts __rte_unused, void *_  __rte_unused)
> >> +{
> >> +struct pmd_queue_cfg *q_conf;
> >> +q_conf = &port_cfg[port_id].queue_cfg[qidx];
> >> +
> >> +if (unlikely(nb_rx == 0)) {
> >> +q_conf->empty_poll_stats++;
> >> +if (unlikely(q_conf->empty_poll_stats > EMPTYPOLL_MAX)) {
> >> +/*scale down freq */
> >> +rte_power_freq_min(rte_lcore_id());
> >> +
> >> +}
> >> +} else {
> >> +q_conf->empty_poll_stats = 0;
> >> +/* scal up freq */
> >> +rte_power_freq_max(rte_lcore_id());
> >> +}
> >> +
> >> +return nb_rx;
> >> +}
> >> +
> >
> > Probably worth to mention in comments that these functions enable/disable
> > are not MT safe.
> 
> Will do in v6.
> 
> >
> >> +int
> >> +rte_power_pmd_mgmt_queue_enable(unsigned int lcore_id,
> >> +uint16_t port_id,
> >> +uint16_t queue_id,
> >> +enum rte_power_pmd_mgmt_type mode)
> >> +{
> >> +struct rte_eth_dev *dev;
> >> +struct pmd_queue_cfg *queue_cfg;
> >> +int ret = 0;
> >> +
> >> +RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL);
> >> +dev = &rte_eth_devices[port_id];
> >> +
> >> +if (port_cfg[port_id].queue_cfg == NULL) {
> >> +port_cfg[port_id].ref_cnt = 0;
> >> +/* allocate memory for empty poll stats */
> >> +port_cfg[port_id].queue_cfg  = rte_malloc_socket(NULL,
> >> +sizeof(struct pmd_queue_cfg)
> >> +* RTE_MAX_QUEUES_PER_PORT,
> >> +0, dev->data->numa_node);
> >> +if (port_cfg[port_id].queue_cfg == NULL)
> >> +return -ENOMEM;
> >> +}
> >> +
> >> +queue_cfg = &port_cfg[port_id].queue_cfg[queue_id];
> >> +
> >> +if (queue_cfg->pwr_mgmt_state == PMD_MGMT_ENABLED) {
> >> +ret = -EINVAL;
> >> +goto failure_handler;
> >> +}
> >> +
> >> +switch (mode) {
> >> +case RTE_POWER_MGMT_TYPE_WAIT:
> >> +if (!rte_cpu_get_flag_enabled(RTE_CPUFLAG_WAITPKG)) {
> >> +ret = -ENOTSUP;
> >> +goto failure_handler;
> >> +}
> >> +queue_cfg->cur_cb = rte_eth_add_rx_callback(port_id, queue_id,
> >> +rte_power_mgmt_umwait, NULL);
> >> +break;
> >> +case RTE_POWER_MGMT_TYPE_SCALE:
> >> +/* init scale freq */
> >> +if (rte_power_init(lcore_id)) {
> >> +ret = -EINVAL;
> >> +goto failure_handler;
> >> +}
> >> +queue_cfg->cur_cb = rte_eth_add_rx_callback(port_id, queue_id,
> >> +rte_power_mgmt_scalefreq, NULL);
> >> +break;
> >> +case RTE_POWER_MGMT_TYPE_PAUSE:
> >> +queue_cfg->cur_cb = rte_eth_add_rx_callback(port_id, queue_id,
> >> +rte_power_mgmt_pause, NULL);
> >> +break;
> >
> > default:
> > ....
> 
> Will add in v6.
> 
> >
> >> +}
> >> +queue_cfg->cb_mode = mode;
> >> +port_cfg[port_id].ref_cnt++;
> >> +queue_cfg->pwr_mgmt_state = PMD_MGMT_ENABLED;
> >> +return ret;
> >> +
> >> +failure_handler:
> >> +if (port_cfg[port_id].ref_cnt == 0) {
> >> +rte_free(port_cfg[port_id].queue_cfg);
> >> +port_cfg[port_id].queue_cfg = NULL;
> >> +}
> >> +return ret;
> >> +}
> >> +
> >> +int
> >> +rte_power_pmd_mgmt_queue_disable(unsigned int lcore_id,
> >> +uint16_t port_id,
> >> +uint16_t queue_id)
> >> +{
> >> +struct pmd_queue_cfg *queue_cfg;
> >> +
> >> +if (port_cfg[port_id].ref_cnt <= 0)
> >> +return -EINVAL;
> >> +
> >> +queue_cfg = &port_cfg[port_id].queue_cfg[queue_id];
> >> +
> >> +if (queue_cfg->pwr_mgmt_state == PMD_MGMT_DISABLED)
> >> +return -EINVAL;
> >> +
> >> +switch (queue_cfg->cb_mode) {
> >> +case RTE_POWER_MGMT_TYPE_WAIT:
> >
> > Think we need wakeup(lcore_id) here.
> 
> Not sure what you mean? Could you please elaborate?
> 
> >
> >> +case RTE_POWER_MGMT_TYPE_PAUSE:
> >> +rte_eth_remove_rx_callback(port_id, queue_id,
> >> +   queue_cfg->cur_cb);
> >> +break;
> >> +case RTE_POWER_MGMT_TYPE_SCALE:
> >> +rte_power_freq_max(lcore_id);
> >> +rte_eth_remove_rx_callback(port_id, queue_id,
> >> +   queue_cfg->cur_cb);
> >> +rte_power_exit(lcore_id);
> >> +break;
> >> +}
> >> +/* it's not recommend to free callback instance here.
> >> + * it cause memory leak which is a known issue.
> >> + */
> >> +queue_cfg->cur_cb = NULL;
> >> +queue_cfg->pwr_mgmt_state = PMD_MGMT_DISABLED;
> >> +port_cfg[port_id].ref_cnt--;
> >> +
> >> +if (port_cfg[port_id].ref_cnt == 0) {
> >> +rte_free(port_cfg[port_id].queue_cfg);
> >
> > It is not safe to do so, unless device is already stopped.
> > Otherwise you need some sync mechanism here (hand-made as bpf lib, or rcu online/offline, or...)
> 
> Not sure what you mean. We're not freeing the callback structure, we're
> freeing the local data structure holding the per-port status.

What is the difference?
You still trying to free memory that might be used by your DP thread
that still executes the callback.

> 
> >
> >> +port_cfg[port_id].queue_cfg = NULL;
> >> +}
> >> +return 0;
> >> +}
> 
> 
> --
> Thanks,
> Anatoly
  
Burakov, Anatoly Oct. 9, 2020, 4:56 p.m. UTC | #4
On 09-Oct-20 5:51 PM, Ananyev, Konstantin wrote:

>>>
>>>> +case RTE_POWER_MGMT_TYPE_PAUSE:
>>>> +rte_eth_remove_rx_callback(port_id, queue_id,
>>>> +   queue_cfg->cur_cb);
>>>> +break;
>>>> +case RTE_POWER_MGMT_TYPE_SCALE:
>>>> +rte_power_freq_max(lcore_id);
>>>> +rte_eth_remove_rx_callback(port_id, queue_id,
>>>> +   queue_cfg->cur_cb);
>>>> +rte_power_exit(lcore_id);
>>>> +break;
>>>> +}
>>>> +/* it's not recommend to free callback instance here.
>>>> + * it cause memory leak which is a known issue.
>>>> + */
>>>> +queue_cfg->cur_cb = NULL;
>>>> +queue_cfg->pwr_mgmt_state = PMD_MGMT_DISABLED;
>>>> +port_cfg[port_id].ref_cnt--;
>>>> +
>>>> +if (port_cfg[port_id].ref_cnt == 0) {
>>>> +rte_free(port_cfg[port_id].queue_cfg);
>>>
>>> It is not safe to do so, unless device is already stopped.
>>> Otherwise you need some sync mechanism here (hand-made as bpf lib, or rcu online/offline, or...)
>>
>> Not sure what you mean. We're not freeing the callback structure, we're
>> freeing the local data structure holding the per-port status.
> 
> What is the difference?
> You still trying to free memory that might be used by your DP thread
> that still executes the callback.

Welp, you're right :/ I'll see what i can do to fix it.
  

Patch

diff --git a/lib/librte_power/meson.build b/lib/librte_power/meson.build
index 78c031c943..cc3c7a8646 100644
--- a/lib/librte_power/meson.build
+++ b/lib/librte_power/meson.build
@@ -9,6 +9,7 @@  sources = files('rte_power.c', 'power_acpi_cpufreq.c',
 		'power_kvm_vm.c', 'guest_channel.c',
 		'rte_power_empty_poll.c',
 		'power_pstate_cpufreq.c',
+		'rte_power_pmd_mgmt.c',
 		'power_common.c')
-headers = files('rte_power.h','rte_power_empty_poll.h')
-deps += ['timer']
+headers = files('rte_power.h','rte_power_empty_poll.h','rte_power_pmd_mgmt.h')
+deps += ['timer' ,'ethdev']
diff --git a/lib/librte_power/pmd_mgmt.h b/lib/librte_power/pmd_mgmt.h
new file mode 100644
index 0000000000..756fbe20f7
--- /dev/null
+++ b/lib/librte_power/pmd_mgmt.h
@@ -0,0 +1,49 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2020 Intel Corporation
+ */
+
+#ifndef _PMD_MGMT_H
+#define _PMD_MGMT_H
+
+/**
+ * @file
+ * Power Management
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * Possible power management states of an ethdev port.
+ */
+enum pmd_mgmt_state {
+	/** Device power management is disabled. */
+	PMD_MGMT_DISABLED = 0,
+	/** Device power management is enabled. */
+	PMD_MGMT_ENABLED,
+};
+
+struct pmd_queue_cfg {
+	enum pmd_mgmt_state pwr_mgmt_state;
+	/**< Power mgmt Callback mode */
+	enum rte_power_pmd_mgmt_type cb_mode;
+	/**< Empty poll number */
+	uint16_t empty_poll_stats;
+	/**< Callback instance  */
+	const struct rte_eth_rxtx_callback *cur_cb;
+} __rte_cache_aligned;
+
+struct pmd_port_cfg {
+	int  ref_cnt;
+	struct pmd_queue_cfg *queue_cfg;
+} __rte_cache_aligned;
+
+
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/lib/librte_power/rte_power_pmd_mgmt.c b/lib/librte_power/rte_power_pmd_mgmt.c
new file mode 100644
index 0000000000..35d2af46a4
--- /dev/null
+++ b/lib/librte_power/rte_power_pmd_mgmt.c
@@ -0,0 +1,208 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2020 Intel Corporation
+ */
+
+#include <rte_lcore.h>
+#include <rte_cycles.h>
+#include <rte_malloc.h>
+#include <rte_ethdev.h>
+#include <rte_power_intrinsics.h>
+
+#include "rte_power_pmd_mgmt.h"
+#include "pmd_mgmt.h"
+
+
+#define EMPTYPOLL_MAX  512
+#define PAUSE_NUM  64
+
+static struct pmd_port_cfg port_cfg[RTE_MAX_ETHPORTS];
+
+static uint16_t
+rte_power_mgmt_umwait(uint16_t port_id, uint16_t qidx,
+		struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx,
+		uint16_t max_pkts __rte_unused, void *_  __rte_unused)
+{
+
+	struct pmd_queue_cfg *q_conf;
+	q_conf = &port_cfg[port_id].queue_cfg[qidx];
+
+	if (unlikely(nb_rx == 0)) {
+		q_conf->empty_poll_stats++;
+		if (unlikely(q_conf->empty_poll_stats > EMPTYPOLL_MAX)) {
+			volatile void *target_addr;
+			uint64_t expected, mask;
+			uint16_t ret;
+
+			/*
+			 * get address of next descriptor in the RX
+			 * ring for this queue, as well as expected
+			 * value and a mask.
+			 */
+			ret = rte_eth_get_wake_addr(port_id, qidx,
+						    &target_addr, &expected,
+						    &mask);
+			if (ret == 0)
+				/* -1ULL is maximum value for TSC */
+				rte_power_monitor(target_addr,
+						  expected, mask,
+						  0, -1ULL);
+		}
+	} else
+		q_conf->empty_poll_stats = 0;
+
+	return nb_rx;
+}
+
+static uint16_t
+rte_power_mgmt_pause(uint16_t port_id, uint16_t qidx,
+		struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx,
+		uint16_t max_pkts __rte_unused, void *_  __rte_unused)
+{
+	struct pmd_queue_cfg *q_conf;
+	int i;
+	q_conf = &port_cfg[port_id].queue_cfg[qidx];
+
+	if (unlikely(nb_rx == 0)) {
+		q_conf->empty_poll_stats++;
+		if (unlikely(q_conf->empty_poll_stats > EMPTYPOLL_MAX)) {
+			for (i = 0; i < PAUSE_NUM; i++)
+				rte_pause();
+		}
+	} else
+		q_conf->empty_poll_stats = 0;
+
+	return nb_rx;
+}
+
+static uint16_t
+rte_power_mgmt_scalefreq(uint16_t port_id, uint16_t qidx,
+		struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx,
+		uint16_t max_pkts __rte_unused, void *_  __rte_unused)
+{
+	struct pmd_queue_cfg *q_conf;
+	q_conf = &port_cfg[port_id].queue_cfg[qidx];
+
+	if (unlikely(nb_rx == 0)) {
+		q_conf->empty_poll_stats++;
+		if (unlikely(q_conf->empty_poll_stats > EMPTYPOLL_MAX)) {
+			/*scale down freq */
+			rte_power_freq_min(rte_lcore_id());
+
+		}
+	} else {
+		q_conf->empty_poll_stats = 0;
+		/* scal up freq */
+		rte_power_freq_max(rte_lcore_id());
+	}
+
+	return nb_rx;
+}
+
+int
+rte_power_pmd_mgmt_queue_enable(unsigned int lcore_id,
+				uint16_t port_id,
+				uint16_t queue_id,
+				enum rte_power_pmd_mgmt_type mode)
+{
+	struct rte_eth_dev *dev;
+	struct pmd_queue_cfg *queue_cfg;
+	int ret = 0;
+
+	RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL);
+	dev = &rte_eth_devices[port_id];
+
+	if (port_cfg[port_id].queue_cfg == NULL) {
+		port_cfg[port_id].ref_cnt = 0;
+		/* allocate memory for empty poll stats */
+		port_cfg[port_id].queue_cfg  = rte_malloc_socket(NULL,
+					sizeof(struct pmd_queue_cfg)
+					* RTE_MAX_QUEUES_PER_PORT,
+					0, dev->data->numa_node);
+		if (port_cfg[port_id].queue_cfg == NULL)
+			return -ENOMEM;
+	}
+
+	queue_cfg = &port_cfg[port_id].queue_cfg[queue_id];
+
+	if (queue_cfg->pwr_mgmt_state == PMD_MGMT_ENABLED) {
+		ret = -EINVAL;
+		goto failure_handler;
+	}
+
+	switch (mode) {
+	case RTE_POWER_MGMT_TYPE_WAIT:
+		if (!rte_cpu_get_flag_enabled(RTE_CPUFLAG_WAITPKG)) {
+			ret = -ENOTSUP;
+			goto failure_handler;
+		}
+		queue_cfg->cur_cb = rte_eth_add_rx_callback(port_id, queue_id,
+						rte_power_mgmt_umwait, NULL);
+		break;
+	case RTE_POWER_MGMT_TYPE_SCALE:
+		/* init scale freq */
+		if (rte_power_init(lcore_id)) {
+			ret = -EINVAL;
+			goto failure_handler;
+		}
+		queue_cfg->cur_cb = rte_eth_add_rx_callback(port_id, queue_id,
+					rte_power_mgmt_scalefreq, NULL);
+		break;
+	case RTE_POWER_MGMT_TYPE_PAUSE:
+		queue_cfg->cur_cb = rte_eth_add_rx_callback(port_id, queue_id,
+						rte_power_mgmt_pause, NULL);
+		break;
+	}
+	queue_cfg->cb_mode = mode;
+	port_cfg[port_id].ref_cnt++;
+	queue_cfg->pwr_mgmt_state = PMD_MGMT_ENABLED;
+	return ret;
+
+failure_handler:
+	if (port_cfg[port_id].ref_cnt == 0) {
+		rte_free(port_cfg[port_id].queue_cfg);
+		port_cfg[port_id].queue_cfg = NULL;
+	}
+	return ret;
+}
+
+int
+rte_power_pmd_mgmt_queue_disable(unsigned int lcore_id,
+				uint16_t port_id,
+				uint16_t queue_id)
+{
+	struct pmd_queue_cfg *queue_cfg;
+
+	if (port_cfg[port_id].ref_cnt <= 0)
+		return -EINVAL;
+
+	queue_cfg = &port_cfg[port_id].queue_cfg[queue_id];
+
+	if (queue_cfg->pwr_mgmt_state == PMD_MGMT_DISABLED)
+		return -EINVAL;
+
+	switch (queue_cfg->cb_mode) {
+	case RTE_POWER_MGMT_TYPE_WAIT:
+	case RTE_POWER_MGMT_TYPE_PAUSE:
+		rte_eth_remove_rx_callback(port_id, queue_id,
+					   queue_cfg->cur_cb);
+		break;
+	case RTE_POWER_MGMT_TYPE_SCALE:
+		rte_power_freq_max(lcore_id);
+		rte_eth_remove_rx_callback(port_id, queue_id,
+					   queue_cfg->cur_cb);
+		rte_power_exit(lcore_id);
+		break;
+	}
+	/* it's not recommend to free callback instance here.
+	 * it cause memory leak which is a known issue.
+	 */
+	queue_cfg->cur_cb = NULL;
+	queue_cfg->pwr_mgmt_state = PMD_MGMT_DISABLED;
+	port_cfg[port_id].ref_cnt--;
+
+	if (port_cfg[port_id].ref_cnt == 0) {
+		rte_free(port_cfg[port_id].queue_cfg);
+		port_cfg[port_id].queue_cfg = NULL;
+	}
+	return 0;
+}
diff --git a/lib/librte_power/rte_power_pmd_mgmt.h b/lib/librte_power/rte_power_pmd_mgmt.h
new file mode 100644
index 0000000000..8b110f1148
--- /dev/null
+++ b/lib/librte_power/rte_power_pmd_mgmt.h
@@ -0,0 +1,88 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2020 Intel Corporation
+ */
+
+#ifndef _RTE_POWER_PMD_MGMT_H
+#define _RTE_POWER_PMD_MGMT_H
+
+/**
+ * @file
+ * RTE PMD Power Management
+ */
+#include <stdint.h>
+#include <stdbool.h>
+
+#include <rte_common.h>
+#include <rte_byteorder.h>
+#include <rte_log.h>
+#include <rte_power.h>
+#include <rte_atomic.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * PMD Power Management Type
+ */
+enum rte_power_pmd_mgmt_type {
+	/** WAIT callback mode. */
+	RTE_POWER_MGMT_TYPE_WAIT = 1,
+	/** PAUSE callback mode. */
+	RTE_POWER_MGMT_TYPE_PAUSE,
+	/** Freq Scaling callback mode. */
+	RTE_POWER_MGMT_TYPE_SCALE,
+};
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change, or be removed, without prior notice
+ *
+ * Setup per-queue power management callback.
+ * @param lcore_id
+ *   lcore_id.
+ * @param port_id
+ *   The port identifier of the Ethernet device.
+ * @param queue_id
+ *   The queue identifier of the Ethernet device.
+ * @param mode
+ *   The power management callback function type.
+
+ * @return
+ *   0 on success
+ *   <0 on error
+ */
+
+__rte_experimental
+int
+rte_power_pmd_mgmt_queue_enable(unsigned int lcore_id,
+				uint16_t port_id,
+				uint16_t queue_id,
+				enum rte_power_pmd_mgmt_type mode);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change, or be removed, without prior notice
+ *
+ * Remove per-queue power management callback.
+ * @param lcore_id
+ *   lcore_id.
+ * @param port_id
+ *   The port identifier of the Ethernet device.
+ * @param queue_id
+ *   The queue identifier of the Ethernet device.
+ * @return
+ *   0 on success
+ *   <0 on error
+ */
+
+__rte_experimental
+int
+rte_power_pmd_mgmt_queue_disable(unsigned int lcore_id,
+				uint16_t port_id,
+				uint16_t queue_id);
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/lib/librte_power/rte_power_version.map b/lib/librte_power/rte_power_version.map
index 69ca9af616..3f2f6cd6f6 100644
--- a/lib/librte_power/rte_power_version.map
+++ b/lib/librte_power/rte_power_version.map
@@ -34,4 +34,8 @@  EXPERIMENTAL {
 	rte_power_guest_channel_receive_msg;
 	rte_power_poll_stat_fetch;
 	rte_power_poll_stat_update;
+	# added in 20.11
+	rte_power_pmd_mgmt_queue_enable;
+	rte_power_pmd_mgmt_queue_disable;
+
 };