@@ -192,6 +192,47 @@ User Cases
----------
The mechanism can applied to any device which is based on polling. e.g. NIC, FPGA.
+PMD Power Management API
+------------------------
+
+Abstract
+~~~~~~~~
+Existing power management mechanisms require developers to change application
+design or change code to make use of it. The PMD power management API provides a
+convenient alternative by utilizing Ethernet PMD RX callbacks, and triggering
+power saving whenever empty poll count reaches a certain number.
+
+ * Monitor
+
+ This power saving scheme will put the CPU into optimized power state and use
+ the ``rte_power_monitor()`` function to monitor the Ethernet PMD RX
+ descriptor address, and wake the CPU up whenever there's new traffic.
+
+ * Pause
+
+ This power saving scheme will avoid busy polling by either entering
+ power-optimized sleep state with ``rte_power_pause()`` function, or, if it's
+ not available, use ``rte_pause()``.
+
+ * Frequency scaling
+
+ This power saving scheme will use existing ``librte_power`` library
+ functionality to scale the core frequency up/down depending on traffic
+ volume.
+
+
+.. note::
+
+ Currently, this power management API is limited to mandatory mapping of 1
+ queue to 1 core (multiple queues are supported, but they must be polled from
+ different cores).
+
+API Overview for PMD Power Management
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+* **Queue Enable**: Enable specific power scheme for certain queue/port/core
+
+* **Queue Disable**: Disable power scheme for certain queue/port/core
+
References
----------
@@ -200,3 +241,6 @@ References
* The :doc:`../sample_app_ug/vm_power_management`
chapter in the :doc:`../sample_app_ug/index` section.
+
+* The :doc:`../sample_app_ug/rxtx_callbacks`
+ chapter in the :doc:`../sample_app_ug/index` section.
@@ -60,6 +60,16 @@ New Features
* ``rte_eth_get_monitor_addr()``, to be used in conjunction with
``rte_power_monitor()`` to enable automatic power management for PMD's.
+* **Add PMD power management helper API**
+
+ A new helper API has been added to make using Ethernet PMD power management
+ easier for the user: ``rte_power_pmd_mgmt_queue_enable()``. Three power
+ management schemes are supported initially:
+
+ * Power saving based on UMWAIT instruction (x86 only)
+ * Power saving based on ``rte_pause()`` (generic) or TPAUSE instruction (x86 only)
+ * Power saving based on frequency scaling through the ``librte_power`` library
+
Removed Items
-------------
@@ -9,6 +9,7 @@ sources = files('rte_power.c', 'power_acpi_cpufreq.c',
'power_kvm_vm.c', 'guest_channel.c',
'rte_power_empty_poll.c',
'power_pstate_cpufreq.c',
+ 'rte_power_pmd_mgmt.c',
'power_common.c')
-headers = files('rte_power.h','rte_power_empty_poll.h')
-deps += ['timer']
+headers = files('rte_power.h','rte_power_empty_poll.h','rte_power_pmd_mgmt.h')
+deps += ['timer' ,'ethdev']
new file mode 100644
@@ -0,0 +1,359 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2020 Intel Corporation
+ */
+
+#include <rte_lcore.h>
+#include <rte_cycles.h>
+#include <rte_cpuflags.h>
+#include <rte_malloc.h>
+#include <rte_ethdev.h>
+#include <rte_power_intrinsics.h>
+
+#include "rte_power_pmd_mgmt.h"
+
+#define EMPTYPOLL_MAX 512
+
+static struct pmd_conf_data {
+ struct rte_cpu_intrinsics intrinsics_support;
+ /**< what do we support? */
+ uint64_t tsc_per_us;
+ /**< pre-calculated tsc diff for 1us */
+ uint64_t pause_per_us;
+ /**< how many rte_pause can we fit in a microisecond? */
+} global_data;
+
+/**
+ * Possible power management states of an ethdev port.
+ */
+enum pmd_mgmt_state {
+ /** Device power management is disabled. */
+ PMD_MGMT_DISABLED = 0,
+ /** Device power management is enabled. */
+ PMD_MGMT_ENABLED,
+ /** Device powermanagement status is about to change. */
+ PMD_MGMT_BUSY
+};
+
+struct pmd_queue_cfg {
+ volatile enum pmd_mgmt_state pwr_mgmt_state;
+ /**< State of power management for this queue */
+ enum rte_power_pmd_mgmt_type cb_mode;
+ /**< Callback mode for this queue */
+ const struct rte_eth_rxtx_callback *cur_cb;
+ /**< Callback instance */
+ volatile bool umwait_in_progress;
+ /**< are we currently sleeping? */
+ uint64_t empty_poll_stats;
+ /**< Number of empty polls */
+} __rte_cache_aligned;
+
+static struct pmd_queue_cfg port_cfg[RTE_MAX_ETHPORTS][RTE_MAX_QUEUES_PER_PORT];
+
+static void
+calc_tsc(void)
+{
+ const uint64_t hz = rte_get_timer_hz();
+ const uint64_t tsc_per_us = hz / US_PER_S; /* 1us */
+
+ global_data.tsc_per_us = tsc_per_us;
+
+ /* only do this if we don't have tpause */
+ if (!global_data.intrinsics_support.power_pause) {
+ const uint64_t start = rte_rdtsc_precise();
+ const uint32_t n_pauses = 10000;
+ double us, us_per_pause;
+ uint64_t end;
+ unsigned int i;
+
+ /* estimate number of rte_pause() calls per us*/
+ for (i = 0; i < n_pauses; i++)
+ rte_pause();
+
+ end = rte_rdtsc_precise();
+ us = (end - start) / (double)tsc_per_us;
+ us_per_pause = us / n_pauses;
+
+ global_data.pause_per_us = (uint64_t)(1.0 / us_per_pause);
+ }
+}
+
+static uint16_t
+clb_umwait(uint16_t port_id, uint16_t qidx, struct rte_mbuf **pkts __rte_unused,
+ uint16_t nb_rx, uint16_t max_pkts __rte_unused,
+ void *addr __rte_unused)
+{
+
+ struct pmd_queue_cfg *q_conf;
+
+ q_conf = &port_cfg[port_id][qidx];
+
+ if (unlikely(nb_rx == 0)) {
+ q_conf->empty_poll_stats++;
+ if (unlikely(q_conf->empty_poll_stats > EMPTYPOLL_MAX)) {
+ struct rte_power_monitor_cond pmc;
+ uint16_t ret;
+
+ /*
+ * we might get a cancellation request while being
+ * inside the callback, in which case the wakeup
+ * wouldn't work because it would've arrived too early.
+ *
+ * to get around this, we notify the other thread that
+ * we're sleeping, so that it can spin until we're done.
+ * unsolicited wakeups are perfectly safe.
+ */
+ q_conf->umwait_in_progress = true;
+
+ /* check if we need to cancel sleep */
+ if (q_conf->pwr_mgmt_state == PMD_MGMT_ENABLED) {
+ /* use monitoring condition to sleep */
+ ret = rte_eth_get_monitor_addr(port_id, qidx,
+ &pmc);
+ if (ret == 0)
+ rte_power_monitor(&pmc, -1ULL);
+ }
+ q_conf->umwait_in_progress = false;
+ }
+ } else
+ q_conf->empty_poll_stats = 0;
+
+ return nb_rx;
+}
+
+static uint16_t
+clb_pause(uint16_t port_id, uint16_t qidx, struct rte_mbuf **pkts __rte_unused,
+ uint16_t nb_rx, uint16_t max_pkts __rte_unused,
+ void *addr __rte_unused)
+{
+ struct pmd_queue_cfg *q_conf;
+
+ q_conf = &port_cfg[port_id][qidx];
+
+ if (unlikely(nb_rx == 0)) {
+ q_conf->empty_poll_stats++;
+ /* sleep for 1 microsecond */
+ if (unlikely(q_conf->empty_poll_stats > EMPTYPOLL_MAX)) {
+ /* use tpause if we have it */
+ if (global_data.intrinsics_support.power_pause) {
+ const uint64_t cur = rte_rdtsc();
+ const uint64_t wait_tsc =
+ cur + global_data.tsc_per_us;
+ rte_power_pause(wait_tsc);
+ } else {
+ uint64_t i;
+ for (i = 0; i < global_data.pause_per_us; i++)
+ rte_pause();
+ }
+ }
+ } else
+ q_conf->empty_poll_stats = 0;
+
+ return nb_rx;
+}
+
+static uint16_t
+clb_scale_freq(uint16_t port_id, uint16_t qidx,
+ struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx,
+ uint16_t max_pkts __rte_unused, void *_ __rte_unused)
+{
+ struct pmd_queue_cfg *q_conf;
+
+ q_conf = &port_cfg[port_id][qidx];
+
+ if (unlikely(nb_rx == 0)) {
+ q_conf->empty_poll_stats++;
+ if (unlikely(q_conf->empty_poll_stats > EMPTYPOLL_MAX))
+ /* scale down freq */
+ rte_power_freq_min(rte_lcore_id());
+ } else {
+ q_conf->empty_poll_stats = 0;
+ /* scale up freq */
+ rte_power_freq_max(rte_lcore_id());
+ }
+
+ return nb_rx;
+}
+
+int
+rte_power_pmd_mgmt_queue_enable(unsigned int lcore_id, uint16_t port_id,
+ uint16_t queue_id, enum rte_power_pmd_mgmt_type mode)
+{
+ struct pmd_queue_cfg *queue_cfg;
+ struct rte_eth_dev_info info;
+ int ret;
+
+ RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL);
+
+ if (queue_id >= RTE_MAX_QUEUES_PER_PORT || lcore_id >= RTE_MAX_LCORE) {
+ ret = -EINVAL;
+ goto end;
+ }
+
+ if (rte_eth_dev_info_get(port_id, &info) < 0) {
+ ret = -EINVAL;
+ goto end;
+ }
+
+ /* check if queue id is valid */
+ if (queue_id >= info.nb_rx_queues) {
+ ret = -EINVAL;
+ goto end;
+ }
+
+ queue_cfg = &port_cfg[port_id][queue_id];
+
+ if (queue_cfg->pwr_mgmt_state != PMD_MGMT_DISABLED) {
+ ret = -EINVAL;
+ goto end;
+ }
+
+ /* we're about to change our state */
+ queue_cfg->pwr_mgmt_state = PMD_MGMT_BUSY;
+
+ /* we need this in various places */
+ rte_cpu_get_intrinsics_support(&global_data.intrinsics_support);
+
+ switch (mode) {
+ case RTE_POWER_MGMT_TYPE_MONITOR:
+ {
+ struct rte_power_monitor_cond dummy;
+
+ /* check if rte_power_monitor is supported */
+ if (!global_data.intrinsics_support.power_monitor) {
+ RTE_LOG(DEBUG, POWER, "Monitoring intrinsics are not supported\n");
+ ret = -ENOTSUP;
+ goto rollback;
+ }
+
+ /* check if the device supports the necessary PMD API */
+ if (rte_eth_get_monitor_addr(port_id, queue_id,
+ &dummy) == -ENOTSUP) {
+ RTE_LOG(DEBUG, POWER, "The device does not support rte_eth_get_monitor_addr\n");
+ ret = -ENOTSUP;
+ goto rollback;
+ }
+ /* initialize data before enabling the callback */
+ queue_cfg->empty_poll_stats = 0;
+ queue_cfg->cb_mode = mode;
+ queue_cfg->umwait_in_progress = false;
+ queue_cfg->pwr_mgmt_state = PMD_MGMT_ENABLED;
+
+ queue_cfg->cur_cb = rte_eth_add_rx_callback(port_id, queue_id,
+ clb_umwait, NULL);
+ break;
+ }
+ case RTE_POWER_MGMT_TYPE_SCALE:
+ {
+ enum power_management_env env;
+ /* only PSTATE and ACPI modes are supported */
+ if (!rte_power_check_env_supported(PM_ENV_ACPI_CPUFREQ) &&
+ !rte_power_check_env_supported(
+ PM_ENV_PSTATE_CPUFREQ)) {
+ RTE_LOG(DEBUG, POWER, "Neither ACPI nor PSTATE modes are supported\n");
+ ret = -ENOTSUP;
+ goto rollback;
+ }
+ /* ensure we could initialize the power library */
+ if (rte_power_init(lcore_id)) {
+ ret = -EINVAL;
+ goto rollback;
+ }
+ /* ensure we initialized the correct env */
+ env = rte_power_get_env();
+ if (env != PM_ENV_ACPI_CPUFREQ &&
+ env != PM_ENV_PSTATE_CPUFREQ) {
+ RTE_LOG(DEBUG, POWER, "Neither ACPI nor PSTATE modes were initialized\n");
+ ret = -ENOTSUP;
+ goto rollback;
+ }
+ /* initialize data before enabling the callback */
+ queue_cfg->empty_poll_stats = 0;
+ queue_cfg->cb_mode = mode;
+ queue_cfg->pwr_mgmt_state = PMD_MGMT_ENABLED;
+
+ queue_cfg->cur_cb = rte_eth_add_rx_callback(port_id,
+ queue_id, clb_scale_freq, NULL);
+ break;
+ }
+ case RTE_POWER_MGMT_TYPE_PAUSE:
+ /* figure out various time-to-tsc conversions */
+ if (global_data.tsc_per_us == 0)
+ calc_tsc();
+
+ /* initialize data before enabling the callback */
+ queue_cfg->empty_poll_stats = 0;
+ queue_cfg->cb_mode = mode;
+ queue_cfg->pwr_mgmt_state = PMD_MGMT_ENABLED;
+
+ queue_cfg->cur_cb = rte_eth_add_rx_callback(port_id, queue_id,
+ clb_pause, NULL);
+ break;
+ }
+ ret = 0;
+
+ return ret;
+
+rollback:
+ queue_cfg->pwr_mgmt_state = PMD_MGMT_DISABLED;
+end:
+ return ret;
+}
+
+int
+rte_power_pmd_mgmt_queue_disable(unsigned int lcore_id,
+ uint16_t port_id, uint16_t queue_id)
+{
+ struct pmd_queue_cfg *queue_cfg;
+
+ RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -EINVAL);
+
+ if (lcore_id >= RTE_MAX_LCORE || queue_id >= RTE_MAX_QUEUES_PER_PORT)
+ return -EINVAL;
+
+ /* no need to check queue id as wrong queue id would not be enabled */
+ queue_cfg = &port_cfg[port_id][queue_id];
+
+ if (queue_cfg->pwr_mgmt_state != PMD_MGMT_ENABLED)
+ return -EINVAL;
+
+ /* let the callback know we're shutting down */
+ queue_cfg->pwr_mgmt_state = PMD_MGMT_BUSY;
+
+ switch (queue_cfg->cb_mode) {
+ case RTE_POWER_MGMT_TYPE_MONITOR:
+ {
+ bool exit = false;
+ do {
+ /*
+ * we may request cancellation while the other thread
+ * has just entered the callback but hasn't started
+ * sleeping yet, so keep waking it up until we know it's
+ * done sleeping.
+ */
+ if (queue_cfg->umwait_in_progress)
+ rte_power_monitor_wakeup(lcore_id);
+ else
+ exit = true;
+ } while (!exit);
+ }
+ /* fall-through */
+ case RTE_POWER_MGMT_TYPE_PAUSE:
+ rte_eth_remove_rx_callback(port_id, queue_id,
+ queue_cfg->cur_cb);
+ break;
+ case RTE_POWER_MGMT_TYPE_SCALE:
+ rte_power_freq_max(lcore_id);
+ rte_eth_remove_rx_callback(port_id, queue_id,
+ queue_cfg->cur_cb);
+ rte_power_exit(lcore_id);
+ break;
+ }
+ /*
+ * we don't free the RX callback here because it is unsafe to do so
+ * unless we know for a fact that all data plane threads have stopped.
+ */
+ queue_cfg->cur_cb = NULL;
+ queue_cfg->pwr_mgmt_state = PMD_MGMT_DISABLED;
+
+ return 0;
+}
new file mode 100644
@@ -0,0 +1,90 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2020 Intel Corporation
+ */
+
+#ifndef _RTE_POWER_PMD_MGMT_H
+#define _RTE_POWER_PMD_MGMT_H
+
+/**
+ * @file
+ * RTE PMD Power Management
+ */
+#include <stdint.h>
+#include <stdbool.h>
+
+#include <rte_common.h>
+#include <rte_byteorder.h>
+#include <rte_log.h>
+#include <rte_power.h>
+#include <rte_atomic.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * PMD Power Management Type
+ */
+enum rte_power_pmd_mgmt_type {
+ /** Use power-optimized monitoring to wait for incoming traffic */
+ RTE_POWER_MGMT_TYPE_MONITOR = 1,
+ /** Use power-optimized sleep to avoid busy polling */
+ RTE_POWER_MGMT_TYPE_PAUSE,
+ /** Use frequency scaling when traffic is low */
+ RTE_POWER_MGMT_TYPE_SCALE,
+};
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change, or be removed, without prior notice
+ *
+ * Enable power management on a specified RX queue and lcore.
+ *
+ * @note This function is not thread-safe.
+ *
+ * @param lcore_id
+ * lcore_id.
+ * @param port_id
+ * The port identifier of the Ethernet device.
+ * @param queue_id
+ * The queue identifier of the Ethernet device.
+ * @param mode
+ * The power management callback function type.
+
+ * @return
+ * 0 on success
+ * <0 on error
+ */
+__rte_experimental
+int
+rte_power_pmd_mgmt_queue_enable(unsigned int lcore_id,
+ uint16_t port_id, uint16_t queue_id,
+ enum rte_power_pmd_mgmt_type mode);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change, or be removed, without prior notice
+ *
+ * Disable power management on a specified RX queue and lcore.
+ *
+ * @note This function is not thread-safe.
+ *
+ * @param lcore_id
+ * lcore_id.
+ * @param port_id
+ * The port identifier of the Ethernet device.
+ * @param queue_id
+ * The queue identifier of the Ethernet device.
+ * @return
+ * 0 on success
+ * <0 on error
+ */
+__rte_experimental
+int
+rte_power_pmd_mgmt_queue_disable(unsigned int lcore_id,
+ uint16_t port_id, uint16_t queue_id);
+#ifdef __cplusplus
+}
+#endif
+
+#endif
@@ -34,4 +34,9 @@ EXPERIMENTAL {
rte_power_guest_channel_receive_msg;
rte_power_poll_stat_fetch;
rte_power_poll_stat_update;
+
+ # added in 21.02
+ rte_power_pmd_mgmt_queue_enable;
+ rte_power_pmd_mgmt_queue_disable;
+
};