[v3,6/7] power: support monitoring multiple Rx queues

Message ID 676eab0e1eb6c63acb170893675daa5a39eac29d.1624884053.git.anatoly.burakov@intel.com (mailing list archive)
State Superseded, archived
Headers
Series Enhancements for PMD power management |

Checks

Context Check Description
ci/checkpatch success coding style OK

Commit Message

Burakov, Anatoly June 28, 2021, 12:41 p.m. UTC
  Use the new multi-monitor intrinsic to allow monitoring multiple ethdev
Rx queues while entering the energy efficient power state. The multi
version will be used unconditionally if supported, and the UMWAIT one
will only be used when multi-monitor is not supported by the hardware.

Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
---
 doc/guides/prog_guide/power_man.rst |  9 ++--
 lib/power/rte_power_pmd_mgmt.c      | 76 ++++++++++++++++++++++++++++-
 2 files changed, 80 insertions(+), 5 deletions(-)
  

Comments

Ananyev, Konstantin June 28, 2021, 1:29 p.m. UTC | #1
> Use the new multi-monitor intrinsic to allow monitoring multiple ethdev
> Rx queues while entering the energy efficient power state. The multi
> version will be used unconditionally if supported, and the UMWAIT one
> will only be used when multi-monitor is not supported by the hardware.
> 
> Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
> ---
>  doc/guides/prog_guide/power_man.rst |  9 ++--
>  lib/power/rte_power_pmd_mgmt.c      | 76 ++++++++++++++++++++++++++++-
>  2 files changed, 80 insertions(+), 5 deletions(-)
> 
> diff --git a/doc/guides/prog_guide/power_man.rst b/doc/guides/prog_guide/power_man.rst
> index fac2c19516..3245a5ebed 100644
> --- a/doc/guides/prog_guide/power_man.rst
> +++ b/doc/guides/prog_guide/power_man.rst
> @@ -221,13 +221,16 @@ power saving whenever empty poll count reaches a certain number.
>  The "monitor" mode is only supported in the following configurations and scenarios:
> 
>  * If ``rte_cpu_get_intrinsics_support()`` function indicates that
> +  ``rte_power_monitor_multi()`` function is supported by the platform, then
> +  monitoring multiple Ethernet Rx queues for traffic will be supported.
> +
> +* If ``rte_cpu_get_intrinsics_support()`` function indicates that only
>    ``rte_power_monitor()`` is supported by the platform, then monitoring will be
>    limited to a mapping of 1 core 1 queue (thus, each Rx queue will have to be
>    monitored from a different lcore).
> 
> -* If ``rte_cpu_get_intrinsics_support()`` function indicates that the
> -  ``rte_power_monitor()`` function is not supported, then monitor mode will not
> -  be supported.
> +* If ``rte_cpu_get_intrinsics_support()`` function indicates that neither of the
> +  two monitoring functions are supported, then monitor mode will not be supported.
> 
>  * Not all Ethernet devices support monitoring, even if the underlying
>    platform may support the necessary CPU instructions. Please refer to
> diff --git a/lib/power/rte_power_pmd_mgmt.c b/lib/power/rte_power_pmd_mgmt.c
> index 7762cd39b8..aab2d4f1ee 100644
> --- a/lib/power/rte_power_pmd_mgmt.c
> +++ b/lib/power/rte_power_pmd_mgmt.c
> @@ -155,6 +155,24 @@ queue_list_remove(struct pmd_core_cfg *cfg, const union queue *q)
>  	return 0;
>  }
> 
> +static inline int
> +get_monitor_addresses(struct pmd_core_cfg *cfg,
> +		struct rte_power_monitor_cond *pmc)
> +{
> +	const struct queue_list_entry *qle;
> +	size_t i = 0;
> +	int ret;
> +
> +	TAILQ_FOREACH(qle, &cfg->head, next) {
> +		struct rte_power_monitor_cond *cur = &pmc[i];

Looks like you never increment 'i' value inside that function.
Also it probably will be safer to add 'num' parameter to check that
we will never over-run pmc[] boundaries.

> +		const union queue *q = &qle->queue;
> +		ret = rte_eth_get_monitor_addr(q->portid, q->qid, cur);
> +		if (ret < 0)
> +			return ret;
> +	}
> +	return 0;
> +}
> +
>  static void
>  calc_tsc(void)
>  {
> @@ -183,6 +201,48 @@ calc_tsc(void)
>  	}
>  }
> 
> +static uint16_t
> +clb_multiwait(uint16_t port_id, uint16_t qidx,
> +		struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx,
> +		uint16_t max_pkts __rte_unused, void *addr __rte_unused)
> +{
> +	const unsigned int lcore = rte_lcore_id();
> +	const union queue q = {.portid = port_id, .qid = qidx};
> +	const bool empty = nb_rx == 0;
> +	struct pmd_core_cfg *q_conf;
> +
> +	q_conf = &lcore_cfg[lcore];
> +
> +	/* early exit */
> +	if (likely(!empty)) {
> +		q_conf->empty_poll_stats = 0;
> +	} else {
> +		/* do we care about this particular queue? */
> +		if (!queue_is_power_save(q_conf, &q))
> +			return nb_rx;

I still don't understand the need of 'special' power_save queue here...
Why we can't just have a function:

get_number_of_queues_whose_sequential_empty_polls_less_then_threshold(struct pmd_core_cfg *lcore_cfg),
and then just:

/* all queues have at least EMPTYPOLL_MAX sequential empty polls */
if (get_number_of_queues_whose_sequential_empty_polls_less_then_threshold(q_conf) == 0) {
    /* go into power-save mode here */
}

> +
> +		/*
> +		 * we can increment unconditionally here because if there were
> +		 * non-empty polls in other queues assigned to this core, we
> +		 * dropped the counter to zero anyway.
> +		 */
> +		q_conf->empty_poll_stats++;
> +		if (unlikely(q_conf->empty_poll_stats > EMPTYPOLL_MAX)) {
> +			struct rte_power_monitor_cond pmc[RTE_MAX_ETHPORTS];

I think you need here:
struct rte_power_monitor_cond pmc[q_conf->n_queues];


> +			uint16_t ret;
> +
> +			/* gather all monitoring conditions */
> +			ret = get_monitor_addresses(q_conf, pmc);
> +
> +			if (ret == 0)
> +				rte_power_monitor_multi(pmc,
> +					q_conf->n_queues, UINT64_MAX);
> +		}
> +	}
> +
> +	return nb_rx;
> +}
> +
>  static uint16_t
>  clb_umwait(uint16_t port_id, uint16_t qidx, struct rte_mbuf **pkts __rte_unused,
>  		uint16_t nb_rx, uint16_t max_pkts __rte_unused,
> @@ -348,14 +408,19 @@ static int
>  check_monitor(struct pmd_core_cfg *cfg, const union queue *qdata)
>  {
>  	struct rte_power_monitor_cond dummy;
> +	bool multimonitor_supported;
> 
>  	/* check if rte_power_monitor is supported */
>  	if (!global_data.intrinsics_support.power_monitor) {
>  		RTE_LOG(DEBUG, POWER, "Monitoring intrinsics are not supported\n");
>  		return -ENOTSUP;
>  	}
> +	/* check if multi-monitor is supported */
> +	multimonitor_supported =
> +			global_data.intrinsics_support.power_monitor_multi;
> 
> -	if (cfg->n_queues > 0) {
> +	/* if we're adding a new queue, do we support multiple queues? */
> +	if (cfg->n_queues > 0 && !multimonitor_supported) {
>  		RTE_LOG(DEBUG, POWER, "Monitoring multiple queues is not supported\n");
>  		return -ENOTSUP;
>  	}
> @@ -371,6 +436,13 @@ check_monitor(struct pmd_core_cfg *cfg, const union queue *qdata)
>  	return 0;
>  }
> 
> +static inline rte_rx_callback_fn
> +get_monitor_callback(void)
> +{
> +	return global_data.intrinsics_support.power_monitor_multi ?
> +		clb_multiwait : clb_umwait;
> +}
> +
>  int
>  rte_power_ethdev_pmgmt_queue_enable(unsigned int lcore_id, uint16_t port_id,
>  		uint16_t queue_id, enum rte_power_pmd_mgmt_type mode)
> @@ -434,7 +506,7 @@ rte_power_ethdev_pmgmt_queue_enable(unsigned int lcore_id, uint16_t port_id,
>  		if (ret < 0)
>  			goto end;
> 
> -		clb = clb_umwait;
> +		clb = get_monitor_callback();
>  		break;
>  	case RTE_POWER_MGMT_TYPE_SCALE:
>  		/* check if we can add a new queue */
> --
> 2.25.1
  
Burakov, Anatoly June 28, 2021, 2:09 p.m. UTC | #2
On 28-Jun-21 2:29 PM, Ananyev, Konstantin wrote:
> 
> 
>> Use the new multi-monitor intrinsic to allow monitoring multiple ethdev
>> Rx queues while entering the energy efficient power state. The multi
>> version will be used unconditionally if supported, and the UMWAIT one
>> will only be used when multi-monitor is not supported by the hardware.
>>
>> Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
>> ---
>>   doc/guides/prog_guide/power_man.rst |  9 ++--
>>   lib/power/rte_power_pmd_mgmt.c      | 76 ++++++++++++++++++++++++++++-
>>   2 files changed, 80 insertions(+), 5 deletions(-)
>>
>> diff --git a/doc/guides/prog_guide/power_man.rst b/doc/guides/prog_guide/power_man.rst
>> index fac2c19516..3245a5ebed 100644
>> --- a/doc/guides/prog_guide/power_man.rst
>> +++ b/doc/guides/prog_guide/power_man.rst
>> @@ -221,13 +221,16 @@ power saving whenever empty poll count reaches a certain number.
>>   The "monitor" mode is only supported in the following configurations and scenarios:
>>
>>   * If ``rte_cpu_get_intrinsics_support()`` function indicates that
>> +  ``rte_power_monitor_multi()`` function is supported by the platform, then
>> +  monitoring multiple Ethernet Rx queues for traffic will be supported.
>> +
>> +* If ``rte_cpu_get_intrinsics_support()`` function indicates that only
>>     ``rte_power_monitor()`` is supported by the platform, then monitoring will be
>>     limited to a mapping of 1 core 1 queue (thus, each Rx queue will have to be
>>     monitored from a different lcore).
>>
>> -* If ``rte_cpu_get_intrinsics_support()`` function indicates that the
>> -  ``rte_power_monitor()`` function is not supported, then monitor mode will not
>> -  be supported.
>> +* If ``rte_cpu_get_intrinsics_support()`` function indicates that neither of the
>> +  two monitoring functions are supported, then monitor mode will not be supported.
>>
>>   * Not all Ethernet devices support monitoring, even if the underlying
>>     platform may support the necessary CPU instructions. Please refer to
>> diff --git a/lib/power/rte_power_pmd_mgmt.c b/lib/power/rte_power_pmd_mgmt.c
>> index 7762cd39b8..aab2d4f1ee 100644
>> --- a/lib/power/rte_power_pmd_mgmt.c
>> +++ b/lib/power/rte_power_pmd_mgmt.c
>> @@ -155,6 +155,24 @@ queue_list_remove(struct pmd_core_cfg *cfg, const union queue *q)
>>        return 0;
>>   }
>>
>> +static inline int
>> +get_monitor_addresses(struct pmd_core_cfg *cfg,
>> +             struct rte_power_monitor_cond *pmc)
>> +{
>> +     const struct queue_list_entry *qle;
>> +     size_t i = 0;
>> +     int ret;
>> +
>> +     TAILQ_FOREACH(qle, &cfg->head, next) {
>> +             struct rte_power_monitor_cond *cur = &pmc[i];
> 
> Looks like you never increment 'i' value inside that function.
> Also it probably will be safer to add 'num' parameter to check that
> we will never over-run pmc[] boundaries.

Will fix in v4, good catch!

> 
>> +             const union queue *q = &qle->queue;
>> +             ret = rte_eth_get_monitor_addr(q->portid, q->qid, cur);
>> +             if (ret < 0)
>> +                     return ret;
>> +     }
>> +     return 0;
>> +}
>> +
>>   static void
>>   calc_tsc(void)
>>   {
>> @@ -183,6 +201,48 @@ calc_tsc(void)
>>        }
>>   }
>>
>> +static uint16_t
>> +clb_multiwait(uint16_t port_id, uint16_t qidx,
>> +             struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx,
>> +             uint16_t max_pkts __rte_unused, void *addr __rte_unused)
>> +{
>> +     const unsigned int lcore = rte_lcore_id();
>> +     const union queue q = {.portid = port_id, .qid = qidx};
>> +     const bool empty = nb_rx == 0;
>> +     struct pmd_core_cfg *q_conf;
>> +
>> +     q_conf = &lcore_cfg[lcore];
>> +
>> +     /* early exit */
>> +     if (likely(!empty)) {
>> +             q_conf->empty_poll_stats = 0;
>> +     } else {
>> +             /* do we care about this particular queue? */
>> +             if (!queue_is_power_save(q_conf, &q))
>> +                     return nb_rx;
> 
> I still don't understand the need of 'special' power_save queue here...
> Why we can't just have a function:
> 
> get_number_of_queues_whose_sequential_empty_polls_less_then_threshold(struct pmd_core_cfg *lcore_cfg),
> and then just:
> 
> /* all queues have at least EMPTYPOLL_MAX sequential empty polls */
> if (get_number_of_queues_whose_sequential_empty_polls_less_then_threshold(q_conf) == 0) {
>      /* go into power-save mode here */
> }

Okay, let's go through this step by step :)

Let's suppose we have three queues - q0, q1 and q2. We want to sleep 
whenever there's no traffic on *all of them*, however we cannot know 
that until we have checked all of them.

So, let's suppose that q0, q1 and q2 were empty all this time, but now 
some traffic arrived at q2 while we're still checking q0. We see that q0 
is empty, and all of the queues were empty for the last N polls, so we 
think we will be safe to sleep at q0 despite the fact that traffic has 
just arrived at q2.

This is not an issue with MONITOR mode because we will be able to see if 
current Rx ring descriptor is busy or not via the NIC callback, *but 
this is not possible* with PAUSE and SCALE modes, because they don't 
have the sneaky lookahead function of MONITOR! So, with PAUSE and SCALE 
modes, it is possible to end up in a situation where you *think* you 
don't have any traffic, but you actually do, you just haven't checked 
the relevant queue yet.

In order to prevent this from happening, we do not sleep on every queue, 
instead we sleep *once* per loop. That is, we check q0, check q1, check 
q2, and only then we decide whether we want to sleep or not.

Of course, with such scheme it is still possible to e.g. sleep in q2 
while there's traffic waiting in q0, but worst case is less bad with 
this scheme, because we'll be doing at worst 1 extra sleep.

Whereas with what you're suggesting, if we had e.g. 10 queues to poll, 
and we checked q1 but traffic has just arrived at q0, we'll be sleeping 
at q1, then we'll be sleeping at q2, then we'll be sleeping at q3, then 
we'll be sleeping at q4, then we'll be sleeping at q5.... and 9 sleeps 
later we finally reach q0 and find out after all this time that we 
shouldn't have slept in the first place. Hopefully you get the point now :)

So, the idea here is, for any N queues, sleep only once, not N times.

> 
>> +
>> +             /*
>> +              * we can increment unconditionally here because if there were
>> +              * non-empty polls in other queues assigned to this core, we
>> +              * dropped the counter to zero anyway.
>> +              */
>> +             q_conf->empty_poll_stats++;
>> +             if (unlikely(q_conf->empty_poll_stats > EMPTYPOLL_MAX)) {
>> +                     struct rte_power_monitor_cond pmc[RTE_MAX_ETHPORTS];
> 
> I think you need here:
> struct rte_power_monitor_cond pmc[q_conf->n_queues];

I think VLA's are generally agreed upon to be unsafe, so i'm avoiding 
them here.

> 
> 
>> +                     uint16_t ret;
>> +
>> +                     /* gather all monitoring conditions */
>> +                     ret = get_monitor_addresses(q_conf, pmc);
>> +
>> +                     if (ret == 0)
>> +                             rte_power_monitor_multi(pmc,
>> +                                     q_conf->n_queues, UINT64_MAX);
>> +             }
>> +     }
>> +
>> +     return nb_rx;
>> +}
>> +
>>   static uint16_t
>>   clb_umwait(uint16_t port_id, uint16_t qidx, struct rte_mbuf **pkts __rte_unused,
>>                uint16_t nb_rx, uint16_t max_pkts __rte_unused,
>> @@ -348,14 +408,19 @@ static int
>>   check_monitor(struct pmd_core_cfg *cfg, const union queue *qdata)
>>   {
>>        struct rte_power_monitor_cond dummy;
>> +     bool multimonitor_supported;
>>
>>        /* check if rte_power_monitor is supported */
>>        if (!global_data.intrinsics_support.power_monitor) {
>>                RTE_LOG(DEBUG, POWER, "Monitoring intrinsics are not supported\n");
>>                return -ENOTSUP;
>>        }
>> +     /* check if multi-monitor is supported */
>> +     multimonitor_supported =
>> +                     global_data.intrinsics_support.power_monitor_multi;
>>
>> -     if (cfg->n_queues > 0) {
>> +     /* if we're adding a new queue, do we support multiple queues? */
>> +     if (cfg->n_queues > 0 && !multimonitor_supported) {
>>                RTE_LOG(DEBUG, POWER, "Monitoring multiple queues is not supported\n");
>>                return -ENOTSUP;
>>        }
>> @@ -371,6 +436,13 @@ check_monitor(struct pmd_core_cfg *cfg, const union queue *qdata)
>>        return 0;
>>   }
>>
>> +static inline rte_rx_callback_fn
>> +get_monitor_callback(void)
>> +{
>> +     return global_data.intrinsics_support.power_monitor_multi ?
>> +             clb_multiwait : clb_umwait;
>> +}
>> +
>>   int
>>   rte_power_ethdev_pmgmt_queue_enable(unsigned int lcore_id, uint16_t port_id,
>>                uint16_t queue_id, enum rte_power_pmd_mgmt_type mode)
>> @@ -434,7 +506,7 @@ rte_power_ethdev_pmgmt_queue_enable(unsigned int lcore_id, uint16_t port_id,
>>                if (ret < 0)
>>                        goto end;
>>
>> -             clb = clb_umwait;
>> +             clb = get_monitor_callback();
>>                break;
>>        case RTE_POWER_MGMT_TYPE_SCALE:
>>                /* check if we can add a new queue */
>> --
>> 2.25.1
>
  
Ananyev, Konstantin June 29, 2021, 12:07 a.m. UTC | #3
> >> Use the new multi-monitor intrinsic to allow monitoring multiple ethdev
> >> Rx queues while entering the energy efficient power state. The multi
> >> version will be used unconditionally if supported, and the UMWAIT one
> >> will only be used when multi-monitor is not supported by the hardware.
> >>
> >> Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
> >> ---
> >>   doc/guides/prog_guide/power_man.rst |  9 ++--
> >>   lib/power/rte_power_pmd_mgmt.c      | 76 ++++++++++++++++++++++++++++-
> >>   2 files changed, 80 insertions(+), 5 deletions(-)
> >>
> >> diff --git a/doc/guides/prog_guide/power_man.rst b/doc/guides/prog_guide/power_man.rst
> >> index fac2c19516..3245a5ebed 100644
> >> --- a/doc/guides/prog_guide/power_man.rst
> >> +++ b/doc/guides/prog_guide/power_man.rst
> >> @@ -221,13 +221,16 @@ power saving whenever empty poll count reaches a certain number.
> >>   The "monitor" mode is only supported in the following configurations and scenarios:
> >>
> >>   * If ``rte_cpu_get_intrinsics_support()`` function indicates that
> >> +  ``rte_power_monitor_multi()`` function is supported by the platform, then
> >> +  monitoring multiple Ethernet Rx queues for traffic will be supported.
> >> +
> >> +* If ``rte_cpu_get_intrinsics_support()`` function indicates that only
> >>     ``rte_power_monitor()`` is supported by the platform, then monitoring will be
> >>     limited to a mapping of 1 core 1 queue (thus, each Rx queue will have to be
> >>     monitored from a different lcore).
> >>
> >> -* If ``rte_cpu_get_intrinsics_support()`` function indicates that the
> >> -  ``rte_power_monitor()`` function is not supported, then monitor mode will not
> >> -  be supported.
> >> +* If ``rte_cpu_get_intrinsics_support()`` function indicates that neither of the
> >> +  two monitoring functions are supported, then monitor mode will not be supported.
> >>
> >>   * Not all Ethernet devices support monitoring, even if the underlying
> >>     platform may support the necessary CPU instructions. Please refer to
> >> diff --git a/lib/power/rte_power_pmd_mgmt.c b/lib/power/rte_power_pmd_mgmt.c
> >> index 7762cd39b8..aab2d4f1ee 100644
> >> --- a/lib/power/rte_power_pmd_mgmt.c
> >> +++ b/lib/power/rte_power_pmd_mgmt.c
> >> @@ -155,6 +155,24 @@ queue_list_remove(struct pmd_core_cfg *cfg, const union queue *q)
> >>        return 0;
> >>   }
> >>
> >> +static inline int
> >> +get_monitor_addresses(struct pmd_core_cfg *cfg,
> >> +             struct rte_power_monitor_cond *pmc)
> >> +{
> >> +     const struct queue_list_entry *qle;
> >> +     size_t i = 0;
> >> +     int ret;
> >> +
> >> +     TAILQ_FOREACH(qle, &cfg->head, next) {
> >> +             struct rte_power_monitor_cond *cur = &pmc[i];
> >
> > Looks like you never increment 'i' value inside that function.
> > Also it probably will be safer to add 'num' parameter to check that
> > we will never over-run pmc[] boundaries.
> 
> Will fix in v4, good catch!
> 
> >
> >> +             const union queue *q = &qle->queue;
> >> +             ret = rte_eth_get_monitor_addr(q->portid, q->qid, cur);
> >> +             if (ret < 0)
> >> +                     return ret;
> >> +     }
> >> +     return 0;
> >> +}
> >> +
> >>   static void
> >>   calc_tsc(void)
> >>   {
> >> @@ -183,6 +201,48 @@ calc_tsc(void)
> >>        }
> >>   }
> >>
> >> +static uint16_t
> >> +clb_multiwait(uint16_t port_id, uint16_t qidx,
> >> +             struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx,
> >> +             uint16_t max_pkts __rte_unused, void *addr __rte_unused)
> >> +{
> >> +     const unsigned int lcore = rte_lcore_id();
> >> +     const union queue q = {.portid = port_id, .qid = qidx};
> >> +     const bool empty = nb_rx == 0;
> >> +     struct pmd_core_cfg *q_conf;
> >> +
> >> +     q_conf = &lcore_cfg[lcore];
> >> +
> >> +     /* early exit */
> >> +     if (likely(!empty)) {
> >> +             q_conf->empty_poll_stats = 0;
> >> +     } else {
> >> +             /* do we care about this particular queue? */
> >> +             if (!queue_is_power_save(q_conf, &q))
> >> +                     return nb_rx;
> >
> > I still don't understand the need of 'special' power_save queue here...
> > Why we can't just have a function:
> >
> > get_number_of_queues_whose_sequential_empty_polls_less_then_threshold(struct pmd_core_cfg *lcore_cfg),
> > and then just:
> >
> > /* all queues have at least EMPTYPOLL_MAX sequential empty polls */
> > if (get_number_of_queues_whose_sequential_empty_polls_less_then_threshold(q_conf) == 0) {
> >      /* go into power-save mode here */
> > }
> 
> Okay, let's go through this step by step :)
> 
> Let's suppose we have three queues - q0, q1 and q2. We want to sleep
> whenever there's no traffic on *all of them*, however we cannot know
> that until we have checked all of them.
> 
> So, let's suppose that q0, q1 and q2 were empty all this time, but now
> some traffic arrived at q2 while we're still checking q0. We see that q0
> is empty, and all of the queues were empty for the last N polls, so we
> think we will be safe to sleep at q0 despite the fact that traffic has
> just arrived at q2.
> This is not an issue with MONITOR mode because we will be able to see if
> current Rx ring descriptor is busy or not via the NIC callback, *but
> this is not possible* with PAUSE and SCALE modes, because they don't
> have the sneaky lookahead function of MONITOR! So, with PAUSE and SCALE
> modes, it is possible to end up in a situation where you *think* you
> don't have any traffic, but you actually do, you just haven't checked
> the relevant queue yet.

I think such situation is unavoidable.
Yes, traffic can arrive to *any* queue at *any* time.
With your example above - user choose q2 as 'special' queue, but
traffic actually arrives on q0 or q1. 
And yes, if user choose PAUSE or SCALE methods he *can* miss the traffic,   
because as you said for these methods there is no notification mechanisms.
I think there are just unavoidable limitations with these power-save methods. 
 
> In order to prevent this from happening, we do not sleep on every queue,
> instead we sleep *once* per loop. 

Yes, totally agree we shouldn't sleep on *every* queue.
We need to go to sleep when there is no traffic on *any* of queues we monitor. 

> That is, we check q0, check q1, check
> q2, and only then we decide whether we want to sleep or not.

> Of course, with such scheme it is still possible to e.g. sleep in q2
> while there's traffic waiting in q0,

Yes, exactly.

> but worst case is less bad with
> this scheme, because we'll be doing at worst 1 extra sleep.

Hmm, I think it would be one extra sleep anyway.

> Whereas with what you're suggesting, if we had e.g. 10 queues to poll,
> and we checked q1 but traffic has just arrived at q0, we'll be sleeping
> at q1, then we'll be sleeping at q2, then we'll be sleeping at q3, then
> we'll be sleeping at q4, then we'll be sleeping at q5.... and 9 sleeps
> later we finally reach q0 and find out after all this time that we
> shouldn't have slept in the first place.

Ah ok, I think I understand now what you are saying.
Sure, to avoid such situation, we'll need to maintain extra counters and
update them properly when we go to sleep.   
I should state it clearly at the beginning.
It might be easier to explain what I meant by code snippet:

lcore_conf needs 2 counters:
uint64_t   nb_queues_ready_to_sleep;
uint64_t   nb_sleeps;
 
Plus each queue needs 2 counters:
uint64_t nb_empty_polls;
uint64_t nb_sleeps;

Now, at rx_callback():

/* check did sleep happen since previous call,
     if yes, then reset queue counters */
if (queue->nb_sleeps != lcore_conf->nb_sleeps) {
    queue->nb_sleeps = lcore_conf->nb_sleeps;
    queue->nb_empty_polls = 0;
}

 /* packet arrived, reset counters */
 if (nb_rx != 0) {
   /* queue is not 'ready_to_sleep' any more */
   if (queue->nb_empty_polls > EMPTYPOLL_MAX)
       lcore_conf-> nb_queues_ready_to_sleep--;
   queue->nb_empty_polls = 0;

/* empty poll */
} else {
    /* queue reaches EMPTYPOLL_MAX threshold, mark it as 'ready_to_sleep' */ 
    if (queue->nb_empty_polls == EMPTYPOLL_MAX)
       lcore_conf-> nb_queues_ready_to_sleep++;
    queue->nb_empty_polls++;
}

   /* no traffic on any queue for at least EMPTYPOLL_MAX iterations */
   if (lcore_conf-> nb_queues_ready_to_sleep == lcore_conf->n_queues) {
      /* update counters and sleep */
      lcore_conf->nb_sleeps++;
      lcore_conf-> nb_queues_ready_to_sleep = 0;
      goto_sleep();
   }
}

> Hopefully you get the point now :)
> 
> So, the idea here is, for any N queues, sleep only once, not N times.
> 
> >
> >> +
> >> +             /*
> >> +              * we can increment unconditionally here because if there were
> >> +              * non-empty polls in other queues assigned to this core, we
> >> +              * dropped the counter to zero anyway.
> >> +              */
> >> +             q_conf->empty_poll_stats++;
> >> +             if (unlikely(q_conf->empty_poll_stats > EMPTYPOLL_MAX)) {
> >> +                     struct rte_power_monitor_cond pmc[RTE_MAX_ETHPORTS];
> >
> > I think you need here:
> > struct rte_power_monitor_cond pmc[q_conf->n_queues];
> 
> I think VLA's are generally agreed upon to be unsafe, so i'm avoiding
> them here.

Wonder why?
These days DPDK uses VLA in dozens of places...
But if you'd like to avoid VLA - you can use alloca(),
or have lcore_conf->pmc[] and realloc() it when new queue is
added/removed from the list.

> 
> >
> >
> >> +                     uint16_t ret;
> >> +
> >> +                     /* gather all monitoring conditions */
> >> +                     ret = get_monitor_addresses(q_conf, pmc);
> >> +
> >> +                     if (ret == 0)
> >> +                             rte_power_monitor_multi(pmc,
> >> +                                     q_conf->n_queues, UINT64_MAX);
> >> +             }
> >> +     }
> >> +
> >> +     return nb_rx;
> >> +}
> >> +
> >>   static uint16_t
> >>   clb_umwait(uint16_t port_id, uint16_t qidx, struct rte_mbuf **pkts __rte_unused,
> >>                uint16_t nb_rx, uint16_t max_pkts __rte_unused,
> >> @@ -348,14 +408,19 @@ static int
> >>   check_monitor(struct pmd_core_cfg *cfg, const union queue *qdata)
> >>   {
> >>        struct rte_power_monitor_cond dummy;
> >> +     bool multimonitor_supported;
> >>
> >>        /* check if rte_power_monitor is supported */
> >>        if (!global_data.intrinsics_support.power_monitor) {
> >>                RTE_LOG(DEBUG, POWER, "Monitoring intrinsics are not supported\n");
> >>                return -ENOTSUP;
> >>        }
> >> +     /* check if multi-monitor is supported */
> >> +     multimonitor_supported =
> >> +                     global_data.intrinsics_support.power_monitor_multi;
> >>
> >> -     if (cfg->n_queues > 0) {
> >> +     /* if we're adding a new queue, do we support multiple queues? */
> >> +     if (cfg->n_queues > 0 && !multimonitor_supported) {
> >>                RTE_LOG(DEBUG, POWER, "Monitoring multiple queues is not supported\n");
> >>                return -ENOTSUP;
> >>        }
> >> @@ -371,6 +436,13 @@ check_monitor(struct pmd_core_cfg *cfg, const union queue *qdata)
> >>        return 0;
> >>   }
> >>
> >> +static inline rte_rx_callback_fn
> >> +get_monitor_callback(void)
> >> +{
> >> +     return global_data.intrinsics_support.power_monitor_multi ?
> >> +             clb_multiwait : clb_umwait;
> >> +}
> >> +
> >>   int
> >>   rte_power_ethdev_pmgmt_queue_enable(unsigned int lcore_id, uint16_t port_id,
> >>                uint16_t queue_id, enum rte_power_pmd_mgmt_type mode)
> >> @@ -434,7 +506,7 @@ rte_power_ethdev_pmgmt_queue_enable(unsigned int lcore_id, uint16_t port_id,
> >>                if (ret < 0)
> >>                        goto end;
> >>
> >> -             clb = clb_umwait;
> >> +             clb = get_monitor_callback();
> >>                break;
> >>        case RTE_POWER_MGMT_TYPE_SCALE:
> >>                /* check if we can add a new queue */
> >> --
> >> 2.25.1
> >
> 
> 
> --
> Thanks,
> Anatoly
  
Burakov, Anatoly June 29, 2021, 11:05 a.m. UTC | #4
On 29-Jun-21 1:07 AM, Ananyev, Konstantin wrote:

>>>> +static uint16_t
>>>> +clb_multiwait(uint16_t port_id, uint16_t qidx,
>>>> +             struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx,
>>>> +             uint16_t max_pkts __rte_unused, void *addr __rte_unused)
>>>> +{
>>>> +     const unsigned int lcore = rte_lcore_id();
>>>> +     const union queue q = {.portid = port_id, .qid = qidx};
>>>> +     const bool empty = nb_rx == 0;
>>>> +     struct pmd_core_cfg *q_conf;
>>>> +
>>>> +     q_conf = &lcore_cfg[lcore];
>>>> +
>>>> +     /* early exit */
>>>> +     if (likely(!empty)) {
>>>> +             q_conf->empty_poll_stats = 0;
>>>> +     } else {
>>>> +             /* do we care about this particular queue? */
>>>> +             if (!queue_is_power_save(q_conf, &q))
>>>> +                     return nb_rx;
>>>
>>> I still don't understand the need of 'special' power_save queue here...
>>> Why we can't just have a function:
>>>
>>> get_number_of_queues_whose_sequential_empty_polls_less_then_threshold(struct pmd_core_cfg *lcore_cfg),
>>> and then just:
>>>
>>> /* all queues have at least EMPTYPOLL_MAX sequential empty polls */
>>> if (get_number_of_queues_whose_sequential_empty_polls_less_then_threshold(q_conf) == 0) {
>>>       /* go into power-save mode here */
>>> }
>>
>> Okay, let's go through this step by step :)
>>
>> Let's suppose we have three queues - q0, q1 and q2. We want to sleep
>> whenever there's no traffic on *all of them*, however we cannot know
>> that until we have checked all of them.
>>
>> So, let's suppose that q0, q1 and q2 were empty all this time, but now
>> some traffic arrived at q2 while we're still checking q0. We see that q0
>> is empty, and all of the queues were empty for the last N polls, so we
>> think we will be safe to sleep at q0 despite the fact that traffic has
>> just arrived at q2.
>> This is not an issue with MONITOR mode because we will be able to see if
>> current Rx ring descriptor is busy or not via the NIC callback, *but
>> this is not possible* with PAUSE and SCALE modes, because they don't
>> have the sneaky lookahead function of MONITOR! So, with PAUSE and SCALE
>> modes, it is possible to end up in a situation where you *think* you
>> don't have any traffic, but you actually do, you just haven't checked
>> the relevant queue yet.
> 
> I think such situation is unavoidable.
> Yes, traffic can arrive to *any* queue at *any* time.
> With your example above - user choose q2 as 'special' queue, but
> traffic actually arrives on q0 or q1.
> And yes, if user choose PAUSE or SCALE methods he *can* miss the traffic,
> because as you said for these methods there is no notification mechanisms.
> I think there are just unavoidable limitations with these power-save methods.
> 
>> In order to prevent this from happening, we do not sleep on every queue,
>> instead we sleep *once* per loop.
> 
> Yes, totally agree we shouldn't sleep on *every* queue.
> We need to go to sleep when there is no traffic on *any* of queues we monitor.
> 
>> That is, we check q0, check q1, check
>> q2, and only then we decide whether we want to sleep or not.
> 
>> Of course, with such scheme it is still possible to e.g. sleep in q2
>> while there's traffic waiting in q0,
> 
> Yes, exactly.
> 
>> but worst case is less bad with
>> this scheme, because we'll be doing at worst 1 extra sleep.
> 
> Hmm, I think it would be one extra sleep anyway.
> 
>> Whereas with what you're suggesting, if we had e.g. 10 queues to poll,
>> and we checked q1 but traffic has just arrived at q0, we'll be sleeping
>> at q1, then we'll be sleeping at q2, then we'll be sleeping at q3, then
>> we'll be sleeping at q4, then we'll be sleeping at q5.... and 9 sleeps
>> later we finally reach q0 and find out after all this time that we
>> shouldn't have slept in the first place.
> 
> Ah ok, I think I understand now what you are saying.
> Sure, to avoid such situation, we'll need to maintain extra counters and
> update them properly when we go to sleep.
> I should state it clearly at the beginning.
> It might be easier to explain what I meant by code snippet:
> 
> lcore_conf needs 2 counters:
> uint64_t   nb_queues_ready_to_sleep;
> uint64_t   nb_sleeps;
> 
> Plus each queue needs 2 counters:
> uint64_t nb_empty_polls;
> uint64_t nb_sleeps;
> 
> Now, at rx_callback():
> 
> /* check did sleep happen since previous call,
>       if yes, then reset queue counters */
> if (queue->nb_sleeps != lcore_conf->nb_sleeps) {
>      queue->nb_sleeps = lcore_conf->nb_sleeps;
>      queue->nb_empty_polls = 0;
> }
> 
>   /* packet arrived, reset counters */
>   if (nb_rx != 0) {
>     /* queue is not 'ready_to_sleep' any more */
>     if (queue->nb_empty_polls > EMPTYPOLL_MAX)
>         lcore_conf-> nb_queues_ready_to_sleep--;
>     queue->nb_empty_polls = 0;
> 
> /* empty poll */
> } else {
>      /* queue reaches EMPTYPOLL_MAX threshold, mark it as 'ready_to_sleep' */
>      if (queue->nb_empty_polls == EMPTYPOLL_MAX)
>         lcore_conf-> nb_queues_ready_to_sleep++;
>      queue->nb_empty_polls++;
> }
> 
>     /* no traffic on any queue for at least EMPTYPOLL_MAX iterations */
>     if (lcore_conf-> nb_queues_ready_to_sleep == lcore_conf->n_queues) {
>        /* update counters and sleep */
>        lcore_conf->nb_sleeps++;
>        lcore_conf-> nb_queues_ready_to_sleep = 0;
>        goto_sleep();
>     }
> }

OK, this sounds like it is actually doable :) I'll prototype and see if 
it works.

> 
>> Hopefully you get the point now :)
>>
>> So, the idea here is, for any N queues, sleep only once, not N times.
>>
>>>
>>>> +
>>>> +             /*
>>>> +              * we can increment unconditionally here because if there were
>>>> +              * non-empty polls in other queues assigned to this core, we
>>>> +              * dropped the counter to zero anyway.
>>>> +              */
>>>> +             q_conf->empty_poll_stats++;
>>>> +             if (unlikely(q_conf->empty_poll_stats > EMPTYPOLL_MAX)) {
>>>> +                     struct rte_power_monitor_cond pmc[RTE_MAX_ETHPORTS];
>>>
>>> I think you need here:
>>> struct rte_power_monitor_cond pmc[q_conf->n_queues];
>>
>> I think VLA's are generally agreed upon to be unsafe, so i'm avoiding
>> them here.
> 
> Wonder why?
> These days DPDK uses VLA in dozens of places...

Well, if that's the case, i can use it here also :) realistically the 
n_queues value will be very small, so it shouldn't be a big issue.
  
Burakov, Anatoly June 29, 2021, 11:39 a.m. UTC | #5
On 29-Jun-21 1:07 AM, Ananyev, Konstantin wrote:
> 
> 
>>>> Use the new multi-monitor intrinsic to allow monitoring multiple ethdev
>>>> Rx queues while entering the energy efficient power state. The multi
>>>> version will be used unconditionally if supported, and the UMWAIT one
>>>> will only be used when multi-monitor is not supported by the hardware.
>>>>
>>>> Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
>>>> ---
>>>>    doc/guides/prog_guide/power_man.rst |  9 ++--
>>>>    lib/power/rte_power_pmd_mgmt.c      | 76 ++++++++++++++++++++++++++++-
>>>>    2 files changed, 80 insertions(+), 5 deletions(-)
>>>>
>>>> diff --git a/doc/guides/prog_guide/power_man.rst b/doc/guides/prog_guide/power_man.rst
>>>> index fac2c19516..3245a5ebed 100644
>>>> --- a/doc/guides/prog_guide/power_man.rst
>>>> +++ b/doc/guides/prog_guide/power_man.rst
>>>> @@ -221,13 +221,16 @@ power saving whenever empty poll count reaches a certain number.
>>>>    The "monitor" mode is only supported in the following configurations and scenarios:
>>>>
>>>>    * If ``rte_cpu_get_intrinsics_support()`` function indicates that
>>>> +  ``rte_power_monitor_multi()`` function is supported by the platform, then
>>>> +  monitoring multiple Ethernet Rx queues for traffic will be supported.
>>>> +
>>>> +* If ``rte_cpu_get_intrinsics_support()`` function indicates that only
>>>>      ``rte_power_monitor()`` is supported by the platform, then monitoring will be
>>>>      limited to a mapping of 1 core 1 queue (thus, each Rx queue will have to be
>>>>      monitored from a different lcore).
>>>>
>>>> -* If ``rte_cpu_get_intrinsics_support()`` function indicates that the
>>>> -  ``rte_power_monitor()`` function is not supported, then monitor mode will not
>>>> -  be supported.
>>>> +* If ``rte_cpu_get_intrinsics_support()`` function indicates that neither of the
>>>> +  two monitoring functions are supported, then monitor mode will not be supported.
>>>>
>>>>    * Not all Ethernet devices support monitoring, even if the underlying
>>>>      platform may support the necessary CPU instructions. Please refer to
>>>> diff --git a/lib/power/rte_power_pmd_mgmt.c b/lib/power/rte_power_pmd_mgmt.c
>>>> index 7762cd39b8..aab2d4f1ee 100644
>>>> --- a/lib/power/rte_power_pmd_mgmt.c
>>>> +++ b/lib/power/rte_power_pmd_mgmt.c
>>>> @@ -155,6 +155,24 @@ queue_list_remove(struct pmd_core_cfg *cfg, const union queue *q)
>>>>         return 0;
>>>>    }
>>>>
>>>> +static inline int
>>>> +get_monitor_addresses(struct pmd_core_cfg *cfg,
>>>> +             struct rte_power_monitor_cond *pmc)
>>>> +{
>>>> +     const struct queue_list_entry *qle;
>>>> +     size_t i = 0;
>>>> +     int ret;
>>>> +
>>>> +     TAILQ_FOREACH(qle, &cfg->head, next) {
>>>> +             struct rte_power_monitor_cond *cur = &pmc[i];
>>>
>>> Looks like you never increment 'i' value inside that function.
>>> Also it probably will be safer to add 'num' parameter to check that
>>> we will never over-run pmc[] boundaries.
>>
>> Will fix in v4, good catch!
>>
>>>
>>>> +             const union queue *q = &qle->queue;
>>>> +             ret = rte_eth_get_monitor_addr(q->portid, q->qid, cur);
>>>> +             if (ret < 0)
>>>> +                     return ret;
>>>> +     }
>>>> +     return 0;
>>>> +}
>>>> +
>>>>    static void
>>>>    calc_tsc(void)
>>>>    {
>>>> @@ -183,6 +201,48 @@ calc_tsc(void)
>>>>         }
>>>>    }
>>>>
>>>> +static uint16_t
>>>> +clb_multiwait(uint16_t port_id, uint16_t qidx,
>>>> +             struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx,
>>>> +             uint16_t max_pkts __rte_unused, void *addr __rte_unused)
>>>> +{
>>>> +     const unsigned int lcore = rte_lcore_id();
>>>> +     const union queue q = {.portid = port_id, .qid = qidx};
>>>> +     const bool empty = nb_rx == 0;
>>>> +     struct pmd_core_cfg *q_conf;
>>>> +
>>>> +     q_conf = &lcore_cfg[lcore];
>>>> +
>>>> +     /* early exit */
>>>> +     if (likely(!empty)) {
>>>> +             q_conf->empty_poll_stats = 0;
>>>> +     } else {
>>>> +             /* do we care about this particular queue? */
>>>> +             if (!queue_is_power_save(q_conf, &q))
>>>> +                     return nb_rx;
>>>
>>> I still don't understand the need of 'special' power_save queue here...
>>> Why we can't just have a function:
>>>
>>> get_number_of_queues_whose_sequential_empty_polls_less_then_threshold(struct pmd_core_cfg *lcore_cfg),
>>> and then just:
>>>
>>> /* all queues have at least EMPTYPOLL_MAX sequential empty polls */
>>> if (get_number_of_queues_whose_sequential_empty_polls_less_then_threshold(q_conf) == 0) {
>>>       /* go into power-save mode here */
>>> }
>>
>> Okay, let's go through this step by step :)
>>
>> Let's suppose we have three queues - q0, q1 and q2. We want to sleep
>> whenever there's no traffic on *all of them*, however we cannot know
>> that until we have checked all of them.
>>
>> So, let's suppose that q0, q1 and q2 were empty all this time, but now
>> some traffic arrived at q2 while we're still checking q0. We see that q0
>> is empty, and all of the queues were empty for the last N polls, so we
>> think we will be safe to sleep at q0 despite the fact that traffic has
>> just arrived at q2.
>> This is not an issue with MONITOR mode because we will be able to see if
>> current Rx ring descriptor is busy or not via the NIC callback, *but
>> this is not possible* with PAUSE and SCALE modes, because they don't
>> have the sneaky lookahead function of MONITOR! So, with PAUSE and SCALE
>> modes, it is possible to end up in a situation where you *think* you
>> don't have any traffic, but you actually do, you just haven't checked
>> the relevant queue yet.
> 
> I think such situation is unavoidable.
> Yes, traffic can arrive to *any* queue at *any* time.
> With your example above - user choose q2 as 'special' queue, but
> traffic actually arrives on q0 or q1.
> And yes, if user choose PAUSE or SCALE methods he *can* miss the traffic,
> because as you said for these methods there is no notification mechanisms.
> I think there are just unavoidable limitations with these power-save methods.
> 
>> In order to prevent this from happening, we do not sleep on every queue,
>> instead we sleep *once* per loop.
> 
> Yes, totally agree we shouldn't sleep on *every* queue.
> We need to go to sleep when there is no traffic on *any* of queues we monitor.
> 
>> That is, we check q0, check q1, check
>> q2, and only then we decide whether we want to sleep or not.
> 
>> Of course, with such scheme it is still possible to e.g. sleep in q2
>> while there's traffic waiting in q0,
> 
> Yes, exactly.
> 
>> but worst case is less bad with
>> this scheme, because we'll be doing at worst 1 extra sleep.
> 
> Hmm, I think it would be one extra sleep anyway.
> 
>> Whereas with what you're suggesting, if we had e.g. 10 queues to poll,
>> and we checked q1 but traffic has just arrived at q0, we'll be sleeping
>> at q1, then we'll be sleeping at q2, then we'll be sleeping at q3, then
>> we'll be sleeping at q4, then we'll be sleeping at q5.... and 9 sleeps
>> later we finally reach q0 and find out after all this time that we
>> shouldn't have slept in the first place.
> 
> Ah ok, I think I understand now what you are saying.
> Sure, to avoid such situation, we'll need to maintain extra counters and
> update them properly when we go to sleep.
> I should state it clearly at the beginning.
> It might be easier to explain what I meant by code snippet:
> 
> lcore_conf needs 2 counters:
> uint64_t   nb_queues_ready_to_sleep;
> uint64_t   nb_sleeps;
> 
> Plus each queue needs 2 counters:
> uint64_t nb_empty_polls;
> uint64_t nb_sleeps;
> 
> Now, at rx_callback():
> 
> /* check did sleep happen since previous call,
>       if yes, then reset queue counters */
> if (queue->nb_sleeps != lcore_conf->nb_sleeps) {
>      queue->nb_sleeps = lcore_conf->nb_sleeps;
>      queue->nb_empty_polls = 0;
> }
> 
>   /* packet arrived, reset counters */
>   if (nb_rx != 0) {
>     /* queue is not 'ready_to_sleep' any more */
>     if (queue->nb_empty_polls > EMPTYPOLL_MAX)
>         lcore_conf-> nb_queues_ready_to_sleep--;
>     queue->nb_empty_polls = 0;
> 
> /* empty poll */
> } else {
>      /* queue reaches EMPTYPOLL_MAX threshold, mark it as 'ready_to_sleep' */
>      if (queue->nb_empty_polls == EMPTYPOLL_MAX)
>         lcore_conf-> nb_queues_ready_to_sleep++;
>      queue->nb_empty_polls++;
> }
> 
>     /* no traffic on any queue for at least EMPTYPOLL_MAX iterations */
>     if (lcore_conf-> nb_queues_ready_to_sleep == lcore_conf->n_queues) {
>        /* update counters and sleep */
>        lcore_conf->nb_sleeps++;
>        lcore_conf-> nb_queues_ready_to_sleep = 0;
>        goto_sleep();
>     }
> }
> 
Actually, i don't think this is going to work, because i can see no 
(easy) way to get from lcore to specific queue. I mean, you could have 
an O(N) for loop that will loop over the list of queues every time we 
enter the callback, but i don't think that's such a good idea.
  
Ananyev, Konstantin June 29, 2021, 12:14 p.m. UTC | #6
> -----Original Message-----
> From: Burakov, Anatoly <anatoly.burakov@intel.com>
> Sent: Tuesday, June 29, 2021 12:40 PM
> To: Ananyev, Konstantin <konstantin.ananyev@intel.com>; dev@dpdk.org; Hunt, David <david.hunt@intel.com>
> Cc: Loftus, Ciara <ciara.loftus@intel.com>
> Subject: Re: [dpdk-dev] [PATCH v3 6/7] power: support monitoring multiple Rx queues
> 
> On 29-Jun-21 1:07 AM, Ananyev, Konstantin wrote:
> >
> >
> >>>> Use the new multi-monitor intrinsic to allow monitoring multiple ethdev
> >>>> Rx queues while entering the energy efficient power state. The multi
> >>>> version will be used unconditionally if supported, and the UMWAIT one
> >>>> will only be used when multi-monitor is not supported by the hardware.
> >>>>
> >>>> Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
> >>>> ---
> >>>>    doc/guides/prog_guide/power_man.rst |  9 ++--
> >>>>    lib/power/rte_power_pmd_mgmt.c      | 76 ++++++++++++++++++++++++++++-
> >>>>    2 files changed, 80 insertions(+), 5 deletions(-)
> >>>>
> >>>> diff --git a/doc/guides/prog_guide/power_man.rst b/doc/guides/prog_guide/power_man.rst
> >>>> index fac2c19516..3245a5ebed 100644
> >>>> --- a/doc/guides/prog_guide/power_man.rst
> >>>> +++ b/doc/guides/prog_guide/power_man.rst
> >>>> @@ -221,13 +221,16 @@ power saving whenever empty poll count reaches a certain number.
> >>>>    The "monitor" mode is only supported in the following configurations and scenarios:
> >>>>
> >>>>    * If ``rte_cpu_get_intrinsics_support()`` function indicates that
> >>>> +  ``rte_power_monitor_multi()`` function is supported by the platform, then
> >>>> +  monitoring multiple Ethernet Rx queues for traffic will be supported.
> >>>> +
> >>>> +* If ``rte_cpu_get_intrinsics_support()`` function indicates that only
> >>>>      ``rte_power_monitor()`` is supported by the platform, then monitoring will be
> >>>>      limited to a mapping of 1 core 1 queue (thus, each Rx queue will have to be
> >>>>      monitored from a different lcore).
> >>>>
> >>>> -* If ``rte_cpu_get_intrinsics_support()`` function indicates that the
> >>>> -  ``rte_power_monitor()`` function is not supported, then monitor mode will not
> >>>> -  be supported.
> >>>> +* If ``rte_cpu_get_intrinsics_support()`` function indicates that neither of the
> >>>> +  two monitoring functions are supported, then monitor mode will not be supported.
> >>>>
> >>>>    * Not all Ethernet devices support monitoring, even if the underlying
> >>>>      platform may support the necessary CPU instructions. Please refer to
> >>>> diff --git a/lib/power/rte_power_pmd_mgmt.c b/lib/power/rte_power_pmd_mgmt.c
> >>>> index 7762cd39b8..aab2d4f1ee 100644
> >>>> --- a/lib/power/rte_power_pmd_mgmt.c
> >>>> +++ b/lib/power/rte_power_pmd_mgmt.c
> >>>> @@ -155,6 +155,24 @@ queue_list_remove(struct pmd_core_cfg *cfg, const union queue *q)
> >>>>         return 0;
> >>>>    }
> >>>>
> >>>> +static inline int
> >>>> +get_monitor_addresses(struct pmd_core_cfg *cfg,
> >>>> +             struct rte_power_monitor_cond *pmc)
> >>>> +{
> >>>> +     const struct queue_list_entry *qle;
> >>>> +     size_t i = 0;
> >>>> +     int ret;
> >>>> +
> >>>> +     TAILQ_FOREACH(qle, &cfg->head, next) {
> >>>> +             struct rte_power_monitor_cond *cur = &pmc[i];
> >>>
> >>> Looks like you never increment 'i' value inside that function.
> >>> Also it probably will be safer to add 'num' parameter to check that
> >>> we will never over-run pmc[] boundaries.
> >>
> >> Will fix in v4, good catch!
> >>
> >>>
> >>>> +             const union queue *q = &qle->queue;
> >>>> +             ret = rte_eth_get_monitor_addr(q->portid, q->qid, cur);
> >>>> +             if (ret < 0)
> >>>> +                     return ret;
> >>>> +     }
> >>>> +     return 0;
> >>>> +}
> >>>> +
> >>>>    static void
> >>>>    calc_tsc(void)
> >>>>    {
> >>>> @@ -183,6 +201,48 @@ calc_tsc(void)
> >>>>         }
> >>>>    }
> >>>>
> >>>> +static uint16_t
> >>>> +clb_multiwait(uint16_t port_id, uint16_t qidx,
> >>>> +             struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx,
> >>>> +             uint16_t max_pkts __rte_unused, void *addr __rte_unused)
> >>>> +{
> >>>> +     const unsigned int lcore = rte_lcore_id();
> >>>> +     const union queue q = {.portid = port_id, .qid = qidx};
> >>>> +     const bool empty = nb_rx == 0;
> >>>> +     struct pmd_core_cfg *q_conf;
> >>>> +
> >>>> +     q_conf = &lcore_cfg[lcore];
> >>>> +
> >>>> +     /* early exit */
> >>>> +     if (likely(!empty)) {
> >>>> +             q_conf->empty_poll_stats = 0;
> >>>> +     } else {
> >>>> +             /* do we care about this particular queue? */
> >>>> +             if (!queue_is_power_save(q_conf, &q))
> >>>> +                     return nb_rx;
> >>>
> >>> I still don't understand the need of 'special' power_save queue here...
> >>> Why we can't just have a function:
> >>>
> >>> get_number_of_queues_whose_sequential_empty_polls_less_then_threshold(struct pmd_core_cfg *lcore_cfg),
> >>> and then just:
> >>>
> >>> /* all queues have at least EMPTYPOLL_MAX sequential empty polls */
> >>> if (get_number_of_queues_whose_sequential_empty_polls_less_then_threshold(q_conf) == 0) {
> >>>       /* go into power-save mode here */
> >>> }
> >>
> >> Okay, let's go through this step by step :)
> >>
> >> Let's suppose we have three queues - q0, q1 and q2. We want to sleep
> >> whenever there's no traffic on *all of them*, however we cannot know
> >> that until we have checked all of them.
> >>
> >> So, let's suppose that q0, q1 and q2 were empty all this time, but now
> >> some traffic arrived at q2 while we're still checking q0. We see that q0
> >> is empty, and all of the queues were empty for the last N polls, so we
> >> think we will be safe to sleep at q0 despite the fact that traffic has
> >> just arrived at q2.
> >> This is not an issue with MONITOR mode because we will be able to see if
> >> current Rx ring descriptor is busy or not via the NIC callback, *but
> >> this is not possible* with PAUSE and SCALE modes, because they don't
> >> have the sneaky lookahead function of MONITOR! So, with PAUSE and SCALE
> >> modes, it is possible to end up in a situation where you *think* you
> >> don't have any traffic, but you actually do, you just haven't checked
> >> the relevant queue yet.
> >
> > I think such situation is unavoidable.
> > Yes, traffic can arrive to *any* queue at *any* time.
> > With your example above - user choose q2 as 'special' queue, but
> > traffic actually arrives on q0 or q1.
> > And yes, if user choose PAUSE or SCALE methods he *can* miss the traffic,
> > because as you said for these methods there is no notification mechanisms.
> > I think there are just unavoidable limitations with these power-save methods.
> >
> >> In order to prevent this from happening, we do not sleep on every queue,
> >> instead we sleep *once* per loop.
> >
> > Yes, totally agree we shouldn't sleep on *every* queue.
> > We need to go to sleep when there is no traffic on *any* of queues we monitor.
> >
> >> That is, we check q0, check q1, check
> >> q2, and only then we decide whether we want to sleep or not.
> >
> >> Of course, with such scheme it is still possible to e.g. sleep in q2
> >> while there's traffic waiting in q0,
> >
> > Yes, exactly.
> >
> >> but worst case is less bad with
> >> this scheme, because we'll be doing at worst 1 extra sleep.
> >
> > Hmm, I think it would be one extra sleep anyway.
> >
> >> Whereas with what you're suggesting, if we had e.g. 10 queues to poll,
> >> and we checked q1 but traffic has just arrived at q0, we'll be sleeping
> >> at q1, then we'll be sleeping at q2, then we'll be sleeping at q3, then
> >> we'll be sleeping at q4, then we'll be sleeping at q5.... and 9 sleeps
> >> later we finally reach q0 and find out after all this time that we
> >> shouldn't have slept in the first place.
> >
> > Ah ok, I think I understand now what you are saying.
> > Sure, to avoid such situation, we'll need to maintain extra counters and
> > update them properly when we go to sleep.
> > I should state it clearly at the beginning.
> > It might be easier to explain what I meant by code snippet:
> >
> > lcore_conf needs 2 counters:
> > uint64_t   nb_queues_ready_to_sleep;
> > uint64_t   nb_sleeps;
> >
> > Plus each queue needs 2 counters:
> > uint64_t nb_empty_polls;
> > uint64_t nb_sleeps;
> >
> > Now, at rx_callback():
> >
> > /* check did sleep happen since previous call,
> >       if yes, then reset queue counters */
> > if (queue->nb_sleeps != lcore_conf->nb_sleeps) {
> >      queue->nb_sleeps = lcore_conf->nb_sleeps;
> >      queue->nb_empty_polls = 0;
> > }
> >
> >   /* packet arrived, reset counters */
> >   if (nb_rx != 0) {
> >     /* queue is not 'ready_to_sleep' any more */
> >     if (queue->nb_empty_polls > EMPTYPOLL_MAX)
> >         lcore_conf-> nb_queues_ready_to_sleep--;
> >     queue->nb_empty_polls = 0;
> >
> > /* empty poll */
> > } else {
> >      /* queue reaches EMPTYPOLL_MAX threshold, mark it as 'ready_to_sleep' */
> >      if (queue->nb_empty_polls == EMPTYPOLL_MAX)
> >         lcore_conf-> nb_queues_ready_to_sleep++;
> >      queue->nb_empty_polls++;
> > }
> >
> >     /* no traffic on any queue for at least EMPTYPOLL_MAX iterations */
> >     if (lcore_conf-> nb_queues_ready_to_sleep == lcore_conf->n_queues) {
> >        /* update counters and sleep */
> >        lcore_conf->nb_sleeps++;
> >        lcore_conf-> nb_queues_ready_to_sleep = 0;
> >        goto_sleep();
> >     }
> > }
> >
> Actually, i don't think this is going to work, because i can see no
> (easy) way to get from lcore to specific queue. I mean, you could have
> an O(N) for loop that will loop over the list of queues every time we
> enter the callback, but i don't think that's such a good idea.

I think something like that will work:

struct queue_list_entry {
        TAILQ_ENTRY(queue_list_entry) next;
        union queue queue;
+     /* pointer to the lcore that queue is managed by */
+      struct pmd_core_cfg *lcore_cfg;
+     /* queue RX callback */
+      const struct rte_eth_rxtx_callback *cur_cb;
};

At rte_power_ethdev_pmgmt_queue_enable():

+      struct queue_list_entry *qle;
...
-        ret = queue_list_add(queue_cfg, &qdata);
+       qle = queue_list_add(queue_cfg, &qdata);
...
-       queue_cfg->cur_cb = rte_eth_add_rx_callback(port_id, queue_id, clb, NULL);
+      qle->cur_cb = rte_eth_add_rx_callback(port_id, queue_id, clb, qdata);

At actual clb_xxx(uint16_t port_id, uint16_t qidx,
                struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx,
                uint16_t max_pkts __rte_unused, void *addr __rte_unused)
{
   ...
   struct queue_list_entry *qle = addr;
   struct pmd_core_cfg *lcore_cfg = qle->lcore_conf;
   ....
}
  
Burakov, Anatoly June 29, 2021, 1:23 p.m. UTC | #7
On 29-Jun-21 1:14 PM, Ananyev, Konstantin wrote:
> 
> 
>> -----Original Message-----
>> From: Burakov, Anatoly <anatoly.burakov@intel.com>
>> Sent: Tuesday, June 29, 2021 12:40 PM
>> To: Ananyev, Konstantin <konstantin.ananyev@intel.com>; dev@dpdk.org; Hunt, David <david.hunt@intel.com>
>> Cc: Loftus, Ciara <ciara.loftus@intel.com>
>> Subject: Re: [dpdk-dev] [PATCH v3 6/7] power: support monitoring multiple Rx queues
>>
>> On 29-Jun-21 1:07 AM, Ananyev, Konstantin wrote:
>>>
>>>
>>>>>> Use the new multi-monitor intrinsic to allow monitoring multiple ethdev
>>>>>> Rx queues while entering the energy efficient power state. The multi
>>>>>> version will be used unconditionally if supported, and the UMWAIT one
>>>>>> will only be used when multi-monitor is not supported by the hardware.
>>>>>>
>>>>>> Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
>>>>>> ---
>>>>>>     doc/guides/prog_guide/power_man.rst |  9 ++--
>>>>>>     lib/power/rte_power_pmd_mgmt.c      | 76 ++++++++++++++++++++++++++++-
>>>>>>     2 files changed, 80 insertions(+), 5 deletions(-)
>>>>>>
>>>>>> diff --git a/doc/guides/prog_guide/power_man.rst b/doc/guides/prog_guide/power_man.rst
>>>>>> index fac2c19516..3245a5ebed 100644
>>>>>> --- a/doc/guides/prog_guide/power_man.rst
>>>>>> +++ b/doc/guides/prog_guide/power_man.rst
>>>>>> @@ -221,13 +221,16 @@ power saving whenever empty poll count reaches a certain number.
>>>>>>     The "monitor" mode is only supported in the following configurations and scenarios:
>>>>>>
>>>>>>     * If ``rte_cpu_get_intrinsics_support()`` function indicates that
>>>>>> +  ``rte_power_monitor_multi()`` function is supported by the platform, then
>>>>>> +  monitoring multiple Ethernet Rx queues for traffic will be supported.
>>>>>> +
>>>>>> +* If ``rte_cpu_get_intrinsics_support()`` function indicates that only
>>>>>>       ``rte_power_monitor()`` is supported by the platform, then monitoring will be
>>>>>>       limited to a mapping of 1 core 1 queue (thus, each Rx queue will have to be
>>>>>>       monitored from a different lcore).
>>>>>>
>>>>>> -* If ``rte_cpu_get_intrinsics_support()`` function indicates that the
>>>>>> -  ``rte_power_monitor()`` function is not supported, then monitor mode will not
>>>>>> -  be supported.
>>>>>> +* If ``rte_cpu_get_intrinsics_support()`` function indicates that neither of the
>>>>>> +  two monitoring functions are supported, then monitor mode will not be supported.
>>>>>>
>>>>>>     * Not all Ethernet devices support monitoring, even if the underlying
>>>>>>       platform may support the necessary CPU instructions. Please refer to
>>>>>> diff --git a/lib/power/rte_power_pmd_mgmt.c b/lib/power/rte_power_pmd_mgmt.c
>>>>>> index 7762cd39b8..aab2d4f1ee 100644
>>>>>> --- a/lib/power/rte_power_pmd_mgmt.c
>>>>>> +++ b/lib/power/rte_power_pmd_mgmt.c
>>>>>> @@ -155,6 +155,24 @@ queue_list_remove(struct pmd_core_cfg *cfg, const union queue *q)
>>>>>>          return 0;
>>>>>>     }
>>>>>>
>>>>>> +static inline int
>>>>>> +get_monitor_addresses(struct pmd_core_cfg *cfg,
>>>>>> +             struct rte_power_monitor_cond *pmc)
>>>>>> +{
>>>>>> +     const struct queue_list_entry *qle;
>>>>>> +     size_t i = 0;
>>>>>> +     int ret;
>>>>>> +
>>>>>> +     TAILQ_FOREACH(qle, &cfg->head, next) {
>>>>>> +             struct rte_power_monitor_cond *cur = &pmc[i];
>>>>>
>>>>> Looks like you never increment 'i' value inside that function.
>>>>> Also it probably will be safer to add 'num' parameter to check that
>>>>> we will never over-run pmc[] boundaries.
>>>>
>>>> Will fix in v4, good catch!
>>>>
>>>>>
>>>>>> +             const union queue *q = &qle->queue;
>>>>>> +             ret = rte_eth_get_monitor_addr(q->portid, q->qid, cur);
>>>>>> +             if (ret < 0)
>>>>>> +                     return ret;
>>>>>> +     }
>>>>>> +     return 0;
>>>>>> +}
>>>>>> +
>>>>>>     static void
>>>>>>     calc_tsc(void)
>>>>>>     {
>>>>>> @@ -183,6 +201,48 @@ calc_tsc(void)
>>>>>>          }
>>>>>>     }
>>>>>>
>>>>>> +static uint16_t
>>>>>> +clb_multiwait(uint16_t port_id, uint16_t qidx,
>>>>>> +             struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx,
>>>>>> +             uint16_t max_pkts __rte_unused, void *addr __rte_unused)
>>>>>> +{
>>>>>> +     const unsigned int lcore = rte_lcore_id();
>>>>>> +     const union queue q = {.portid = port_id, .qid = qidx};
>>>>>> +     const bool empty = nb_rx == 0;
>>>>>> +     struct pmd_core_cfg *q_conf;
>>>>>> +
>>>>>> +     q_conf = &lcore_cfg[lcore];
>>>>>> +
>>>>>> +     /* early exit */
>>>>>> +     if (likely(!empty)) {
>>>>>> +             q_conf->empty_poll_stats = 0;
>>>>>> +     } else {
>>>>>> +             /* do we care about this particular queue? */
>>>>>> +             if (!queue_is_power_save(q_conf, &q))
>>>>>> +                     return nb_rx;
>>>>>
>>>>> I still don't understand the need of 'special' power_save queue here...
>>>>> Why we can't just have a function:
>>>>>
>>>>> get_number_of_queues_whose_sequential_empty_polls_less_then_threshold(struct pmd_core_cfg *lcore_cfg),
>>>>> and then just:
>>>>>
>>>>> /* all queues have at least EMPTYPOLL_MAX sequential empty polls */
>>>>> if (get_number_of_queues_whose_sequential_empty_polls_less_then_threshold(q_conf) == 0) {
>>>>>        /* go into power-save mode here */
>>>>> }
>>>>
>>>> Okay, let's go through this step by step :)
>>>>
>>>> Let's suppose we have three queues - q0, q1 and q2. We want to sleep
>>>> whenever there's no traffic on *all of them*, however we cannot know
>>>> that until we have checked all of them.
>>>>
>>>> So, let's suppose that q0, q1 and q2 were empty all this time, but now
>>>> some traffic arrived at q2 while we're still checking q0. We see that q0
>>>> is empty, and all of the queues were empty for the last N polls, so we
>>>> think we will be safe to sleep at q0 despite the fact that traffic has
>>>> just arrived at q2.
>>>> This is not an issue with MONITOR mode because we will be able to see if
>>>> current Rx ring descriptor is busy or not via the NIC callback, *but
>>>> this is not possible* with PAUSE and SCALE modes, because they don't
>>>> have the sneaky lookahead function of MONITOR! So, with PAUSE and SCALE
>>>> modes, it is possible to end up in a situation where you *think* you
>>>> don't have any traffic, but you actually do, you just haven't checked
>>>> the relevant queue yet.
>>>
>>> I think such situation is unavoidable.
>>> Yes, traffic can arrive to *any* queue at *any* time.
>>> With your example above - user choose q2 as 'special' queue, but
>>> traffic actually arrives on q0 or q1.
>>> And yes, if user choose PAUSE or SCALE methods he *can* miss the traffic,
>>> because as you said for these methods there is no notification mechanisms.
>>> I think there are just unavoidable limitations with these power-save methods.
>>>
>>>> In order to prevent this from happening, we do not sleep on every queue,
>>>> instead we sleep *once* per loop.
>>>
>>> Yes, totally agree we shouldn't sleep on *every* queue.
>>> We need to go to sleep when there is no traffic on *any* of queues we monitor.
>>>
>>>> That is, we check q0, check q1, check
>>>> q2, and only then we decide whether we want to sleep or not.
>>>
>>>> Of course, with such scheme it is still possible to e.g. sleep in q2
>>>> while there's traffic waiting in q0,
>>>
>>> Yes, exactly.
>>>
>>>> but worst case is less bad with
>>>> this scheme, because we'll be doing at worst 1 extra sleep.
>>>
>>> Hmm, I think it would be one extra sleep anyway.
>>>
>>>> Whereas with what you're suggesting, if we had e.g. 10 queues to poll,
>>>> and we checked q1 but traffic has just arrived at q0, we'll be sleeping
>>>> at q1, then we'll be sleeping at q2, then we'll be sleeping at q3, then
>>>> we'll be sleeping at q4, then we'll be sleeping at q5.... and 9 sleeps
>>>> later we finally reach q0 and find out after all this time that we
>>>> shouldn't have slept in the first place.
>>>
>>> Ah ok, I think I understand now what you are saying.
>>> Sure, to avoid such situation, we'll need to maintain extra counters and
>>> update them properly when we go to sleep.
>>> I should state it clearly at the beginning.
>>> It might be easier to explain what I meant by code snippet:
>>>
>>> lcore_conf needs 2 counters:
>>> uint64_t   nb_queues_ready_to_sleep;
>>> uint64_t   nb_sleeps;
>>>
>>> Plus each queue needs 2 counters:
>>> uint64_t nb_empty_polls;
>>> uint64_t nb_sleeps;
>>>
>>> Now, at rx_callback():
>>>
>>> /* check did sleep happen since previous call,
>>>        if yes, then reset queue counters */
>>> if (queue->nb_sleeps != lcore_conf->nb_sleeps) {
>>>       queue->nb_sleeps = lcore_conf->nb_sleeps;
>>>       queue->nb_empty_polls = 0;
>>> }
>>>
>>>    /* packet arrived, reset counters */
>>>    if (nb_rx != 0) {
>>>      /* queue is not 'ready_to_sleep' any more */
>>>      if (queue->nb_empty_polls > EMPTYPOLL_MAX)
>>>          lcore_conf-> nb_queues_ready_to_sleep--;
>>>      queue->nb_empty_polls = 0;
>>>
>>> /* empty poll */
>>> } else {
>>>       /* queue reaches EMPTYPOLL_MAX threshold, mark it as 'ready_to_sleep' */
>>>       if (queue->nb_empty_polls == EMPTYPOLL_MAX)
>>>          lcore_conf-> nb_queues_ready_to_sleep++;
>>>       queue->nb_empty_polls++;
>>> }
>>>
>>>      /* no traffic on any queue for at least EMPTYPOLL_MAX iterations */
>>>      if (lcore_conf-> nb_queues_ready_to_sleep == lcore_conf->n_queues) {
>>>         /* update counters and sleep */
>>>         lcore_conf->nb_sleeps++;
>>>         lcore_conf-> nb_queues_ready_to_sleep = 0;
>>>         goto_sleep();
>>>      }
>>> }
>>>
>> Actually, i don't think this is going to work, because i can see no
>> (easy) way to get from lcore to specific queue. I mean, you could have
>> an O(N) for loop that will loop over the list of queues every time we
>> enter the callback, but i don't think that's such a good idea.
> 
> I think something like that will work:
> 
> struct queue_list_entry {
>          TAILQ_ENTRY(queue_list_entry) next;
>          union queue queue;
> +     /* pointer to the lcore that queue is managed by */
> +      struct pmd_core_cfg *lcore_cfg;
> +     /* queue RX callback */
> +      const struct rte_eth_rxtx_callback *cur_cb;
> };
> 
> At rte_power_ethdev_pmgmt_queue_enable():
> 
> +      struct queue_list_entry *qle;
> ...
> -        ret = queue_list_add(queue_cfg, &qdata);
> +       qle = queue_list_add(queue_cfg, &qdata);
> ...
> -       queue_cfg->cur_cb = rte_eth_add_rx_callback(port_id, queue_id, clb, NULL);
> +      qle->cur_cb = rte_eth_add_rx_callback(port_id, queue_id, clb, qdata);
> 
> At actual clb_xxx(uint16_t port_id, uint16_t qidx,
>                  struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx,
>                  uint16_t max_pkts __rte_unused, void *addr __rte_unused)
> {
>     ...
>     struct queue_list_entry *qle = addr;
>     struct pmd_core_cfg *lcore_cfg = qle->lcore_conf;
>     ....
> }
> 
> 

Hm, that's actually a clever solution :) Thanks!
  

Patch

diff --git a/doc/guides/prog_guide/power_man.rst b/doc/guides/prog_guide/power_man.rst
index fac2c19516..3245a5ebed 100644
--- a/doc/guides/prog_guide/power_man.rst
+++ b/doc/guides/prog_guide/power_man.rst
@@ -221,13 +221,16 @@  power saving whenever empty poll count reaches a certain number.
 The "monitor" mode is only supported in the following configurations and scenarios:
 
 * If ``rte_cpu_get_intrinsics_support()`` function indicates that
+  ``rte_power_monitor_multi()`` function is supported by the platform, then
+  monitoring multiple Ethernet Rx queues for traffic will be supported.
+
+* If ``rte_cpu_get_intrinsics_support()`` function indicates that only
   ``rte_power_monitor()`` is supported by the platform, then monitoring will be
   limited to a mapping of 1 core 1 queue (thus, each Rx queue will have to be
   monitored from a different lcore).
 
-* If ``rte_cpu_get_intrinsics_support()`` function indicates that the
-  ``rte_power_monitor()`` function is not supported, then monitor mode will not
-  be supported.
+* If ``rte_cpu_get_intrinsics_support()`` function indicates that neither of the
+  two monitoring functions are supported, then monitor mode will not be supported.
 
 * Not all Ethernet devices support monitoring, even if the underlying
   platform may support the necessary CPU instructions. Please refer to
diff --git a/lib/power/rte_power_pmd_mgmt.c b/lib/power/rte_power_pmd_mgmt.c
index 7762cd39b8..aab2d4f1ee 100644
--- a/lib/power/rte_power_pmd_mgmt.c
+++ b/lib/power/rte_power_pmd_mgmt.c
@@ -155,6 +155,24 @@  queue_list_remove(struct pmd_core_cfg *cfg, const union queue *q)
 	return 0;
 }
 
+static inline int
+get_monitor_addresses(struct pmd_core_cfg *cfg,
+		struct rte_power_monitor_cond *pmc)
+{
+	const struct queue_list_entry *qle;
+	size_t i = 0;
+	int ret;
+
+	TAILQ_FOREACH(qle, &cfg->head, next) {
+		struct rte_power_monitor_cond *cur = &pmc[i];
+		const union queue *q = &qle->queue;
+		ret = rte_eth_get_monitor_addr(q->portid, q->qid, cur);
+		if (ret < 0)
+			return ret;
+	}
+	return 0;
+}
+
 static void
 calc_tsc(void)
 {
@@ -183,6 +201,48 @@  calc_tsc(void)
 	}
 }
 
+static uint16_t
+clb_multiwait(uint16_t port_id, uint16_t qidx,
+		struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx,
+		uint16_t max_pkts __rte_unused, void *addr __rte_unused)
+{
+	const unsigned int lcore = rte_lcore_id();
+	const union queue q = {.portid = port_id, .qid = qidx};
+	const bool empty = nb_rx == 0;
+	struct pmd_core_cfg *q_conf;
+
+	q_conf = &lcore_cfg[lcore];
+
+	/* early exit */
+	if (likely(!empty)) {
+		q_conf->empty_poll_stats = 0;
+	} else {
+		/* do we care about this particular queue? */
+		if (!queue_is_power_save(q_conf, &q))
+			return nb_rx;
+
+		/*
+		 * we can increment unconditionally here because if there were
+		 * non-empty polls in other queues assigned to this core, we
+		 * dropped the counter to zero anyway.
+		 */
+		q_conf->empty_poll_stats++;
+		if (unlikely(q_conf->empty_poll_stats > EMPTYPOLL_MAX)) {
+			struct rte_power_monitor_cond pmc[RTE_MAX_ETHPORTS];
+			uint16_t ret;
+
+			/* gather all monitoring conditions */
+			ret = get_monitor_addresses(q_conf, pmc);
+
+			if (ret == 0)
+				rte_power_monitor_multi(pmc,
+					q_conf->n_queues, UINT64_MAX);
+		}
+	}
+
+	return nb_rx;
+}
+
 static uint16_t
 clb_umwait(uint16_t port_id, uint16_t qidx, struct rte_mbuf **pkts __rte_unused,
 		uint16_t nb_rx, uint16_t max_pkts __rte_unused,
@@ -348,14 +408,19 @@  static int
 check_monitor(struct pmd_core_cfg *cfg, const union queue *qdata)
 {
 	struct rte_power_monitor_cond dummy;
+	bool multimonitor_supported;
 
 	/* check if rte_power_monitor is supported */
 	if (!global_data.intrinsics_support.power_monitor) {
 		RTE_LOG(DEBUG, POWER, "Monitoring intrinsics are not supported\n");
 		return -ENOTSUP;
 	}
+	/* check if multi-monitor is supported */
+	multimonitor_supported =
+			global_data.intrinsics_support.power_monitor_multi;
 
-	if (cfg->n_queues > 0) {
+	/* if we're adding a new queue, do we support multiple queues? */
+	if (cfg->n_queues > 0 && !multimonitor_supported) {
 		RTE_LOG(DEBUG, POWER, "Monitoring multiple queues is not supported\n");
 		return -ENOTSUP;
 	}
@@ -371,6 +436,13 @@  check_monitor(struct pmd_core_cfg *cfg, const union queue *qdata)
 	return 0;
 }
 
+static inline rte_rx_callback_fn
+get_monitor_callback(void)
+{
+	return global_data.intrinsics_support.power_monitor_multi ?
+		clb_multiwait : clb_umwait;
+}
+
 int
 rte_power_ethdev_pmgmt_queue_enable(unsigned int lcore_id, uint16_t port_id,
 		uint16_t queue_id, enum rte_power_pmd_mgmt_type mode)
@@ -434,7 +506,7 @@  rte_power_ethdev_pmgmt_queue_enable(unsigned int lcore_id, uint16_t port_id,
 		if (ret < 0)
 			goto end;
 
-		clb = clb_umwait;
+		clb = get_monitor_callback();
 		break;
 	case RTE_POWER_MGMT_TYPE_SCALE:
 		/* check if we can add a new queue */