[v2,3/7] eal: add power monitor for multiple events

Message ID 6c48562add728df4e2f22b3a41170d624c63e233.1624629506.git.anatoly.burakov@intel.com (mailing list archive)
State Superseded, archived
Delegated to: Thomas Monjalon
Headers
Series Enhancements for PMD power management |

Checks

Context Check Description
ci/checkpatch warning coding style issues

Commit Message

Anatoly Burakov June 25, 2021, 2 p.m. UTC
  Use RTM and WAITPKG instructions to perform a wait-for-writes similar to
what UMWAIT does, but without the limitation of having to listen for
just one event. This works because the optimized power state used by the
TPAUSE instruction will cause a wake up on RTM transaction abort, so if
we add the addresses we're interested in to the read-set, any write to
those addresses will wake us up.

Signed-off-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
---

Notes:
    v2:
    - Adapt to callback mechanism

 doc/guides/rel_notes/release_21_08.rst        |  2 +
 lib/eal/arm/rte_power_intrinsics.c            | 11 +++
 lib/eal/include/generic/rte_cpuflags.h        |  2 +
 .../include/generic/rte_power_intrinsics.h    | 35 ++++++++++
 lib/eal/ppc/rte_power_intrinsics.c            | 11 +++
 lib/eal/version.map                           |  3 +
 lib/eal/x86/rte_cpuflags.c                    |  2 +
 lib/eal/x86/rte_power_intrinsics.c            | 69 +++++++++++++++++++
 8 files changed, 135 insertions(+)
  

Comments

Ananyev, Konstantin June 28, 2021, 12:37 p.m. UTC | #1
> Use RTM and WAITPKG instructions to perform a wait-for-writes similar to
> what UMWAIT does, but without the limitation of having to listen for
> just one event. This works because the optimized power state used by the
> TPAUSE instruction will cause a wake up on RTM transaction abort, so if
> we add the addresses we're interested in to the read-set, any write to
> those addresses will wake us up.
> 
> Signed-off-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
> Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
> ---
> 
> Notes:
>     v2:
>     - Adapt to callback mechanism
> 
>  doc/guides/rel_notes/release_21_08.rst        |  2 +
>  lib/eal/arm/rte_power_intrinsics.c            | 11 +++
>  lib/eal/include/generic/rte_cpuflags.h        |  2 +
>  .../include/generic/rte_power_intrinsics.h    | 35 ++++++++++
>  lib/eal/ppc/rte_power_intrinsics.c            | 11 +++
>  lib/eal/version.map                           |  3 +
>  lib/eal/x86/rte_cpuflags.c                    |  2 +
>  lib/eal/x86/rte_power_intrinsics.c            | 69 +++++++++++++++++++
>  8 files changed, 135 insertions(+)
> 
...

> diff --git a/lib/eal/x86/rte_power_intrinsics.c b/lib/eal/x86/rte_power_intrinsics.c
> index 3c5c9ce7ad..3fc6f62ef5 100644
> --- a/lib/eal/x86/rte_power_intrinsics.c
> +++ b/lib/eal/x86/rte_power_intrinsics.c
> @@ -4,6 +4,7 @@
> 
>  #include <rte_common.h>
>  #include <rte_lcore.h>
> +#include <rte_rtm.h>
>  #include <rte_spinlock.h>
> 
>  #include "rte_power_intrinsics.h"
> @@ -28,6 +29,7 @@ __umwait_wakeup(volatile void *addr)
>  }
> 
>  static bool wait_supported;
> +static bool wait_multi_supported;
> 
>  static inline uint64_t
>  __get_umwait_val(const volatile void *p, const uint8_t sz)
> @@ -164,6 +166,8 @@ RTE_INIT(rte_power_intrinsics_init) {
> 
>  	if (i.power_monitor && i.power_pause)
>  		wait_supported = 1;
> +	if (i.power_monitor_multi)
> +		wait_multi_supported = 1;
>  }
> 
>  int
> @@ -202,6 +206,9 @@ rte_power_monitor_wakeup(const unsigned int lcore_id)
>  	 * In this case, since we've already woken up, the "wakeup" was
>  	 * unneeded, and since T1 is still waiting on T2 releasing the lock, the
>  	 * wakeup address is still valid so it's perfectly safe to write it.
> +	 *
> +	 * For multi-monitor case, the act of locking will in itself trigger the
> +	 * wakeup, so no additional writes necessary.
>  	 */
>  	rte_spinlock_lock(&s->lock);
>  	if (s->monitor_addr != NULL)
> @@ -210,3 +217,65 @@ rte_power_monitor_wakeup(const unsigned int lcore_id)
> 
>  	return 0;
>  }
> +
> +int
> +rte_power_monitor_multi(const struct rte_power_monitor_cond pmc[],
> +		const uint32_t num, const uint64_t tsc_timestamp)
> +{
> +	const unsigned int lcore_id = rte_lcore_id();
> +	struct power_wait_status *s = &wait_status[lcore_id];
> +	uint32_t i, rc;
> +
> +	/* check if supported */
> +	if (!wait_multi_supported)
> +		return -ENOTSUP;
> +
> +	if (pmc == NULL || num == 0)
> +		return -EINVAL;
> +
> +	/* we are already inside transaction region, return */
> +	if (rte_xtest() != 0)
> +		return 0;
> +
> +	/* start new transaction region */
> +	rc = rte_xbegin();
> +
> +	/* transaction abort, possible write to one of wait addresses */
> +	if (rc != RTE_XBEGIN_STARTED)
> +		return 0;
> +
> +	/*
> +	 * the mere act of reading the lock status here adds the lock to
> +	 * the read set. This means that when we trigger a wakeup from another
> +	 * thread, even if we don't have a defined wakeup address and thus don't
> +	 * actually cause any writes, the act of locking our lock will itself
> +	 * trigger the wakeup and abort the transaction.
> +	 */
> +	rte_spinlock_is_locked(&s->lock);
> +
> +	/*
> +	 * add all addresses to wait on into transaction read-set and check if
> +	 * any of wakeup conditions are already met.
> +	 */
> +	for (i = 0; i < num; i++) {
> +		const struct rte_power_monitor_cond *c = &pmc[i];
> +
> +		if (pmc->fn == NULL)

Should be c->fn, I believe.

> +			continue;

Actually that way, if c->fn == NULL, we'll never add  our c->addr to monitored addresses.
Is that what we really want?
My thought was, that if callback is not set, we'll just go to power-save state without extra checking, no?
Something like that:

const struct rte_power_monitor_cond *c = &pmc[i];
const uint64_t val = __get_umwait_val(c->addr, c->size);

if (c->fn && c->fn(val, c->opaque) != 0)
   break;

Same thought for rte_power_monitor().

> +		const uint64_t val = __get_umwait_val(pmc->addr, pmc->size);

Same thing: s/pmc->/c->/

> +
> +		/* abort if callback indicates that we need to stop */
> +		if (c->fn(val, c->opaque) != 0)
> +			break;
> +	}
> +
> +	/* none of the conditions were met, sleep until timeout */
> +	if (i == num)
> +		rte_power_pause(tsc_timestamp);
> +
> +	/* end transaction region */
> +	rte_xend();
> +
> +	return 0;
> +}
> --
> 2.25.1
  
Anatoly Burakov June 28, 2021, 12:43 p.m. UTC | #2
On 28-Jun-21 1:37 PM, Ananyev, Konstantin wrote:
> 
>> Use RTM and WAITPKG instructions to perform a wait-for-writes similar to
>> what UMWAIT does, but without the limitation of having to listen for
>> just one event. This works because the optimized power state used by the
>> TPAUSE instruction will cause a wake up on RTM transaction abort, so if
>> we add the addresses we're interested in to the read-set, any write to
>> those addresses will wake us up.
>>
>> Signed-off-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
>> Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
>> ---
>>
>> Notes:
>>      v2:
>>      - Adapt to callback mechanism
>>
>>   doc/guides/rel_notes/release_21_08.rst        |  2 +
>>   lib/eal/arm/rte_power_intrinsics.c            | 11 +++
>>   lib/eal/include/generic/rte_cpuflags.h        |  2 +
>>   .../include/generic/rte_power_intrinsics.h    | 35 ++++++++++
>>   lib/eal/ppc/rte_power_intrinsics.c            | 11 +++
>>   lib/eal/version.map                           |  3 +
>>   lib/eal/x86/rte_cpuflags.c                    |  2 +
>>   lib/eal/x86/rte_power_intrinsics.c            | 69 +++++++++++++++++++
>>   8 files changed, 135 insertions(+)
>>
> ...
> 
>> diff --git a/lib/eal/x86/rte_power_intrinsics.c b/lib/eal/x86/rte_power_intrinsics.c
>> index 3c5c9ce7ad..3fc6f62ef5 100644
>> --- a/lib/eal/x86/rte_power_intrinsics.c
>> +++ b/lib/eal/x86/rte_power_intrinsics.c
>> @@ -4,6 +4,7 @@
>>
>>   #include <rte_common.h>
>>   #include <rte_lcore.h>
>> +#include <rte_rtm.h>
>>   #include <rte_spinlock.h>
>>
>>   #include "rte_power_intrinsics.h"
>> @@ -28,6 +29,7 @@ __umwait_wakeup(volatile void *addr)
>>   }
>>
>>   static bool wait_supported;
>> +static bool wait_multi_supported;
>>
>>   static inline uint64_t
>>   __get_umwait_val(const volatile void *p, const uint8_t sz)
>> @@ -164,6 +166,8 @@ RTE_INIT(rte_power_intrinsics_init) {
>>
>>        if (i.power_monitor && i.power_pause)
>>                wait_supported = 1;
>> +     if (i.power_monitor_multi)
>> +             wait_multi_supported = 1;
>>   }
>>
>>   int
>> @@ -202,6 +206,9 @@ rte_power_monitor_wakeup(const unsigned int lcore_id)
>>         * In this case, since we've already woken up, the "wakeup" was
>>         * unneeded, and since T1 is still waiting on T2 releasing the lock, the
>>         * wakeup address is still valid so it's perfectly safe to write it.
>> +      *
>> +      * For multi-monitor case, the act of locking will in itself trigger the
>> +      * wakeup, so no additional writes necessary.
>>         */
>>        rte_spinlock_lock(&s->lock);
>>        if (s->monitor_addr != NULL)
>> @@ -210,3 +217,65 @@ rte_power_monitor_wakeup(const unsigned int lcore_id)
>>
>>        return 0;
>>   }
>> +
>> +int
>> +rte_power_monitor_multi(const struct rte_power_monitor_cond pmc[],
>> +             const uint32_t num, const uint64_t tsc_timestamp)
>> +{
>> +     const unsigned int lcore_id = rte_lcore_id();
>> +     struct power_wait_status *s = &wait_status[lcore_id];
>> +     uint32_t i, rc;
>> +
>> +     /* check if supported */
>> +     if (!wait_multi_supported)
>> +             return -ENOTSUP;
>> +
>> +     if (pmc == NULL || num == 0)
>> +             return -EINVAL;
>> +
>> +     /* we are already inside transaction region, return */
>> +     if (rte_xtest() != 0)
>> +             return 0;
>> +
>> +     /* start new transaction region */
>> +     rc = rte_xbegin();
>> +
>> +     /* transaction abort, possible write to one of wait addresses */
>> +     if (rc != RTE_XBEGIN_STARTED)
>> +             return 0;
>> +
>> +     /*
>> +      * the mere act of reading the lock status here adds the lock to
>> +      * the read set. This means that when we trigger a wakeup from another
>> +      * thread, even if we don't have a defined wakeup address and thus don't
>> +      * actually cause any writes, the act of locking our lock will itself
>> +      * trigger the wakeup and abort the transaction.
>> +      */
>> +     rte_spinlock_is_locked(&s->lock);
>> +
>> +     /*
>> +      * add all addresses to wait on into transaction read-set and check if
>> +      * any of wakeup conditions are already met.
>> +      */
>> +     for (i = 0; i < num; i++) {
>> +             const struct rte_power_monitor_cond *c = &pmc[i];
>> +
>> +             if (pmc->fn == NULL)
> 
> Should be c->fn, I believe.

Yep, will fix.

> 
>> +                     continue;
> 
> Actually that way, if c->fn == NULL, we'll never add  our c->addr to monitored addresses.
> Is that what we really want?
> My thought was, that if callback is not set, we'll just go to power-save state without extra checking, no?
> Something like that:
> 
> const struct rte_power_monitor_cond *c = &pmc[i];
> const uint64_t val = __get_umwait_val(c->addr, c->size);
> 
> if (c->fn && c->fn(val, c->opaque) != 0)
>     break;

This is consistent with previous behavior of rte_power_monitor where if 
mask wasn't set we entered power save mode without any checks. If we do 
a break, that means the check condition has failed somewhere and we have 
to abort the sleep. Continue keeps the sleep.

> 
> Same thought for rte_power_monitor().
> 
>> +             const uint64_t val = __get_umwait_val(pmc->addr, pmc->size);
> 
> Same thing: s/pmc->/c->/

Yep, you're right.

> 
>> +
>> +             /* abort if callback indicates that we need to stop */
>> +             if (c->fn(val, c->opaque) != 0)
>> +                     break;
>> +     }
>> +
>> +     /* none of the conditions were met, sleep until timeout */
>> +     if (i == num)
>> +             rte_power_pause(tsc_timestamp);
>> +
>> +     /* end transaction region */
>> +     rte_xend();
>> +
>> +     return 0;
>> +}
>> --
>> 2.25.1
>
  
Ananyev, Konstantin June 28, 2021, 12:58 p.m. UTC | #3
> On 28-Jun-21 1:37 PM, Ananyev, Konstantin wrote:
> >
> >> Use RTM and WAITPKG instructions to perform a wait-for-writes similar to
> >> what UMWAIT does, but without the limitation of having to listen for
> >> just one event. This works because the optimized power state used by the
> >> TPAUSE instruction will cause a wake up on RTM transaction abort, so if
> >> we add the addresses we're interested in to the read-set, any write to
> >> those addresses will wake us up.
> >>
> >> Signed-off-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
> >> Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
> >> ---
> >>
> >> Notes:
> >>      v2:
> >>      - Adapt to callback mechanism
> >>
> >>   doc/guides/rel_notes/release_21_08.rst        |  2 +
> >>   lib/eal/arm/rte_power_intrinsics.c            | 11 +++
> >>   lib/eal/include/generic/rte_cpuflags.h        |  2 +
> >>   .../include/generic/rte_power_intrinsics.h    | 35 ++++++++++
> >>   lib/eal/ppc/rte_power_intrinsics.c            | 11 +++
> >>   lib/eal/version.map                           |  3 +
> >>   lib/eal/x86/rte_cpuflags.c                    |  2 +
> >>   lib/eal/x86/rte_power_intrinsics.c            | 69 +++++++++++++++++++
> >>   8 files changed, 135 insertions(+)
> >>
> > ...
> >
> >> diff --git a/lib/eal/x86/rte_power_intrinsics.c b/lib/eal/x86/rte_power_intrinsics.c
> >> index 3c5c9ce7ad..3fc6f62ef5 100644
> >> --- a/lib/eal/x86/rte_power_intrinsics.c
> >> +++ b/lib/eal/x86/rte_power_intrinsics.c
> >> @@ -4,6 +4,7 @@
> >>
> >>   #include <rte_common.h>
> >>   #include <rte_lcore.h>
> >> +#include <rte_rtm.h>
> >>   #include <rte_spinlock.h>
> >>
> >>   #include "rte_power_intrinsics.h"
> >> @@ -28,6 +29,7 @@ __umwait_wakeup(volatile void *addr)
> >>   }
> >>
> >>   static bool wait_supported;
> >> +static bool wait_multi_supported;
> >>
> >>   static inline uint64_t
> >>   __get_umwait_val(const volatile void *p, const uint8_t sz)
> >> @@ -164,6 +166,8 @@ RTE_INIT(rte_power_intrinsics_init) {
> >>
> >>        if (i.power_monitor && i.power_pause)
> >>                wait_supported = 1;
> >> +     if (i.power_monitor_multi)
> >> +             wait_multi_supported = 1;
> >>   }
> >>
> >>   int
> >> @@ -202,6 +206,9 @@ rte_power_monitor_wakeup(const unsigned int lcore_id)
> >>         * In this case, since we've already woken up, the "wakeup" was
> >>         * unneeded, and since T1 is still waiting on T2 releasing the lock, the
> >>         * wakeup address is still valid so it's perfectly safe to write it.
> >> +      *
> >> +      * For multi-monitor case, the act of locking will in itself trigger the
> >> +      * wakeup, so no additional writes necessary.
> >>         */
> >>        rte_spinlock_lock(&s->lock);
> >>        if (s->monitor_addr != NULL)
> >> @@ -210,3 +217,65 @@ rte_power_monitor_wakeup(const unsigned int lcore_id)
> >>
> >>        return 0;
> >>   }
> >> +
> >> +int
> >> +rte_power_monitor_multi(const struct rte_power_monitor_cond pmc[],
> >> +             const uint32_t num, const uint64_t tsc_timestamp)
> >> +{
> >> +     const unsigned int lcore_id = rte_lcore_id();
> >> +     struct power_wait_status *s = &wait_status[lcore_id];
> >> +     uint32_t i, rc;
> >> +
> >> +     /* check if supported */
> >> +     if (!wait_multi_supported)
> >> +             return -ENOTSUP;
> >> +
> >> +     if (pmc == NULL || num == 0)
> >> +             return -EINVAL;
> >> +
> >> +     /* we are already inside transaction region, return */
> >> +     if (rte_xtest() != 0)
> >> +             return 0;
> >> +
> >> +     /* start new transaction region */
> >> +     rc = rte_xbegin();
> >> +
> >> +     /* transaction abort, possible write to one of wait addresses */
> >> +     if (rc != RTE_XBEGIN_STARTED)
> >> +             return 0;
> >> +
> >> +     /*
> >> +      * the mere act of reading the lock status here adds the lock to
> >> +      * the read set. This means that when we trigger a wakeup from another
> >> +      * thread, even if we don't have a defined wakeup address and thus don't
> >> +      * actually cause any writes, the act of locking our lock will itself
> >> +      * trigger the wakeup and abort the transaction.
> >> +      */
> >> +     rte_spinlock_is_locked(&s->lock);
> >> +
> >> +     /*
> >> +      * add all addresses to wait on into transaction read-set and check if
> >> +      * any of wakeup conditions are already met.
> >> +      */
> >> +     for (i = 0; i < num; i++) {
> >> +             const struct rte_power_monitor_cond *c = &pmc[i];
> >> +
> >> +             if (pmc->fn == NULL)
> >
> > Should be c->fn, I believe.
> 
> Yep, will fix.
> 
> >
> >> +                     continue;
> >
> > Actually that way, if c->fn == NULL, we'll never add  our c->addr to monitored addresses.
> > Is that what we really want?
> > My thought was, that if callback is not set, we'll just go to power-save state without extra checking, no?
> > Something like that:
> >
> > const struct rte_power_monitor_cond *c = &pmc[i];
> > const uint64_t val = __get_umwait_val(c->addr, c->size);
> >
> > if (c->fn && c->fn(val, c->opaque) != 0)
> >     break;
> 
> This is consistent with previous behavior of rte_power_monitor where if
> mask wasn't set we entered power save mode without any checks. If we do
> a break, that means the check condition has failed somewhere and we have
> to abort the sleep. Continue keeps the sleep.

Ok, so what is current intention?
If pmc->fn == NULL what does it mean:
1) pmc->addr shouldn't be monitored at all?
2) pmc->addr should be monitored unconditionally
3) pmc->fn should never be NULL and monitor should return an error
3) something else?

For me 1) looks really strange, if user doesn't want to sleep on that address,
he can just not add this addr to pmc[].

2) is probably ok... but is that really needed?
User can just provide NOP as a callback and it would be the same.

3) seems like a most sane to be.

> 
> >
> > Same thought for rte_power_monitor().
> >
> >> +             const uint64_t val = __get_umwait_val(pmc->addr, pmc->size);
> >
> > Same thing: s/pmc->/c->/
> 
> Yep, you're right.
> 
> >
> >> +
> >> +             /* abort if callback indicates that we need to stop */
> >> +             if (c->fn(val, c->opaque) != 0)
> >> +                     break;
> >> +     }
> >> +
> >> +     /* none of the conditions were met, sleep until timeout */
> >> +     if (i == num)
> >> +             rte_power_pause(tsc_timestamp);
> >> +
> >> +     /* end transaction region */
> >> +     rte_xend();
> >> +
> >> +     return 0;
> >> +}
> >> --
> >> 2.25.1
> >
> 
> 
> --
> Thanks,
> Anatoly
  
Anatoly Burakov June 28, 2021, 1:29 p.m. UTC | #4
On 28-Jun-21 1:58 PM, Ananyev, Konstantin wrote:
> 
>> On 28-Jun-21 1:37 PM, Ananyev, Konstantin wrote:
>>>
>>>> Use RTM and WAITPKG instructions to perform a wait-for-writes similar to
>>>> what UMWAIT does, but without the limitation of having to listen for
>>>> just one event. This works because the optimized power state used by the
>>>> TPAUSE instruction will cause a wake up on RTM transaction abort, so if
>>>> we add the addresses we're interested in to the read-set, any write to
>>>> those addresses will wake us up.
>>>>
>>>> Signed-off-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
>>>> Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
>>>> ---
>>>>
>>>> Notes:
>>>>       v2:
>>>>       - Adapt to callback mechanism
>>>>
>>>>    doc/guides/rel_notes/release_21_08.rst        |  2 +
>>>>    lib/eal/arm/rte_power_intrinsics.c            | 11 +++
>>>>    lib/eal/include/generic/rte_cpuflags.h        |  2 +
>>>>    .../include/generic/rte_power_intrinsics.h    | 35 ++++++++++
>>>>    lib/eal/ppc/rte_power_intrinsics.c            | 11 +++
>>>>    lib/eal/version.map                           |  3 +
>>>>    lib/eal/x86/rte_cpuflags.c                    |  2 +
>>>>    lib/eal/x86/rte_power_intrinsics.c            | 69 +++++++++++++++++++
>>>>    8 files changed, 135 insertions(+)
>>>>
>>> ...
>>>
>>>> diff --git a/lib/eal/x86/rte_power_intrinsics.c b/lib/eal/x86/rte_power_intrinsics.c
>>>> index 3c5c9ce7ad..3fc6f62ef5 100644
>>>> --- a/lib/eal/x86/rte_power_intrinsics.c
>>>> +++ b/lib/eal/x86/rte_power_intrinsics.c
>>>> @@ -4,6 +4,7 @@
>>>>
>>>>    #include <rte_common.h>
>>>>    #include <rte_lcore.h>
>>>> +#include <rte_rtm.h>
>>>>    #include <rte_spinlock.h>
>>>>
>>>>    #include "rte_power_intrinsics.h"
>>>> @@ -28,6 +29,7 @@ __umwait_wakeup(volatile void *addr)
>>>>    }
>>>>
>>>>    static bool wait_supported;
>>>> +static bool wait_multi_supported;
>>>>
>>>>    static inline uint64_t
>>>>    __get_umwait_val(const volatile void *p, const uint8_t sz)
>>>> @@ -164,6 +166,8 @@ RTE_INIT(rte_power_intrinsics_init) {
>>>>
>>>>         if (i.power_monitor && i.power_pause)
>>>>                 wait_supported = 1;
>>>> +     if (i.power_monitor_multi)
>>>> +             wait_multi_supported = 1;
>>>>    }
>>>>
>>>>    int
>>>> @@ -202,6 +206,9 @@ rte_power_monitor_wakeup(const unsigned int lcore_id)
>>>>          * In this case, since we've already woken up, the "wakeup" was
>>>>          * unneeded, and since T1 is still waiting on T2 releasing the lock, the
>>>>          * wakeup address is still valid so it's perfectly safe to write it.
>>>> +      *
>>>> +      * For multi-monitor case, the act of locking will in itself trigger the
>>>> +      * wakeup, so no additional writes necessary.
>>>>          */
>>>>         rte_spinlock_lock(&s->lock);
>>>>         if (s->monitor_addr != NULL)
>>>> @@ -210,3 +217,65 @@ rte_power_monitor_wakeup(const unsigned int lcore_id)
>>>>
>>>>         return 0;
>>>>    }
>>>> +
>>>> +int
>>>> +rte_power_monitor_multi(const struct rte_power_monitor_cond pmc[],
>>>> +             const uint32_t num, const uint64_t tsc_timestamp)
>>>> +{
>>>> +     const unsigned int lcore_id = rte_lcore_id();
>>>> +     struct power_wait_status *s = &wait_status[lcore_id];
>>>> +     uint32_t i, rc;
>>>> +
>>>> +     /* check if supported */
>>>> +     if (!wait_multi_supported)
>>>> +             return -ENOTSUP;
>>>> +
>>>> +     if (pmc == NULL || num == 0)
>>>> +             return -EINVAL;
>>>> +
>>>> +     /* we are already inside transaction region, return */
>>>> +     if (rte_xtest() != 0)
>>>> +             return 0;
>>>> +
>>>> +     /* start new transaction region */
>>>> +     rc = rte_xbegin();
>>>> +
>>>> +     /* transaction abort, possible write to one of wait addresses */
>>>> +     if (rc != RTE_XBEGIN_STARTED)
>>>> +             return 0;
>>>> +
>>>> +     /*
>>>> +      * the mere act of reading the lock status here adds the lock to
>>>> +      * the read set. This means that when we trigger a wakeup from another
>>>> +      * thread, even if we don't have a defined wakeup address and thus don't
>>>> +      * actually cause any writes, the act of locking our lock will itself
>>>> +      * trigger the wakeup and abort the transaction.
>>>> +      */
>>>> +     rte_spinlock_is_locked(&s->lock);
>>>> +
>>>> +     /*
>>>> +      * add all addresses to wait on into transaction read-set and check if
>>>> +      * any of wakeup conditions are already met.
>>>> +      */
>>>> +     for (i = 0; i < num; i++) {
>>>> +             const struct rte_power_monitor_cond *c = &pmc[i];
>>>> +
>>>> +             if (pmc->fn == NULL)
>>>
>>> Should be c->fn, I believe.
>>
>> Yep, will fix.
>>
>>>
>>>> +                     continue;
>>>
>>> Actually that way, if c->fn == NULL, we'll never add  our c->addr to monitored addresses.
>>> Is that what we really want?
>>> My thought was, that if callback is not set, we'll just go to power-save state without extra checking, no?
>>> Something like that:
>>>
>>> const struct rte_power_monitor_cond *c = &pmc[i];
>>> const uint64_t val = __get_umwait_val(c->addr, c->size);
>>>
>>> if (c->fn && c->fn(val, c->opaque) != 0)
>>>      break;
>>
>> This is consistent with previous behavior of rte_power_monitor where if
>> mask wasn't set we entered power save mode without any checks. If we do
>> a break, that means the check condition has failed somewhere and we have
>> to abort the sleep. Continue keeps the sleep.
> 
> Ok, so what is current intention?
> If pmc->fn == NULL what does it mean:
> 1) pmc->addr shouldn't be monitored at all?
> 2) pmc->addr should be monitored unconditionally
> 3) pmc->fn should never be NULL and monitor should return an error
> 3) something else?
> 
> For me 1) looks really strange, if user doesn't want to sleep on that address,
> he can just not add this addr to pmc[].

Ah, i see what you mean now. While we can skip the comparison, we still 
want to read the address. So, i would guess 2) should be true.

> 
> 2) is probably ok... but is that really needed?
> User can just provide NOP as a callback and it would be the same.
> 
> 3) seems like a most sane to be.

In that case, we have to do the same in the singular power monitor 
function - fn set to NULL should return an error. Will fix in v4 (v3 
went out before i noticed your comments :/ ).
  

Patch

diff --git a/doc/guides/rel_notes/release_21_08.rst b/doc/guides/rel_notes/release_21_08.rst
index c84ac280f5..9d1cfac395 100644
--- a/doc/guides/rel_notes/release_21_08.rst
+++ b/doc/guides/rel_notes/release_21_08.rst
@@ -55,6 +55,8 @@  New Features
      Also, make sure to start the actual text at the margin.
      =======================================================
 
+* eal: added ``rte_power_monitor_multi`` to support waiting for multiple events.
+
 
 Removed Items
 -------------
diff --git a/lib/eal/arm/rte_power_intrinsics.c b/lib/eal/arm/rte_power_intrinsics.c
index e83f04072a..78f55b7203 100644
--- a/lib/eal/arm/rte_power_intrinsics.c
+++ b/lib/eal/arm/rte_power_intrinsics.c
@@ -38,3 +38,14 @@  rte_power_monitor_wakeup(const unsigned int lcore_id)
 
 	return -ENOTSUP;
 }
+
+int
+rte_power_monitor_multi(const struct rte_power_monitor_cond pmc[],
+		const uint32_t num, const uint64_t tsc_timestamp)
+{
+	RTE_SET_USED(pmc);
+	RTE_SET_USED(num);
+	RTE_SET_USED(tsc_timestamp);
+
+	return -ENOTSUP;
+}
diff --git a/lib/eal/include/generic/rte_cpuflags.h b/lib/eal/include/generic/rte_cpuflags.h
index 28a5aecde8..d35551e931 100644
--- a/lib/eal/include/generic/rte_cpuflags.h
+++ b/lib/eal/include/generic/rte_cpuflags.h
@@ -24,6 +24,8 @@  struct rte_cpu_intrinsics {
 	/**< indicates support for rte_power_monitor function */
 	uint32_t power_pause : 1;
 	/**< indicates support for rte_power_pause function */
+	uint32_t power_monitor_multi : 1;
+	/**< indicates support for rte_power_monitor_multi function */
 };
 
 /**
diff --git a/lib/eal/include/generic/rte_power_intrinsics.h b/lib/eal/include/generic/rte_power_intrinsics.h
index 046667ade6..877fb282cb 100644
--- a/lib/eal/include/generic/rte_power_intrinsics.h
+++ b/lib/eal/include/generic/rte_power_intrinsics.h
@@ -124,4 +124,39 @@  int rte_power_monitor_wakeup(const unsigned int lcore_id);
 __rte_experimental
 int rte_power_pause(const uint64_t tsc_timestamp);
 
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Monitor a set of addresses for changes. This will cause the CPU to enter an
+ * architecture-defined optimized power state until either one of the specified
+ * memory addresses is written to, a certain TSC timestamp is reached, or other
+ * reasons cause the CPU to wake up.
+ *
+ * Additionally, `expected` 64-bit values and 64-bit masks are provided. If
+ * mask is non-zero, the current value pointed to by the `p` pointer will be
+ * checked against the expected value, and if they do not match, the entering of
+ * optimized power state may be aborted.
+ *
+ * @warning It is responsibility of the user to check if this function is
+ *   supported at runtime using `rte_cpu_get_intrinsics_support()` API call.
+ *   Failing to do so may result in an illegal CPU instruction error.
+ *
+ * @param pmc
+ *   An array of monitoring condition structures.
+ * @param num
+ *   Length of the `pmc` array.
+ * @param tsc_timestamp
+ *   Maximum TSC timestamp to wait for. Note that the wait behavior is
+ *   architecture-dependent.
+ *
+ * @return
+ *   0 on success
+ *   -EINVAL on invalid parameters
+ *   -ENOTSUP if unsupported
+ */
+__rte_experimental
+int rte_power_monitor_multi(const struct rte_power_monitor_cond pmc[],
+		const uint32_t num, const uint64_t tsc_timestamp);
+
 #endif /* _RTE_POWER_INTRINSIC_H_ */
diff --git a/lib/eal/ppc/rte_power_intrinsics.c b/lib/eal/ppc/rte_power_intrinsics.c
index 7fc9586da7..f00b58ade5 100644
--- a/lib/eal/ppc/rte_power_intrinsics.c
+++ b/lib/eal/ppc/rte_power_intrinsics.c
@@ -38,3 +38,14 @@  rte_power_monitor_wakeup(const unsigned int lcore_id)
 
 	return -ENOTSUP;
 }
+
+int
+rte_power_monitor_multi(const struct rte_power_monitor_cond pmc[],
+		const uint32_t num, const uint64_t tsc_timestamp)
+{
+	RTE_SET_USED(pmc);
+	RTE_SET_USED(num);
+	RTE_SET_USED(tsc_timestamp);
+
+	return -ENOTSUP;
+}
diff --git a/lib/eal/version.map b/lib/eal/version.map
index fe5c3dac98..4ccd5475d6 100644
--- a/lib/eal/version.map
+++ b/lib/eal/version.map
@@ -423,6 +423,9 @@  EXPERIMENTAL {
 	rte_version_release; # WINDOWS_NO_EXPORT
 	rte_version_suffix; # WINDOWS_NO_EXPORT
 	rte_version_year; # WINDOWS_NO_EXPORT
+
+	# added in 21.08
+	rte_power_monitor_multi; # WINDOWS_NO_EXPORT
 };
 
 INTERNAL {
diff --git a/lib/eal/x86/rte_cpuflags.c b/lib/eal/x86/rte_cpuflags.c
index a96312ff7f..d339734a8c 100644
--- a/lib/eal/x86/rte_cpuflags.c
+++ b/lib/eal/x86/rte_cpuflags.c
@@ -189,5 +189,7 @@  rte_cpu_get_intrinsics_support(struct rte_cpu_intrinsics *intrinsics)
 	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_WAITPKG)) {
 		intrinsics->power_monitor = 1;
 		intrinsics->power_pause = 1;
+		if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_RTM))
+			intrinsics->power_monitor_multi = 1;
 	}
 }
diff --git a/lib/eal/x86/rte_power_intrinsics.c b/lib/eal/x86/rte_power_intrinsics.c
index 3c5c9ce7ad..3fc6f62ef5 100644
--- a/lib/eal/x86/rte_power_intrinsics.c
+++ b/lib/eal/x86/rte_power_intrinsics.c
@@ -4,6 +4,7 @@ 
 
 #include <rte_common.h>
 #include <rte_lcore.h>
+#include <rte_rtm.h>
 #include <rte_spinlock.h>
 
 #include "rte_power_intrinsics.h"
@@ -28,6 +29,7 @@  __umwait_wakeup(volatile void *addr)
 }
 
 static bool wait_supported;
+static bool wait_multi_supported;
 
 static inline uint64_t
 __get_umwait_val(const volatile void *p, const uint8_t sz)
@@ -164,6 +166,8 @@  RTE_INIT(rte_power_intrinsics_init) {
 
 	if (i.power_monitor && i.power_pause)
 		wait_supported = 1;
+	if (i.power_monitor_multi)
+		wait_multi_supported = 1;
 }
 
 int
@@ -202,6 +206,9 @@  rte_power_monitor_wakeup(const unsigned int lcore_id)
 	 * In this case, since we've already woken up, the "wakeup" was
 	 * unneeded, and since T1 is still waiting on T2 releasing the lock, the
 	 * wakeup address is still valid so it's perfectly safe to write it.
+	 *
+	 * For multi-monitor case, the act of locking will in itself trigger the
+	 * wakeup, so no additional writes necessary.
 	 */
 	rte_spinlock_lock(&s->lock);
 	if (s->monitor_addr != NULL)
@@ -210,3 +217,65 @@  rte_power_monitor_wakeup(const unsigned int lcore_id)
 
 	return 0;
 }
+
+int
+rte_power_monitor_multi(const struct rte_power_monitor_cond pmc[],
+		const uint32_t num, const uint64_t tsc_timestamp)
+{
+	const unsigned int lcore_id = rte_lcore_id();
+	struct power_wait_status *s = &wait_status[lcore_id];
+	uint32_t i, rc;
+
+	/* check if supported */
+	if (!wait_multi_supported)
+		return -ENOTSUP;
+
+	if (pmc == NULL || num == 0)
+		return -EINVAL;
+
+	/* we are already inside transaction region, return */
+	if (rte_xtest() != 0)
+		return 0;
+
+	/* start new transaction region */
+	rc = rte_xbegin();
+
+	/* transaction abort, possible write to one of wait addresses */
+	if (rc != RTE_XBEGIN_STARTED)
+		return 0;
+
+	/*
+	 * the mere act of reading the lock status here adds the lock to
+	 * the read set. This means that when we trigger a wakeup from another
+	 * thread, even if we don't have a defined wakeup address and thus don't
+	 * actually cause any writes, the act of locking our lock will itself
+	 * trigger the wakeup and abort the transaction.
+	 */
+	rte_spinlock_is_locked(&s->lock);
+
+	/*
+	 * add all addresses to wait on into transaction read-set and check if
+	 * any of wakeup conditions are already met.
+	 */
+	for (i = 0; i < num; i++) {
+		const struct rte_power_monitor_cond *c = &pmc[i];
+
+		if (pmc->fn == NULL)
+			continue;
+
+		const uint64_t val = __get_umwait_val(pmc->addr, pmc->size);
+
+		/* abort if callback indicates that we need to stop */
+		if (c->fn(val, c->opaque) != 0)
+			break;
+	}
+
+	/* none of the conditions were met, sleep until timeout */
+	if (i == num)
+		rte_power_pause(tsc_timestamp);
+
+	/* end transaction region */
+	rte_xend();
+
+	return 0;
+}