[v13,05/11] eal: add monitor wakeup function

Message ID 0265c6b23e59f089e7f03a48e8904ae410639868.1610127362.git.anatoly.burakov@intel.com (mailing list archive)
State Superseded, archived
Delegated to: Thomas Monjalon
Headers
Series Add PMD power management |

Checks

Context Check Description
ci/checkpatch success coding style OK

Commit Message

Anatoly Burakov Jan. 8, 2021, 5:42 p.m. UTC
  Now that we have everything in a C file, we can store the information
about our sleep, and have a native mechanism to wake up the sleeping
core. This mechanism would however only wake up a core that's sleeping
while monitoring - waking up from `rte_power_pause` won't work.

Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
---

Notes:
    v13:
    - Add comments around wakeup code to explain what it does
    - Add lcore_id parameter checking to prevent buffer overrun

 .../arm/include/rte_power_intrinsics.h        |  9 ++
 .../include/generic/rte_power_intrinsics.h    | 16 ++++
 .../ppc/include/rte_power_intrinsics.h        |  9 ++
 lib/librte_eal/version.map                    |  1 +
 lib/librte_eal/x86/rte_power_intrinsics.c     | 85 +++++++++++++++++++
 5 files changed, 120 insertions(+)
  

Comments

Ananyev, Konstantin Jan. 12, 2021, 4:02 p.m. UTC | #1
> 
> Now that we have everything in a C file, we can store the information
> about our sleep, and have a native mechanism to wake up the sleeping
> core. This mechanism would however only wake up a core that's sleeping
> while monitoring - waking up from `rte_power_pause` won't work.
> 
> Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
> ---
> 
> Notes:
>     v13:
>     - Add comments around wakeup code to explain what it does
>     - Add lcore_id parameter checking to prevent buffer overrun
> 
>  .../arm/include/rte_power_intrinsics.h        |  9 ++
>  .../include/generic/rte_power_intrinsics.h    | 16 ++++
>  .../ppc/include/rte_power_intrinsics.h        |  9 ++
>  lib/librte_eal/version.map                    |  1 +
>  lib/librte_eal/x86/rte_power_intrinsics.c     | 85 +++++++++++++++++++
>  5 files changed, 120 insertions(+)
> 
> diff --git a/lib/librte_eal/arm/include/rte_power_intrinsics.h b/lib/librte_eal/arm/include/rte_power_intrinsics.h
> index 27869251a8..39e49cc45b 100644
> --- a/lib/librte_eal/arm/include/rte_power_intrinsics.h
> +++ b/lib/librte_eal/arm/include/rte_power_intrinsics.h
> @@ -33,6 +33,15 @@ rte_power_pause(const uint64_t tsc_timestamp)
>  	RTE_SET_USED(tsc_timestamp);
>  }
> 
> +/**
> + * This function is not supported on ARM.
> + */
> +void
> +rte_power_monitor_wakeup(const unsigned int lcore_id)
> +{
> +	RTE_SET_USED(lcore_id);
> +}
> +
>  #ifdef __cplusplus
>  }
>  #endif
> diff --git a/lib/librte_eal/include/generic/rte_power_intrinsics.h b/lib/librte_eal/include/generic/rte_power_intrinsics.h
> index a6f1955996..e311d6f8ea 100644
> --- a/lib/librte_eal/include/generic/rte_power_intrinsics.h
> +++ b/lib/librte_eal/include/generic/rte_power_intrinsics.h
> @@ -57,6 +57,22 @@ __rte_experimental
>  void rte_power_monitor(const struct rte_power_monitor_cond *pmc,
>  		const uint64_t tsc_timestamp);
> 
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice
> + *
> + * Wake up a specific lcore that is in a power optimized state and is monitoring
> + * an address.
> + *
> + * @note This function will *not* wake up a core that is in a power optimized
> + *   state due to calling `rte_power_pause`.
> + *
> + * @param lcore_id
> + *   Lcore ID of a sleeping thread.
> + */
> +__rte_experimental
> +void rte_power_monitor_wakeup(const unsigned int lcore_id);
> +
>  /**
>   * @warning
>   * @b EXPERIMENTAL: this API may change without prior notice
> diff --git a/lib/librte_eal/ppc/include/rte_power_intrinsics.h b/lib/librte_eal/ppc/include/rte_power_intrinsics.h
> index 248d1f4a23..2e7db0e7eb 100644
> --- a/lib/librte_eal/ppc/include/rte_power_intrinsics.h
> +++ b/lib/librte_eal/ppc/include/rte_power_intrinsics.h
> @@ -33,6 +33,15 @@ rte_power_pause(const uint64_t tsc_timestamp)
>  	RTE_SET_USED(tsc_timestamp);
>  }
> 
> +/**
> + * This function is not supported on PPC64.
> + */
> +void
> +rte_power_monitor_wakeup(const unsigned int lcore_id)
> +{
> +	RTE_SET_USED(lcore_id);
> +}
> +
>  #ifdef __cplusplus
>  }
>  #endif
> diff --git a/lib/librte_eal/version.map b/lib/librte_eal/version.map
> index 20945b1efa..ac026e289d 100644
> --- a/lib/librte_eal/version.map
> +++ b/lib/librte_eal/version.map
> @@ -406,6 +406,7 @@ EXPERIMENTAL {
> 
>  	# added in 21.02
>  	rte_power_monitor;
> +	rte_power_monitor_wakeup;
>  	rte_power_pause;
>  };
> 
> diff --git a/lib/librte_eal/x86/rte_power_intrinsics.c b/lib/librte_eal/x86/rte_power_intrinsics.c
> index a9cd1afe9d..46a4fb6cd5 100644
> --- a/lib/librte_eal/x86/rte_power_intrinsics.c
> +++ b/lib/librte_eal/x86/rte_power_intrinsics.c
> @@ -2,8 +2,31 @@
>   * Copyright(c) 2020 Intel Corporation
>   */
> 
> +#include <rte_common.h>
> +#include <rte_lcore.h>
> +#include <rte_spinlock.h>
> +
>  #include "rte_power_intrinsics.h"
> 
> +/*
> + * Per-lcore structure holding current status of C0.2 sleeps.
> + */
> +static struct power_wait_status {
> +	rte_spinlock_t lock;
> +	volatile void *monitor_addr; /**< NULL if not currently sleeping */
> +} __rte_cache_aligned wait_status[RTE_MAX_LCORE];
> +
> +static inline void
> +__umwait_wakeup(volatile void *addr)
> +{
> +	uint64_t val;
> +
> +	/* trigger a write but don't change the value */
> +	val = __atomic_load_n((volatile uint64_t *)addr, __ATOMIC_RELAXED);
> +	__atomic_compare_exchange_n((volatile uint64_t *)addr, &val, val, 0,
> +			__ATOMIC_RELAXED, __ATOMIC_RELAXED);
> +}
> +
>  static uint8_t wait_supported;
> 
>  static inline uint64_t
> @@ -36,6 +59,12 @@ rte_power_monitor(const struct rte_power_monitor_cond *pmc,
>  {
>  	const uint32_t tsc_l = (uint32_t)tsc_timestamp;
>  	const uint32_t tsc_h = (uint32_t)(tsc_timestamp >> 32);
> +	const unsigned int lcore_id = rte_lcore_id();
> +	struct power_wait_status *s;
> +
> +	/* prevent non-EAL thread from using this API */
> +	if (lcore_id >= RTE_MAX_LCORE)
> +		return;
> 
>  	/* prevent user from running this instruction if it's not supported */
>  	if (!wait_supported)
> @@ -60,11 +89,24 @@ rte_power_monitor(const struct rte_power_monitor_cond *pmc,
>  		if (masked == pmc->val)
>  			return;
>  	}
> +
> +	s = &wait_status[lcore_id];
> +
> +	/* update sleep address */
> +	rte_spinlock_lock(&s->lock);
> +	s->monitor_addr = pmc->addr;
> +	rte_spinlock_unlock(&s->lock);

It was a while, since I looked at it last time,
but shouldn't we grab the lock before monitor()?
I.E:
lock();
monitor();
addr=...;
unlock();
umwait();

> +
>  	/* execute UMWAIT */
>  	asm volatile(".byte 0xf2, 0x0f, 0xae, 0xf7;"
>  			: /* ignore rflags */
>  			: "D"(0), /* enter C0.2 */
>  			  "a"(tsc_l), "d"(tsc_h));
> +
> +	/* erase sleep address */
> +	rte_spinlock_lock(&s->lock);
> +	s->monitor_addr = NULL;
> +	rte_spinlock_unlock(&s->lock);
>  }
> 
>  /**
> @@ -97,3 +139,46 @@ RTE_INIT(rte_power_intrinsics_init) {
>  	if (i.power_monitor && i.power_pause)
>  		wait_supported = 1;
>  }
> +
> +void
> +rte_power_monitor_wakeup(const unsigned int lcore_id)
> +{
> +	struct power_wait_status *s;
> +
> +	/* prevent buffer overrun */
> +	if (lcore_id >= RTE_MAX_LCORE)
> +		return;
> +
> +	/* prevent user from running this instruction if it's not supported */
> +	if (!wait_supported)
> +		return;
> +
> +	s = &wait_status[lcore_id];
> +
> +	/*
> +	 * There is a race condition between sleep, wakeup and locking, but we
> +	 * don't need to handle it.
> +	 *
> +	 * Possible situations:
> +	 *
> +	 * 1. T1 locks, sets address, unlocks
> +	 * 2. T2 locks, triggers wakeup, unlocks
> +	 * 3. T1 sleeps
> +	 *
> +	 * In this case, because T1 has already set the address for monitoring,
> +	 * we will wake up immediately even if T2 triggers wakeup before T1
> +	 * goes to sleep.
> +	 *
> +	 * 1. T1 locks, sets address, unlocks, goes to sleep, and wakes up
> +	 * 2. T2 locks, triggers wakeup, and unlocks
> +	 * 3. T1 locks, erases address, and unlocks
> +	 *
> +	 * In this case, since we've already woken up, the "wakeup" was
> +	 * unneeded, and since T1 is still waiting on T2 releasing the lock, the
> +	 * wakeup address is still valid so it's perfectly safe to write it.
> +	 */
> +	rte_spinlock_lock(&s->lock);
> +	if (s->monitor_addr != NULL)
> +		__umwait_wakeup(s->monitor_addr);
> +	rte_spinlock_unlock(&s->lock);
> +}
> --
> 2.25.1
  
Anatoly Burakov Jan. 12, 2021, 4:18 p.m. UTC | #2
On 12-Jan-21 4:02 PM, Ananyev, Konstantin wrote:
> 
>>
>> Now that we have everything in a C file, we can store the information
>> about our sleep, and have a native mechanism to wake up the sleeping
>> core. This mechanism would however only wake up a core that's sleeping
>> while monitoring - waking up from `rte_power_pause` won't work.
>>
>> Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
>> ---
>>
>> Notes:
>>      v13:
>>      - Add comments around wakeup code to explain what it does
>>      - Add lcore_id parameter checking to prevent buffer overrun
>>
>>   .../arm/include/rte_power_intrinsics.h        |  9 ++
>>   .../include/generic/rte_power_intrinsics.h    | 16 ++++
>>   .../ppc/include/rte_power_intrinsics.h        |  9 ++
>>   lib/librte_eal/version.map                    |  1 +
>>   lib/librte_eal/x86/rte_power_intrinsics.c     | 85 +++++++++++++++++++
>>   5 files changed, 120 insertions(+)
>>
>> diff --git a/lib/librte_eal/arm/include/rte_power_intrinsics.h b/lib/librte_eal/arm/include/rte_power_intrinsics.h
>> index 27869251a8..39e49cc45b 100644
>> --- a/lib/librte_eal/arm/include/rte_power_intrinsics.h
>> +++ b/lib/librte_eal/arm/include/rte_power_intrinsics.h
>> @@ -33,6 +33,15 @@ rte_power_pause(const uint64_t tsc_timestamp)
>>   RTE_SET_USED(tsc_timestamp);
>>   }
>>
>> +/**
>> + * This function is not supported on ARM.
>> + */
>> +void
>> +rte_power_monitor_wakeup(const unsigned int lcore_id)
>> +{
>> +RTE_SET_USED(lcore_id);
>> +}
>> +
>>   #ifdef __cplusplus
>>   }
>>   #endif
>> diff --git a/lib/librte_eal/include/generic/rte_power_intrinsics.h b/lib/librte_eal/include/generic/rte_power_intrinsics.h
>> index a6f1955996..e311d6f8ea 100644
>> --- a/lib/librte_eal/include/generic/rte_power_intrinsics.h
>> +++ b/lib/librte_eal/include/generic/rte_power_intrinsics.h
>> @@ -57,6 +57,22 @@ __rte_experimental
>>   void rte_power_monitor(const struct rte_power_monitor_cond *pmc,
>>   const uint64_t tsc_timestamp);
>>
>> +/**
>> + * @warning
>> + * @b EXPERIMENTAL: this API may change without prior notice
>> + *
>> + * Wake up a specific lcore that is in a power optimized state and is monitoring
>> + * an address.
>> + *
>> + * @note This function will *not* wake up a core that is in a power optimized
>> + *   state due to calling `rte_power_pause`.
>> + *
>> + * @param lcore_id
>> + *   Lcore ID of a sleeping thread.
>> + */
>> +__rte_experimental
>> +void rte_power_monitor_wakeup(const unsigned int lcore_id);
>> +
>>   /**
>>    * @warning
>>    * @b EXPERIMENTAL: this API may change without prior notice
>> diff --git a/lib/librte_eal/ppc/include/rte_power_intrinsics.h b/lib/librte_eal/ppc/include/rte_power_intrinsics.h
>> index 248d1f4a23..2e7db0e7eb 100644
>> --- a/lib/librte_eal/ppc/include/rte_power_intrinsics.h
>> +++ b/lib/librte_eal/ppc/include/rte_power_intrinsics.h
>> @@ -33,6 +33,15 @@ rte_power_pause(const uint64_t tsc_timestamp)
>>   RTE_SET_USED(tsc_timestamp);
>>   }
>>
>> +/**
>> + * This function is not supported on PPC64.
>> + */
>> +void
>> +rte_power_monitor_wakeup(const unsigned int lcore_id)
>> +{
>> +RTE_SET_USED(lcore_id);
>> +}
>> +
>>   #ifdef __cplusplus
>>   }
>>   #endif
>> diff --git a/lib/librte_eal/version.map b/lib/librte_eal/version.map
>> index 20945b1efa..ac026e289d 100644
>> --- a/lib/librte_eal/version.map
>> +++ b/lib/librte_eal/version.map
>> @@ -406,6 +406,7 @@ EXPERIMENTAL {
>>
>>   # added in 21.02
>>   rte_power_monitor;
>> +rte_power_monitor_wakeup;
>>   rte_power_pause;
>>   };
>>
>> diff --git a/lib/librte_eal/x86/rte_power_intrinsics.c b/lib/librte_eal/x86/rte_power_intrinsics.c
>> index a9cd1afe9d..46a4fb6cd5 100644
>> --- a/lib/librte_eal/x86/rte_power_intrinsics.c
>> +++ b/lib/librte_eal/x86/rte_power_intrinsics.c
>> @@ -2,8 +2,31 @@
>>    * Copyright(c) 2020 Intel Corporation
>>    */
>>
>> +#include <rte_common.h>
>> +#include <rte_lcore.h>
>> +#include <rte_spinlock.h>
>> +
>>   #include "rte_power_intrinsics.h"
>>
>> +/*
>> + * Per-lcore structure holding current status of C0.2 sleeps.
>> + */
>> +static struct power_wait_status {
>> +rte_spinlock_t lock;
>> +volatile void *monitor_addr; /**< NULL if not currently sleeping */
>> +} __rte_cache_aligned wait_status[RTE_MAX_LCORE];
>> +
>> +static inline void
>> +__umwait_wakeup(volatile void *addr)
>> +{
>> +uint64_t val;
>> +
>> +/* trigger a write but don't change the value */
>> +val = __atomic_load_n((volatile uint64_t *)addr, __ATOMIC_RELAXED);
>> +__atomic_compare_exchange_n((volatile uint64_t *)addr, &val, val, 0,
>> +__ATOMIC_RELAXED, __ATOMIC_RELAXED);
>> +}
>> +
>>   static uint8_t wait_supported;
>>
>>   static inline uint64_t
>> @@ -36,6 +59,12 @@ rte_power_monitor(const struct rte_power_monitor_cond *pmc,
>>   {
>>   const uint32_t tsc_l = (uint32_t)tsc_timestamp;
>>   const uint32_t tsc_h = (uint32_t)(tsc_timestamp >> 32);
>> +const unsigned int lcore_id = rte_lcore_id();
>> +struct power_wait_status *s;
>> +
>> +/* prevent non-EAL thread from using this API */
>> +if (lcore_id >= RTE_MAX_LCORE)
>> +return;
>>
>>   /* prevent user from running this instruction if it's not supported */
>>   if (!wait_supported)
>> @@ -60,11 +89,24 @@ rte_power_monitor(const struct rte_power_monitor_cond *pmc,
>>   if (masked == pmc->val)
>>   return;
>>   }
>> +
>> +s = &wait_status[lcore_id];
>> +
>> +/* update sleep address */
>> +rte_spinlock_lock(&s->lock);
>> +s->monitor_addr = pmc->addr;
>> +rte_spinlock_unlock(&s->lock);
> 
> It was a while, since I looked at it last time,
> but shouldn't we grab the lock before monitor()?
> I.E:
> lock();
> monitor();
> addr=...;
> unlock();
> umwait();
> 

I don't believe so.

The idea here is to only store the address when we are looking to sleep, 
and avoid the locks entirely if we already know we aren't going to 
sleep. I mean, technically we could lock unconditionally, then unlock 
when we're done, but there's very little practical difference between 
the two because the moment we are interested in (sleep) happens the same 
way whether we lock before or after monitor().

>> +
>>   /* execute UMWAIT */
>>   asm volatile(".byte 0xf2, 0x0f, 0xae, 0xf7;"
>>   : /* ignore rflags */
>>   : "D"(0), /* enter C0.2 */
>>     "a"(tsc_l), "d"(tsc_h));
>> +
>> +/* erase sleep address */
>> +rte_spinlock_lock(&s->lock);
>> +s->monitor_addr = NULL;
>> +rte_spinlock_unlock(&s->lock);
>>   }
>>
>>   /**
>> @@ -97,3 +139,46 @@ RTE_INIT(rte_power_intrinsics_init) {
>>   if (i.power_monitor && i.power_pause)
>>   wait_supported = 1;
>>   }
>> +
>> +void
>> +rte_power_monitor_wakeup(const unsigned int lcore_id)
>> +{
>> +struct power_wait_status *s;
>> +
>> +/* prevent buffer overrun */
>> +if (lcore_id >= RTE_MAX_LCORE)
>> +return;
>> +
>> +/* prevent user from running this instruction if it's not supported */
>> +if (!wait_supported)
>> +return;
>> +
>> +s = &wait_status[lcore_id];
>> +
>> +/*
>> + * There is a race condition between sleep, wakeup and locking, but we
>> + * don't need to handle it.
>> + *
>> + * Possible situations:
>> + *
>> + * 1. T1 locks, sets address, unlocks
>> + * 2. T2 locks, triggers wakeup, unlocks
>> + * 3. T1 sleeps
>> + *
>> + * In this case, because T1 has already set the address for monitoring,
>> + * we will wake up immediately even if T2 triggers wakeup before T1
>> + * goes to sleep.
>> + *
>> + * 1. T1 locks, sets address, unlocks, goes to sleep, and wakes up
>> + * 2. T2 locks, triggers wakeup, and unlocks
>> + * 3. T1 locks, erases address, and unlocks
>> + *
>> + * In this case, since we've already woken up, the "wakeup" was
>> + * unneeded, and since T1 is still waiting on T2 releasing the lock, the
>> + * wakeup address is still valid so it's perfectly safe to write it.
>> + */
>> +rte_spinlock_lock(&s->lock);
>> +if (s->monitor_addr != NULL)
>> +__umwait_wakeup(s->monitor_addr);
>> +rte_spinlock_unlock(&s->lock);
>> +}
>> --
>> 2.25.1
  
Anatoly Burakov Jan. 12, 2021, 4:25 p.m. UTC | #3
On 12-Jan-21 4:18 PM, Burakov, Anatoly wrote:
> On 12-Jan-21 4:02 PM, Ananyev, Konstantin wrote:
>>
>>>
>>> Now that we have everything in a C file, we can store the information
>>> about our sleep, and have a native mechanism to wake up the sleeping
>>> core. This mechanism would however only wake up a core that's sleeping
>>> while monitoring - waking up from `rte_power_pause` won't work.
>>>
>>> Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
>>> ---
>>>
>>> Notes:
>>>      v13:
>>>      - Add comments around wakeup code to explain what it does
>>>      - Add lcore_id parameter checking to prevent buffer overrun
>>>
>>>   .../arm/include/rte_power_intrinsics.h        |  9 ++
>>>   .../include/generic/rte_power_intrinsics.h    | 16 ++++
>>>   .../ppc/include/rte_power_intrinsics.h        |  9 ++
>>>   lib/librte_eal/version.map                    |  1 +
>>>   lib/librte_eal/x86/rte_power_intrinsics.c     | 85 +++++++++++++++++++
>>>   5 files changed, 120 insertions(+)
>>>
>>> diff --git a/lib/librte_eal/arm/include/rte_power_intrinsics.h 
>>> b/lib/librte_eal/arm/include/rte_power_intrinsics.h
>>> index 27869251a8..39e49cc45b 100644
>>> --- a/lib/librte_eal/arm/include/rte_power_intrinsics.h
>>> +++ b/lib/librte_eal/arm/include/rte_power_intrinsics.h
>>> @@ -33,6 +33,15 @@ rte_power_pause(const uint64_t tsc_timestamp)
>>>   RTE_SET_USED(tsc_timestamp);
>>>   }
>>>
>>> +/**
>>> + * This function is not supported on ARM.
>>> + */
>>> +void
>>> +rte_power_monitor_wakeup(const unsigned int lcore_id)
>>> +{
>>> +RTE_SET_USED(lcore_id);
>>> +}
>>> +
>>>   #ifdef __cplusplus
>>>   }
>>>   #endif
>>> diff --git a/lib/librte_eal/include/generic/rte_power_intrinsics.h 
>>> b/lib/librte_eal/include/generic/rte_power_intrinsics.h
>>> index a6f1955996..e311d6f8ea 100644
>>> --- a/lib/librte_eal/include/generic/rte_power_intrinsics.h
>>> +++ b/lib/librte_eal/include/generic/rte_power_intrinsics.h
>>> @@ -57,6 +57,22 @@ __rte_experimental
>>>   void rte_power_monitor(const struct rte_power_monitor_cond *pmc,
>>>   const uint64_t tsc_timestamp);
>>>
>>> +/**
>>> + * @warning
>>> + * @b EXPERIMENTAL: this API may change without prior notice
>>> + *
>>> + * Wake up a specific lcore that is in a power optimized state and 
>>> is monitoring
>>> + * an address.
>>> + *
>>> + * @note This function will *not* wake up a core that is in a power 
>>> optimized
>>> + *   state due to calling `rte_power_pause`.
>>> + *
>>> + * @param lcore_id
>>> + *   Lcore ID of a sleeping thread.
>>> + */
>>> +__rte_experimental
>>> +void rte_power_monitor_wakeup(const unsigned int lcore_id);
>>> +
>>>   /**
>>>    * @warning
>>>    * @b EXPERIMENTAL: this API may change without prior notice
>>> diff --git a/lib/librte_eal/ppc/include/rte_power_intrinsics.h 
>>> b/lib/librte_eal/ppc/include/rte_power_intrinsics.h
>>> index 248d1f4a23..2e7db0e7eb 100644
>>> --- a/lib/librte_eal/ppc/include/rte_power_intrinsics.h
>>> +++ b/lib/librte_eal/ppc/include/rte_power_intrinsics.h
>>> @@ -33,6 +33,15 @@ rte_power_pause(const uint64_t tsc_timestamp)
>>>   RTE_SET_USED(tsc_timestamp);
>>>   }
>>>
>>> +/**
>>> + * This function is not supported on PPC64.
>>> + */
>>> +void
>>> +rte_power_monitor_wakeup(const unsigned int lcore_id)
>>> +{
>>> +RTE_SET_USED(lcore_id);
>>> +}
>>> +
>>>   #ifdef __cplusplus
>>>   }
>>>   #endif
>>> diff --git a/lib/librte_eal/version.map b/lib/librte_eal/version.map
>>> index 20945b1efa..ac026e289d 100644
>>> --- a/lib/librte_eal/version.map
>>> +++ b/lib/librte_eal/version.map
>>> @@ -406,6 +406,7 @@ EXPERIMENTAL {
>>>
>>>   # added in 21.02
>>>   rte_power_monitor;
>>> +rte_power_monitor_wakeup;
>>>   rte_power_pause;
>>>   };
>>>
>>> diff --git a/lib/librte_eal/x86/rte_power_intrinsics.c 
>>> b/lib/librte_eal/x86/rte_power_intrinsics.c
>>> index a9cd1afe9d..46a4fb6cd5 100644
>>> --- a/lib/librte_eal/x86/rte_power_intrinsics.c
>>> +++ b/lib/librte_eal/x86/rte_power_intrinsics.c
>>> @@ -2,8 +2,31 @@
>>>    * Copyright(c) 2020 Intel Corporation
>>>    */
>>>
>>> +#include <rte_common.h>
>>> +#include <rte_lcore.h>
>>> +#include <rte_spinlock.h>
>>> +
>>>   #include "rte_power_intrinsics.h"
>>>
>>> +/*
>>> + * Per-lcore structure holding current status of C0.2 sleeps.
>>> + */
>>> +static struct power_wait_status {
>>> +rte_spinlock_t lock;
>>> +volatile void *monitor_addr; /**< NULL if not currently sleeping */
>>> +} __rte_cache_aligned wait_status[RTE_MAX_LCORE];
>>> +
>>> +static inline void
>>> +__umwait_wakeup(volatile void *addr)
>>> +{
>>> +uint64_t val;
>>> +
>>> +/* trigger a write but don't change the value */
>>> +val = __atomic_load_n((volatile uint64_t *)addr, __ATOMIC_RELAXED);
>>> +__atomic_compare_exchange_n((volatile uint64_t *)addr, &val, val, 0,
>>> +__ATOMIC_RELAXED, __ATOMIC_RELAXED);
>>> +}
>>> +
>>>   static uint8_t wait_supported;
>>>
>>>   static inline uint64_t
>>> @@ -36,6 +59,12 @@ rte_power_monitor(const struct 
>>> rte_power_monitor_cond *pmc,
>>>   {
>>>   const uint32_t tsc_l = (uint32_t)tsc_timestamp;
>>>   const uint32_t tsc_h = (uint32_t)(tsc_timestamp >> 32);
>>> +const unsigned int lcore_id = rte_lcore_id();
>>> +struct power_wait_status *s;
>>> +
>>> +/* prevent non-EAL thread from using this API */
>>> +if (lcore_id >= RTE_MAX_LCORE)
>>> +return;
>>>
>>>   /* prevent user from running this instruction if it's not supported */
>>>   if (!wait_supported)
>>> @@ -60,11 +89,24 @@ rte_power_monitor(const struct 
>>> rte_power_monitor_cond *pmc,
>>>   if (masked == pmc->val)
>>>   return;
>>>   }
>>> +
>>> +s = &wait_status[lcore_id];
>>> +
>>> +/* update sleep address */
>>> +rte_spinlock_lock(&s->lock);
>>> +s->monitor_addr = pmc->addr;
>>> +rte_spinlock_unlock(&s->lock);
>>
>> It was a while, since I looked at it last time,
>> but shouldn't we grab the lock before monitor()?
>> I.E:
>> lock();
>> monitor();
>> addr=...;
>> unlock();
>> umwait();
>>
> 
> I don't believe so.
> 
> The idea here is to only store the address when we are looking to sleep, 
> and avoid the locks entirely if we already know we aren't going to 
> sleep. I mean, technically we could lock unconditionally, then unlock 
> when we're done, but there's very little practical difference between 
> the two because the moment we are interested in (sleep) happens the same 
> way whether we lock before or after monitor().

On another thought, putting the lock before monitor() and unlocking 
afterwards allows us to ask for a wakeup earlier, without necessarily 
waiting for a sleep. So think i'll take your suggestion on board anyway, 
thanks!

> 
>>> +
>>>   /* execute UMWAIT */
>>>   asm volatile(".byte 0xf2, 0x0f, 0xae, 0xf7;"
>>>   : /* ignore rflags */
>>>   : "D"(0), /* enter C0.2 */
>>>     "a"(tsc_l), "d"(tsc_h));
>>> +
>>> +/* erase sleep address */
>>> +rte_spinlock_lock(&s->lock);
>>> +s->monitor_addr = NULL;
>>> +rte_spinlock_unlock(&s->lock);
>>>   }
>>>
>>>   /**
>>> @@ -97,3 +139,46 @@ RTE_INIT(rte_power_intrinsics_init) {
>>>   if (i.power_monitor && i.power_pause)
>>>   wait_supported = 1;
>>>   }
>>> +
>>> +void
>>> +rte_power_monitor_wakeup(const unsigned int lcore_id)
>>> +{
>>> +struct power_wait_status *s;
>>> +
>>> +/* prevent buffer overrun */
>>> +if (lcore_id >= RTE_MAX_LCORE)
>>> +return;
>>> +
>>> +/* prevent user from running this instruction if it's not supported */
>>> +if (!wait_supported)
>>> +return;
>>> +
>>> +s = &wait_status[lcore_id];
>>> +
>>> +/*
>>> + * There is a race condition between sleep, wakeup and locking, but we
>>> + * don't need to handle it.
>>> + *
>>> + * Possible situations:
>>> + *
>>> + * 1. T1 locks, sets address, unlocks
>>> + * 2. T2 locks, triggers wakeup, unlocks
>>> + * 3. T1 sleeps
>>> + *
>>> + * In this case, because T1 has already set the address for monitoring,
>>> + * we will wake up immediately even if T2 triggers wakeup before T1
>>> + * goes to sleep.
>>> + *
>>> + * 1. T1 locks, sets address, unlocks, goes to sleep, and wakes up
>>> + * 2. T2 locks, triggers wakeup, and unlocks
>>> + * 3. T1 locks, erases address, and unlocks
>>> + *
>>> + * In this case, since we've already woken up, the "wakeup" was
>>> + * unneeded, and since T1 is still waiting on T2 releasing the lock, 
>>> the
>>> + * wakeup address is still valid so it's perfectly safe to write it.
>>> + */
>>> +rte_spinlock_lock(&s->lock);
>>> +if (s->monitor_addr != NULL)
>>> +__umwait_wakeup(s->monitor_addr);
>>> +rte_spinlock_unlock(&s->lock);
>>> +}
>>> -- 
>>> 2.25.1
> 
>
  

Patch

diff --git a/lib/librte_eal/arm/include/rte_power_intrinsics.h b/lib/librte_eal/arm/include/rte_power_intrinsics.h
index 27869251a8..39e49cc45b 100644
--- a/lib/librte_eal/arm/include/rte_power_intrinsics.h
+++ b/lib/librte_eal/arm/include/rte_power_intrinsics.h
@@ -33,6 +33,15 @@  rte_power_pause(const uint64_t tsc_timestamp)
 	RTE_SET_USED(tsc_timestamp);
 }
 
+/**
+ * This function is not supported on ARM.
+ */
+void
+rte_power_monitor_wakeup(const unsigned int lcore_id)
+{
+	RTE_SET_USED(lcore_id);
+}
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/librte_eal/include/generic/rte_power_intrinsics.h b/lib/librte_eal/include/generic/rte_power_intrinsics.h
index a6f1955996..e311d6f8ea 100644
--- a/lib/librte_eal/include/generic/rte_power_intrinsics.h
+++ b/lib/librte_eal/include/generic/rte_power_intrinsics.h
@@ -57,6 +57,22 @@  __rte_experimental
 void rte_power_monitor(const struct rte_power_monitor_cond *pmc,
 		const uint64_t tsc_timestamp);
 
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Wake up a specific lcore that is in a power optimized state and is monitoring
+ * an address.
+ *
+ * @note This function will *not* wake up a core that is in a power optimized
+ *   state due to calling `rte_power_pause`.
+ *
+ * @param lcore_id
+ *   Lcore ID of a sleeping thread.
+ */
+__rte_experimental
+void rte_power_monitor_wakeup(const unsigned int lcore_id);
+
 /**
  * @warning
  * @b EXPERIMENTAL: this API may change without prior notice
diff --git a/lib/librte_eal/ppc/include/rte_power_intrinsics.h b/lib/librte_eal/ppc/include/rte_power_intrinsics.h
index 248d1f4a23..2e7db0e7eb 100644
--- a/lib/librte_eal/ppc/include/rte_power_intrinsics.h
+++ b/lib/librte_eal/ppc/include/rte_power_intrinsics.h
@@ -33,6 +33,15 @@  rte_power_pause(const uint64_t tsc_timestamp)
 	RTE_SET_USED(tsc_timestamp);
 }
 
+/**
+ * This function is not supported on PPC64.
+ */
+void
+rte_power_monitor_wakeup(const unsigned int lcore_id)
+{
+	RTE_SET_USED(lcore_id);
+}
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/librte_eal/version.map b/lib/librte_eal/version.map
index 20945b1efa..ac026e289d 100644
--- a/lib/librte_eal/version.map
+++ b/lib/librte_eal/version.map
@@ -406,6 +406,7 @@  EXPERIMENTAL {
 
 	# added in 21.02
 	rte_power_monitor;
+	rte_power_monitor_wakeup;
 	rte_power_pause;
 };
 
diff --git a/lib/librte_eal/x86/rte_power_intrinsics.c b/lib/librte_eal/x86/rte_power_intrinsics.c
index a9cd1afe9d..46a4fb6cd5 100644
--- a/lib/librte_eal/x86/rte_power_intrinsics.c
+++ b/lib/librte_eal/x86/rte_power_intrinsics.c
@@ -2,8 +2,31 @@ 
  * Copyright(c) 2020 Intel Corporation
  */
 
+#include <rte_common.h>
+#include <rte_lcore.h>
+#include <rte_spinlock.h>
+
 #include "rte_power_intrinsics.h"
 
+/*
+ * Per-lcore structure holding current status of C0.2 sleeps.
+ */
+static struct power_wait_status {
+	rte_spinlock_t lock;
+	volatile void *monitor_addr; /**< NULL if not currently sleeping */
+} __rte_cache_aligned wait_status[RTE_MAX_LCORE];
+
+static inline void
+__umwait_wakeup(volatile void *addr)
+{
+	uint64_t val;
+
+	/* trigger a write but don't change the value */
+	val = __atomic_load_n((volatile uint64_t *)addr, __ATOMIC_RELAXED);
+	__atomic_compare_exchange_n((volatile uint64_t *)addr, &val, val, 0,
+			__ATOMIC_RELAXED, __ATOMIC_RELAXED);
+}
+
 static uint8_t wait_supported;
 
 static inline uint64_t
@@ -36,6 +59,12 @@  rte_power_monitor(const struct rte_power_monitor_cond *pmc,
 {
 	const uint32_t tsc_l = (uint32_t)tsc_timestamp;
 	const uint32_t tsc_h = (uint32_t)(tsc_timestamp >> 32);
+	const unsigned int lcore_id = rte_lcore_id();
+	struct power_wait_status *s;
+
+	/* prevent non-EAL thread from using this API */
+	if (lcore_id >= RTE_MAX_LCORE)
+		return;
 
 	/* prevent user from running this instruction if it's not supported */
 	if (!wait_supported)
@@ -60,11 +89,24 @@  rte_power_monitor(const struct rte_power_monitor_cond *pmc,
 		if (masked == pmc->val)
 			return;
 	}
+
+	s = &wait_status[lcore_id];
+
+	/* update sleep address */
+	rte_spinlock_lock(&s->lock);
+	s->monitor_addr = pmc->addr;
+	rte_spinlock_unlock(&s->lock);
+
 	/* execute UMWAIT */
 	asm volatile(".byte 0xf2, 0x0f, 0xae, 0xf7;"
 			: /* ignore rflags */
 			: "D"(0), /* enter C0.2 */
 			  "a"(tsc_l), "d"(tsc_h));
+
+	/* erase sleep address */
+	rte_spinlock_lock(&s->lock);
+	s->monitor_addr = NULL;
+	rte_spinlock_unlock(&s->lock);
 }
 
 /**
@@ -97,3 +139,46 @@  RTE_INIT(rte_power_intrinsics_init) {
 	if (i.power_monitor && i.power_pause)
 		wait_supported = 1;
 }
+
+void
+rte_power_monitor_wakeup(const unsigned int lcore_id)
+{
+	struct power_wait_status *s;
+
+	/* prevent buffer overrun */
+	if (lcore_id >= RTE_MAX_LCORE)
+		return;
+
+	/* prevent user from running this instruction if it's not supported */
+	if (!wait_supported)
+		return;
+
+	s = &wait_status[lcore_id];
+
+	/*
+	 * There is a race condition between sleep, wakeup and locking, but we
+	 * don't need to handle it.
+	 *
+	 * Possible situations:
+	 *
+	 * 1. T1 locks, sets address, unlocks
+	 * 2. T2 locks, triggers wakeup, unlocks
+	 * 3. T1 sleeps
+	 *
+	 * In this case, because T1 has already set the address for monitoring,
+	 * we will wake up immediately even if T2 triggers wakeup before T1
+	 * goes to sleep.
+	 *
+	 * 1. T1 locks, sets address, unlocks, goes to sleep, and wakes up
+	 * 2. T2 locks, triggers wakeup, and unlocks
+	 * 3. T1 locks, erases address, and unlocks
+	 *
+	 * In this case, since we've already woken up, the "wakeup" was
+	 * unneeded, and since T1 is still waiting on T2 releasing the lock, the
+	 * wakeup address is still valid so it's perfectly safe to write it.
+	 */
+	rte_spinlock_lock(&s->lock);
+	if (s->monitor_addr != NULL)
+		__umwait_wakeup(s->monitor_addr);
+	rte_spinlock_unlock(&s->lock);
+}