[v13,05/11] eal: add monitor wakeup function
Checks
Commit Message
Now that we have everything in a C file, we can store the information
about our sleep, and have a native mechanism to wake up the sleeping
core. This mechanism would however only wake up a core that's sleeping
while monitoring - waking up from `rte_power_pause` won't work.
Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
---
Notes:
v13:
- Add comments around wakeup code to explain what it does
- Add lcore_id parameter checking to prevent buffer overrun
.../arm/include/rte_power_intrinsics.h | 9 ++
.../include/generic/rte_power_intrinsics.h | 16 ++++
.../ppc/include/rte_power_intrinsics.h | 9 ++
lib/librte_eal/version.map | 1 +
lib/librte_eal/x86/rte_power_intrinsics.c | 85 +++++++++++++++++++
5 files changed, 120 insertions(+)
Comments
>
> Now that we have everything in a C file, we can store the information
> about our sleep, and have a native mechanism to wake up the sleeping
> core. This mechanism would however only wake up a core that's sleeping
> while monitoring - waking up from `rte_power_pause` won't work.
>
> Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
> ---
>
> Notes:
> v13:
> - Add comments around wakeup code to explain what it does
> - Add lcore_id parameter checking to prevent buffer overrun
>
> .../arm/include/rte_power_intrinsics.h | 9 ++
> .../include/generic/rte_power_intrinsics.h | 16 ++++
> .../ppc/include/rte_power_intrinsics.h | 9 ++
> lib/librte_eal/version.map | 1 +
> lib/librte_eal/x86/rte_power_intrinsics.c | 85 +++++++++++++++++++
> 5 files changed, 120 insertions(+)
>
> diff --git a/lib/librte_eal/arm/include/rte_power_intrinsics.h b/lib/librte_eal/arm/include/rte_power_intrinsics.h
> index 27869251a8..39e49cc45b 100644
> --- a/lib/librte_eal/arm/include/rte_power_intrinsics.h
> +++ b/lib/librte_eal/arm/include/rte_power_intrinsics.h
> @@ -33,6 +33,15 @@ rte_power_pause(const uint64_t tsc_timestamp)
> RTE_SET_USED(tsc_timestamp);
> }
>
> +/**
> + * This function is not supported on ARM.
> + */
> +void
> +rte_power_monitor_wakeup(const unsigned int lcore_id)
> +{
> + RTE_SET_USED(lcore_id);
> +}
> +
> #ifdef __cplusplus
> }
> #endif
> diff --git a/lib/librte_eal/include/generic/rte_power_intrinsics.h b/lib/librte_eal/include/generic/rte_power_intrinsics.h
> index a6f1955996..e311d6f8ea 100644
> --- a/lib/librte_eal/include/generic/rte_power_intrinsics.h
> +++ b/lib/librte_eal/include/generic/rte_power_intrinsics.h
> @@ -57,6 +57,22 @@ __rte_experimental
> void rte_power_monitor(const struct rte_power_monitor_cond *pmc,
> const uint64_t tsc_timestamp);
>
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice
> + *
> + * Wake up a specific lcore that is in a power optimized state and is monitoring
> + * an address.
> + *
> + * @note This function will *not* wake up a core that is in a power optimized
> + * state due to calling `rte_power_pause`.
> + *
> + * @param lcore_id
> + * Lcore ID of a sleeping thread.
> + */
> +__rte_experimental
> +void rte_power_monitor_wakeup(const unsigned int lcore_id);
> +
> /**
> * @warning
> * @b EXPERIMENTAL: this API may change without prior notice
> diff --git a/lib/librte_eal/ppc/include/rte_power_intrinsics.h b/lib/librte_eal/ppc/include/rte_power_intrinsics.h
> index 248d1f4a23..2e7db0e7eb 100644
> --- a/lib/librte_eal/ppc/include/rte_power_intrinsics.h
> +++ b/lib/librte_eal/ppc/include/rte_power_intrinsics.h
> @@ -33,6 +33,15 @@ rte_power_pause(const uint64_t tsc_timestamp)
> RTE_SET_USED(tsc_timestamp);
> }
>
> +/**
> + * This function is not supported on PPC64.
> + */
> +void
> +rte_power_monitor_wakeup(const unsigned int lcore_id)
> +{
> + RTE_SET_USED(lcore_id);
> +}
> +
> #ifdef __cplusplus
> }
> #endif
> diff --git a/lib/librte_eal/version.map b/lib/librte_eal/version.map
> index 20945b1efa..ac026e289d 100644
> --- a/lib/librte_eal/version.map
> +++ b/lib/librte_eal/version.map
> @@ -406,6 +406,7 @@ EXPERIMENTAL {
>
> # added in 21.02
> rte_power_monitor;
> + rte_power_monitor_wakeup;
> rte_power_pause;
> };
>
> diff --git a/lib/librte_eal/x86/rte_power_intrinsics.c b/lib/librte_eal/x86/rte_power_intrinsics.c
> index a9cd1afe9d..46a4fb6cd5 100644
> --- a/lib/librte_eal/x86/rte_power_intrinsics.c
> +++ b/lib/librte_eal/x86/rte_power_intrinsics.c
> @@ -2,8 +2,31 @@
> * Copyright(c) 2020 Intel Corporation
> */
>
> +#include <rte_common.h>
> +#include <rte_lcore.h>
> +#include <rte_spinlock.h>
> +
> #include "rte_power_intrinsics.h"
>
> +/*
> + * Per-lcore structure holding current status of C0.2 sleeps.
> + */
> +static struct power_wait_status {
> + rte_spinlock_t lock;
> + volatile void *monitor_addr; /**< NULL if not currently sleeping */
> +} __rte_cache_aligned wait_status[RTE_MAX_LCORE];
> +
> +static inline void
> +__umwait_wakeup(volatile void *addr)
> +{
> + uint64_t val;
> +
> + /* trigger a write but don't change the value */
> + val = __atomic_load_n((volatile uint64_t *)addr, __ATOMIC_RELAXED);
> + __atomic_compare_exchange_n((volatile uint64_t *)addr, &val, val, 0,
> + __ATOMIC_RELAXED, __ATOMIC_RELAXED);
> +}
> +
> static uint8_t wait_supported;
>
> static inline uint64_t
> @@ -36,6 +59,12 @@ rte_power_monitor(const struct rte_power_monitor_cond *pmc,
> {
> const uint32_t tsc_l = (uint32_t)tsc_timestamp;
> const uint32_t tsc_h = (uint32_t)(tsc_timestamp >> 32);
> + const unsigned int lcore_id = rte_lcore_id();
> + struct power_wait_status *s;
> +
> + /* prevent non-EAL thread from using this API */
> + if (lcore_id >= RTE_MAX_LCORE)
> + return;
>
> /* prevent user from running this instruction if it's not supported */
> if (!wait_supported)
> @@ -60,11 +89,24 @@ rte_power_monitor(const struct rte_power_monitor_cond *pmc,
> if (masked == pmc->val)
> return;
> }
> +
> + s = &wait_status[lcore_id];
> +
> + /* update sleep address */
> + rte_spinlock_lock(&s->lock);
> + s->monitor_addr = pmc->addr;
> + rte_spinlock_unlock(&s->lock);
It was a while, since I looked at it last time,
but shouldn't we grab the lock before monitor()?
I.E:
lock();
monitor();
addr=...;
unlock();
umwait();
> +
> /* execute UMWAIT */
> asm volatile(".byte 0xf2, 0x0f, 0xae, 0xf7;"
> : /* ignore rflags */
> : "D"(0), /* enter C0.2 */
> "a"(tsc_l), "d"(tsc_h));
> +
> + /* erase sleep address */
> + rte_spinlock_lock(&s->lock);
> + s->monitor_addr = NULL;
> + rte_spinlock_unlock(&s->lock);
> }
>
> /**
> @@ -97,3 +139,46 @@ RTE_INIT(rte_power_intrinsics_init) {
> if (i.power_monitor && i.power_pause)
> wait_supported = 1;
> }
> +
> +void
> +rte_power_monitor_wakeup(const unsigned int lcore_id)
> +{
> + struct power_wait_status *s;
> +
> + /* prevent buffer overrun */
> + if (lcore_id >= RTE_MAX_LCORE)
> + return;
> +
> + /* prevent user from running this instruction if it's not supported */
> + if (!wait_supported)
> + return;
> +
> + s = &wait_status[lcore_id];
> +
> + /*
> + * There is a race condition between sleep, wakeup and locking, but we
> + * don't need to handle it.
> + *
> + * Possible situations:
> + *
> + * 1. T1 locks, sets address, unlocks
> + * 2. T2 locks, triggers wakeup, unlocks
> + * 3. T1 sleeps
> + *
> + * In this case, because T1 has already set the address for monitoring,
> + * we will wake up immediately even if T2 triggers wakeup before T1
> + * goes to sleep.
> + *
> + * 1. T1 locks, sets address, unlocks, goes to sleep, and wakes up
> + * 2. T2 locks, triggers wakeup, and unlocks
> + * 3. T1 locks, erases address, and unlocks
> + *
> + * In this case, since we've already woken up, the "wakeup" was
> + * unneeded, and since T1 is still waiting on T2 releasing the lock, the
> + * wakeup address is still valid so it's perfectly safe to write it.
> + */
> + rte_spinlock_lock(&s->lock);
> + if (s->monitor_addr != NULL)
> + __umwait_wakeup(s->monitor_addr);
> + rte_spinlock_unlock(&s->lock);
> +}
> --
> 2.25.1
On 12-Jan-21 4:02 PM, Ananyev, Konstantin wrote:
>
>>
>> Now that we have everything in a C file, we can store the information
>> about our sleep, and have a native mechanism to wake up the sleeping
>> core. This mechanism would however only wake up a core that's sleeping
>> while monitoring - waking up from `rte_power_pause` won't work.
>>
>> Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
>> ---
>>
>> Notes:
>> v13:
>> - Add comments around wakeup code to explain what it does
>> - Add lcore_id parameter checking to prevent buffer overrun
>>
>> .../arm/include/rte_power_intrinsics.h | 9 ++
>> .../include/generic/rte_power_intrinsics.h | 16 ++++
>> .../ppc/include/rte_power_intrinsics.h | 9 ++
>> lib/librte_eal/version.map | 1 +
>> lib/librte_eal/x86/rte_power_intrinsics.c | 85 +++++++++++++++++++
>> 5 files changed, 120 insertions(+)
>>
>> diff --git a/lib/librte_eal/arm/include/rte_power_intrinsics.h b/lib/librte_eal/arm/include/rte_power_intrinsics.h
>> index 27869251a8..39e49cc45b 100644
>> --- a/lib/librte_eal/arm/include/rte_power_intrinsics.h
>> +++ b/lib/librte_eal/arm/include/rte_power_intrinsics.h
>> @@ -33,6 +33,15 @@ rte_power_pause(const uint64_t tsc_timestamp)
>> RTE_SET_USED(tsc_timestamp);
>> }
>>
>> +/**
>> + * This function is not supported on ARM.
>> + */
>> +void
>> +rte_power_monitor_wakeup(const unsigned int lcore_id)
>> +{
>> +RTE_SET_USED(lcore_id);
>> +}
>> +
>> #ifdef __cplusplus
>> }
>> #endif
>> diff --git a/lib/librte_eal/include/generic/rte_power_intrinsics.h b/lib/librte_eal/include/generic/rte_power_intrinsics.h
>> index a6f1955996..e311d6f8ea 100644
>> --- a/lib/librte_eal/include/generic/rte_power_intrinsics.h
>> +++ b/lib/librte_eal/include/generic/rte_power_intrinsics.h
>> @@ -57,6 +57,22 @@ __rte_experimental
>> void rte_power_monitor(const struct rte_power_monitor_cond *pmc,
>> const uint64_t tsc_timestamp);
>>
>> +/**
>> + * @warning
>> + * @b EXPERIMENTAL: this API may change without prior notice
>> + *
>> + * Wake up a specific lcore that is in a power optimized state and is monitoring
>> + * an address.
>> + *
>> + * @note This function will *not* wake up a core that is in a power optimized
>> + * state due to calling `rte_power_pause`.
>> + *
>> + * @param lcore_id
>> + * Lcore ID of a sleeping thread.
>> + */
>> +__rte_experimental
>> +void rte_power_monitor_wakeup(const unsigned int lcore_id);
>> +
>> /**
>> * @warning
>> * @b EXPERIMENTAL: this API may change without prior notice
>> diff --git a/lib/librte_eal/ppc/include/rte_power_intrinsics.h b/lib/librte_eal/ppc/include/rte_power_intrinsics.h
>> index 248d1f4a23..2e7db0e7eb 100644
>> --- a/lib/librte_eal/ppc/include/rte_power_intrinsics.h
>> +++ b/lib/librte_eal/ppc/include/rte_power_intrinsics.h
>> @@ -33,6 +33,15 @@ rte_power_pause(const uint64_t tsc_timestamp)
>> RTE_SET_USED(tsc_timestamp);
>> }
>>
>> +/**
>> + * This function is not supported on PPC64.
>> + */
>> +void
>> +rte_power_monitor_wakeup(const unsigned int lcore_id)
>> +{
>> +RTE_SET_USED(lcore_id);
>> +}
>> +
>> #ifdef __cplusplus
>> }
>> #endif
>> diff --git a/lib/librte_eal/version.map b/lib/librte_eal/version.map
>> index 20945b1efa..ac026e289d 100644
>> --- a/lib/librte_eal/version.map
>> +++ b/lib/librte_eal/version.map
>> @@ -406,6 +406,7 @@ EXPERIMENTAL {
>>
>> # added in 21.02
>> rte_power_monitor;
>> +rte_power_monitor_wakeup;
>> rte_power_pause;
>> };
>>
>> diff --git a/lib/librte_eal/x86/rte_power_intrinsics.c b/lib/librte_eal/x86/rte_power_intrinsics.c
>> index a9cd1afe9d..46a4fb6cd5 100644
>> --- a/lib/librte_eal/x86/rte_power_intrinsics.c
>> +++ b/lib/librte_eal/x86/rte_power_intrinsics.c
>> @@ -2,8 +2,31 @@
>> * Copyright(c) 2020 Intel Corporation
>> */
>>
>> +#include <rte_common.h>
>> +#include <rte_lcore.h>
>> +#include <rte_spinlock.h>
>> +
>> #include "rte_power_intrinsics.h"
>>
>> +/*
>> + * Per-lcore structure holding current status of C0.2 sleeps.
>> + */
>> +static struct power_wait_status {
>> +rte_spinlock_t lock;
>> +volatile void *monitor_addr; /**< NULL if not currently sleeping */
>> +} __rte_cache_aligned wait_status[RTE_MAX_LCORE];
>> +
>> +static inline void
>> +__umwait_wakeup(volatile void *addr)
>> +{
>> +uint64_t val;
>> +
>> +/* trigger a write but don't change the value */
>> +val = __atomic_load_n((volatile uint64_t *)addr, __ATOMIC_RELAXED);
>> +__atomic_compare_exchange_n((volatile uint64_t *)addr, &val, val, 0,
>> +__ATOMIC_RELAXED, __ATOMIC_RELAXED);
>> +}
>> +
>> static uint8_t wait_supported;
>>
>> static inline uint64_t
>> @@ -36,6 +59,12 @@ rte_power_monitor(const struct rte_power_monitor_cond *pmc,
>> {
>> const uint32_t tsc_l = (uint32_t)tsc_timestamp;
>> const uint32_t tsc_h = (uint32_t)(tsc_timestamp >> 32);
>> +const unsigned int lcore_id = rte_lcore_id();
>> +struct power_wait_status *s;
>> +
>> +/* prevent non-EAL thread from using this API */
>> +if (lcore_id >= RTE_MAX_LCORE)
>> +return;
>>
>> /* prevent user from running this instruction if it's not supported */
>> if (!wait_supported)
>> @@ -60,11 +89,24 @@ rte_power_monitor(const struct rte_power_monitor_cond *pmc,
>> if (masked == pmc->val)
>> return;
>> }
>> +
>> +s = &wait_status[lcore_id];
>> +
>> +/* update sleep address */
>> +rte_spinlock_lock(&s->lock);
>> +s->monitor_addr = pmc->addr;
>> +rte_spinlock_unlock(&s->lock);
>
> It was a while, since I looked at it last time,
> but shouldn't we grab the lock before monitor()?
> I.E:
> lock();
> monitor();
> addr=...;
> unlock();
> umwait();
>
I don't believe so.
The idea here is to only store the address when we are looking to sleep,
and avoid the locks entirely if we already know we aren't going to
sleep. I mean, technically we could lock unconditionally, then unlock
when we're done, but there's very little practical difference between
the two because the moment we are interested in (sleep) happens the same
way whether we lock before or after monitor().
>> +
>> /* execute UMWAIT */
>> asm volatile(".byte 0xf2, 0x0f, 0xae, 0xf7;"
>> : /* ignore rflags */
>> : "D"(0), /* enter C0.2 */
>> "a"(tsc_l), "d"(tsc_h));
>> +
>> +/* erase sleep address */
>> +rte_spinlock_lock(&s->lock);
>> +s->monitor_addr = NULL;
>> +rte_spinlock_unlock(&s->lock);
>> }
>>
>> /**
>> @@ -97,3 +139,46 @@ RTE_INIT(rte_power_intrinsics_init) {
>> if (i.power_monitor && i.power_pause)
>> wait_supported = 1;
>> }
>> +
>> +void
>> +rte_power_monitor_wakeup(const unsigned int lcore_id)
>> +{
>> +struct power_wait_status *s;
>> +
>> +/* prevent buffer overrun */
>> +if (lcore_id >= RTE_MAX_LCORE)
>> +return;
>> +
>> +/* prevent user from running this instruction if it's not supported */
>> +if (!wait_supported)
>> +return;
>> +
>> +s = &wait_status[lcore_id];
>> +
>> +/*
>> + * There is a race condition between sleep, wakeup and locking, but we
>> + * don't need to handle it.
>> + *
>> + * Possible situations:
>> + *
>> + * 1. T1 locks, sets address, unlocks
>> + * 2. T2 locks, triggers wakeup, unlocks
>> + * 3. T1 sleeps
>> + *
>> + * In this case, because T1 has already set the address for monitoring,
>> + * we will wake up immediately even if T2 triggers wakeup before T1
>> + * goes to sleep.
>> + *
>> + * 1. T1 locks, sets address, unlocks, goes to sleep, and wakes up
>> + * 2. T2 locks, triggers wakeup, and unlocks
>> + * 3. T1 locks, erases address, and unlocks
>> + *
>> + * In this case, since we've already woken up, the "wakeup" was
>> + * unneeded, and since T1 is still waiting on T2 releasing the lock, the
>> + * wakeup address is still valid so it's perfectly safe to write it.
>> + */
>> +rte_spinlock_lock(&s->lock);
>> +if (s->monitor_addr != NULL)
>> +__umwait_wakeup(s->monitor_addr);
>> +rte_spinlock_unlock(&s->lock);
>> +}
>> --
>> 2.25.1
On 12-Jan-21 4:18 PM, Burakov, Anatoly wrote:
> On 12-Jan-21 4:02 PM, Ananyev, Konstantin wrote:
>>
>>>
>>> Now that we have everything in a C file, we can store the information
>>> about our sleep, and have a native mechanism to wake up the sleeping
>>> core. This mechanism would however only wake up a core that's sleeping
>>> while monitoring - waking up from `rte_power_pause` won't work.
>>>
>>> Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
>>> ---
>>>
>>> Notes:
>>> v13:
>>> - Add comments around wakeup code to explain what it does
>>> - Add lcore_id parameter checking to prevent buffer overrun
>>>
>>> .../arm/include/rte_power_intrinsics.h | 9 ++
>>> .../include/generic/rte_power_intrinsics.h | 16 ++++
>>> .../ppc/include/rte_power_intrinsics.h | 9 ++
>>> lib/librte_eal/version.map | 1 +
>>> lib/librte_eal/x86/rte_power_intrinsics.c | 85 +++++++++++++++++++
>>> 5 files changed, 120 insertions(+)
>>>
>>> diff --git a/lib/librte_eal/arm/include/rte_power_intrinsics.h
>>> b/lib/librte_eal/arm/include/rte_power_intrinsics.h
>>> index 27869251a8..39e49cc45b 100644
>>> --- a/lib/librte_eal/arm/include/rte_power_intrinsics.h
>>> +++ b/lib/librte_eal/arm/include/rte_power_intrinsics.h
>>> @@ -33,6 +33,15 @@ rte_power_pause(const uint64_t tsc_timestamp)
>>> RTE_SET_USED(tsc_timestamp);
>>> }
>>>
>>> +/**
>>> + * This function is not supported on ARM.
>>> + */
>>> +void
>>> +rte_power_monitor_wakeup(const unsigned int lcore_id)
>>> +{
>>> +RTE_SET_USED(lcore_id);
>>> +}
>>> +
>>> #ifdef __cplusplus
>>> }
>>> #endif
>>> diff --git a/lib/librte_eal/include/generic/rte_power_intrinsics.h
>>> b/lib/librte_eal/include/generic/rte_power_intrinsics.h
>>> index a6f1955996..e311d6f8ea 100644
>>> --- a/lib/librte_eal/include/generic/rte_power_intrinsics.h
>>> +++ b/lib/librte_eal/include/generic/rte_power_intrinsics.h
>>> @@ -57,6 +57,22 @@ __rte_experimental
>>> void rte_power_monitor(const struct rte_power_monitor_cond *pmc,
>>> const uint64_t tsc_timestamp);
>>>
>>> +/**
>>> + * @warning
>>> + * @b EXPERIMENTAL: this API may change without prior notice
>>> + *
>>> + * Wake up a specific lcore that is in a power optimized state and
>>> is monitoring
>>> + * an address.
>>> + *
>>> + * @note This function will *not* wake up a core that is in a power
>>> optimized
>>> + * state due to calling `rte_power_pause`.
>>> + *
>>> + * @param lcore_id
>>> + * Lcore ID of a sleeping thread.
>>> + */
>>> +__rte_experimental
>>> +void rte_power_monitor_wakeup(const unsigned int lcore_id);
>>> +
>>> /**
>>> * @warning
>>> * @b EXPERIMENTAL: this API may change without prior notice
>>> diff --git a/lib/librte_eal/ppc/include/rte_power_intrinsics.h
>>> b/lib/librte_eal/ppc/include/rte_power_intrinsics.h
>>> index 248d1f4a23..2e7db0e7eb 100644
>>> --- a/lib/librte_eal/ppc/include/rte_power_intrinsics.h
>>> +++ b/lib/librte_eal/ppc/include/rte_power_intrinsics.h
>>> @@ -33,6 +33,15 @@ rte_power_pause(const uint64_t tsc_timestamp)
>>> RTE_SET_USED(tsc_timestamp);
>>> }
>>>
>>> +/**
>>> + * This function is not supported on PPC64.
>>> + */
>>> +void
>>> +rte_power_monitor_wakeup(const unsigned int lcore_id)
>>> +{
>>> +RTE_SET_USED(lcore_id);
>>> +}
>>> +
>>> #ifdef __cplusplus
>>> }
>>> #endif
>>> diff --git a/lib/librte_eal/version.map b/lib/librte_eal/version.map
>>> index 20945b1efa..ac026e289d 100644
>>> --- a/lib/librte_eal/version.map
>>> +++ b/lib/librte_eal/version.map
>>> @@ -406,6 +406,7 @@ EXPERIMENTAL {
>>>
>>> # added in 21.02
>>> rte_power_monitor;
>>> +rte_power_monitor_wakeup;
>>> rte_power_pause;
>>> };
>>>
>>> diff --git a/lib/librte_eal/x86/rte_power_intrinsics.c
>>> b/lib/librte_eal/x86/rte_power_intrinsics.c
>>> index a9cd1afe9d..46a4fb6cd5 100644
>>> --- a/lib/librte_eal/x86/rte_power_intrinsics.c
>>> +++ b/lib/librte_eal/x86/rte_power_intrinsics.c
>>> @@ -2,8 +2,31 @@
>>> * Copyright(c) 2020 Intel Corporation
>>> */
>>>
>>> +#include <rte_common.h>
>>> +#include <rte_lcore.h>
>>> +#include <rte_spinlock.h>
>>> +
>>> #include "rte_power_intrinsics.h"
>>>
>>> +/*
>>> + * Per-lcore structure holding current status of C0.2 sleeps.
>>> + */
>>> +static struct power_wait_status {
>>> +rte_spinlock_t lock;
>>> +volatile void *monitor_addr; /**< NULL if not currently sleeping */
>>> +} __rte_cache_aligned wait_status[RTE_MAX_LCORE];
>>> +
>>> +static inline void
>>> +__umwait_wakeup(volatile void *addr)
>>> +{
>>> +uint64_t val;
>>> +
>>> +/* trigger a write but don't change the value */
>>> +val = __atomic_load_n((volatile uint64_t *)addr, __ATOMIC_RELAXED);
>>> +__atomic_compare_exchange_n((volatile uint64_t *)addr, &val, val, 0,
>>> +__ATOMIC_RELAXED, __ATOMIC_RELAXED);
>>> +}
>>> +
>>> static uint8_t wait_supported;
>>>
>>> static inline uint64_t
>>> @@ -36,6 +59,12 @@ rte_power_monitor(const struct
>>> rte_power_monitor_cond *pmc,
>>> {
>>> const uint32_t tsc_l = (uint32_t)tsc_timestamp;
>>> const uint32_t tsc_h = (uint32_t)(tsc_timestamp >> 32);
>>> +const unsigned int lcore_id = rte_lcore_id();
>>> +struct power_wait_status *s;
>>> +
>>> +/* prevent non-EAL thread from using this API */
>>> +if (lcore_id >= RTE_MAX_LCORE)
>>> +return;
>>>
>>> /* prevent user from running this instruction if it's not supported */
>>> if (!wait_supported)
>>> @@ -60,11 +89,24 @@ rte_power_monitor(const struct
>>> rte_power_monitor_cond *pmc,
>>> if (masked == pmc->val)
>>> return;
>>> }
>>> +
>>> +s = &wait_status[lcore_id];
>>> +
>>> +/* update sleep address */
>>> +rte_spinlock_lock(&s->lock);
>>> +s->monitor_addr = pmc->addr;
>>> +rte_spinlock_unlock(&s->lock);
>>
>> It was a while, since I looked at it last time,
>> but shouldn't we grab the lock before monitor()?
>> I.E:
>> lock();
>> monitor();
>> addr=...;
>> unlock();
>> umwait();
>>
>
> I don't believe so.
>
> The idea here is to only store the address when we are looking to sleep,
> and avoid the locks entirely if we already know we aren't going to
> sleep. I mean, technically we could lock unconditionally, then unlock
> when we're done, but there's very little practical difference between
> the two because the moment we are interested in (sleep) happens the same
> way whether we lock before or after monitor().
On another thought, putting the lock before monitor() and unlocking
afterwards allows us to ask for a wakeup earlier, without necessarily
waiting for a sleep. So think i'll take your suggestion on board anyway,
thanks!
>
>>> +
>>> /* execute UMWAIT */
>>> asm volatile(".byte 0xf2, 0x0f, 0xae, 0xf7;"
>>> : /* ignore rflags */
>>> : "D"(0), /* enter C0.2 */
>>> "a"(tsc_l), "d"(tsc_h));
>>> +
>>> +/* erase sleep address */
>>> +rte_spinlock_lock(&s->lock);
>>> +s->monitor_addr = NULL;
>>> +rte_spinlock_unlock(&s->lock);
>>> }
>>>
>>> /**
>>> @@ -97,3 +139,46 @@ RTE_INIT(rte_power_intrinsics_init) {
>>> if (i.power_monitor && i.power_pause)
>>> wait_supported = 1;
>>> }
>>> +
>>> +void
>>> +rte_power_monitor_wakeup(const unsigned int lcore_id)
>>> +{
>>> +struct power_wait_status *s;
>>> +
>>> +/* prevent buffer overrun */
>>> +if (lcore_id >= RTE_MAX_LCORE)
>>> +return;
>>> +
>>> +/* prevent user from running this instruction if it's not supported */
>>> +if (!wait_supported)
>>> +return;
>>> +
>>> +s = &wait_status[lcore_id];
>>> +
>>> +/*
>>> + * There is a race condition between sleep, wakeup and locking, but we
>>> + * don't need to handle it.
>>> + *
>>> + * Possible situations:
>>> + *
>>> + * 1. T1 locks, sets address, unlocks
>>> + * 2. T2 locks, triggers wakeup, unlocks
>>> + * 3. T1 sleeps
>>> + *
>>> + * In this case, because T1 has already set the address for monitoring,
>>> + * we will wake up immediately even if T2 triggers wakeup before T1
>>> + * goes to sleep.
>>> + *
>>> + * 1. T1 locks, sets address, unlocks, goes to sleep, and wakes up
>>> + * 2. T2 locks, triggers wakeup, and unlocks
>>> + * 3. T1 locks, erases address, and unlocks
>>> + *
>>> + * In this case, since we've already woken up, the "wakeup" was
>>> + * unneeded, and since T1 is still waiting on T2 releasing the lock,
>>> the
>>> + * wakeup address is still valid so it's perfectly safe to write it.
>>> + */
>>> +rte_spinlock_lock(&s->lock);
>>> +if (s->monitor_addr != NULL)
>>> +__umwait_wakeup(s->monitor_addr);
>>> +rte_spinlock_unlock(&s->lock);
>>> +}
>>> --
>>> 2.25.1
>
>
@@ -33,6 +33,15 @@ rte_power_pause(const uint64_t tsc_timestamp)
RTE_SET_USED(tsc_timestamp);
}
+/**
+ * This function is not supported on ARM.
+ */
+void
+rte_power_monitor_wakeup(const unsigned int lcore_id)
+{
+ RTE_SET_USED(lcore_id);
+}
+
#ifdef __cplusplus
}
#endif
@@ -57,6 +57,22 @@ __rte_experimental
void rte_power_monitor(const struct rte_power_monitor_cond *pmc,
const uint64_t tsc_timestamp);
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Wake up a specific lcore that is in a power optimized state and is monitoring
+ * an address.
+ *
+ * @note This function will *not* wake up a core that is in a power optimized
+ * state due to calling `rte_power_pause`.
+ *
+ * @param lcore_id
+ * Lcore ID of a sleeping thread.
+ */
+__rte_experimental
+void rte_power_monitor_wakeup(const unsigned int lcore_id);
+
/**
* @warning
* @b EXPERIMENTAL: this API may change without prior notice
@@ -33,6 +33,15 @@ rte_power_pause(const uint64_t tsc_timestamp)
RTE_SET_USED(tsc_timestamp);
}
+/**
+ * This function is not supported on PPC64.
+ */
+void
+rte_power_monitor_wakeup(const unsigned int lcore_id)
+{
+ RTE_SET_USED(lcore_id);
+}
+
#ifdef __cplusplus
}
#endif
@@ -406,6 +406,7 @@ EXPERIMENTAL {
# added in 21.02
rte_power_monitor;
+ rte_power_monitor_wakeup;
rte_power_pause;
};
@@ -2,8 +2,31 @@
* Copyright(c) 2020 Intel Corporation
*/
+#include <rte_common.h>
+#include <rte_lcore.h>
+#include <rte_spinlock.h>
+
#include "rte_power_intrinsics.h"
+/*
+ * Per-lcore structure holding current status of C0.2 sleeps.
+ */
+static struct power_wait_status {
+ rte_spinlock_t lock;
+ volatile void *monitor_addr; /**< NULL if not currently sleeping */
+} __rte_cache_aligned wait_status[RTE_MAX_LCORE];
+
+static inline void
+__umwait_wakeup(volatile void *addr)
+{
+ uint64_t val;
+
+ /* trigger a write but don't change the value */
+ val = __atomic_load_n((volatile uint64_t *)addr, __ATOMIC_RELAXED);
+ __atomic_compare_exchange_n((volatile uint64_t *)addr, &val, val, 0,
+ __ATOMIC_RELAXED, __ATOMIC_RELAXED);
+}
+
static uint8_t wait_supported;
static inline uint64_t
@@ -36,6 +59,12 @@ rte_power_monitor(const struct rte_power_monitor_cond *pmc,
{
const uint32_t tsc_l = (uint32_t)tsc_timestamp;
const uint32_t tsc_h = (uint32_t)(tsc_timestamp >> 32);
+ const unsigned int lcore_id = rte_lcore_id();
+ struct power_wait_status *s;
+
+ /* prevent non-EAL thread from using this API */
+ if (lcore_id >= RTE_MAX_LCORE)
+ return;
/* prevent user from running this instruction if it's not supported */
if (!wait_supported)
@@ -60,11 +89,24 @@ rte_power_monitor(const struct rte_power_monitor_cond *pmc,
if (masked == pmc->val)
return;
}
+
+ s = &wait_status[lcore_id];
+
+ /* update sleep address */
+ rte_spinlock_lock(&s->lock);
+ s->monitor_addr = pmc->addr;
+ rte_spinlock_unlock(&s->lock);
+
/* execute UMWAIT */
asm volatile(".byte 0xf2, 0x0f, 0xae, 0xf7;"
: /* ignore rflags */
: "D"(0), /* enter C0.2 */
"a"(tsc_l), "d"(tsc_h));
+
+ /* erase sleep address */
+ rte_spinlock_lock(&s->lock);
+ s->monitor_addr = NULL;
+ rte_spinlock_unlock(&s->lock);
}
/**
@@ -97,3 +139,46 @@ RTE_INIT(rte_power_intrinsics_init) {
if (i.power_monitor && i.power_pause)
wait_supported = 1;
}
+
+void
+rte_power_monitor_wakeup(const unsigned int lcore_id)
+{
+ struct power_wait_status *s;
+
+ /* prevent buffer overrun */
+ if (lcore_id >= RTE_MAX_LCORE)
+ return;
+
+ /* prevent user from running this instruction if it's not supported */
+ if (!wait_supported)
+ return;
+
+ s = &wait_status[lcore_id];
+
+ /*
+ * There is a race condition between sleep, wakeup and locking, but we
+ * don't need to handle it.
+ *
+ * Possible situations:
+ *
+ * 1. T1 locks, sets address, unlocks
+ * 2. T2 locks, triggers wakeup, unlocks
+ * 3. T1 sleeps
+ *
+ * In this case, because T1 has already set the address for monitoring,
+ * we will wake up immediately even if T2 triggers wakeup before T1
+ * goes to sleep.
+ *
+ * 1. T1 locks, sets address, unlocks, goes to sleep, and wakes up
+ * 2. T2 locks, triggers wakeup, and unlocks
+ * 3. T1 locks, erases address, and unlocks
+ *
+ * In this case, since we've already woken up, the "wakeup" was
+ * unneeded, and since T1 is still waiting on T2 releasing the lock, the
+ * wakeup address is still valid so it's perfectly safe to write it.
+ */
+ rte_spinlock_lock(&s->lock);
+ if (s->monitor_addr != NULL)
+ __umwait_wakeup(s->monitor_addr);
+ rte_spinlock_unlock(&s->lock);
+}