[RFC] eal/riscv: add support for zawrs extension
Checks
Commit Message
The zawrs extension adds a pair of instructions that stall a core until
a memory location is written to. This patch uses one of them to
implement RISCV-specific versions of the rte_wait_until_equal_*
functions. This is potentially more energy efficient than the default
implementation that uses rte_pause/Zihintpause.
The technique works as follows:
* Create a reservation set containing the address we want to wait on
using an atomic load (lr.dw)
* Call wrs.nto - this blocks until the reservation set is invalidated by
someone else writing to that address
* Execution can also resume arbitrarily, so we still need to check
whether a change occurred and loop if not
Due to RISC-V atomics only supporting naturally aligned word (32 bit)
and double word (64 bit) loads, I've used pointer rounding and bit
shifting to implement waiting on 16-bit values.
This new functionality is controlled by a Meson flag that is disabled by
default.
Signed-off-by: Daniel Gregory <daniel.gregory@bytedance.com>
Suggested-by: Punit Agrawal <punit.agrawal@bytedance.com>
---
Posting as an RFC to get early feedback and enable testing by others
with Zawrs-enabled hardware. Whilst I have been able to test it compiles
& passes tests using QEMU, I am waiting on some Zawrs-enabled hardware
to become available before I carry out performance tests.
Nonetheless, I would be glad to hear any feedback on the general
approach. Thanks, Daniel
config/riscv/meson.build | 5 ++
lib/eal/riscv/include/rte_pause.h | 139 ++++++++++++++++++++++++++++++
2 files changed, 144 insertions(+)
Comments
I'm terribly sorry for the delay in review. I'll try to get to it in
the next 2 days.
On Thu, May 2, 2024 at 4:44 PM Daniel Gregory
<daniel.gregory@bytedance.com> wrote:
>
> The zawrs extension adds a pair of instructions that stall a core until
> a memory location is written to. This patch uses one of them to
> implement RISCV-specific versions of the rte_wait_until_equal_*
> functions. This is potentially more energy efficient than the default
> implementation that uses rte_pause/Zihintpause.
>
> The technique works as follows:
>
> * Create a reservation set containing the address we want to wait on
> using an atomic load (lr.dw)
> * Call wrs.nto - this blocks until the reservation set is invalidated by
> someone else writing to that address
> * Execution can also resume arbitrarily, so we still need to check
> whether a change occurred and loop if not
>
> Due to RISC-V atomics only supporting naturally aligned word (32 bit)
> and double word (64 bit) loads, I've used pointer rounding and bit
> shifting to implement waiting on 16-bit values.
>
> This new functionality is controlled by a Meson flag that is disabled by
> default.
>
> Signed-off-by: Daniel Gregory <daniel.gregory@bytedance.com>
> Suggested-by: Punit Agrawal <punit.agrawal@bytedance.com>
> ---
>
> Posting as an RFC to get early feedback and enable testing by others
> with Zawrs-enabled hardware. Whilst I have been able to test it compiles
> & passes tests using QEMU, I am waiting on some Zawrs-enabled hardware
> to become available before I carry out performance tests.
>
> Nonetheless, I would be glad to hear any feedback on the general
> approach. Thanks, Daniel
>
> config/riscv/meson.build | 5 ++
> lib/eal/riscv/include/rte_pause.h | 139 ++++++++++++++++++++++++++++++
> 2 files changed, 144 insertions(+)
>
> diff --git a/config/riscv/meson.build b/config/riscv/meson.build
> index 07d7d9da23..4cfdc42ecb 100644
> --- a/config/riscv/meson.build
> +++ b/config/riscv/meson.build
> @@ -26,6 +26,11 @@ flags_common = [
> # read from /proc/device-tree/cpus/timebase-frequency. This property is
> # guaranteed on Linux, as riscv time_init() requires it.
> ['RTE_RISCV_TIME_FREQ', 0],
> +
> + # Enable use of RISC-V Wait-on-Reservation-Set extension (Zawrs)
> + # Mitigates looping when polling on memory locations
> + # Make sure to add '_zawrs' to your target's -march below
> + ['RTE_RISCV_ZAWRS', false]
> ]
>
> ## SoC-specific options.
> diff --git a/lib/eal/riscv/include/rte_pause.h b/lib/eal/riscv/include/rte_pause.h
> index cb8e9ca52d..e7b43dffa3 100644
> --- a/lib/eal/riscv/include/rte_pause.h
> +++ b/lib/eal/riscv/include/rte_pause.h
> @@ -11,6 +11,12 @@
> extern "C" {
> #endif
>
> +#ifdef RTE_RISCV_ZAWRS
> +#define RTE_WAIT_UNTIL_EQUAL_ARCH_DEFINED
> +#endif
> +
> +#include <rte_debug.h>
> +
> #include "rte_atomic.h"
>
> #include "generic/rte_pause.h"
> @@ -24,6 +30,139 @@ static inline void rte_pause(void)
> asm volatile(".int 0x0100000F" : : : "memory");
> }
>
> +#ifdef RTE_RISCV_ZAWRS
> +
> +/*
> + * Atomic load from an address, it returns either a sign-extended word or
> + * doubleword and creates a 'reservation set' containing the read memory
> + * location. When someone else writes to the reservation set, it is invalidated,
> + * causing any stalled WRS instructions to resume.
> + *
> + * Address needs to be naturally aligned.
> + */
> +#define __RTE_RISCV_LR_32(src, dst, memorder) do { \
> + if ((memorder) == rte_memory_order_relaxed) { \
> + asm volatile("lr.w %0, (%1)" \
> + : "=r" (dst) \
> + : "r" (src) \
> + : "memory"); \
> + } else { \
> + asm volatile("lr.w.aq %0, (%1)" \
> + : "=r" (dst) \
> + : "r" (src) \
> + : "memory"); \
> + } } while (0)
> +#define __RTE_RISCV_LR_64(src, dst, memorder) do { \
> + if ((memorder) == rte_memory_order_relaxed) { \
> + asm volatile("lr.d %0, (%1)" \
> + : "=r" (dst) \
> + : "r" (src) \
> + : "memory"); \
> + } else { \
> + asm volatile("lr.d.aq %0, (%1)" \
> + : "=r" (dst) \
> + : "r" (src) \
> + : "memory"); \
> + } } while (0)
> +
> +/*
> + * There's not a RISC-V atomic load primitive for halfwords, so cast up to a
> + * _naturally aligned_ word and extract the halfword we want
> + */
> +#define __RTE_RISCV_LR_16(src, dst, memorder) do { \
> + uint32_t word; \
> + __RTE_RISCV_LR_32(((uintptr_t)(src) & (~3)), word, (memorder)); \
> + if ((size_t)(src) & 3) \
> + (dst) = (typeof(dst))(word >> 16); \
> + else \
> + (dst) = (typeof(dst))(word & 0xFFFF); \
> +} while (0)
> +
> +#define __RTE_RISCV_LR(src, dst, memorder, size) { \
> + RTE_BUILD_BUG_ON(size != 16 && size != 32 && size != 64); \
> + if (size == 16) \
> + __RTE_RISCV_LR_16(src, dst, memorder); \
> + else if (size == 32) \
> + __RTE_RISCV_LR_32(src, dst, memorder); \
> + else if (size == 64) \
> + __RTE_RISCV_LR_64(src, dst, memorder); \
> +}
> +
> +/*
> + * Wait-on-Reservation-Set extension instruction, it stalls execution until the
> + * reservation set is invalidated or an interrupt is observed.
> + * A loop is likely still needed as it may stop stalling arbitrarily.
> + */
> +#define __RTE_RISCV_WRS_NTO() { asm volatile("wrs.nto" : : : "memory"); }
> +
> +static __rte_always_inline void
> +rte_wait_until_equal_16(volatile uint16_t *addr, uint16_t expected,
> + int memorder)
> +{
> + uint16_t value;
> +
> + RTE_ASSERT(memorder == rte_memory_order_acquire ||
> + memorder == rte_memory_order_relaxed);
> + RTE_ASSERT(((size_t)addr & 1) == 0);
> +
> + __RTE_RISCV_LR_16(addr, value, memorder);
> + while (value != expected) {
> + __RTE_RISCV_WRS_NTO();
> + __RTE_RISCV_LR_16(addr, value, memorder);
> + }
> +}
> +
> +static __rte_always_inline void
> +rte_wait_until_equal_32(volatile uint32_t *addr, uint32_t expected,
> + int memorder)
> +{
> + uint32_t value;
> +
> + RTE_ASSERT(memorder == rte_memory_order_acquire ||
> + memorder == rte_memory_order_relaxed);
> + RTE_ASSERT(((size_t)addr & 3) == 0);
> +
> + __RTE_RISCV_LR_32(addr, value, memorder);
> + while (value != expected) {
> + __RTE_RISCV_WRS_NTO();
> + __RTE_RISCV_LR_32(addr, value, memorder);
> + }
> +}
> +
> +static __rte_always_inline void
> +rte_wait_until_equal_64(volatile uint64_t *addr, uint64_t expected,
> + int memorder)
> +{
> + uint64_t value;
> +
> + RTE_ASSERT(memorder == rte_memory_order_acquire ||
> + memorder == rte_memory_order_relaxed);
> + RTE_ASSERT(((size_t)addr & 7) == 0);
> +
> + __RTE_RISCV_LR_64(addr, value, memorder);
> + while (value != expected) {
> + __RTE_RISCV_WRS_NTO();
> + __RTE_RISCV_LR_64(addr, value, memorder);
> + }
> +}
> +
> +#define RTE_WAIT_UNTIL_MASKED(addr, mask, cond, expected, memorder) do { \
> + RTE_BUILD_BUG_ON(!__builtin_constant_p(memorder)); \
> + RTE_BUILD_BUG_ON(memorder != rte_memory_order_acquire && \
> + memorder != rte_memory_order_relaxed); \
> + RTE_ASSERT(((size_t)(addr) & (sizeof(*(addr)) - 1)) != 0); \
> + const uint32_t size = sizeof(*(addr)) << 3; \
> + typeof(*(addr)) expected_value = (expected); \
> + typeof(*(addr)) value; \
> + __RTE_RISCV_LR((addr), value, memorder, size); \
> + while (!((value & (mask)) cond expected_value)) { \
> + __RTE_RISCV_WRS_NTO(); \
> + __RTE_RISCV_LR((addr), value, memorder, size); \
> + } \
> +} while (0)
> +
> +#endif /* RTE_RISCV_ZAWRS */
> +
> #ifdef __cplusplus
> }
> #endif
> --
> 2.39.2
>
On Thu, May 2, 2024 at 4:44 PM Daniel Gregory
<daniel.gregory@bytedance.com> wrote:
>
> The zawrs extension adds a pair of instructions that stall a core until
> a memory location is written to. This patch uses one of them to
> implement RISCV-specific versions of the rte_wait_until_equal_*
> functions. This is potentially more energy efficient than the default
> implementation that uses rte_pause/Zihintpause.
>
> The technique works as follows:
>
> * Create a reservation set containing the address we want to wait on
> using an atomic load (lr.dw)
> * Call wrs.nto - this blocks until the reservation set is invalidated by
> someone else writing to that address
> * Execution can also resume arbitrarily, so we still need to check
> whether a change occurred and loop if not
>
> Due to RISC-V atomics only supporting naturally aligned word (32 bit)
> and double word (64 bit) loads, I've used pointer rounding and bit
> shifting to implement waiting on 16-bit values.
>
> This new functionality is controlled by a Meson flag that is disabled by
> default.
>
> Signed-off-by: Daniel Gregory <daniel.gregory@bytedance.com>
> Suggested-by: Punit Agrawal <punit.agrawal@bytedance.com>
> ---
>
> Posting as an RFC to get early feedback and enable testing by others
> with Zawrs-enabled hardware. Whilst I have been able to test it compiles
> & passes tests using QEMU, I am waiting on some Zawrs-enabled hardware
> to become available before I carry out performance tests.
>
> Nonetheless, I would be glad to hear any feedback on the general
> approach. Thanks, Daniel
>
> config/riscv/meson.build | 5 ++
> lib/eal/riscv/include/rte_pause.h | 139 ++++++++++++++++++++++++++++++
> 2 files changed, 144 insertions(+)
>
> diff --git a/config/riscv/meson.build b/config/riscv/meson.build
> index 07d7d9da23..4cfdc42ecb 100644
> --- a/config/riscv/meson.build
> +++ b/config/riscv/meson.build
> @@ -26,6 +26,11 @@ flags_common = [
> # read from /proc/device-tree/cpus/timebase-frequency. This property is
> # guaranteed on Linux, as riscv time_init() requires it.
> ['RTE_RISCV_TIME_FREQ', 0],
> +
> + # Enable use of RISC-V Wait-on-Reservation-Set extension (Zawrs)
> + # Mitigates looping when polling on memory locations
> + # Make sure to add '_zawrs' to your target's -march below
> + ['RTE_RISCV_ZAWRS', false]
A bit orthogonal to this patch (or maybe not?)
Should we perhaps add a Qemu target in meson.build which would have
the modified -march for what qemu supports now?
Or perhaps add machine detection logic based either on the "riscv,isa"
cpu@0 property in the DT or RHCT ACPI table?
Or add perhaps some other way we could specify the extension list
suffix for -march?
> ]
>
> ## SoC-specific options.
> diff --git a/lib/eal/riscv/include/rte_pause.h b/lib/eal/riscv/include/rte_pause.h
> index cb8e9ca52d..e7b43dffa3 100644
> --- a/lib/eal/riscv/include/rte_pause.h
> +++ b/lib/eal/riscv/include/rte_pause.h
> @@ -11,6 +11,12 @@
> extern "C" {
> #endif
>
> +#ifdef RTE_RISCV_ZAWRS
> +#define RTE_WAIT_UNTIL_EQUAL_ARCH_DEFINED
> +#endif
> +
> +#include <rte_debug.h>
> +
> #include "rte_atomic.h"
>
> #include "generic/rte_pause.h"
> @@ -24,6 +30,139 @@ static inline void rte_pause(void)
> asm volatile(".int 0x0100000F" : : : "memory");
> }
>
> +#ifdef RTE_RISCV_ZAWRS
> +
> +/*
> + * Atomic load from an address, it returns either a sign-extended word or
> + * doubleword and creates a 'reservation set' containing the read memory
> + * location. When someone else writes to the reservation set, it is invalidated,
> + * causing any stalled WRS instructions to resume.
> + *
> + * Address needs to be naturally aligned.
> + */
> +#define __RTE_RISCV_LR_32(src, dst, memorder) do { \
> + if ((memorder) == rte_memory_order_relaxed) { \
> + asm volatile("lr.w %0, (%1)" \
> + : "=r" (dst) \
> + : "r" (src) \
> + : "memory"); \
> + } else { \
> + asm volatile("lr.w.aq %0, (%1)" \
> + : "=r" (dst) \
> + : "r" (src) \
> + : "memory"); \
> + } } while (0)
> +#define __RTE_RISCV_LR_64(src, dst, memorder) do { \
> + if ((memorder) == rte_memory_order_relaxed) { \
> + asm volatile("lr.d %0, (%1)" \
> + : "=r" (dst) \
> + : "r" (src) \
> + : "memory"); \
> + } else { \
> + asm volatile("lr.d.aq %0, (%1)" \
> + : "=r" (dst) \
> + : "r" (src) \
> + : "memory"); \
> + } } while (0)
> +
> +/*
> + * There's not a RISC-V atomic load primitive for halfwords, so cast up to a
> + * _naturally aligned_ word and extract the halfword we want
> + */
> +#define __RTE_RISCV_LR_16(src, dst, memorder) do { \
> + uint32_t word; \
> + __RTE_RISCV_LR_32(((uintptr_t)(src) & (~3)), word, (memorder)); \
> + if ((size_t)(src) & 3) \
> + (dst) = (typeof(dst))(word >> 16); \
> + else \
> + (dst) = (typeof(dst))(word & 0xFFFF); \
> +} while (0)
> +
> +#define __RTE_RISCV_LR(src, dst, memorder, size) { \
> + RTE_BUILD_BUG_ON(size != 16 && size != 32 && size != 64); \
> + if (size == 16) \
> + __RTE_RISCV_LR_16(src, dst, memorder); \
> + else if (size == 32) \
> + __RTE_RISCV_LR_32(src, dst, memorder); \
> + else if (size == 64) \
> + __RTE_RISCV_LR_64(src, dst, memorder); \
> +}
> +
> +/*
> + * Wait-on-Reservation-Set extension instruction, it stalls execution until the
> + * reservation set is invalidated or an interrupt is observed.
> + * A loop is likely still needed as it may stop stalling arbitrarily.
> + */
> +#define __RTE_RISCV_WRS_NTO() { asm volatile("wrs.nto" : : : "memory"); }
> +
> +static __rte_always_inline void
> +rte_wait_until_equal_16(volatile uint16_t *addr, uint16_t expected,
> + int memorder)
> +{
> + uint16_t value;
> +
> + RTE_ASSERT(memorder == rte_memory_order_acquire ||
> + memorder == rte_memory_order_relaxed);
> + RTE_ASSERT(((size_t)addr & 1) == 0);
> +
> + __RTE_RISCV_LR_16(addr, value, memorder);
> + while (value != expected) {
> + __RTE_RISCV_WRS_NTO();
> + __RTE_RISCV_LR_16(addr, value, memorder);
> + }
> +}
> +
> +static __rte_always_inline void
> +rte_wait_until_equal_32(volatile uint32_t *addr, uint32_t expected,
> + int memorder)
> +{
> + uint32_t value;
> +
> + RTE_ASSERT(memorder == rte_memory_order_acquire ||
> + memorder == rte_memory_order_relaxed);
> + RTE_ASSERT(((size_t)addr & 3) == 0);
> +
> + __RTE_RISCV_LR_32(addr, value, memorder);
> + while (value != expected) {
> + __RTE_RISCV_WRS_NTO();
> + __RTE_RISCV_LR_32(addr, value, memorder);
> + }
> +}
> +
> +static __rte_always_inline void
> +rte_wait_until_equal_64(volatile uint64_t *addr, uint64_t expected,
> + int memorder)
> +{
> + uint64_t value;
> +
> + RTE_ASSERT(memorder == rte_memory_order_acquire ||
> + memorder == rte_memory_order_relaxed);
> + RTE_ASSERT(((size_t)addr & 7) == 0);
> +
> + __RTE_RISCV_LR_64(addr, value, memorder);
> + while (value != expected) {
> + __RTE_RISCV_WRS_NTO();
> + __RTE_RISCV_LR_64(addr, value, memorder);
> + }
> +}
> +
> +#define RTE_WAIT_UNTIL_MASKED(addr, mask, cond, expected, memorder) do { \
> + RTE_BUILD_BUG_ON(!__builtin_constant_p(memorder)); \
> + RTE_BUILD_BUG_ON(memorder != rte_memory_order_acquire && \
> + memorder != rte_memory_order_relaxed); \
> + RTE_ASSERT(((size_t)(addr) & (sizeof(*(addr)) - 1)) != 0); \
> + const uint32_t size = sizeof(*(addr)) << 3; \
> + typeof(*(addr)) expected_value = (expected); \
> + typeof(*(addr)) value; \
> + __RTE_RISCV_LR((addr), value, memorder, size); \
> + while (!((value & (mask)) cond expected_value)) { \
> + __RTE_RISCV_WRS_NTO(); \
> + __RTE_RISCV_LR((addr), value, memorder, size); \
> + } \
> +} while (0)
> +
> +#endif /* RTE_RISCV_ZAWRS */
> +
> #ifdef __cplusplus
> }
> #endif
Reviewed-by: Stanislaw Kardach <stanislaw.kardach@gmail.com>
> --
> 2.39.2
>
> ]
>
> ## SoC-specific options.
> diff --git a/lib/eal/riscv/include/rte_pause.h b/lib/eal/riscv/include/rte_pause.h
> index cb8e9ca52d..e7b43dffa3 100644
> --- a/lib/eal/riscv/include/rte_pause.h
> +++ b/lib/eal/riscv/include/rte_pause.h
> @@ -11,6 +11,12 @@
> extern "C" {
> #endif
>
> +#ifdef RTE_RISCV_ZAWRS
> +#define RTE_WAIT_UNTIL_EQUAL_ARCH_DEFINED
> +#endif
> +
> +#include <rte_debug.h>
> +
> #include "rte_atomic.h"
>
> #include "generic/rte_pause.h"
> @@ -24,6 +30,139 @@ static inline void rte_pause(void)
> asm volatile(".int 0x0100000F" : : : "memory");
> }
>
> +#ifdef RTE_RISCV_ZAWRS
> +
> +/*
> + * Atomic load from an address, it returns either a sign-extended word or
> + * doubleword and creates a 'reservation set' containing the read memory
> + * location. When someone else writes to the reservation set, it is invalidated,
> + * causing any stalled WRS instructions to resume.
> + *
> + * Address needs to be naturally aligned.
> + */
> +#define __RTE_RISCV_LR_32(src, dst, memorder) do { \
> + if ((memorder) == rte_memory_order_relaxed) { \
> + asm volatile("lr.w %0, (%1)" \
> + : "=r" (dst) \
> + : "r" (src) \
> + : "memory"); \
> + } else { \
> + asm volatile("lr.w.aq %0, (%1)" \
> + : "=r" (dst) \
> + : "r" (src) \
> + : "memory"); \
> + } } while (0)
> +#define __RTE_RISCV_LR_64(src, dst, memorder) do { \
> + if ((memorder) == rte_memory_order_relaxed) { \
> + asm volatile("lr.d %0, (%1)" \
> + : "=r" (dst) \
> + : "r" (src) \
> + : "memory"); \
> + } else { \
> + asm volatile("lr.d.aq %0, (%1)" \
> + : "=r" (dst) \
> + : "r" (src) \
> + : "memory"); \
> + } } while (0)
> +
> +/*
> + * There's not a RISC-V atomic load primitive for halfwords, so cast up to a
> + * _naturally aligned_ word and extract the halfword we want
> + */
> +#define __RTE_RISCV_LR_16(src, dst, memorder) do { \
> + uint32_t word; \
> + __RTE_RISCV_LR_32(((uintptr_t)(src) & (~3)), word, (memorder)); \
> + if ((size_t)(src) & 3) \
> + (dst) = (typeof(dst))(word >> 16); \
> + else \
> + (dst) = (typeof(dst))(word & 0xFFFF); \
> +} while (0)
> +
> +#define __RTE_RISCV_LR(src, dst, memorder, size) { \
> + RTE_BUILD_BUG_ON(size != 16 && size != 32 && size != 64); \
> + if (size == 16) \
> + __RTE_RISCV_LR_16(src, dst, memorder); \
> + else if (size == 32) \
> + __RTE_RISCV_LR_32(src, dst, memorder); \
> + else if (size == 64) \
> + __RTE_RISCV_LR_64(src, dst, memorder); \
> +}
> +
> +/*
> + * Wait-on-Reservation-Set extension instruction, it stalls execution until the
> + * reservation set is invalidated or an interrupt is observed.
> + * A loop is likely still needed as it may stop stalling arbitrarily.
> + */
> +#define __RTE_RISCV_WRS_NTO() { asm volatile("wrs.nto" : : : "memory"); }
> +
> +static __rte_always_inline void
> +rte_wait_until_equal_16(volatile uint16_t *addr, uint16_t expected,
> + int memorder)
> +{
> + uint16_t value;
> +
> + RTE_ASSERT(memorder == rte_memory_order_acquire ||
> + memorder == rte_memory_order_relaxed);
> + RTE_ASSERT(((size_t)addr & 1) == 0);
> +
> + __RTE_RISCV_LR_16(addr, value, memorder);
> + while (value != expected) {
> + __RTE_RISCV_WRS_NTO();
> + __RTE_RISCV_LR_16(addr, value, memorder);
> + }
> +}
> +
> +static __rte_always_inline void
> +rte_wait_until_equal_32(volatile uint32_t *addr, uint32_t expected,
> + int memorder)
> +{
> + uint32_t value;
> +
> + RTE_ASSERT(memorder == rte_memory_order_acquire ||
> + memorder == rte_memory_order_relaxed);
> + RTE_ASSERT(((size_t)addr & 3) == 0);
> +
> + __RTE_RISCV_LR_32(addr, value, memorder);
> + while (value != expected) {
> + __RTE_RISCV_WRS_NTO();
> + __RTE_RISCV_LR_32(addr, value, memorder);
> + }
> +}
> +
> +static __rte_always_inline void
> +rte_wait_until_equal_64(volatile uint64_t *addr, uint64_t expected,
> + int memorder)
> +{
> + uint64_t value;
> +
> + RTE_ASSERT(memorder == rte_memory_order_acquire ||
> + memorder == rte_memory_order_relaxed);
> + RTE_ASSERT(((size_t)addr & 7) == 0);
> +
> + __RTE_RISCV_LR_64(addr, value, memorder);
> + while (value != expected) {
> + __RTE_RISCV_WRS_NTO();
> + __RTE_RISCV_LR_64(addr, value, memorder);
> + }
> +}
> +
> +#define RTE_WAIT_UNTIL_MASKED(addr, mask, cond, expected, memorder) do { \
> + RTE_BUILD_BUG_ON(!__builtin_constant_p(memorder)); \
> + RTE_BUILD_BUG_ON(memorder != rte_memory_order_acquire && \
> + memorder != rte_memory_order_relaxed); \
> + RTE_ASSERT(((size_t)(addr) & (sizeof(*(addr)) - 1)) != 0); \
> + const uint32_t size = sizeof(*(addr)) << 3; \
> + typeof(*(addr)) expected_value = (expected); \
> + typeof(*(addr)) value; \
> + __RTE_RISCV_LR((addr), value, memorder, size); \
> + while (!((value & (mask)) cond expected_value)) { \
> + __RTE_RISCV_WRS_NTO(); \
> + __RTE_RISCV_LR((addr), value, memorder, size); \
> + } \
> +} while (0)
> +
> +#endif /* RTE_RISCV_ZAWRS */
> +
> #ifdef __cplusplus
> }
> #endif
> --
> 2.39.2
>
On Sun, May 12, 2024 at 09:10:49AM +0200, Stanisław Kardach wrote:
> On Thu, May 2, 2024 at 4:44 PM Daniel Gregory
> <daniel.gregory@bytedance.com> wrote:
> > diff --git a/config/riscv/meson.build b/config/riscv/meson.build
> > index 07d7d9da23..4cfdc42ecb 100644
> > --- a/config/riscv/meson.build
> > +++ b/config/riscv/meson.build
> > @@ -26,6 +26,11 @@ flags_common = [
> > # read from /proc/device-tree/cpus/timebase-frequency. This property is
> > # guaranteed on Linux, as riscv time_init() requires it.
> > ['RTE_RISCV_TIME_FREQ', 0],
> > +
> > + # Enable use of RISC-V Wait-on-Reservation-Set extension (Zawrs)
> > + # Mitigates looping when polling on memory locations
> > + # Make sure to add '_zawrs' to your target's -march below
> > + ['RTE_RISCV_ZAWRS', false]
> A bit orthogonal to this patch (or maybe not?)
> Should we perhaps add a Qemu target in meson.build which would have
> the modified -march for what qemu supports now?
Yes, I can see that being worth doing as part of this patch. In addition
to Zawrs for this patch, GCC 13+ should generate prefetch instructions
for __builtin_prefetch() (lib/eal/include/generic/rte_prefetch.h) if the
Zicbop extension is enabled. Any more in particular you think would
benefit or would it be best to add every extension GCC 14 supports?
> Or perhaps add machine detection logic based either on the "riscv,isa"
> cpu@0 property in the DT or RHCT ACPI table?
I have had a look and, at least on QEMU 9, this seems non-trivial. The
RHCT acpi table at /proc/cpuinfo doesn't list every extension present
(eg. it's missing Zawrs), and the DT, whilst complete, can't be fed
directly into GCC because QEMU reports several newer and non-ratified
extensions that GCC doesn't support yet.
> Or add perhaps some other way we could specify the extension list
> suffix for -march?
Setting -Dcpu_instruction_set to some arbitrary ISA could work with
somes minor changes to the build script to not discard it in favour of
rv64gc. Then, we could add a map from ISA extensions to flags that are
enabled when that extension is present in cpu_instruction_set?
Thanks for your review,
Daniel
Stanisław Kardach <stanislaw.kardach@gmail.com> writes:
> On Thu, May 2, 2024 at 4:44 PM Daniel Gregory
> <daniel.gregory@bytedance.com> wrote:
>>
>> The zawrs extension adds a pair of instructions that stall a core until
>> a memory location is written to. This patch uses one of them to
>> implement RISCV-specific versions of the rte_wait_until_equal_*
>> functions. This is potentially more energy efficient than the default
>> implementation that uses rte_pause/Zihintpause.
>>
>> The technique works as follows:
>>
>> * Create a reservation set containing the address we want to wait on
>> using an atomic load (lr.dw)
>> * Call wrs.nto - this blocks until the reservation set is invalidated by
>> someone else writing to that address
>> * Execution can also resume arbitrarily, so we still need to check
>> whether a change occurred and loop if not
>>
>> Due to RISC-V atomics only supporting naturally aligned word (32 bit)
>> and double word (64 bit) loads, I've used pointer rounding and bit
>> shifting to implement waiting on 16-bit values.
>>
>> This new functionality is controlled by a Meson flag that is disabled by
>> default.
>>
>> Signed-off-by: Daniel Gregory <daniel.gregory@bytedance.com>
>> Suggested-by: Punit Agrawal <punit.agrawal@bytedance.com>
>> ---
>>
>> Posting as an RFC to get early feedback and enable testing by others
>> with Zawrs-enabled hardware. Whilst I have been able to test it compiles
>> & passes tests using QEMU, I am waiting on some Zawrs-enabled hardware
>> to become available before I carry out performance tests.
>>
>> Nonetheless, I would be glad to hear any feedback on the general
>> approach. Thanks, Daniel
>>
>> config/riscv/meson.build | 5 ++
>> lib/eal/riscv/include/rte_pause.h | 139 ++++++++++++++++++++++++++++++
>> 2 files changed, 144 insertions(+)
>>
>> diff --git a/config/riscv/meson.build b/config/riscv/meson.build
>> index 07d7d9da23..4cfdc42ecb 100644
>> --- a/config/riscv/meson.build
>> +++ b/config/riscv/meson.build
>> @@ -26,6 +26,11 @@ flags_common = [
>> # read from /proc/device-tree/cpus/timebase-frequency. This property is
>> # guaranteed on Linux, as riscv time_init() requires it.
>> ['RTE_RISCV_TIME_FREQ', 0],
>> +
>> + # Enable use of RISC-V Wait-on-Reservation-Set extension (Zawrs)
>> + # Mitigates looping when polling on memory locations
>> + # Make sure to add '_zawrs' to your target's -march below
>> + ['RTE_RISCV_ZAWRS', false]
> A bit orthogonal to this patch (or maybe not?)
> Should we perhaps add a Qemu target in meson.build which would have
> the modified -march for what qemu supports now?
> Or perhaps add machine detection logic based either on the "riscv,isa"
> cpu@0 property in the DT or RHCT ACPI table?
Compile time feature detection doesn't add a lot of benefit - it doesn't
work in cross builds environments - which is the common way things are
built for RISC-V at the moment. Also it doesn't work for distros where a
single build is used across a broad set of machines.
> Or add perhaps some other way we could specify the extension list
> suffix for -march?
Making it easier to specify the required extensions during the build
does make sense. Though this is an orthogonal change and is better done
in follow-on patches.
[...]
@@ -26,6 +26,11 @@ flags_common = [
# read from /proc/device-tree/cpus/timebase-frequency. This property is
# guaranteed on Linux, as riscv time_init() requires it.
['RTE_RISCV_TIME_FREQ', 0],
+
+ # Enable use of RISC-V Wait-on-Reservation-Set extension (Zawrs)
+ # Mitigates looping when polling on memory locations
+ # Make sure to add '_zawrs' to your target's -march below
+ ['RTE_RISCV_ZAWRS', false]
]
## SoC-specific options.
@@ -11,6 +11,12 @@
extern "C" {
#endif
+#ifdef RTE_RISCV_ZAWRS
+#define RTE_WAIT_UNTIL_EQUAL_ARCH_DEFINED
+#endif
+
+#include <rte_debug.h>
+
#include "rte_atomic.h"
#include "generic/rte_pause.h"
@@ -24,6 +30,139 @@ static inline void rte_pause(void)
asm volatile(".int 0x0100000F" : : : "memory");
}
+#ifdef RTE_RISCV_ZAWRS
+
+/*
+ * Atomic load from an address, it returns either a sign-extended word or
+ * doubleword and creates a 'reservation set' containing the read memory
+ * location. When someone else writes to the reservation set, it is invalidated,
+ * causing any stalled WRS instructions to resume.
+ *
+ * Address needs to be naturally aligned.
+ */
+#define __RTE_RISCV_LR_32(src, dst, memorder) do { \
+ if ((memorder) == rte_memory_order_relaxed) { \
+ asm volatile("lr.w %0, (%1)" \
+ : "=r" (dst) \
+ : "r" (src) \
+ : "memory"); \
+ } else { \
+ asm volatile("lr.w.aq %0, (%1)" \
+ : "=r" (dst) \
+ : "r" (src) \
+ : "memory"); \
+ } } while (0)
+#define __RTE_RISCV_LR_64(src, dst, memorder) do { \
+ if ((memorder) == rte_memory_order_relaxed) { \
+ asm volatile("lr.d %0, (%1)" \
+ : "=r" (dst) \
+ : "r" (src) \
+ : "memory"); \
+ } else { \
+ asm volatile("lr.d.aq %0, (%1)" \
+ : "=r" (dst) \
+ : "r" (src) \
+ : "memory"); \
+ } } while (0)
+
+/*
+ * There's not a RISC-V atomic load primitive for halfwords, so cast up to a
+ * _naturally aligned_ word and extract the halfword we want
+ */
+#define __RTE_RISCV_LR_16(src, dst, memorder) do { \
+ uint32_t word; \
+ __RTE_RISCV_LR_32(((uintptr_t)(src) & (~3)), word, (memorder)); \
+ if ((size_t)(src) & 3) \
+ (dst) = (typeof(dst))(word >> 16); \
+ else \
+ (dst) = (typeof(dst))(word & 0xFFFF); \
+} while (0)
+
+#define __RTE_RISCV_LR(src, dst, memorder, size) { \
+ RTE_BUILD_BUG_ON(size != 16 && size != 32 && size != 64); \
+ if (size == 16) \
+ __RTE_RISCV_LR_16(src, dst, memorder); \
+ else if (size == 32) \
+ __RTE_RISCV_LR_32(src, dst, memorder); \
+ else if (size == 64) \
+ __RTE_RISCV_LR_64(src, dst, memorder); \
+}
+
+/*
+ * Wait-on-Reservation-Set extension instruction, it stalls execution until the
+ * reservation set is invalidated or an interrupt is observed.
+ * A loop is likely still needed as it may stop stalling arbitrarily.
+ */
+#define __RTE_RISCV_WRS_NTO() { asm volatile("wrs.nto" : : : "memory"); }
+
+static __rte_always_inline void
+rte_wait_until_equal_16(volatile uint16_t *addr, uint16_t expected,
+ int memorder)
+{
+ uint16_t value;
+
+ RTE_ASSERT(memorder == rte_memory_order_acquire ||
+ memorder == rte_memory_order_relaxed);
+ RTE_ASSERT(((size_t)addr & 1) == 0);
+
+ __RTE_RISCV_LR_16(addr, value, memorder);
+ while (value != expected) {
+ __RTE_RISCV_WRS_NTO();
+ __RTE_RISCV_LR_16(addr, value, memorder);
+ }
+}
+
+static __rte_always_inline void
+rte_wait_until_equal_32(volatile uint32_t *addr, uint32_t expected,
+ int memorder)
+{
+ uint32_t value;
+
+ RTE_ASSERT(memorder == rte_memory_order_acquire ||
+ memorder == rte_memory_order_relaxed);
+ RTE_ASSERT(((size_t)addr & 3) == 0);
+
+ __RTE_RISCV_LR_32(addr, value, memorder);
+ while (value != expected) {
+ __RTE_RISCV_WRS_NTO();
+ __RTE_RISCV_LR_32(addr, value, memorder);
+ }
+}
+
+static __rte_always_inline void
+rte_wait_until_equal_64(volatile uint64_t *addr, uint64_t expected,
+ int memorder)
+{
+ uint64_t value;
+
+ RTE_ASSERT(memorder == rte_memory_order_acquire ||
+ memorder == rte_memory_order_relaxed);
+ RTE_ASSERT(((size_t)addr & 7) == 0);
+
+ __RTE_RISCV_LR_64(addr, value, memorder);
+ while (value != expected) {
+ __RTE_RISCV_WRS_NTO();
+ __RTE_RISCV_LR_64(addr, value, memorder);
+ }
+}
+
+#define RTE_WAIT_UNTIL_MASKED(addr, mask, cond, expected, memorder) do { \
+ RTE_BUILD_BUG_ON(!__builtin_constant_p(memorder)); \
+ RTE_BUILD_BUG_ON(memorder != rte_memory_order_acquire && \
+ memorder != rte_memory_order_relaxed); \
+ RTE_ASSERT(((size_t)(addr) & (sizeof(*(addr)) - 1)) != 0); \
+ const uint32_t size = sizeof(*(addr)) << 3; \
+ typeof(*(addr)) expected_value = (expected); \
+ typeof(*(addr)) value; \
+ __RTE_RISCV_LR((addr), value, memorder, size); \
+ while (!((value & (mask)) cond expected_value)) { \
+ __RTE_RISCV_WRS_NTO(); \
+ __RTE_RISCV_LR((addr), value, memorder, size); \
+ } \
+} while (0)
+
+#endif /* RTE_RISCV_ZAWRS */
+
#ifdef __cplusplus
}
#endif