[v11,2/5] eal: add the APIs to wait until equal

Message ID 1572180765-49767-3-git-send-email-gavin.hu@arm.com (mailing list archive)
State Superseded, archived
Delegated to: David Marchand
Headers
Series use WFE for aarch64 |

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/Intel-compilation fail Compilation issues

Commit Message

Gavin Hu Oct. 27, 2019, 12:52 p.m. UTC
  The rte_wait_until_equal_xx APIs abstract the functionality of
'polling for a memory location to become equal to a given value'.

Add the RTE_ARM_USE_WFE configuration entry for aarch64, disabled
by default. When it is enabled, the above APIs will call WFE instruction
to save CPU cycles and power.

From a VM, when calling this API on aarch64, it may trap in and out to
release vCPUs whereas cause high exit latency. Since kernel 4.18.20 an
adaptive trapping mechanism is introduced to balance the latency and
workload.

Signed-off-by: Gavin Hu <gavin.hu@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
Reviewed-by: Steve Capper <steve.capper@arm.com>
Reviewed-by: Ola Liljedahl <ola.liljedahl@arm.com>
Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
Reviewed-by: Phil Yang <phil.yang@arm.com>
Acked-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
Acked-by: Jerin Jacob <jerinj@marvell.com>
---
 config/arm/meson.build                             |   1 +
 config/common_base                                 |   5 +
 .../common/include/arch/arm/rte_pause_64.h         | 188 +++++++++++++++++++++
 lib/librte_eal/common/include/generic/rte_pause.h  |  99 +++++++++++
 4 files changed, 293 insertions(+)
  

Comments

David Marchand Oct. 27, 2019, 8:49 p.m. UTC | #1
On Sun, Oct 27, 2019 at 1:53 PM Gavin Hu <gavin.hu@arm.com> wrote:

[snip]

> diff --git a/lib/librte_eal/common/include/arch/arm/rte_pause_64.h b/lib/librte_eal/common/include/arch/arm/rte_pause_64.h
> index 93895d3..1680d7a 100644
> --- a/lib/librte_eal/common/include/arch/arm/rte_pause_64.h
> +++ b/lib/librte_eal/common/include/arch/arm/rte_pause_64.h

[snip]

> @@ -17,6 +23,188 @@ static inline void rte_pause(void)
>         asm volatile("yield" ::: "memory");
>  }
>
> +/**
> + * Send an event to quit WFE.
> + */
> +static inline void rte_sevl(void);
> +
> +/**
> + * Put processor into low power WFE(Wait For Event) state
> + */
> +static inline void rte_wfe(void);
> +
> +#ifdef RTE_ARM_USE_WFE
> +static inline void rte_sevl(void)
> +{
> +       asm volatile("sevl" : : : "memory");
> +}
> +
> +static inline void rte_wfe(void)
> +{
> +       asm volatile("wfe" : : : "memory");
> +}
> +#else
> +static inline void rte_sevl(void)
> +{
> +}
> +static inline void rte_wfe(void)
> +{
> +       rte_pause();
> +}
> +#endif
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change, or be removed, without prior notice

experimental?
Just complaining on the principle, you missed the __rte_experimental
in such a case.
But this API is a no go for me, see below.


> + *
> + * Atomic exclusive load from addr, it returns the 16-bit content of *addr
> + * while making it 'monitored',when it is written by someone else, the
> + * 'monitored' state is cleared and a event is generated implicitly to exit
> + * WFE.
> + *
> + * @param addr
> + *  A pointer to the memory location.
> + * @param memorder
> + *  The valid memory order variants are __ATOMIC_ACQUIRE and __ATOMIC_RELAXED.
> + *  These map to C++11 memory orders with the same names, see the C++11 standard
> + *  the GCC wiki on atomic synchronization for detailed definitions.
> + */
> +static __rte_always_inline uint16_t
> +rte_atomic_load_ex_16(volatile uint16_t *addr, int memorder);

This API does not make sense for anything but arm, so this prefix is not good.

On arm, when RTE_ARM_USE_WFE is undefined, why would you need it?
A non exclusive load is enough since you don't want to use wfe.

[snip]

> +
> +static __rte_always_inline uint16_t
> +rte_atomic_load_ex_16(volatile uint16_t *addr, int memorder)
> +{
> +       uint16_t tmp;
> +       assert((memorder == __ATOMIC_ACQUIRE)
> +                       || (memorder == __ATOMIC_RELAXED));
> +       if (memorder == __ATOMIC_ACQUIRE)
> +               asm volatile("ldaxrh %w[tmp], [%x[addr]]"
> +                       : [tmp] "=&r" (tmp)
> +                       : [addr] "r"(addr)
> +                       : "memory");
> +       else if (memorder == __ATOMIC_RELAXED)
> +               asm volatile("ldxrh %w[tmp], [%x[addr]]"
> +                       : [tmp] "=&r" (tmp)
> +                       : [addr] "r"(addr)
> +                       : "memory");
> +       return tmp;
> +}
> +
> +static __rte_always_inline uint32_t
> +rte_atomic_load_ex_32(volatile uint32_t *addr, int memorder)
> +{
> +       uint32_t tmp;
> +       assert((memorder == __ATOMIC_ACQUIRE)
> +                       || (memorder == __ATOMIC_RELAXED));
> +       if (memorder == __ATOMIC_ACQUIRE)
> +               asm volatile("ldaxr %w[tmp], [%x[addr]]"
> +                       : [tmp] "=&r" (tmp)
> +                       : [addr] "r"(addr)
> +                       : "memory");
> +       else if (memorder == __ATOMIC_RELAXED)
> +               asm volatile("ldxr %w[tmp], [%x[addr]]"
> +                       : [tmp] "=&r" (tmp)
> +                       : [addr] "r"(addr)
> +                       : "memory");
> +       return tmp;
> +}
> +
> +static __rte_always_inline uint64_t
> +rte_atomic_load_ex_64(volatile uint64_t *addr, int memorder)
> +{
> +       uint64_t tmp;
> +       assert((memorder == __ATOMIC_ACQUIRE)
> +                       || (memorder == __ATOMIC_RELAXED));
> +       if (memorder == __ATOMIC_ACQUIRE)
> +               asm volatile("ldaxr %x[tmp], [%x[addr]]"
> +                       : [tmp] "=&r" (tmp)
> +                       : [addr] "r"(addr)
> +                       : "memory");
> +       else if (memorder == __ATOMIC_RELAXED)
> +               asm volatile("ldxr %x[tmp], [%x[addr]]"
> +                       : [tmp] "=&r" (tmp)
> +                       : [addr] "r"(addr)
> +                       : "memory");
> +       return tmp;
> +}
> +
> +#ifdef RTE_WAIT_UNTIL_EQUAL_ARCH_DEFINED
> +static __rte_always_inline void
> +rte_wait_until_equal_16(volatile uint16_t *addr, uint16_t expected,
> +int memorder)
> +{
> +       if (__atomic_load_n(addr, memorder) != expected) {
> +               rte_sevl();
> +               do {
> +                       rte_wfe();


We are in the RTE_WAIT_UNTIL_EQUAL_ARCH_DEFINED case.
rte_wfe() is always asm volatile("wfe" : : : "memory");


> +               } while (rte_atomic_load_ex_16(addr, memorder) != expected);
> +       }
> +}
> +
> +static __rte_always_inline void
> +rte_wait_until_equal_32(volatile uint32_t *addr, uint32_t expected,
> +int memorder)
> +{
> +       if (__atomic_load_n(addr, memorder) != expected) {
> +               rte_sevl();
> +               do {
> +                       rte_wfe();
> +               } while (__atomic_load_n(addr, memorder) != expected);
> +       }
> +}

The while() should be with an exclusive load.


I will submit a v12 with those comments addressed so that we move
forward for rc2.
But it won't make it in rc1, sorry.
  
Ananyev, Konstantin Oct. 27, 2019, 10:19 p.m. UTC | #2
> The rte_wait_until_equal_xx APIs abstract the functionality of
> 'polling for a memory location to become equal to a given value'.
> 
> Add the RTE_ARM_USE_WFE configuration entry for aarch64, disabled
> by default. When it is enabled, the above APIs will call WFE instruction
> to save CPU cycles and power.
> 
> From a VM, when calling this API on aarch64, it may trap in and out to
> release vCPUs whereas cause high exit latency. Since kernel 4.18.20 an
> adaptive trapping mechanism is introduced to balance the latency and
> workload.
> 
> Signed-off-by: Gavin Hu <gavin.hu@arm.com>
> Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
> Reviewed-by: Steve Capper <steve.capper@arm.com>
> Reviewed-by: Ola Liljedahl <ola.liljedahl@arm.com>
> Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> Reviewed-by: Phil Yang <phil.yang@arm.com>
> Acked-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
> Acked-by: Jerin Jacob <jerinj@marvell.com>
> ---
>  config/arm/meson.build                             |   1 +
>  config/common_base                                 |   5 +
>  .../common/include/arch/arm/rte_pause_64.h         | 188 +++++++++++++++++++++
>  lib/librte_eal/common/include/generic/rte_pause.h  |  99 +++++++++++
>  4 files changed, 293 insertions(+)
> 
> diff --git a/config/arm/meson.build b/config/arm/meson.build
> index 979018e..b4b4cac 100644
> --- a/config/arm/meson.build
> +++ b/config/arm/meson.build
> @@ -26,6 +26,7 @@ flags_common_default = [
>  	['RTE_LIBRTE_AVP_PMD', false],
> 
>  	['RTE_SCHED_VECTOR', false],
> +	['RTE_ARM_USE_WFE', false],
>  ]
> 
>  flags_generic = [
> diff --git a/config/common_base b/config/common_base
> index e843a21..c812156 100644
> --- a/config/common_base
> +++ b/config/common_base
> @@ -111,6 +111,11 @@ CONFIG_RTE_MAX_VFIO_CONTAINERS=64
>  CONFIG_RTE_MALLOC_DEBUG=n
>  CONFIG_RTE_EAL_NUMA_AWARE_HUGEPAGES=n
>  CONFIG_RTE_USE_LIBBSD=n
> +# Use WFE instructions to implement the rte_wait_for_equal_xxx APIs,
> +# calling these APIs put the cores in low power state while waiting
> +# for the memory address to become equal to the expected value.
> +# This is supported only by aarch64.
> +CONFIG_RTE_ARM_USE_WFE=n
> 
>  #
>  # Recognize/ignore the AVX/AVX512 CPU flags for performance/power testing.
> diff --git a/lib/librte_eal/common/include/arch/arm/rte_pause_64.h b/lib/librte_eal/common/include/arch/arm/rte_pause_64.h
> index 93895d3..1680d7a 100644
> --- a/lib/librte_eal/common/include/arch/arm/rte_pause_64.h
> +++ b/lib/librte_eal/common/include/arch/arm/rte_pause_64.h
> @@ -1,5 +1,6 @@
>  /* SPDX-License-Identifier: BSD-3-Clause
>   * Copyright(c) 2017 Cavium, Inc
> + * Copyright(c) 2019 Arm Limited
>   */
> 
>  #ifndef _RTE_PAUSE_ARM64_H_
> @@ -10,6 +11,11 @@ extern "C" {
>  #endif
> 
>  #include <rte_common.h>
> +
> +#ifdef RTE_ARM_USE_WFE
> +#define RTE_WAIT_UNTIL_EQUAL_ARCH_DEFINED
> +#endif
> +
>  #include "generic/rte_pause.h"
> 
>  static inline void rte_pause(void)
> @@ -17,6 +23,188 @@ static inline void rte_pause(void)
>  	asm volatile("yield" ::: "memory");
>  }
> 
> +/**
> + * Send an event to quit WFE.
> + */
> +static inline void rte_sevl(void);
> +
> +/**
> + * Put processor into low power WFE(Wait For Event) state
> + */
> +static inline void rte_wfe(void);
> +
> +#ifdef RTE_ARM_USE_WFE
> +static inline void rte_sevl(void)
> +{
> +	asm volatile("sevl" : : : "memory");
> +}
> +
> +static inline void rte_wfe(void)
> +{
> +	asm volatile("wfe" : : : "memory");
> +}
> +#else
> +static inline void rte_sevl(void)
> +{
> +}
> +static inline void rte_wfe(void)
> +{
> +	rte_pause();
> +}
> +#endif
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change, or be removed, without prior notice
> + *
> + * Atomic exclusive load from addr, it returns the 16-bit content of *addr
> + * while making it 'monitored',when it is written by someone else, the
> + * 'monitored' state is cleared and a event is generated implicitly to exit
> + * WFE.
> + *
> + * @param addr
> + *  A pointer to the memory location.
> + * @param memorder
> + *  The valid memory order variants are __ATOMIC_ACQUIRE and __ATOMIC_RELAXED.
> + *  These map to C++11 memory orders with the same names, see the C++11 standard
> + *  the GCC wiki on atomic synchronization for detailed definitions.
> + */
> +static __rte_always_inline uint16_t
> +rte_atomic_load_ex_16(volatile uint16_t *addr, int memorder);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change, or be removed, without prior notice
> + *
> + * Atomic exclusive load from addr, it returns the 32-bit content of *addr
> + * while making it 'monitored',when it is written by someone else, the
> + * 'monitored' state is cleared and a event is generated implicitly to exit
> + * WFE.
> + *
> + * @param addr
> + *  A pointer to the memory location.
> + * @param memorder
> + *  The valid memory order variants are __ATOMIC_ACQUIRE and __ATOMIC_RELAXED.
> + *  These map to C++11 memory orders with the same names, see the C++11 standard
> + *  the GCC wiki on atomic synchronization for detailed definitions.
> + */
> +static __rte_always_inline uint32_t
> +rte_atomic_load_ex_32(volatile uint32_t *addr, int memorder);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change, or be removed, without prior notice
> + *
> + * Atomic exclusive load from addr, it returns the 64-bit content of *addr
> + * while making it 'monitored',when it is written by someone else, the
> + * 'monitored' state is cleared and a event is generated implicitly to exit
> + * WFE.
> + *
> + * @param addr
> + *  A pointer to the memory location.
> + * @param memorder
> + *  The valid memory order variants are __ATOMIC_ACQUIRE and __ATOMIC_RELAXED.
> + *  These map to C++11 memory orders with the same names, see the C++11 standard
> + *  the GCC wiki on atomic synchronization for detailed definitions.
> + */
> +static __rte_always_inline uint64_t
> +rte_atomic_load_ex_64(volatile uint64_t *addr, int memorder);
> +
> +static __rte_always_inline uint16_t
> +rte_atomic_load_ex_16(volatile uint16_t *addr, int memorder)
> +{
> +	uint16_t tmp;
> +	assert((memorder == __ATOMIC_ACQUIRE)
> +			|| (memorder == __ATOMIC_RELAXED));
> +	if (memorder == __ATOMIC_ACQUIRE)
> +		asm volatile("ldaxrh %w[tmp], [%x[addr]]"
> +			: [tmp] "=&r" (tmp)
> +			: [addr] "r"(addr)
> +			: "memory");
> +	else if (memorder == __ATOMIC_RELAXED)
> +		asm volatile("ldxrh %w[tmp], [%x[addr]]"
> +			: [tmp] "=&r" (tmp)
> +			: [addr] "r"(addr)
> +			: "memory");
> +	return tmp;
> +}
> +
> +static __rte_always_inline uint32_t
> +rte_atomic_load_ex_32(volatile uint32_t *addr, int memorder)
> +{
> +	uint32_t tmp;
> +	assert((memorder == __ATOMIC_ACQUIRE)
> +			|| (memorder == __ATOMIC_RELAXED));
> +	if (memorder == __ATOMIC_ACQUIRE)
> +		asm volatile("ldaxr %w[tmp], [%x[addr]]"
> +			: [tmp] "=&r" (tmp)
> +			: [addr] "r"(addr)
> +			: "memory");
> +	else if (memorder == __ATOMIC_RELAXED)
> +		asm volatile("ldxr %w[tmp], [%x[addr]]"
> +			: [tmp] "=&r" (tmp)
> +			: [addr] "r"(addr)
> +			: "memory");
> +	return tmp;
> +}
> +
> +static __rte_always_inline uint64_t
> +rte_atomic_load_ex_64(volatile uint64_t *addr, int memorder)
> +{
> +	uint64_t tmp;
> +	assert((memorder == __ATOMIC_ACQUIRE)
> +			|| (memorder == __ATOMIC_RELAXED));
> +	if (memorder == __ATOMIC_ACQUIRE)
> +		asm volatile("ldaxr %x[tmp], [%x[addr]]"
> +			: [tmp] "=&r" (tmp)
> +			: [addr] "r"(addr)
> +			: "memory");
> +	else if (memorder == __ATOMIC_RELAXED)
> +		asm volatile("ldxr %x[tmp], [%x[addr]]"
> +			: [tmp] "=&r" (tmp)
> +			: [addr] "r"(addr)
> +			: "memory");
> +	return tmp;
> +}
> +
> +#ifdef RTE_WAIT_UNTIL_EQUAL_ARCH_DEFINED
> +static __rte_always_inline void
> +rte_wait_until_equal_16(volatile uint16_t *addr, uint16_t expected,
> +int memorder)
> +{
> +	if (__atomic_load_n(addr, memorder) != expected) {
> +		rte_sevl();
> +		do {
> +			rte_wfe();
> +		} while (rte_atomic_load_ex_16(addr, memorder) != expected);
> +	}
> +}
> +
> +static __rte_always_inline void
> +rte_wait_until_equal_32(volatile uint32_t *addr, uint32_t expected,
> +int memorder)
> +{
> +	if (__atomic_load_n(addr, memorder) != expected) {
> +		rte_sevl();
> +		do {
> +			rte_wfe();
> +		} while (__atomic_load_n(addr, memorder) != expected);

Here and in _64, shouldn't it be:
rte_atomic_load_ex_..
?

> +	}
> +}
> +
> +static __rte_always_inline void
> +rte_wait_until_equal_64(volatile uint64_t *addr, uint64_t expected,
> +int memorder)
> +{
> +	if (__atomic_load_n(addr, memorder) != expected) {
> +		rte_sevl();
> +		do {
> +			rte_wfe();
> +		} while (__atomic_load_n(addr, memorder) != expected);
> +	}
> +}
> +#endif
> +
>  #ifdef __cplusplus
>  }
>  #endif
> diff --git a/lib/librte_eal/common/include/generic/rte_pause.h b/lib/librte_eal/common/include/generic/rte_pause.h
> index 52bd4db..9d42e32 100644
> --- a/lib/librte_eal/common/include/generic/rte_pause.h
> +++ b/lib/librte_eal/common/include/generic/rte_pause.h
> @@ -1,5 +1,6 @@
>  /* SPDX-License-Identifier: BSD-3-Clause
>   * Copyright(c) 2017 Cavium, Inc
> + * Copyright(c) 2019 Arm Limited
>   */
> 
>  #ifndef _RTE_PAUSE_H_
> @@ -12,6 +13,12 @@
>   *
>   */
> 
> +#include <stdint.h>
> +#include <rte_common.h>
> +#include <rte_atomic.h>
> +#include <rte_compat.h>
> +#include <assert.h>
> +
>  /**
>   * Pause CPU execution for a short while
>   *
> @@ -20,4 +27,96 @@
>   */
>  static inline void rte_pause(void);
> 
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change, or be removed, without prior notice
> + *
> + * Wait for *addr to be updated with a 16-bit expected value, with a relaxed
> + * memory ordering model meaning the loads around this API can be reordered.
> + *
> + * @param addr
> + *  A pointer to the memory location.
> + * @param expected
> + *  A 16-bit expected value to be in the memory location.
> + * @param memorder
> + *  Two different memory orders that can be specified:
> + *  __ATOMIC_ACQUIRE and __ATOMIC_RELAXED. These map to
> + *  C++11 memory orders with the same names, see the C++11 standard or
> + *  the GCC wiki on atomic synchronization for detailed definition.
> + */
> +__rte_experimental
> +static __rte_always_inline void
> +rte_wait_until_equal_16(volatile uint16_t *addr, uint16_t expected,
> +int memorder);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change, or be removed, without prior notice
> + *
> + * Wait for *addr to be updated with a 32-bit expected value, with a relaxed
> + * memory ordering model meaning the loads around this API can be reordered.
> + *
> + * @param addr
> + *  A pointer to the memory location.
> + * @param expected
> + *  A 32-bit expected value to be in the memory location.
> + * @param memorder
> + *  Two different memory orders that can be specified:
> + *  __ATOMIC_ACQUIRE and __ATOMIC_RELAXED. These map to
> + *  C++11 memory orders with the same names, see the C++11 standard or
> + *  the GCC wiki on atomic synchronization for detailed definition.
> + */
> +__rte_experimental
> +static __rte_always_inline void
> +rte_wait_until_equal_32(volatile uint32_t *addr, uint32_t expected,
> +int memorder);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change, or be removed, without prior notice
> + *
> + * Wait for *addr to be updated with a 64-bit expected value, with a relaxed
> + * memory ordering model meaning the loads around this API can be reordered.
> + *
> + * @param addr
> + *  A pointer to the memory location.
> + * @param expected
> + *  A 64-bit expected value to be in the memory location.
> + * @param memorder
> + *  Two different memory orders that can be specified:
> + *  __ATOMIC_ACQUIRE and __ATOMIC_RELAXED. These map to
> + *  C++11 memory orders with the same names, see the C++11 standard or
> + *  the GCC wiki on atomic synchronization for detailed definition.
> + */
> +__rte_experimental
> +static __rte_always_inline void
> +rte_wait_until_equal_64(volatile uint64_t *addr, uint64_t expected,
> +int memorder);
> +
> +#ifndef RTE_WAIT_UNTIL_EQUAL_ARCH_DEFINED
> +static __rte_always_inline void
> +rte_wait_until_equal_16(volatile uint16_t *addr, uint16_t expected,
> +int memorder)
> +{
> +	while (__atomic_load_n(addr, memorder) != expected)
> +		rte_pause();
> +}
> +
> +static __rte_always_inline void
> +rte_wait_until_equal_32(volatile uint32_t *addr, uint32_t expected,
> +int memorder)
> +{
> +	while (__atomic_load_n(addr, memorder) != expected)
> +		rte_pause();
> +}
> +
> +static __rte_always_inline void
> +rte_wait_until_equal_64(volatile uint64_t *addr, uint64_t expected,
> +int memorder)
> +{
> +	while (__atomic_load_n(addr, memorder) != expected)
> +		rte_pause();
> +}
> +#endif
> +
>  #endif /* _RTE_PAUSE_H_ */
> --
> 2.7.4
  
Gavin Hu Oct. 28, 2019, 5:04 a.m. UTC | #3
Hi Konstantin,

> -----Original Message-----
> From: Ananyev, Konstantin <konstantin.ananyev@intel.com>
> Sent: Monday, October 28, 2019 6:20 AM
> To: Gavin Hu (Arm Technology China) <Gavin.Hu@arm.com>; dev@dpdk.org
> Cc: nd <nd@arm.com>; david.marchand@redhat.com; thomas@monjalon.net;
> stephen@networkplumber.org; hemant.agrawal@nxp.com;
> jerinj@marvell.com; pbhagavatula@marvell.com; Honnappa Nagarahalli
> <Honnappa.Nagarahalli@arm.com>; Ruifeng Wang (Arm Technology China)
> <Ruifeng.Wang@arm.com>; Phil Yang (Arm Technology China)
> <Phil.Yang@arm.com>; Steve Capper <Steve.Capper@arm.com>
> Subject: RE: [PATCH v11 2/5] eal: add the APIs to wait until equal
> 
> 
> > The rte_wait_until_equal_xx APIs abstract the functionality of
> > 'polling for a memory location to become equal to a given value'.
> >
> > Add the RTE_ARM_USE_WFE configuration entry for aarch64, disabled
> > by default. When it is enabled, the above APIs will call WFE instruction
> > to save CPU cycles and power.
> >
> > From a VM, when calling this API on aarch64, it may trap in and out to
> > release vCPUs whereas cause high exit latency. Since kernel 4.18.20 an
> > adaptive trapping mechanism is introduced to balance the latency and
> > workload.
> >
> > Signed-off-by: Gavin Hu <gavin.hu@arm.com>
> > Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
> > Reviewed-by: Steve Capper <steve.capper@arm.com>
> > Reviewed-by: Ola Liljedahl <ola.liljedahl@arm.com>
> > Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> > Reviewed-by: Phil Yang <phil.yang@arm.com>
> > Acked-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
> > Acked-by: Jerin Jacob <jerinj@marvell.com>
> > ---
> >  config/arm/meson.build                             |   1 +
> >  config/common_base                                 |   5 +
> >  .../common/include/arch/arm/rte_pause_64.h         | 188
> +++++++++++++++++++++
> >  lib/librte_eal/common/include/generic/rte_pause.h  |  99 +++++++++++
> >  4 files changed, 293 insertions(+)
> >
> > diff --git a/config/arm/meson.build b/config/arm/meson.build
> > index 979018e..b4b4cac 100644
> > --- a/config/arm/meson.build
> > +++ b/config/arm/meson.build
> > @@ -26,6 +26,7 @@ flags_common_default = [
> >  	['RTE_LIBRTE_AVP_PMD', false],
> >
> >  	['RTE_SCHED_VECTOR', false],
> > +	['RTE_ARM_USE_WFE', false],
> >  ]
> >
> >  flags_generic = [
> > diff --git a/config/common_base b/config/common_base
> > index e843a21..c812156 100644
> > --- a/config/common_base
> > +++ b/config/common_base
> > @@ -111,6 +111,11 @@ CONFIG_RTE_MAX_VFIO_CONTAINERS=64
> >  CONFIG_RTE_MALLOC_DEBUG=n
> >  CONFIG_RTE_EAL_NUMA_AWARE_HUGEPAGES=n
> >  CONFIG_RTE_USE_LIBBSD=n
> > +# Use WFE instructions to implement the rte_wait_for_equal_xxx APIs,
> > +# calling these APIs put the cores in low power state while waiting
> > +# for the memory address to become equal to the expected value.
> > +# This is supported only by aarch64.
> > +CONFIG_RTE_ARM_USE_WFE=n
> >
> >  #
> >  # Recognize/ignore the AVX/AVX512 CPU flags for performance/power
> testing.
> > diff --git a/lib/librte_eal/common/include/arch/arm/rte_pause_64.h
> b/lib/librte_eal/common/include/arch/arm/rte_pause_64.h
> > index 93895d3..1680d7a 100644
> > --- a/lib/librte_eal/common/include/arch/arm/rte_pause_64.h
> > +++ b/lib/librte_eal/common/include/arch/arm/rte_pause_64.h
> > @@ -1,5 +1,6 @@
> >  /* SPDX-License-Identifier: BSD-3-Clause
> >   * Copyright(c) 2017 Cavium, Inc
> > + * Copyright(c) 2019 Arm Limited
> >   */
> >
> >  #ifndef _RTE_PAUSE_ARM64_H_
> > @@ -10,6 +11,11 @@ extern "C" {
> >  #endif
> >
> >  #include <rte_common.h>
> > +
> > +#ifdef RTE_ARM_USE_WFE
> > +#define RTE_WAIT_UNTIL_EQUAL_ARCH_DEFINED
> > +#endif
> > +
> >  #include "generic/rte_pause.h"
> >
> >  static inline void rte_pause(void)
> > @@ -17,6 +23,188 @@ static inline void rte_pause(void)
> >  	asm volatile("yield" ::: "memory");
> >  }
> >
> > +/**
> > + * Send an event to quit WFE.
> > + */
> > +static inline void rte_sevl(void);
> > +
> > +/**
> > + * Put processor into low power WFE(Wait For Event) state
> > + */
> > +static inline void rte_wfe(void);
> > +
> > +#ifdef RTE_ARM_USE_WFE
> > +static inline void rte_sevl(void)
> > +{
> > +	asm volatile("sevl" : : : "memory");
> > +}
> > +
> > +static inline void rte_wfe(void)
> > +{
> > +	asm volatile("wfe" : : : "memory");
> > +}
> > +#else
> > +static inline void rte_sevl(void)
> > +{
> > +}
> > +static inline void rte_wfe(void)
> > +{
> > +	rte_pause();
> > +}
> > +#endif
> > +
> > +/**
> > + * @warning
> > + * @b EXPERIMENTAL: this API may change, or be removed, without prior
> notice
> > + *
> > + * Atomic exclusive load from addr, it returns the 16-bit content of *addr
> > + * while making it 'monitored',when it is written by someone else, the
> > + * 'monitored' state is cleared and a event is generated implicitly to exit
> > + * WFE.
> > + *
> > + * @param addr
> > + *  A pointer to the memory location.
> > + * @param memorder
> > + *  The valid memory order variants are __ATOMIC_ACQUIRE and
> __ATOMIC_RELAXED.
> > + *  These map to C++11 memory orders with the same names, see the
> C++11 standard
> > + *  the GCC wiki on atomic synchronization for detailed definitions.
> > + */
> > +static __rte_always_inline uint16_t
> > +rte_atomic_load_ex_16(volatile uint16_t *addr, int memorder);
> > +
> > +/**
> > + * @warning
> > + * @b EXPERIMENTAL: this API may change, or be removed, without prior
> notice
> > + *
> > + * Atomic exclusive load from addr, it returns the 32-bit content of *addr
> > + * while making it 'monitored',when it is written by someone else, the
> > + * 'monitored' state is cleared and a event is generated implicitly to exit
> > + * WFE.
> > + *
> > + * @param addr
> > + *  A pointer to the memory location.
> > + * @param memorder
> > + *  The valid memory order variants are __ATOMIC_ACQUIRE and
> __ATOMIC_RELAXED.
> > + *  These map to C++11 memory orders with the same names, see the
> C++11 standard
> > + *  the GCC wiki on atomic synchronization for detailed definitions.
> > + */
> > +static __rte_always_inline uint32_t
> > +rte_atomic_load_ex_32(volatile uint32_t *addr, int memorder);
> > +
> > +/**
> > + * @warning
> > + * @b EXPERIMENTAL: this API may change, or be removed, without prior
> notice
> > + *
> > + * Atomic exclusive load from addr, it returns the 64-bit content of *addr
> > + * while making it 'monitored',when it is written by someone else, the
> > + * 'monitored' state is cleared and a event is generated implicitly to exit
> > + * WFE.
> > + *
> > + * @param addr
> > + *  A pointer to the memory location.
> > + * @param memorder
> > + *  The valid memory order variants are __ATOMIC_ACQUIRE and
> __ATOMIC_RELAXED.
> > + *  These map to C++11 memory orders with the same names, see the
> C++11 standard
> > + *  the GCC wiki on atomic synchronization for detailed definitions.
> > + */
> > +static __rte_always_inline uint64_t
> > +rte_atomic_load_ex_64(volatile uint64_t *addr, int memorder);
> > +
> > +static __rte_always_inline uint16_t
> > +rte_atomic_load_ex_16(volatile uint16_t *addr, int memorder)
> > +{
> > +	uint16_t tmp;
> > +	assert((memorder == __ATOMIC_ACQUIRE)
> > +			|| (memorder == __ATOMIC_RELAXED));
> > +	if (memorder == __ATOMIC_ACQUIRE)
> > +		asm volatile("ldaxrh %w[tmp], [%x[addr]]"
> > +			: [tmp] "=&r" (tmp)
> > +			: [addr] "r"(addr)
> > +			: "memory");
> > +	else if (memorder == __ATOMIC_RELAXED)
> > +		asm volatile("ldxrh %w[tmp], [%x[addr]]"
> > +			: [tmp] "=&r" (tmp)
> > +			: [addr] "r"(addr)
> > +			: "memory");
> > +	return tmp;
> > +}
> > +
> > +static __rte_always_inline uint32_t
> > +rte_atomic_load_ex_32(volatile uint32_t *addr, int memorder)
> > +{
> > +	uint32_t tmp;
> > +	assert((memorder == __ATOMIC_ACQUIRE)
> > +			|| (memorder == __ATOMIC_RELAXED));
> > +	if (memorder == __ATOMIC_ACQUIRE)
> > +		asm volatile("ldaxr %w[tmp], [%x[addr]]"
> > +			: [tmp] "=&r" (tmp)
> > +			: [addr] "r"(addr)
> > +			: "memory");
> > +	else if (memorder == __ATOMIC_RELAXED)
> > +		asm volatile("ldxr %w[tmp], [%x[addr]]"
> > +			: [tmp] "=&r" (tmp)
> > +			: [addr] "r"(addr)
> > +			: "memory");
> > +	return tmp;
> > +}
> > +
> > +static __rte_always_inline uint64_t
> > +rte_atomic_load_ex_64(volatile uint64_t *addr, int memorder)
> > +{
> > +	uint64_t tmp;
> > +	assert((memorder == __ATOMIC_ACQUIRE)
> > +			|| (memorder == __ATOMIC_RELAXED));
> > +	if (memorder == __ATOMIC_ACQUIRE)
> > +		asm volatile("ldaxr %x[tmp], [%x[addr]]"
> > +			: [tmp] "=&r" (tmp)
> > +			: [addr] "r"(addr)
> > +			: "memory");
> > +	else if (memorder == __ATOMIC_RELAXED)
> > +		asm volatile("ldxr %x[tmp], [%x[addr]]"
> > +			: [tmp] "=&r" (tmp)
> > +			: [addr] "r"(addr)
> > +			: "memory");
> > +	return tmp;
> > +}
> > +
> > +#ifdef RTE_WAIT_UNTIL_EQUAL_ARCH_DEFINED
> > +static __rte_always_inline void
> > +rte_wait_until_equal_16(volatile uint16_t *addr, uint16_t expected,
> > +int memorder)
> > +{
> > +	if (__atomic_load_n(addr, memorder) != expected) {
> > +		rte_sevl();
> > +		do {
> > +			rte_wfe();
> > +		} while (rte_atomic_load_ex_16(addr, memorder) !=
> expected);
> > +	}
> > +}
> > +
> > +static __rte_always_inline void
> > +rte_wait_until_equal_32(volatile uint32_t *addr, uint32_t expected,
> > +int memorder)
> > +{
> > +	if (__atomic_load_n(addr, memorder) != expected) {
> > +		rte_sevl();
> > +		do {
> > +			rte_wfe();
> > +		} while (__atomic_load_n(addr, memorder) != expected);
> 
> Here and in _64, shouldn't it be:
> rte_atomic_load_ex_..
Thanks for spotting this error, David also spotted it. Sorry for that. 
> 
> > +	}
> > +}
> > +
> > +static __rte_always_inline void
> > +rte_wait_until_equal_64(volatile uint64_t *addr, uint64_t expected,
> > +int memorder)
> > +{
> > +	if (__atomic_load_n(addr, memorder) != expected) {
> > +		rte_sevl();
> > +		do {
> > +			rte_wfe();
> > +		} while (__atomic_load_n(addr, memorder) != expected);
> > +	}
> > +}
> > +#endif
> > +
> >  #ifdef __cplusplus
> >  }
> >  #endif
> > diff --git a/lib/librte_eal/common/include/generic/rte_pause.h
> b/lib/librte_eal/common/include/generic/rte_pause.h
> > index 52bd4db..9d42e32 100644
> > --- a/lib/librte_eal/common/include/generic/rte_pause.h
> > +++ b/lib/librte_eal/common/include/generic/rte_pause.h
> > @@ -1,5 +1,6 @@
> >  /* SPDX-License-Identifier: BSD-3-Clause
> >   * Copyright(c) 2017 Cavium, Inc
> > + * Copyright(c) 2019 Arm Limited
> >   */
> >
> >  #ifndef _RTE_PAUSE_H_
> > @@ -12,6 +13,12 @@
> >   *
> >   */
> >
> > +#include <stdint.h>
> > +#include <rte_common.h>
> > +#include <rte_atomic.h>
> > +#include <rte_compat.h>
> > +#include <assert.h>
> > +
> >  /**
> >   * Pause CPU execution for a short while
> >   *
> > @@ -20,4 +27,96 @@
> >   */
> >  static inline void rte_pause(void);
> >
> > +/**
> > + * @warning
> > + * @b EXPERIMENTAL: this API may change, or be removed, without prior
> notice
> > + *
> > + * Wait for *addr to be updated with a 16-bit expected value, with a relaxed
> > + * memory ordering model meaning the loads around this API can be
> reordered.
> > + *
> > + * @param addr
> > + *  A pointer to the memory location.
> > + * @param expected
> > + *  A 16-bit expected value to be in the memory location.
> > + * @param memorder
> > + *  Two different memory orders that can be specified:
> > + *  __ATOMIC_ACQUIRE and __ATOMIC_RELAXED. These map to
> > + *  C++11 memory orders with the same names, see the C++11 standard or
> > + *  the GCC wiki on atomic synchronization for detailed definition.
> > + */
> > +__rte_experimental
> > +static __rte_always_inline void
> > +rte_wait_until_equal_16(volatile uint16_t *addr, uint16_t expected,
> > +int memorder);
> > +
> > +/**
> > + * @warning
> > + * @b EXPERIMENTAL: this API may change, or be removed, without prior
> notice
> > + *
> > + * Wait for *addr to be updated with a 32-bit expected value, with a relaxed
> > + * memory ordering model meaning the loads around this API can be
> reordered.
> > + *
> > + * @param addr
> > + *  A pointer to the memory location.
> > + * @param expected
> > + *  A 32-bit expected value to be in the memory location.
> > + * @param memorder
> > + *  Two different memory orders that can be specified:
> > + *  __ATOMIC_ACQUIRE and __ATOMIC_RELAXED. These map to
> > + *  C++11 memory orders with the same names, see the C++11 standard or
> > + *  the GCC wiki on atomic synchronization for detailed definition.
> > + */
> > +__rte_experimental
> > +static __rte_always_inline void
> > +rte_wait_until_equal_32(volatile uint32_t *addr, uint32_t expected,
> > +int memorder);
> > +
> > +/**
> > + * @warning
> > + * @b EXPERIMENTAL: this API may change, or be removed, without prior
> notice
> > + *
> > + * Wait for *addr to be updated with a 64-bit expected value, with a relaxed
> > + * memory ordering model meaning the loads around this API can be
> reordered.
> > + *
> > + * @param addr
> > + *  A pointer to the memory location.
> > + * @param expected
> > + *  A 64-bit expected value to be in the memory location.
> > + * @param memorder
> > + *  Two different memory orders that can be specified:
> > + *  __ATOMIC_ACQUIRE and __ATOMIC_RELAXED. These map to
> > + *  C++11 memory orders with the same names, see the C++11 standard or
> > + *  the GCC wiki on atomic synchronization for detailed definition.
> > + */
> > +__rte_experimental
> > +static __rte_always_inline void
> > +rte_wait_until_equal_64(volatile uint64_t *addr, uint64_t expected,
> > +int memorder);
> > +
> > +#ifndef RTE_WAIT_UNTIL_EQUAL_ARCH_DEFINED
> > +static __rte_always_inline void
> > +rte_wait_until_equal_16(volatile uint16_t *addr, uint16_t expected,
> > +int memorder)
> > +{
> > +	while (__atomic_load_n(addr, memorder) != expected)
> > +		rte_pause();
> > +}
> > +
> > +static __rte_always_inline void
> > +rte_wait_until_equal_32(volatile uint32_t *addr, uint32_t expected,
> > +int memorder)
> > +{
> > +	while (__atomic_load_n(addr, memorder) != expected)
> > +		rte_pause();
> > +}
> > +
> > +static __rte_always_inline void
> > +rte_wait_until_equal_64(volatile uint64_t *addr, uint64_t expected,
> > +int memorder)
> > +{
> > +	while (__atomic_load_n(addr, memorder) != expected)
> > +		rte_pause();
> > +}
> > +#endif
> > +
> >  #endif /* _RTE_PAUSE_H_ */
> > --
> > 2.7.4
  
Gavin Hu Oct. 28, 2019, 5:08 a.m. UTC | #4
Hi david,
> -----Original Message-----
> From: David Marchand <david.marchand@redhat.com>
> Sent: Monday, October 28, 2019 4:50 AM
> To: Gavin Hu (Arm Technology China) <Gavin.Hu@arm.com>
> Cc: dev <dev@dpdk.org>; nd <nd@arm.com>; Ananyev, Konstantin
> <konstantin.ananyev@intel.com>; thomas@monjalon.net; Stephen
> Hemminger <stephen@networkplumber.org>; hemant.agrawal@nxp.com;
> jerinj@marvell.com; Pavan Nikhilesh <pbhagavatula@marvell.com>;
> Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>; Ruifeng Wang
> (Arm Technology China) <Ruifeng.Wang@arm.com>; Phil Yang (Arm
> Technology China) <Phil.Yang@arm.com>; Steve Capper
> <Steve.Capper@arm.com>
> Subject: Re: [PATCH v11 2/5] eal: add the APIs to wait until equal
> 
> On Sun, Oct 27, 2019 at 1:53 PM Gavin Hu <gavin.hu@arm.com> wrote:
> 
> [snip]
> 
> > diff --git a/lib/librte_eal/common/include/arch/arm/rte_pause_64.h
> b/lib/librte_eal/common/include/arch/arm/rte_pause_64.h
> > index 93895d3..1680d7a 100644
> > --- a/lib/librte_eal/common/include/arch/arm/rte_pause_64.h
> > +++ b/lib/librte_eal/common/include/arch/arm/rte_pause_64.h
> 
> [snip]
> 
> > @@ -17,6 +23,188 @@ static inline void rte_pause(void)
> >         asm volatile("yield" ::: "memory");
> >  }
> >
> > +/**
> > + * Send an event to quit WFE.
> > + */
> > +static inline void rte_sevl(void);
> > +
> > +/**
> > + * Put processor into low power WFE(Wait For Event) state
> > + */
> > +static inline void rte_wfe(void);
> > +
> > +#ifdef RTE_ARM_USE_WFE
> > +static inline void rte_sevl(void)
> > +{
> > +       asm volatile("sevl" : : : "memory");
> > +}
> > +
> > +static inline void rte_wfe(void)
> > +{
> > +       asm volatile("wfe" : : : "memory");
> > +}
> > +#else
> > +static inline void rte_sevl(void)
> > +{
> > +}
> > +static inline void rte_wfe(void)
> > +{
> > +       rte_pause();
> > +}
> > +#endif
> > +
> > +/**
> > + * @warning
> > + * @b EXPERIMENTAL: this API may change, or be removed, without prior
> notice
> 
> experimental?
> Just complaining on the principle, you missed the __rte_experimental
> in such a case.
> But this API is a no go for me, see below.
Got it, thanks!
> 
> > + *
> > + * Atomic exclusive load from addr, it returns the 16-bit content of *addr
> > + * while making it 'monitored',when it is written by someone else, the
> > + * 'monitored' state is cleared and a event is generated implicitly to exit
> > + * WFE.
> > + *
> > + * @param addr
> > + *  A pointer to the memory location.
> > + * @param memorder
> > + *  The valid memory order variants are __ATOMIC_ACQUIRE and
> __ATOMIC_RELAXED.
> > + *  These map to C++11 memory orders with the same names, see the
> C++11 standard
> > + *  the GCC wiki on atomic synchronization for detailed definitions.
> > + */
> > +static __rte_always_inline uint16_t
> > +rte_atomic_load_ex_16(volatile uint16_t *addr, int memorder);
> 
> This API does not make sense for anything but arm, so this prefix is not good.
Yes, we can change back to __ atomic_load_ex_16?
> 
> On arm, when RTE_ARM_USE_WFE is undefined, why would you need it?
> A non exclusive load is enough since you don't want to use wfe.
We can move it inside #ifdef RTE_ARM_USE_WFE .. #endif.
> [snip]
> 
> > +
> > +static __rte_always_inline uint16_t
> > +rte_atomic_load_ex_16(volatile uint16_t *addr, int memorder)
> > +{
> > +       uint16_t tmp;
> > +       assert((memorder == __ATOMIC_ACQUIRE)
> > +                       || (memorder == __ATOMIC_RELAXED));
> > +       if (memorder == __ATOMIC_ACQUIRE)
> > +               asm volatile("ldaxrh %w[tmp], [%x[addr]]"
> > +                       : [tmp] "=&r" (tmp)
> > +                       : [addr] "r"(addr)
> > +                       : "memory");
> > +       else if (memorder == __ATOMIC_RELAXED)
> > +               asm volatile("ldxrh %w[tmp], [%x[addr]]"
> > +                       : [tmp] "=&r" (tmp)
> > +                       : [addr] "r"(addr)
> > +                       : "memory");
> > +       return tmp;
> > +}
> > +
> > +static __rte_always_inline uint32_t
> > +rte_atomic_load_ex_32(volatile uint32_t *addr, int memorder)
> > +{
> > +       uint32_t tmp;
> > +       assert((memorder == __ATOMIC_ACQUIRE)
> > +                       || (memorder == __ATOMIC_RELAXED));
> > +       if (memorder == __ATOMIC_ACQUIRE)
> > +               asm volatile("ldaxr %w[tmp], [%x[addr]]"
> > +                       : [tmp] "=&r" (tmp)
> > +                       : [addr] "r"(addr)
> > +                       : "memory");
> > +       else if (memorder == __ATOMIC_RELAXED)
> > +               asm volatile("ldxr %w[tmp], [%x[addr]]"
> > +                       : [tmp] "=&r" (tmp)
> > +                       : [addr] "r"(addr)
> > +                       : "memory");
> > +       return tmp;
> > +}
> > +
> > +static __rte_always_inline uint64_t
> > +rte_atomic_load_ex_64(volatile uint64_t *addr, int memorder)
> > +{
> > +       uint64_t tmp;
> > +       assert((memorder == __ATOMIC_ACQUIRE)
> > +                       || (memorder == __ATOMIC_RELAXED));
> > +       if (memorder == __ATOMIC_ACQUIRE)
> > +               asm volatile("ldaxr %x[tmp], [%x[addr]]"
> > +                       : [tmp] "=&r" (tmp)
> > +                       : [addr] "r"(addr)
> > +                       : "memory");
> > +       else if (memorder == __ATOMIC_RELAXED)
> > +               asm volatile("ldxr %x[tmp], [%x[addr]]"
> > +                       : [tmp] "=&r" (tmp)
> > +                       : [addr] "r"(addr)
> > +                       : "memory");
> > +       return tmp;
> > +}
> > +
> > +#ifdef RTE_WAIT_UNTIL_EQUAL_ARCH_DEFINED
> > +static __rte_always_inline void
> > +rte_wait_until_equal_16(volatile uint16_t *addr, uint16_t expected,
> > +int memorder)
> > +{
> > +       if (__atomic_load_n(addr, memorder) != expected) {
> > +               rte_sevl();
> > +               do {
> > +                       rte_wfe();
> 
> 
> We are in the RTE_WAIT_UNTIL_EQUAL_ARCH_DEFINED case.
> rte_wfe() is always asm volatile("wfe" : : : "memory");
> 
> 
> > +               } while (rte_atomic_load_ex_16(addr, memorder) != expected);
> > +       }
> > +}
> > +
> > +static __rte_always_inline void
> > +rte_wait_until_equal_32(volatile uint32_t *addr, uint32_t expected,
> > +int memorder)
> > +{
> > +       if (__atomic_load_n(addr, memorder) != expected) {
> > +               rte_sevl();
> > +               do {
> > +                       rte_wfe();
> > +               } while (__atomic_load_n(addr, memorder) != expected);
> > +       }
> > +}
> 
> The while() should be with an exclusive load.
Sorry for this explicit error. 
> 
> 
> I will submit a v12 with those comments addressed so that we move
> forward for rc2.
> But it won't make it in rc1, sorry.
I will do it if you prefer, otherwise thanks!
> 
> 
> --
> David Marchand
  

Patch

diff --git a/config/arm/meson.build b/config/arm/meson.build
index 979018e..b4b4cac 100644
--- a/config/arm/meson.build
+++ b/config/arm/meson.build
@@ -26,6 +26,7 @@  flags_common_default = [
 	['RTE_LIBRTE_AVP_PMD', false],
 
 	['RTE_SCHED_VECTOR', false],
+	['RTE_ARM_USE_WFE', false],
 ]
 
 flags_generic = [
diff --git a/config/common_base b/config/common_base
index e843a21..c812156 100644
--- a/config/common_base
+++ b/config/common_base
@@ -111,6 +111,11 @@  CONFIG_RTE_MAX_VFIO_CONTAINERS=64
 CONFIG_RTE_MALLOC_DEBUG=n
 CONFIG_RTE_EAL_NUMA_AWARE_HUGEPAGES=n
 CONFIG_RTE_USE_LIBBSD=n
+# Use WFE instructions to implement the rte_wait_for_equal_xxx APIs,
+# calling these APIs put the cores in low power state while waiting
+# for the memory address to become equal to the expected value.
+# This is supported only by aarch64.
+CONFIG_RTE_ARM_USE_WFE=n
 
 #
 # Recognize/ignore the AVX/AVX512 CPU flags for performance/power testing.
diff --git a/lib/librte_eal/common/include/arch/arm/rte_pause_64.h b/lib/librte_eal/common/include/arch/arm/rte_pause_64.h
index 93895d3..1680d7a 100644
--- a/lib/librte_eal/common/include/arch/arm/rte_pause_64.h
+++ b/lib/librte_eal/common/include/arch/arm/rte_pause_64.h
@@ -1,5 +1,6 @@ 
 /* SPDX-License-Identifier: BSD-3-Clause
  * Copyright(c) 2017 Cavium, Inc
+ * Copyright(c) 2019 Arm Limited
  */
 
 #ifndef _RTE_PAUSE_ARM64_H_
@@ -10,6 +11,11 @@  extern "C" {
 #endif
 
 #include <rte_common.h>
+
+#ifdef RTE_ARM_USE_WFE
+#define RTE_WAIT_UNTIL_EQUAL_ARCH_DEFINED
+#endif
+
 #include "generic/rte_pause.h"
 
 static inline void rte_pause(void)
@@ -17,6 +23,188 @@  static inline void rte_pause(void)
 	asm volatile("yield" ::: "memory");
 }
 
+/**
+ * Send an event to quit WFE.
+ */
+static inline void rte_sevl(void);
+
+/**
+ * Put processor into low power WFE(Wait For Event) state
+ */
+static inline void rte_wfe(void);
+
+#ifdef RTE_ARM_USE_WFE
+static inline void rte_sevl(void)
+{
+	asm volatile("sevl" : : : "memory");
+}
+
+static inline void rte_wfe(void)
+{
+	asm volatile("wfe" : : : "memory");
+}
+#else
+static inline void rte_sevl(void)
+{
+}
+static inline void rte_wfe(void)
+{
+	rte_pause();
+}
+#endif
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change, or be removed, without prior notice
+ *
+ * Atomic exclusive load from addr, it returns the 16-bit content of *addr
+ * while making it 'monitored',when it is written by someone else, the
+ * 'monitored' state is cleared and a event is generated implicitly to exit
+ * WFE.
+ *
+ * @param addr
+ *  A pointer to the memory location.
+ * @param memorder
+ *  The valid memory order variants are __ATOMIC_ACQUIRE and __ATOMIC_RELAXED.
+ *  These map to C++11 memory orders with the same names, see the C++11 standard
+ *  the GCC wiki on atomic synchronization for detailed definitions.
+ */
+static __rte_always_inline uint16_t
+rte_atomic_load_ex_16(volatile uint16_t *addr, int memorder);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change, or be removed, without prior notice
+ *
+ * Atomic exclusive load from addr, it returns the 32-bit content of *addr
+ * while making it 'monitored',when it is written by someone else, the
+ * 'monitored' state is cleared and a event is generated implicitly to exit
+ * WFE.
+ *
+ * @param addr
+ *  A pointer to the memory location.
+ * @param memorder
+ *  The valid memory order variants are __ATOMIC_ACQUIRE and __ATOMIC_RELAXED.
+ *  These map to C++11 memory orders with the same names, see the C++11 standard
+ *  the GCC wiki on atomic synchronization for detailed definitions.
+ */
+static __rte_always_inline uint32_t
+rte_atomic_load_ex_32(volatile uint32_t *addr, int memorder);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change, or be removed, without prior notice
+ *
+ * Atomic exclusive load from addr, it returns the 64-bit content of *addr
+ * while making it 'monitored',when it is written by someone else, the
+ * 'monitored' state is cleared and a event is generated implicitly to exit
+ * WFE.
+ *
+ * @param addr
+ *  A pointer to the memory location.
+ * @param memorder
+ *  The valid memory order variants are __ATOMIC_ACQUIRE and __ATOMIC_RELAXED.
+ *  These map to C++11 memory orders with the same names, see the C++11 standard
+ *  the GCC wiki on atomic synchronization for detailed definitions.
+ */
+static __rte_always_inline uint64_t
+rte_atomic_load_ex_64(volatile uint64_t *addr, int memorder);
+
+static __rte_always_inline uint16_t
+rte_atomic_load_ex_16(volatile uint16_t *addr, int memorder)
+{
+	uint16_t tmp;
+	assert((memorder == __ATOMIC_ACQUIRE)
+			|| (memorder == __ATOMIC_RELAXED));
+	if (memorder == __ATOMIC_ACQUIRE)
+		asm volatile("ldaxrh %w[tmp], [%x[addr]]"
+			: [tmp] "=&r" (tmp)
+			: [addr] "r"(addr)
+			: "memory");
+	else if (memorder == __ATOMIC_RELAXED)
+		asm volatile("ldxrh %w[tmp], [%x[addr]]"
+			: [tmp] "=&r" (tmp)
+			: [addr] "r"(addr)
+			: "memory");
+	return tmp;
+}
+
+static __rte_always_inline uint32_t
+rte_atomic_load_ex_32(volatile uint32_t *addr, int memorder)
+{
+	uint32_t tmp;
+	assert((memorder == __ATOMIC_ACQUIRE)
+			|| (memorder == __ATOMIC_RELAXED));
+	if (memorder == __ATOMIC_ACQUIRE)
+		asm volatile("ldaxr %w[tmp], [%x[addr]]"
+			: [tmp] "=&r" (tmp)
+			: [addr] "r"(addr)
+			: "memory");
+	else if (memorder == __ATOMIC_RELAXED)
+		asm volatile("ldxr %w[tmp], [%x[addr]]"
+			: [tmp] "=&r" (tmp)
+			: [addr] "r"(addr)
+			: "memory");
+	return tmp;
+}
+
+static __rte_always_inline uint64_t
+rte_atomic_load_ex_64(volatile uint64_t *addr, int memorder)
+{
+	uint64_t tmp;
+	assert((memorder == __ATOMIC_ACQUIRE)
+			|| (memorder == __ATOMIC_RELAXED));
+	if (memorder == __ATOMIC_ACQUIRE)
+		asm volatile("ldaxr %x[tmp], [%x[addr]]"
+			: [tmp] "=&r" (tmp)
+			: [addr] "r"(addr)
+			: "memory");
+	else if (memorder == __ATOMIC_RELAXED)
+		asm volatile("ldxr %x[tmp], [%x[addr]]"
+			: [tmp] "=&r" (tmp)
+			: [addr] "r"(addr)
+			: "memory");
+	return tmp;
+}
+
+#ifdef RTE_WAIT_UNTIL_EQUAL_ARCH_DEFINED
+static __rte_always_inline void
+rte_wait_until_equal_16(volatile uint16_t *addr, uint16_t expected,
+int memorder)
+{
+	if (__atomic_load_n(addr, memorder) != expected) {
+		rte_sevl();
+		do {
+			rte_wfe();
+		} while (rte_atomic_load_ex_16(addr, memorder) != expected);
+	}
+}
+
+static __rte_always_inline void
+rte_wait_until_equal_32(volatile uint32_t *addr, uint32_t expected,
+int memorder)
+{
+	if (__atomic_load_n(addr, memorder) != expected) {
+		rte_sevl();
+		do {
+			rte_wfe();
+		} while (__atomic_load_n(addr, memorder) != expected);
+	}
+}
+
+static __rte_always_inline void
+rte_wait_until_equal_64(volatile uint64_t *addr, uint64_t expected,
+int memorder)
+{
+	if (__atomic_load_n(addr, memorder) != expected) {
+		rte_sevl();
+		do {
+			rte_wfe();
+		} while (__atomic_load_n(addr, memorder) != expected);
+	}
+}
+#endif
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/librte_eal/common/include/generic/rte_pause.h b/lib/librte_eal/common/include/generic/rte_pause.h
index 52bd4db..9d42e32 100644
--- a/lib/librte_eal/common/include/generic/rte_pause.h
+++ b/lib/librte_eal/common/include/generic/rte_pause.h
@@ -1,5 +1,6 @@ 
 /* SPDX-License-Identifier: BSD-3-Clause
  * Copyright(c) 2017 Cavium, Inc
+ * Copyright(c) 2019 Arm Limited
  */
 
 #ifndef _RTE_PAUSE_H_
@@ -12,6 +13,12 @@ 
  *
  */
 
+#include <stdint.h>
+#include <rte_common.h>
+#include <rte_atomic.h>
+#include <rte_compat.h>
+#include <assert.h>
+
 /**
  * Pause CPU execution for a short while
  *
@@ -20,4 +27,96 @@ 
  */
 static inline void rte_pause(void);
 
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change, or be removed, without prior notice
+ *
+ * Wait for *addr to be updated with a 16-bit expected value, with a relaxed
+ * memory ordering model meaning the loads around this API can be reordered.
+ *
+ * @param addr
+ *  A pointer to the memory location.
+ * @param expected
+ *  A 16-bit expected value to be in the memory location.
+ * @param memorder
+ *  Two different memory orders that can be specified:
+ *  __ATOMIC_ACQUIRE and __ATOMIC_RELAXED. These map to
+ *  C++11 memory orders with the same names, see the C++11 standard or
+ *  the GCC wiki on atomic synchronization for detailed definition.
+ */
+__rte_experimental
+static __rte_always_inline void
+rte_wait_until_equal_16(volatile uint16_t *addr, uint16_t expected,
+int memorder);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change, or be removed, without prior notice
+ *
+ * Wait for *addr to be updated with a 32-bit expected value, with a relaxed
+ * memory ordering model meaning the loads around this API can be reordered.
+ *
+ * @param addr
+ *  A pointer to the memory location.
+ * @param expected
+ *  A 32-bit expected value to be in the memory location.
+ * @param memorder
+ *  Two different memory orders that can be specified:
+ *  __ATOMIC_ACQUIRE and __ATOMIC_RELAXED. These map to
+ *  C++11 memory orders with the same names, see the C++11 standard or
+ *  the GCC wiki on atomic synchronization for detailed definition.
+ */
+__rte_experimental
+static __rte_always_inline void
+rte_wait_until_equal_32(volatile uint32_t *addr, uint32_t expected,
+int memorder);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change, or be removed, without prior notice
+ *
+ * Wait for *addr to be updated with a 64-bit expected value, with a relaxed
+ * memory ordering model meaning the loads around this API can be reordered.
+ *
+ * @param addr
+ *  A pointer to the memory location.
+ * @param expected
+ *  A 64-bit expected value to be in the memory location.
+ * @param memorder
+ *  Two different memory orders that can be specified:
+ *  __ATOMIC_ACQUIRE and __ATOMIC_RELAXED. These map to
+ *  C++11 memory orders with the same names, see the C++11 standard or
+ *  the GCC wiki on atomic synchronization for detailed definition.
+ */
+__rte_experimental
+static __rte_always_inline void
+rte_wait_until_equal_64(volatile uint64_t *addr, uint64_t expected,
+int memorder);
+
+#ifndef RTE_WAIT_UNTIL_EQUAL_ARCH_DEFINED
+static __rte_always_inline void
+rte_wait_until_equal_16(volatile uint16_t *addr, uint16_t expected,
+int memorder)
+{
+	while (__atomic_load_n(addr, memorder) != expected)
+		rte_pause();
+}
+
+static __rte_always_inline void
+rte_wait_until_equal_32(volatile uint32_t *addr, uint32_t expected,
+int memorder)
+{
+	while (__atomic_load_n(addr, memorder) != expected)
+		rte_pause();
+}
+
+static __rte_always_inline void
+rte_wait_until_equal_64(volatile uint64_t *addr, uint64_t expected,
+int memorder)
+{
+	while (__atomic_load_n(addr, memorder) != expected)
+		rte_pause();
+}
+#endif
+
 #endif /* _RTE_PAUSE_H_ */