[v8,2/6] eal: add the APIs to wait until equal

Message ID 1571651279-51857-3-git-send-email-gavin.hu@arm.com (mailing list archive)
State Superseded, archived
Delegated to: David Marchand
Headers
Series use WFE for aarch64 |

Checks

Context Check Description
ci/checkpatch warning coding style issues
ci/Intel-compilation success Compilation OK

Commit Message

Gavin Hu Oct. 21, 2019, 9:47 a.m. UTC
  The rte_wait_until_equal_xx APIs abstract the functionality of
'polling for a memory location to become equal to a given value'.

Add the RTE_ARM_USE_WFE configuration entry for aarch64, disabled
by default. When it is enabled, the above APIs will call WFE instruction
to save CPU cycles and power.

From a VM, when calling this API on aarch64, it may trap in and out to
release vCPUs whereas cause high exit latency. Since kernel 4.18.20 an
adaptive trapping mechanism is introduced to balance the latency and
workload.

Signed-off-by: Gavin Hu <gavin.hu@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
Reviewed-by: Steve Capper <steve.capper@arm.com>
Reviewed-by: Ola Liljedahl <ola.liljedahl@arm.com>
Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
Reviewed-by: Phil Yang <phil.yang@arm.com>
Acked-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
Acked-by: Jerin Jacob <jerinj@marvell.com>
---
 config/arm/meson.build                             |  1 +
 config/common_base                                 |  5 ++
 .../common/include/arch/arm/rte_pause_64.h         | 26 ++++++
 lib/librte_eal/common/include/generic/rte_pause.h  | 93 ++++++++++++++++++++++
 4 files changed, 125 insertions(+)
  

Comments

David Marchand Oct. 21, 2019, 7:19 p.m. UTC | #1
On Mon, Oct 21, 2019 at 11:48 AM Gavin Hu <gavin.hu@arm.com> wrote:
>
> The rte_wait_until_equal_xx APIs abstract the functionality of
> 'polling for a memory location to become equal to a given value'.
>
> Add the RTE_ARM_USE_WFE configuration entry for aarch64, disabled
> by default. When it is enabled, the above APIs will call WFE instruction
> to save CPU cycles and power.
>
> From a VM, when calling this API on aarch64, it may trap in and out to
> release vCPUs whereas cause high exit latency. Since kernel 4.18.20 an
> adaptive trapping mechanism is introduced to balance the latency and
> workload.
>
> Signed-off-by: Gavin Hu <gavin.hu@arm.com>
> Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
> Reviewed-by: Steve Capper <steve.capper@arm.com>
> Reviewed-by: Ola Liljedahl <ola.liljedahl@arm.com>
> Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> Reviewed-by: Phil Yang <phil.yang@arm.com>
> Acked-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
> Acked-by: Jerin Jacob <jerinj@marvell.com>
> ---
>  config/arm/meson.build                             |  1 +
>  config/common_base                                 |  5 ++
>  .../common/include/arch/arm/rte_pause_64.h         | 26 ++++++
>  lib/librte_eal/common/include/generic/rte_pause.h  | 93 ++++++++++++++++++++++
>  4 files changed, 125 insertions(+)
>
> diff --git a/config/arm/meson.build b/config/arm/meson.build
> index 979018e..b4b4cac 100644
> --- a/config/arm/meson.build
> +++ b/config/arm/meson.build
> @@ -26,6 +26,7 @@ flags_common_default = [
>         ['RTE_LIBRTE_AVP_PMD', false],
>
>         ['RTE_SCHED_VECTOR', false],
> +       ['RTE_ARM_USE_WFE', false],
>  ]
>
>  flags_generic = [
> diff --git a/config/common_base b/config/common_base
> index e843a21..c812156 100644
> --- a/config/common_base
> +++ b/config/common_base
> @@ -111,6 +111,11 @@ CONFIG_RTE_MAX_VFIO_CONTAINERS=64
>  CONFIG_RTE_MALLOC_DEBUG=n
>  CONFIG_RTE_EAL_NUMA_AWARE_HUGEPAGES=n
>  CONFIG_RTE_USE_LIBBSD=n
> +# Use WFE instructions to implement the rte_wait_for_equal_xxx APIs,
> +# calling these APIs put the cores in low power state while waiting
> +# for the memory address to become equal to the expected value.
> +# This is supported only by aarch64.
> +CONFIG_RTE_ARM_USE_WFE=n
>
>  #
>  # Recognize/ignore the AVX/AVX512 CPU flags for performance/power testing.
> diff --git a/lib/librte_eal/common/include/arch/arm/rte_pause_64.h b/lib/librte_eal/common/include/arch/arm/rte_pause_64.h
> index 93895d3..eb8f73e 100644
> --- a/lib/librte_eal/common/include/arch/arm/rte_pause_64.h
> +++ b/lib/librte_eal/common/include/arch/arm/rte_pause_64.h
> @@ -1,5 +1,6 @@
>  /* SPDX-License-Identifier: BSD-3-Clause
>   * Copyright(c) 2017 Cavium, Inc
> + * Copyright(c) 2019 Arm Limited
>   */

Before including generic/rte_pause.h, put a check like:

#ifdef RTE_ARM_USE_WFE
#define RTE_ARCH_HAS_WFE
#endif

#include "generic/rte_pause.h"

>
>  #ifndef _RTE_PAUSE_ARM64_H_
> @@ -17,6 +18,31 @@ static inline void rte_pause(void)
>         asm volatile("yield" ::: "memory");
>  }
>
> +#ifdef RTE_ARM_USE_WFE
> +#define sev()  { asm volatile("sev" : : : "memory") }
> +#define wfe()  { asm volatile("wfe" : : : "memory") }
> +
> +#define __WAIT_UNTIL_EQUAL(type, size, addr, expected, memorder) \
> +__rte_experimental                                             \

The experimental tag is unnecessary here.
We only need it in the function prototype (in the generic header).

> +static __rte_always_inline void                                        \
> +rte_wait_until_equal_##size(volatile type * addr, type expected,\
> +int memorder)                                                  \
> +{                                                              \
> +       if (__atomic_load_n(addr, memorder) != expected) {      \
> +               sev();                                                  \
> +               do {                                                    \
> +                       wfe();                                          \
> +               } while (__atomic_load_n(addr, memorder) != expected);  \
> +        }                                                              \
> +}
> +__WAIT_UNTIL_EQUAL(uint16_t, 16, addr, expected, memorder)
> +__WAIT_UNTIL_EQUAL(uint32_t, 32, addr, expected, memorder)
> +__WAIT_UNTIL_EQUAL(uint64_t, 64, addr, expected, memorder)
> +
> +#undef __WAIT_UNTIL_EQUAL

Missing #undef on sev and wfe macros.


> +
> +#endif /* RTE_ARM_USE_WFE */
> +
>  #ifdef __cplusplus
>  }
>  #endif
> diff --git a/lib/librte_eal/common/include/generic/rte_pause.h b/lib/librte_eal/common/include/generic/rte_pause.h
> index 52bd4db..80597a9 100644
> --- a/lib/librte_eal/common/include/generic/rte_pause.h
> +++ b/lib/librte_eal/common/include/generic/rte_pause.h
> @@ -1,5 +1,6 @@
>  /* SPDX-License-Identifier: BSD-3-Clause
>   * Copyright(c) 2017 Cavium, Inc
> + * Copyright(c) 2019 Arm Limited
>   */
>
>  #ifndef _RTE_PAUSE_H_
> @@ -12,6 +13,11 @@
>   *
>   */
>
> +#include <stdint.h>
> +#include <rte_common.h>
> +#include <rte_atomic.h>
> +#include <rte_compat.h>
> +
>  /**
>   * Pause CPU execution for a short while
>   *
> @@ -20,4 +26,91 @@
>   */
>  static inline void rte_pause(void);
>
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change, or be removed, without prior notice
> + *
> + * Wait for *addr to be updated with a 16-bit expected value, with a relaxed
> + * memory ordering model meaning the loads around this API can be reordered.
> + *
> + * @param addr
> + *  A pointer to the memory location.
> + * @param expected
> + *  A 16-bit expected value to be in the memory location.
> + * @param memorder
> + *  Two different memory orders that can be specified:
> + *  __ATOMIC_ACQUIRE and __ATOMIC_RELAXED. These map to
> + *  C++11 memory orders with the same names, see the C++11 standard or
> + *  the GCC wiki on atomic synchronization for detailed definition.
> + */
> +__rte_experimental
> +static __rte_always_inline void
> +rte_wait_until_equal_16(volatile uint16_t *addr, uint16_t expected,
> +int memorder);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change, or be removed, without prior notice
> + *
> + * Wait for *addr to be updated with a 32-bit expected value, with a relaxed
> + * memory ordering model meaning the loads around this API can be reordered.
> + *
> + * @param addr
> + *  A pointer to the memory location.
> + * @param expected
> + *  A 32-bit expected value to be in the memory location.
> + * @param memorder
> + *  Two different memory orders that can be specified:
> + *  __ATOMIC_ACQUIRE and __ATOMIC_RELAXED. These map to
> + *  C++11 memory orders with the same names, see the C++11 standard or
> + *  the GCC wiki on atomic synchronization for detailed definition.
> + */
> +__rte_experimental
> +static __rte_always_inline void
> +rte_wait_until_equal_32(volatile uint32_t *addr, uint32_t expected,
> +int memorder);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change, or be removed, without prior notice
> + *
> + * Wait for *addr to be updated with a 64-bit expected value, with a relaxed
> + * memory ordering model meaning the loads around this API can be reordered.
> + *
> + * @param addr
> + *  A pointer to the memory location.
> + * @param expected
> + *  A 64-bit expected value to be in the memory location.
> + * @param memorder
> + *  Two different memory orders that can be specified:
> + *  __ATOMIC_ACQUIRE and __ATOMIC_RELAXED. These map to
> + *  C++11 memory orders with the same names, see the C++11 standard or
> + *  the GCC wiki on atomic synchronization for detailed definition.
> + */
> +__rte_experimental
> +static __rte_always_inline void
> +rte_wait_until_equal_64(volatile uint64_t *addr, uint64_t expected,
> +int memorder);
> +

Here, change this check to:

#ifndef RTE_ARCH_HAS_WFE


> +#ifndef RTE_ARM_USE_WFE
> +#define __WAIT_UNTIL_EQUAL(type, size, addr, expected, memorder) \
> +__rte_experimental \
> +static __rte_always_inline void                                        \
> +rte_wait_until_equal_##size(volatile type * addr, type expected,\
> +int memorder)                                                  \
> +{                                                              \
> +       if (__atomic_load_n(addr, memorder) != expected) {      \
> +               do {                                                    \
> +                       rte_pause();                                    \
> +               } while (__atomic_load_n(addr, memorder) != expected);  \
> +       }                                                               \
> +}
> +__WAIT_UNTIL_EQUAL(uint16_t, 16, addr, expected, memorder)
> +__WAIT_UNTIL_EQUAL(uint32_t, 32, addr, expected, memorder)
> +__WAIT_UNTIL_EQUAL(uint64_t, 64, addr, expected, memorder)
> +
> +#undef __WAIT_UNTIL_EQUAL
> +
> +#endif /* RTE_ARM_USE_WFE */
> +
>  #endif /* _RTE_PAUSE_H_ */
> --
> 2.7.4
>

Thanks.

--
David Marchand
  
Ananyev, Konstantin Oct. 22, 2019, 9:36 a.m. UTC | #2
> > The rte_wait_until_equal_xx APIs abstract the functionality of
> > 'polling for a memory location to become equal to a given value'.
> >
> > Add the RTE_ARM_USE_WFE configuration entry for aarch64, disabled
> > by default. When it is enabled, the above APIs will call WFE instruction
> > to save CPU cycles and power.
> >
> > From a VM, when calling this API on aarch64, it may trap in and out to
> > release vCPUs whereas cause high exit latency. Since kernel 4.18.20 an
> > adaptive trapping mechanism is introduced to balance the latency and
> > workload.
> >
> > Signed-off-by: Gavin Hu <gavin.hu@arm.com>
> > Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
> > Reviewed-by: Steve Capper <steve.capper@arm.com>
> > Reviewed-by: Ola Liljedahl <ola.liljedahl@arm.com>
> > Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> > Reviewed-by: Phil Yang <phil.yang@arm.com>
> > Acked-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
> > Acked-by: Jerin Jacob <jerinj@marvell.com>
> > ---
> >  config/arm/meson.build                             |  1 +
> >  config/common_base                                 |  5 ++
> >  .../common/include/arch/arm/rte_pause_64.h         | 26 ++++++
> >  lib/librte_eal/common/include/generic/rte_pause.h  | 93 ++++++++++++++++++++++
> >  4 files changed, 125 insertions(+)
> >
> > diff --git a/config/arm/meson.build b/config/arm/meson.build
> > index 979018e..b4b4cac 100644
> > --- a/config/arm/meson.build
> > +++ b/config/arm/meson.build
> > @@ -26,6 +26,7 @@ flags_common_default = [
> >         ['RTE_LIBRTE_AVP_PMD', false],
> >
> >         ['RTE_SCHED_VECTOR', false],
> > +       ['RTE_ARM_USE_WFE', false],
> >  ]
> >
> >  flags_generic = [
> > diff --git a/config/common_base b/config/common_base
> > index e843a21..c812156 100644
> > --- a/config/common_base
> > +++ b/config/common_base
> > @@ -111,6 +111,11 @@ CONFIG_RTE_MAX_VFIO_CONTAINERS=64
> >  CONFIG_RTE_MALLOC_DEBUG=n
> >  CONFIG_RTE_EAL_NUMA_AWARE_HUGEPAGES=n
> >  CONFIG_RTE_USE_LIBBSD=n
> > +# Use WFE instructions to implement the rte_wait_for_equal_xxx APIs,
> > +# calling these APIs put the cores in low power state while waiting
> > +# for the memory address to become equal to the expected value.
> > +# This is supported only by aarch64.
> > +CONFIG_RTE_ARM_USE_WFE=n
> >
> >  #
> >  # Recognize/ignore the AVX/AVX512 CPU flags for performance/power testing.
> > diff --git a/lib/librte_eal/common/include/arch/arm/rte_pause_64.h b/lib/librte_eal/common/include/arch/arm/rte_pause_64.h
> > index 93895d3..eb8f73e 100644
> > --- a/lib/librte_eal/common/include/arch/arm/rte_pause_64.h
> > +++ b/lib/librte_eal/common/include/arch/arm/rte_pause_64.h
> > @@ -1,5 +1,6 @@
> >  /* SPDX-License-Identifier: BSD-3-Clause
> >   * Copyright(c) 2017 Cavium, Inc
> > + * Copyright(c) 2019 Arm Limited
> >   */
> 
> Before including generic/rte_pause.h, put a check like:
> 
> #ifdef RTE_ARM_USE_WFE
> #define RTE_ARCH_HAS_WFE
> #endif
> 
> #include "generic/rte_pause.h"
> 
> >
> >  #ifndef _RTE_PAUSE_ARM64_H_
> > @@ -17,6 +18,31 @@ static inline void rte_pause(void)
> >         asm volatile("yield" ::: "memory");
> >  }
> >
> > +#ifdef RTE_ARM_USE_WFE
> > +#define sev()  { asm volatile("sev" : : : "memory") }
> > +#define wfe()  { asm volatile("wfe" : : : "memory") }
> > +
> > +#define __WAIT_UNTIL_EQUAL(type, size, addr, expected, memorder) \
> > +__rte_experimental                                             \
> 
> The experimental tag is unnecessary here.
> We only need it in the function prototype (in the generic header).
> 
> > +static __rte_always_inline void                                        \
> > +rte_wait_until_equal_##size(volatile type * addr, type expected,\
> > +int memorder)                                                  \
> > +{                                                              \
> > +       if (__atomic_load_n(addr, memorder) != expected) {      \
> > +               sev();                                                  \
> > +               do {                                                    \
> > +                       wfe();                                          \
> > +               } while (__atomic_load_n(addr, memorder) != expected);  \
> > +        }                                                              \
> > +}
> > +__WAIT_UNTIL_EQUAL(uint16_t, 16, addr, expected, memorder)
> > +__WAIT_UNTIL_EQUAL(uint32_t, 32, addr, expected, memorder)
> > +__WAIT_UNTIL_EQUAL(uint64_t, 64, addr, expected, memorder)
> > +
> > +#undef __WAIT_UNTIL_EQUAL

Might be instead of defining/undefining these macros, just
define explicitly these 3 functions?
Now they are really small, so I think it would be an easier/cleaner way.
Yes, a bit of code duplication, but as I said they are really small now.
Same thought about generic version.

> 
> Missing #undef on sev and wfe macros.

Actually should we undefine them?
Or should we add rte_ prefix (which is needed anyway I suppose)
and have them always defined?
Might be you can reuse them in other arm specific places too (spinlock, rwlock, etc.)
Actually probably it is possible to make them either emiting a proper instructions or NOP,
then you'll need RTE_ARM_USE_WFE only around these macros.

I.E

#ifdef RTE_ARM_USE_WFE
#define rte_sev()  { asm volatile("sev" : : : "memory") }
#define rte_wfe()  { asm volatile("wfe" : : : "memory") }
#else
static inline void rte_sev(void)
{
}
static inline void rte_wfe(void)
{
	rte_pause();
}
#endif

And then just one common version of _wait_ functios:

static __rte_always_inline void                                        \
rte_wait_until_equal_32(volatile type * addr, type expected, int memorder)
{                                                              \
	if (__atomic_load_n(addr, memorder) != expected) {    
		rte_sev();
		do {
	                       rte_wfe();
	              } while (__atomic_load_n(addr, memorder) != expected);
	  } 
}
 

> 
> 
> > +
> > +#endif /* RTE_ARM_USE_WFE */
> > +
> >  #ifdef __cplusplus
> >  }
> >  #endif
> > diff --git a/lib/librte_eal/common/include/generic/rte_pause.h b/lib/librte_eal/common/include/generic/rte_pause.h
> > index 52bd4db..80597a9 100644
> > --- a/lib/librte_eal/common/include/generic/rte_pause.h
> > +++ b/lib/librte_eal/common/include/generic/rte_pause.h
> > @@ -1,5 +1,6 @@
> >  /* SPDX-License-Identifier: BSD-3-Clause
> >   * Copyright(c) 2017 Cavium, Inc
> > + * Copyright(c) 2019 Arm Limited
> >   */
> >
> >  #ifndef _RTE_PAUSE_H_
> > @@ -12,6 +13,11 @@
> >   *
> >   */
> >
> > +#include <stdint.h>
> > +#include <rte_common.h>
> > +#include <rte_atomic.h>
> > +#include <rte_compat.h>
> > +
> >  /**
> >   * Pause CPU execution for a short while
> >   *
> > @@ -20,4 +26,91 @@
> >   */
> >  static inline void rte_pause(void);
> >
> > +/**
> > + * @warning
> > + * @b EXPERIMENTAL: this API may change, or be removed, without prior notice
> > + *
> > + * Wait for *addr to be updated with a 16-bit expected value, with a relaxed
> > + * memory ordering model meaning the loads around this API can be reordered.
> > + *
> > + * @param addr
> > + *  A pointer to the memory location.
> > + * @param expected
> > + *  A 16-bit expected value to be in the memory location.
> > + * @param memorder
> > + *  Two different memory orders that can be specified:
> > + *  __ATOMIC_ACQUIRE and __ATOMIC_RELAXED. These map to
> > + *  C++11 memory orders with the same names, see the C++11 standard or
> > + *  the GCC wiki on atomic synchronization for detailed definition.
> > + */
> > +__rte_experimental
> > +static __rte_always_inline void
> > +rte_wait_until_equal_16(volatile uint16_t *addr, uint16_t expected,
> > +int memorder);
> > +
> > +/**
> > + * @warning
> > + * @b EXPERIMENTAL: this API may change, or be removed, without prior notice
> > + *
> > + * Wait for *addr to be updated with a 32-bit expected value, with a relaxed
> > + * memory ordering model meaning the loads around this API can be reordered.
> > + *
> > + * @param addr
> > + *  A pointer to the memory location.
> > + * @param expected
> > + *  A 32-bit expected value to be in the memory location.
> > + * @param memorder
> > + *  Two different memory orders that can be specified:
> > + *  __ATOMIC_ACQUIRE and __ATOMIC_RELAXED. These map to
> > + *  C++11 memory orders with the same names, see the C++11 standard or
> > + *  the GCC wiki on atomic synchronization for detailed definition.
> > + */
> > +__rte_experimental
> > +static __rte_always_inline void
> > +rte_wait_until_equal_32(volatile uint32_t *addr, uint32_t expected,
> > +int memorder);
> > +
> > +/**
> > + * @warning
> > + * @b EXPERIMENTAL: this API may change, or be removed, without prior notice
> > + *
> > + * Wait for *addr to be updated with a 64-bit expected value, with a relaxed
> > + * memory ordering model meaning the loads around this API can be reordered.
> > + *
> > + * @param addr
> > + *  A pointer to the memory location.
> > + * @param expected
> > + *  A 64-bit expected value to be in the memory location.
> > + * @param memorder
> > + *  Two different memory orders that can be specified:
> > + *  __ATOMIC_ACQUIRE and __ATOMIC_RELAXED. These map to
> > + *  C++11 memory orders with the same names, see the C++11 standard or
> > + *  the GCC wiki on atomic synchronization for detailed definition.
> > + */
> > +__rte_experimental
> > +static __rte_always_inline void
> > +rte_wait_until_equal_64(volatile uint64_t *addr, uint64_t expected,
> > +int memorder);
> > +
> 
> Here, change this check to:
> 
> #ifndef RTE_ARCH_HAS_WFE
> 
> 
> > +#ifndef RTE_ARM_USE_WFE

Might be something arch neutral in name here?
RTE_WAIT_UNTIL_EQUAL_ARCH_DEFINED or so?

> > +#define __WAIT_UNTIL_EQUAL(type, size, addr, expected, memorder) \
> > +__rte_experimental \
> > +static __rte_always_inline void                                        \
> > +rte_wait_until_equal_##size(volatile type * addr, type expected,\
> > +int memorder)                                                  \
> > +{                                                              \
> > +       if (__atomic_load_n(addr, memorder) != expected) {      \
> > +               do {                                                    \
> > +                       rte_pause();                                    \
> > +               } while (__atomic_load_n(addr, memorder) != expected);  \
> > +       }                                                               \
> > +}
> > +__WAIT_UNTIL_EQUAL(uint16_t, 16, addr, expected, memorder)
> > +__WAIT_UNTIL_EQUAL(uint32_t, 32, addr, expected, memorder)
> > +__WAIT_UNTIL_EQUAL(uint64_t, 64, addr, expected, memorder)
> > +
> > +#undef __WAIT_UNTIL_EQUAL
> > +
> > +#endif /* RTE_ARM_USE_WFE */
> > +
> >  #endif /* _RTE_PAUSE_H_ */
> > --
> > 2.7.4
> >
> 
> Thanks.
> 
> --
> David Marchand
  
David Marchand Oct. 22, 2019, 10:17 a.m. UTC | #3
On Tue, Oct 22, 2019 at 11:37 AM Ananyev, Konstantin
<konstantin.ananyev@intel.com> wrote:
>
> > > The rte_wait_until_equal_xx APIs abstract the functionality of
> > > 'polling for a memory location to become equal to a given value'.
> > >
> > > Add the RTE_ARM_USE_WFE configuration entry for aarch64, disabled
> > > by default. When it is enabled, the above APIs will call WFE instruction
> > > to save CPU cycles and power.
> > >
> > > From a VM, when calling this API on aarch64, it may trap in and out to
> > > release vCPUs whereas cause high exit latency. Since kernel 4.18.20 an
> > > adaptive trapping mechanism is introduced to balance the latency and
> > > workload.
> > >
> > > Signed-off-by: Gavin Hu <gavin.hu@arm.com>
> > > Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
> > > Reviewed-by: Steve Capper <steve.capper@arm.com>
> > > Reviewed-by: Ola Liljedahl <ola.liljedahl@arm.com>
> > > Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> > > Reviewed-by: Phil Yang <phil.yang@arm.com>
> > > Acked-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
> > > Acked-by: Jerin Jacob <jerinj@marvell.com>
> > > ---
> > >  config/arm/meson.build                             |  1 +
> > >  config/common_base                                 |  5 ++
> > >  .../common/include/arch/arm/rte_pause_64.h         | 26 ++++++
> > >  lib/librte_eal/common/include/generic/rte_pause.h  | 93 ++++++++++++++++++++++
> > >  4 files changed, 125 insertions(+)
> > >
> > > diff --git a/config/arm/meson.build b/config/arm/meson.build
> > > index 979018e..b4b4cac 100644
> > > --- a/config/arm/meson.build
> > > +++ b/config/arm/meson.build
> > > @@ -26,6 +26,7 @@ flags_common_default = [
> > >         ['RTE_LIBRTE_AVP_PMD', false],
> > >
> > >         ['RTE_SCHED_VECTOR', false],
> > > +       ['RTE_ARM_USE_WFE', false],
> > >  ]
> > >
> > >  flags_generic = [
> > > diff --git a/config/common_base b/config/common_base
> > > index e843a21..c812156 100644
> > > --- a/config/common_base
> > > +++ b/config/common_base
> > > @@ -111,6 +111,11 @@ CONFIG_RTE_MAX_VFIO_CONTAINERS=64
> > >  CONFIG_RTE_MALLOC_DEBUG=n
> > >  CONFIG_RTE_EAL_NUMA_AWARE_HUGEPAGES=n
> > >  CONFIG_RTE_USE_LIBBSD=n
> > > +# Use WFE instructions to implement the rte_wait_for_equal_xxx APIs,
> > > +# calling these APIs put the cores in low power state while waiting
> > > +# for the memory address to become equal to the expected value.
> > > +# This is supported only by aarch64.
> > > +CONFIG_RTE_ARM_USE_WFE=n
> > >
> > >  #
> > >  # Recognize/ignore the AVX/AVX512 CPU flags for performance/power testing.
> > > diff --git a/lib/librte_eal/common/include/arch/arm/rte_pause_64.h b/lib/librte_eal/common/include/arch/arm/rte_pause_64.h
> > > index 93895d3..eb8f73e 100644
> > > --- a/lib/librte_eal/common/include/arch/arm/rte_pause_64.h
> > > +++ b/lib/librte_eal/common/include/arch/arm/rte_pause_64.h
> > > @@ -1,5 +1,6 @@
> > >  /* SPDX-License-Identifier: BSD-3-Clause
> > >   * Copyright(c) 2017 Cavium, Inc
> > > + * Copyright(c) 2019 Arm Limited
> > >   */
> >
> > Before including generic/rte_pause.h, put a check like:
> >
> > #ifdef RTE_ARM_USE_WFE
> > #define RTE_ARCH_HAS_WFE
> > #endif
> >
> > #include "generic/rte_pause.h"
> >
> > >
> > >  #ifndef _RTE_PAUSE_ARM64_H_
> > > @@ -17,6 +18,31 @@ static inline void rte_pause(void)
> > >         asm volatile("yield" ::: "memory");
> > >  }
> > >
> > > +#ifdef RTE_ARM_USE_WFE
> > > +#define sev()  { asm volatile("sev" : : : "memory") }
> > > +#define wfe()  { asm volatile("wfe" : : : "memory") }
> > > +
> > > +#define __WAIT_UNTIL_EQUAL(type, size, addr, expected, memorder) \
> > > +__rte_experimental                                             \
> >
> > The experimental tag is unnecessary here.
> > We only need it in the function prototype (in the generic header).
> >
> > > +static __rte_always_inline void                                        \
> > > +rte_wait_until_equal_##size(volatile type * addr, type expected,\
> > > +int memorder)                                                  \
> > > +{                                                              \
> > > +       if (__atomic_load_n(addr, memorder) != expected) {      \
> > > +               sev();                                                  \
> > > +               do {                                                    \
> > > +                       wfe();                                          \
> > > +               } while (__atomic_load_n(addr, memorder) != expected);  \
> > > +        }                                                              \
> > > +}
> > > +__WAIT_UNTIL_EQUAL(uint16_t, 16, addr, expected, memorder)
> > > +__WAIT_UNTIL_EQUAL(uint32_t, 32, addr, expected, memorder)
> > > +__WAIT_UNTIL_EQUAL(uint64_t, 64, addr, expected, memorder)
> > > +
> > > +#undef __WAIT_UNTIL_EQUAL
>
> Might be instead of defining/undefining these macros, just
> define explicitly these 3 functions?
> Now they are really small, so I think it would be an easier/cleaner way.
> Yes, a bit of code duplication, but as I said they are really small now.
> Same thought about generic version.

I don't really like those macros defining inlines either.
I am fine with this little duplication, so +1 from me.


>
> >
> > Missing #undef on sev and wfe macros.
>
> Actually should we undefine them?
> Or should we add rte_ prefix (which is needed anyway I suppose)
> and have them always defined?
> Might be you can reuse them in other arm specific places too (spinlock, rwlock, etc.)
> Actually probably it is possible to make them either emiting a proper instructions or NOP,
> then you'll need RTE_ARM_USE_WFE only around these macros.

Interesting idea, but only if it gets used in this series.
I don't want to see stuff that will not be used later.

>
> I.E
>
> #ifdef RTE_ARM_USE_WFE
> #define rte_sev()  { asm volatile("sev" : : : "memory") }
> #define rte_wfe()  { asm volatile("wfe" : : : "memory") }
> #else
> static inline void rte_sev(void)
> {
> }
> static inline void rte_wfe(void)
> {
>         rte_pause();
> }
> #endif
>
> And then just one common version of _wait_ functios:
>
> static __rte_always_inline void                                        \
> rte_wait_until_equal_32(volatile type * addr, type expected, int memorder)
> {                                                              \
>         if (__atomic_load_n(addr, memorder) != expected) {
>                 rte_sev();
>                 do {
>                                rte_wfe();
>                       } while (__atomic_load_n(addr, memorder) != expected);
>           }
> }
>
>

[snip]

> > Here, change this check to:
> >
> > #ifndef RTE_ARCH_HAS_WFE
> >
> >
> > > +#ifndef RTE_ARM_USE_WFE
>
> Might be something arch neutral in name here?
> RTE_WAIT_UNTIL_EQUAL_ARCH_DEFINED or so?

Yes, better than my suggestion.


Gavin, I noticed that you added a #ifndef ARM in the generic
rte_spinlock.h header later in this series.
Please, can you think of a way to avoid this?
Maybe applying the same pattern to the spinlock?


Thanks.

--
David Marchand
  
Gavin Hu Oct. 22, 2019, 4:03 p.m. UTC | #4
> -----Original Message-----
> From: Ananyev, Konstantin <konstantin.ananyev@intel.com>
> Sent: Tuesday, October 22, 2019 5:37 PM
> To: David Marchand <david.marchand@redhat.com>; Gavin Hu (Arm
> Technology China) <Gavin.Hu@arm.com>
> Cc: dev <dev@dpdk.org>; nd <nd@arm.com>; thomas@monjalon.net;
> Stephen Hemminger <stephen@networkplumber.org>;
> hemant.agrawal@nxp.com; jerinj@marvell.com; Pavan Nikhilesh
> <pbhagavatula@marvell.com>; Honnappa Nagarahalli
> <Honnappa.Nagarahalli@arm.com>; Ruifeng Wang (Arm Technology China)
> <Ruifeng.Wang@arm.com>; Phil Yang (Arm Technology China)
> <Phil.Yang@arm.com>; Steve Capper <Steve.Capper@arm.com>
> Subject: RE: [dpdk-dev] [PATCH v8 2/6] eal: add the APIs to wait until equal
> 
> > > The rte_wait_until_equal_xx APIs abstract the functionality of
> > > 'polling for a memory location to become equal to a given value'.
> > >
> > > Add the RTE_ARM_USE_WFE configuration entry for aarch64, disabled
> > > by default. When it is enabled, the above APIs will call WFE instruction
> > > to save CPU cycles and power.
> > >
> > > From a VM, when calling this API on aarch64, it may trap in and out to
> > > release vCPUs whereas cause high exit latency. Since kernel 4.18.20 an
> > > adaptive trapping mechanism is introduced to balance the latency and
> > > workload.
> > >
> > > Signed-off-by: Gavin Hu <gavin.hu@arm.com>
> > > Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
> > > Reviewed-by: Steve Capper <steve.capper@arm.com>
> > > Reviewed-by: Ola Liljedahl <ola.liljedahl@arm.com>
> > > Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> > > Reviewed-by: Phil Yang <phil.yang@arm.com>
> > > Acked-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
> > > Acked-by: Jerin Jacob <jerinj@marvell.com>
> > > ---
> > >  config/arm/meson.build                             |  1 +
> > >  config/common_base                                 |  5 ++
> > >  .../common/include/arch/arm/rte_pause_64.h         | 26 ++++++
> > >  lib/librte_eal/common/include/generic/rte_pause.h  | 93
> ++++++++++++++++++++++
> > >  4 files changed, 125 insertions(+)
> > >
> > > diff --git a/config/arm/meson.build b/config/arm/meson.build
> > > index 979018e..b4b4cac 100644
> > > --- a/config/arm/meson.build
> > > +++ b/config/arm/meson.build
> > > @@ -26,6 +26,7 @@ flags_common_default = [
> > >         ['RTE_LIBRTE_AVP_PMD', false],
> > >
> > >         ['RTE_SCHED_VECTOR', false],
> > > +       ['RTE_ARM_USE_WFE', false],
> > >  ]
> > >
> > >  flags_generic = [
> > > diff --git a/config/common_base b/config/common_base
> > > index e843a21..c812156 100644
> > > --- a/config/common_base
> > > +++ b/config/common_base
> > > @@ -111,6 +111,11 @@ CONFIG_RTE_MAX_VFIO_CONTAINERS=64
> > >  CONFIG_RTE_MALLOC_DEBUG=n
> > >  CONFIG_RTE_EAL_NUMA_AWARE_HUGEPAGES=n
> > >  CONFIG_RTE_USE_LIBBSD=n
> > > +# Use WFE instructions to implement the rte_wait_for_equal_xxx APIs,
> > > +# calling these APIs put the cores in low power state while waiting
> > > +# for the memory address to become equal to the expected value.
> > > +# This is supported only by aarch64.
> > > +CONFIG_RTE_ARM_USE_WFE=n
> > >
> > >  #
> > >  # Recognize/ignore the AVX/AVX512 CPU flags for performance/power
> testing.
> > > diff --git a/lib/librte_eal/common/include/arch/arm/rte_pause_64.h
> b/lib/librte_eal/common/include/arch/arm/rte_pause_64.h
> > > index 93895d3..eb8f73e 100644
> > > --- a/lib/librte_eal/common/include/arch/arm/rte_pause_64.h
> > > +++ b/lib/librte_eal/common/include/arch/arm/rte_pause_64.h
> > > @@ -1,5 +1,6 @@
> > >  /* SPDX-License-Identifier: BSD-3-Clause
> > >   * Copyright(c) 2017 Cavium, Inc
> > > + * Copyright(c) 2019 Arm Limited
> > >   */
> >
> > Before including generic/rte_pause.h, put a check like:
> >
> > #ifdef RTE_ARM_USE_WFE
> > #define RTE_ARCH_HAS_WFE
> > #endif
> >
> > #include "generic/rte_pause.h"
> >
> > >
> > >  #ifndef _RTE_PAUSE_ARM64_H_
> > > @@ -17,6 +18,31 @@ static inline void rte_pause(void)
> > >         asm volatile("yield" ::: "memory");
> > >  }
> > >
> > > +#ifdef RTE_ARM_USE_WFE
> > > +#define sev()  { asm volatile("sev" : : : "memory") }
> > > +#define wfe()  { asm volatile("wfe" : : : "memory") }
> > > +
> > > +#define __WAIT_UNTIL_EQUAL(type, size, addr, expected, memorder) \
> > > +__rte_experimental                                             \
> >
> > The experimental tag is unnecessary here.
> > We only need it in the function prototype (in the generic header).
> >
> > > +static __rte_always_inline void                                        \
> > > +rte_wait_until_equal_##size(volatile type * addr, type expected,\
> > > +int memorder)                                                  \
> > > +{                                                              \
> > > +       if (__atomic_load_n(addr, memorder) != expected) {      \
> > > +               sev();                                                  \
> > > +               do {                                                    \
> > > +                       wfe();                                          \
> > > +               } while (__atomic_load_n(addr, memorder) != expected);  \
> > > +        }                                                              \
> > > +}
> > > +__WAIT_UNTIL_EQUAL(uint16_t, 16, addr, expected, memorder)
> > > +__WAIT_UNTIL_EQUAL(uint32_t, 32, addr, expected, memorder)
> > > +__WAIT_UNTIL_EQUAL(uint64_t, 64, addr, expected, memorder)
> > > +
> > > +#undef __WAIT_UNTIL_EQUAL
> 
> Might be instead of defining/undefining these macros, just
> define explicitly these 3 functions?
> Now they are really small, so I think it would be an easier/cleaner way.
> Yes, a bit of code duplication, but as I said they are really small now.
> Same thought about generic version.
Ok, no problem, an alternative way to reduce duplication is to use _Generic( since gcc-4.9). 
As you said, now it is small, I will take your suggestion.
> >
> > Missing #undef on sev and wfe macros.
> 
> Actually should we undefine them?
> Or should we add rte_ prefix (which is needed anyway I suppose)
> and have them always defined?
> Might be you can reuse them in other arm specific places too (spinlock,
> rwlock, etc.)
> Actually probably it is possible to make them either emiting a proper
> instructions or NOP,
> then you'll need RTE_ARM_USE_WFE only around these macros.
> 
> I.E
> 
> #ifdef RTE_ARM_USE_WFE
> #define rte_sev()  { asm volatile("sev" : : : "memory") }
> #define rte_wfe()  { asm volatile("wfe" : : : "memory") }
> #else
> static inline void rte_sev(void)
> {
> }
> static inline void rte_wfe(void)
> {
> 	rte_pause();
> }
> #endif
> 
> And then just one common version of _wait_ functios:
Good suggestion, will fix in v9.
> 
> static __rte_always_inline void                                        \
> rte_wait_until_equal_32(volatile type * addr, type expected, int memorder)
> {                                                              \
> 	if (__atomic_load_n(addr, memorder) != expected) {
> 		rte_sev();
> 		do {
> 	                       rte_wfe();
> 	              } while (__atomic_load_n(addr, memorder) != expected);
> 	  }
> }
> 
> 
> >
> >
> > > +
> > > +#endif /* RTE_ARM_USE_WFE */
> > > +
> > >  #ifdef __cplusplus
> > >  }
> > >  #endif
> > > diff --git a/lib/librte_eal/common/include/generic/rte_pause.h
> b/lib/librte_eal/common/include/generic/rte_pause.h
> > > index 52bd4db..80597a9 100644
> > > --- a/lib/librte_eal/common/include/generic/rte_pause.h
> > > +++ b/lib/librte_eal/common/include/generic/rte_pause.h
> > > @@ -1,5 +1,6 @@
> > >  /* SPDX-License-Identifier: BSD-3-Clause
> > >   * Copyright(c) 2017 Cavium, Inc
> > > + * Copyright(c) 2019 Arm Limited
> > >   */
> > >
> > >  #ifndef _RTE_PAUSE_H_
> > > @@ -12,6 +13,11 @@
> > >   *
> > >   */
> > >
> > > +#include <stdint.h>
> > > +#include <rte_common.h>
> > > +#include <rte_atomic.h>
> > > +#include <rte_compat.h>
> > > +
> > >  /**
> > >   * Pause CPU execution for a short while
> > >   *
> > > @@ -20,4 +26,91 @@
> > >   */
> > >  static inline void rte_pause(void);
> > >
> > > +/**
> > > + * @warning
> > > + * @b EXPERIMENTAL: this API may change, or be removed, without
> prior notice
> > > + *
> > > + * Wait for *addr to be updated with a 16-bit expected value, with a
> relaxed
> > > + * memory ordering model meaning the loads around this API can be
> reordered.
> > > + *
> > > + * @param addr
> > > + *  A pointer to the memory location.
> > > + * @param expected
> > > + *  A 16-bit expected value to be in the memory location.
> > > + * @param memorder
> > > + *  Two different memory orders that can be specified:
> > > + *  __ATOMIC_ACQUIRE and __ATOMIC_RELAXED. These map to
> > > + *  C++11 memory orders with the same names, see the C++11 standard
> or
> > > + *  the GCC wiki on atomic synchronization for detailed definition.
> > > + */
> > > +__rte_experimental
> > > +static __rte_always_inline void
> > > +rte_wait_until_equal_16(volatile uint16_t *addr, uint16_t expected,
> > > +int memorder);
> > > +
> > > +/**
> > > + * @warning
> > > + * @b EXPERIMENTAL: this API may change, or be removed, without
> prior notice
> > > + *
> > > + * Wait for *addr to be updated with a 32-bit expected value, with a
> relaxed
> > > + * memory ordering model meaning the loads around this API can be
> reordered.
> > > + *
> > > + * @param addr
> > > + *  A pointer to the memory location.
> > > + * @param expected
> > > + *  A 32-bit expected value to be in the memory location.
> > > + * @param memorder
> > > + *  Two different memory orders that can be specified:
> > > + *  __ATOMIC_ACQUIRE and __ATOMIC_RELAXED. These map to
> > > + *  C++11 memory orders with the same names, see the C++11 standard
> or
> > > + *  the GCC wiki on atomic synchronization for detailed definition.
> > > + */
> > > +__rte_experimental
> > > +static __rte_always_inline void
> > > +rte_wait_until_equal_32(volatile uint32_t *addr, uint32_t expected,
> > > +int memorder);
> > > +
> > > +/**
> > > + * @warning
> > > + * @b EXPERIMENTAL: this API may change, or be removed, without
> prior notice
> > > + *
> > > + * Wait for *addr to be updated with a 64-bit expected value, with a
> relaxed
> > > + * memory ordering model meaning the loads around this API can be
> reordered.
> > > + *
> > > + * @param addr
> > > + *  A pointer to the memory location.
> > > + * @param expected
> > > + *  A 64-bit expected value to be in the memory location.
> > > + * @param memorder
> > > + *  Two different memory orders that can be specified:
> > > + *  __ATOMIC_ACQUIRE and __ATOMIC_RELAXED. These map to
> > > + *  C++11 memory orders with the same names, see the C++11 standard
> or
> > > + *  the GCC wiki on atomic synchronization for detailed definition.
> > > + */
> > > +__rte_experimental
> > > +static __rte_always_inline void
> > > +rte_wait_until_equal_64(volatile uint64_t *addr, uint64_t expected,
> > > +int memorder);
> > > +
> >
> > Here, change this check to:
> >
> > #ifndef RTE_ARCH_HAS_WFE
> >
> >
> > > +#ifndef RTE_ARM_USE_WFE
> 
> Might be something arch neutral in name here?
> RTE_WAIT_UNTIL_EQUAL_ARCH_DEFINED or so?
> 
> > > +#define __WAIT_UNTIL_EQUAL(type, size, addr, expected, memorder) \
> > > +__rte_experimental \
> > > +static __rte_always_inline void                                        \
> > > +rte_wait_until_equal_##size(volatile type * addr, type expected,\
> > > +int memorder)                                                  \
> > > +{                                                              \
> > > +       if (__atomic_load_n(addr, memorder) != expected) {      \
> > > +               do {                                                    \
> > > +                       rte_pause();                                    \
> > > +               } while (__atomic_load_n(addr, memorder) != expected);  \
> > > +       }                                                               \
> > > +}
> > > +__WAIT_UNTIL_EQUAL(uint16_t, 16, addr, expected, memorder)
> > > +__WAIT_UNTIL_EQUAL(uint32_t, 32, addr, expected, memorder)
> > > +__WAIT_UNTIL_EQUAL(uint64_t, 64, addr, expected, memorder)
> > > +
> > > +#undef __WAIT_UNTIL_EQUAL
> > > +
> > > +#endif /* RTE_ARM_USE_WFE */
> > > +
> > >  #endif /* _RTE_PAUSE_H_ */
> > > --
> > > 2.7.4
> > >
> >
> > Thanks.
> >
> > --
> > David Marchand
  
Gavin Hu Oct. 22, 2019, 4:05 p.m. UTC | #5
> -----Original Message-----
> From: David Marchand <david.marchand@redhat.com>
> Sent: Tuesday, October 22, 2019 6:17 PM
> To: Ananyev, Konstantin <konstantin.ananyev@intel.com>
> Cc: Gavin Hu (Arm Technology China) <Gavin.Hu@arm.com>; dev
> <dev@dpdk.org>; nd <nd@arm.com>; thomas@monjalon.net; Stephen
> Hemminger <stephen@networkplumber.org>; hemant.agrawal@nxp.com;
> jerinj@marvell.com; Pavan Nikhilesh <pbhagavatula@marvell.com>;
> Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>; Ruifeng Wang
> (Arm Technology China) <Ruifeng.Wang@arm.com>; Phil Yang (Arm
> Technology China) <Phil.Yang@arm.com>; Steve Capper
> <Steve.Capper@arm.com>
> Subject: Re: [dpdk-dev] [PATCH v8 2/6] eal: add the APIs to wait until equal
> 
> On Tue, Oct 22, 2019 at 11:37 AM Ananyev, Konstantin
> <konstantin.ananyev@intel.com> wrote:
> >
> > > > The rte_wait_until_equal_xx APIs abstract the functionality of
> > > > 'polling for a memory location to become equal to a given value'.
> > > >
> > > > Add the RTE_ARM_USE_WFE configuration entry for aarch64, disabled
> > > > by default. When it is enabled, the above APIs will call WFE instruction
> > > > to save CPU cycles and power.
> > > >
> > > > From a VM, when calling this API on aarch64, it may trap in and out to
> > > > release vCPUs whereas cause high exit latency. Since kernel 4.18.20 an
> > > > adaptive trapping mechanism is introduced to balance the latency and
> > > > workload.
> > > >
> > > > Signed-off-by: Gavin Hu <gavin.hu@arm.com>
> > > > Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
> > > > Reviewed-by: Steve Capper <steve.capper@arm.com>
> > > > Reviewed-by: Ola Liljedahl <ola.liljedahl@arm.com>
> > > > Reviewed-by: Honnappa Nagarahalli
> <honnappa.nagarahalli@arm.com>
> > > > Reviewed-by: Phil Yang <phil.yang@arm.com>
> > > > Acked-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
> > > > Acked-by: Jerin Jacob <jerinj@marvell.com>
> > > > ---
> > > >  config/arm/meson.build                             |  1 +
> > > >  config/common_base                                 |  5 ++
> > > >  .../common/include/arch/arm/rte_pause_64.h         | 26 ++++++
> > > >  lib/librte_eal/common/include/generic/rte_pause.h  | 93
> ++++++++++++++++++++++
> > > >  4 files changed, 125 insertions(+)
> > > >
> > > > diff --git a/config/arm/meson.build b/config/arm/meson.build
> > > > index 979018e..b4b4cac 100644
> > > > --- a/config/arm/meson.build
> > > > +++ b/config/arm/meson.build
> > > > @@ -26,6 +26,7 @@ flags_common_default = [
> > > >         ['RTE_LIBRTE_AVP_PMD', false],
> > > >
> > > >         ['RTE_SCHED_VECTOR', false],
> > > > +       ['RTE_ARM_USE_WFE', false],
> > > >  ]
> > > >
> > > >  flags_generic = [
> > > > diff --git a/config/common_base b/config/common_base
> > > > index e843a21..c812156 100644
> > > > --- a/config/common_base
> > > > +++ b/config/common_base
> > > > @@ -111,6 +111,11 @@ CONFIG_RTE_MAX_VFIO_CONTAINERS=64
> > > >  CONFIG_RTE_MALLOC_DEBUG=n
> > > >  CONFIG_RTE_EAL_NUMA_AWARE_HUGEPAGES=n
> > > >  CONFIG_RTE_USE_LIBBSD=n
> > > > +# Use WFE instructions to implement the rte_wait_for_equal_xxx APIs,
> > > > +# calling these APIs put the cores in low power state while waiting
> > > > +# for the memory address to become equal to the expected value.
> > > > +# This is supported only by aarch64.
> > > > +CONFIG_RTE_ARM_USE_WFE=n
> > > >
> > > >  #
> > > >  # Recognize/ignore the AVX/AVX512 CPU flags for performance/power
> testing.
> > > > diff --git a/lib/librte_eal/common/include/arch/arm/rte_pause_64.h
> b/lib/librte_eal/common/include/arch/arm/rte_pause_64.h
> > > > index 93895d3..eb8f73e 100644
> > > > --- a/lib/librte_eal/common/include/arch/arm/rte_pause_64.h
> > > > +++ b/lib/librte_eal/common/include/arch/arm/rte_pause_64.h
> > > > @@ -1,5 +1,6 @@
> > > >  /* SPDX-License-Identifier: BSD-3-Clause
> > > >   * Copyright(c) 2017 Cavium, Inc
> > > > + * Copyright(c) 2019 Arm Limited
> > > >   */
> > >
> > > Before including generic/rte_pause.h, put a check like:
> > >
> > > #ifdef RTE_ARM_USE_WFE
> > > #define RTE_ARCH_HAS_WFE
> > > #endif
> > >
> > > #include "generic/rte_pause.h"
> > >
> > > >
> > > >  #ifndef _RTE_PAUSE_ARM64_H_
> > > > @@ -17,6 +18,31 @@ static inline void rte_pause(void)
> > > >         asm volatile("yield" ::: "memory");
> > > >  }
> > > >
> > > > +#ifdef RTE_ARM_USE_WFE
> > > > +#define sev()  { asm volatile("sev" : : : "memory") }
> > > > +#define wfe()  { asm volatile("wfe" : : : "memory") }
> > > > +
> > > > +#define __WAIT_UNTIL_EQUAL(type, size, addr, expected, memorder)
> \
> > > > +__rte_experimental                                             \
> > >
> > > The experimental tag is unnecessary here.
> > > We only need it in the function prototype (in the generic header).
> > >
> > > > +static __rte_always_inline void                                        \
> > > > +rte_wait_until_equal_##size(volatile type * addr, type expected,\
> > > > +int memorder)                                                  \
> > > > +{                                                              \
> > > > +       if (__atomic_load_n(addr, memorder) != expected) {      \
> > > > +               sev();                                                  \
> > > > +               do {                                                    \
> > > > +                       wfe();                                          \
> > > > +               } while (__atomic_load_n(addr, memorder) != expected);  \
> > > > +        }                                                              \
> > > > +}
> > > > +__WAIT_UNTIL_EQUAL(uint16_t, 16, addr, expected, memorder)
> > > > +__WAIT_UNTIL_EQUAL(uint32_t, 32, addr, expected, memorder)
> > > > +__WAIT_UNTIL_EQUAL(uint64_t, 64, addr, expected, memorder)
> > > > +
> > > > +#undef __WAIT_UNTIL_EQUAL
> >
> > Might be instead of defining/undefining these macros, just
> > define explicitly these 3 functions?
> > Now they are really small, so I think it would be an easier/cleaner way.
> > Yes, a bit of code duplication, but as I said they are really small now.
> > Same thought about generic version.
> 
> I don't really like those macros defining inlines either.
> I am fine with this little duplication, so +1 from me.
> 
> 
> >
> > >
> > > Missing #undef on sev and wfe macros.
> >
> > Actually should we undefine them?
> > Or should we add rte_ prefix (which is needed anyway I suppose)
> > and have them always defined?
> > Might be you can reuse them in other arm specific places too (spinlock,
> rwlock, etc.)
> > Actually probably it is possible to make them either emiting a proper
> instructions or NOP,
> > then you'll need RTE_ARM_USE_WFE only around these macros.
> 
> Interesting idea, but only if it gets used in this series.
> I don't want to see stuff that will not be used later.
> 
> >
> > I.E
> >
> > #ifdef RTE_ARM_USE_WFE
> > #define rte_sev()  { asm volatile("sev" : : : "memory") }
> > #define rte_wfe()  { asm volatile("wfe" : : : "memory") }
> > #else
> > static inline void rte_sev(void)
> > {
> > }
> > static inline void rte_wfe(void)
> > {
> >         rte_pause();
> > }
> > #endif
> >
> > And then just one common version of _wait_ functios:
> >
> > static __rte_always_inline void                                        \
> > rte_wait_until_equal_32(volatile type * addr, type expected, int
> memorder)
> > {                                                              \
> >         if (__atomic_load_n(addr, memorder) != expected) {
> >                 rte_sev();
> >                 do {
> >                                rte_wfe();
> >                       } while (__atomic_load_n(addr, memorder) != expected);
> >           }
> > }
> >
> >
> 
> [snip]
> 
> > > Here, change this check to:
> > >
> > > #ifndef RTE_ARCH_HAS_WFE
> > >
> > >
> > > > +#ifndef RTE_ARM_USE_WFE
> >
> > Might be something arch neutral in name here?
> > RTE_WAIT_UNTIL_EQUAL_ARCH_DEFINED or so?
> 
> Yes, better than my suggestion.
> 
> 
> Gavin, I noticed that you added a #ifndef ARM in the generic
> rte_spinlock.h header later in this series.
> Please, can you think of a way to avoid this?
> Maybe applying the same pattern to the spinlock?
David, I will look into this and other comments, and fix them in v9, thanks for all your comments!
/Gavin
> 
> Thanks.
> 
> --
> David Marchand
  

Patch

diff --git a/config/arm/meson.build b/config/arm/meson.build
index 979018e..b4b4cac 100644
--- a/config/arm/meson.build
+++ b/config/arm/meson.build
@@ -26,6 +26,7 @@  flags_common_default = [
 	['RTE_LIBRTE_AVP_PMD', false],
 
 	['RTE_SCHED_VECTOR', false],
+	['RTE_ARM_USE_WFE', false],
 ]
 
 flags_generic = [
diff --git a/config/common_base b/config/common_base
index e843a21..c812156 100644
--- a/config/common_base
+++ b/config/common_base
@@ -111,6 +111,11 @@  CONFIG_RTE_MAX_VFIO_CONTAINERS=64
 CONFIG_RTE_MALLOC_DEBUG=n
 CONFIG_RTE_EAL_NUMA_AWARE_HUGEPAGES=n
 CONFIG_RTE_USE_LIBBSD=n
+# Use WFE instructions to implement the rte_wait_for_equal_xxx APIs,
+# calling these APIs put the cores in low power state while waiting
+# for the memory address to become equal to the expected value.
+# This is supported only by aarch64.
+CONFIG_RTE_ARM_USE_WFE=n
 
 #
 # Recognize/ignore the AVX/AVX512 CPU flags for performance/power testing.
diff --git a/lib/librte_eal/common/include/arch/arm/rte_pause_64.h b/lib/librte_eal/common/include/arch/arm/rte_pause_64.h
index 93895d3..eb8f73e 100644
--- a/lib/librte_eal/common/include/arch/arm/rte_pause_64.h
+++ b/lib/librte_eal/common/include/arch/arm/rte_pause_64.h
@@ -1,5 +1,6 @@ 
 /* SPDX-License-Identifier: BSD-3-Clause
  * Copyright(c) 2017 Cavium, Inc
+ * Copyright(c) 2019 Arm Limited
  */
 
 #ifndef _RTE_PAUSE_ARM64_H_
@@ -17,6 +18,31 @@  static inline void rte_pause(void)
 	asm volatile("yield" ::: "memory");
 }
 
+#ifdef RTE_ARM_USE_WFE
+#define sev()	{ asm volatile("sev" : : : "memory") }
+#define wfe()	{ asm volatile("wfe" : : : "memory") }
+
+#define __WAIT_UNTIL_EQUAL(type, size, addr, expected, memorder) \
+__rte_experimental						\
+static __rte_always_inline void					\
+rte_wait_until_equal_##size(volatile type * addr, type expected,\
+int memorder)							\
+{								\
+	if (__atomic_load_n(addr, memorder) != expected) {	\
+		sev();							\
+		do {							\
+			wfe();						\
+		} while (__atomic_load_n(addr, memorder) != expected);	\
+	 }								\
+}
+__WAIT_UNTIL_EQUAL(uint16_t, 16, addr, expected, memorder)
+__WAIT_UNTIL_EQUAL(uint32_t, 32, addr, expected, memorder)
+__WAIT_UNTIL_EQUAL(uint64_t, 64, addr, expected, memorder)
+
+#undef __WAIT_UNTIL_EQUAL
+
+#endif /* RTE_ARM_USE_WFE */
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/librte_eal/common/include/generic/rte_pause.h b/lib/librte_eal/common/include/generic/rte_pause.h
index 52bd4db..80597a9 100644
--- a/lib/librte_eal/common/include/generic/rte_pause.h
+++ b/lib/librte_eal/common/include/generic/rte_pause.h
@@ -1,5 +1,6 @@ 
 /* SPDX-License-Identifier: BSD-3-Clause
  * Copyright(c) 2017 Cavium, Inc
+ * Copyright(c) 2019 Arm Limited
  */
 
 #ifndef _RTE_PAUSE_H_
@@ -12,6 +13,11 @@ 
  *
  */
 
+#include <stdint.h>
+#include <rte_common.h>
+#include <rte_atomic.h>
+#include <rte_compat.h>
+
 /**
  * Pause CPU execution for a short while
  *
@@ -20,4 +26,91 @@ 
  */
 static inline void rte_pause(void);
 
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change, or be removed, without prior notice
+ *
+ * Wait for *addr to be updated with a 16-bit expected value, with a relaxed
+ * memory ordering model meaning the loads around this API can be reordered.
+ *
+ * @param addr
+ *  A pointer to the memory location.
+ * @param expected
+ *  A 16-bit expected value to be in the memory location.
+ * @param memorder
+ *  Two different memory orders that can be specified:
+ *  __ATOMIC_ACQUIRE and __ATOMIC_RELAXED. These map to
+ *  C++11 memory orders with the same names, see the C++11 standard or
+ *  the GCC wiki on atomic synchronization for detailed definition.
+ */
+__rte_experimental
+static __rte_always_inline void
+rte_wait_until_equal_16(volatile uint16_t *addr, uint16_t expected,
+int memorder);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change, or be removed, without prior notice
+ *
+ * Wait for *addr to be updated with a 32-bit expected value, with a relaxed
+ * memory ordering model meaning the loads around this API can be reordered.
+ *
+ * @param addr
+ *  A pointer to the memory location.
+ * @param expected
+ *  A 32-bit expected value to be in the memory location.
+ * @param memorder
+ *  Two different memory orders that can be specified:
+ *  __ATOMIC_ACQUIRE and __ATOMIC_RELAXED. These map to
+ *  C++11 memory orders with the same names, see the C++11 standard or
+ *  the GCC wiki on atomic synchronization for detailed definition.
+ */
+__rte_experimental
+static __rte_always_inline void
+rte_wait_until_equal_32(volatile uint32_t *addr, uint32_t expected,
+int memorder);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change, or be removed, without prior notice
+ *
+ * Wait for *addr to be updated with a 64-bit expected value, with a relaxed
+ * memory ordering model meaning the loads around this API can be reordered.
+ *
+ * @param addr
+ *  A pointer to the memory location.
+ * @param expected
+ *  A 64-bit expected value to be in the memory location.
+ * @param memorder
+ *  Two different memory orders that can be specified:
+ *  __ATOMIC_ACQUIRE and __ATOMIC_RELAXED. These map to
+ *  C++11 memory orders with the same names, see the C++11 standard or
+ *  the GCC wiki on atomic synchronization for detailed definition.
+ */
+__rte_experimental
+static __rte_always_inline void
+rte_wait_until_equal_64(volatile uint64_t *addr, uint64_t expected,
+int memorder);
+
+#ifndef RTE_ARM_USE_WFE
+#define __WAIT_UNTIL_EQUAL(type, size, addr, expected, memorder) \
+__rte_experimental \
+static __rte_always_inline void					\
+rte_wait_until_equal_##size(volatile type * addr, type expected,\
+int memorder)							\
+{								\
+	if (__atomic_load_n(addr, memorder) != expected) {	\
+		do {							\
+			rte_pause();					\
+		} while (__atomic_load_n(addr, memorder) != expected);	\
+	}								\
+}
+__WAIT_UNTIL_EQUAL(uint16_t, 16, addr, expected, memorder)
+__WAIT_UNTIL_EQUAL(uint32_t, 32, addr, expected, memorder)
+__WAIT_UNTIL_EQUAL(uint64_t, 64, addr, expected, memorder)
+
+#undef __WAIT_UNTIL_EQUAL
+
+#endif /* RTE_ARM_USE_WFE */
+
 #endif /* _RTE_PAUSE_H_ */