[v7,2/7] eal: add the APIs to wait until equal

Message ID 1569562904-43950-3-git-send-email-gavin.hu@arm.com (mailing list archive)
State Superseded, archived
Delegated to: David Marchand
Headers
Series use WFE for aarch64 |

Checks

Context Check Description
ci/Intel-compilation success Compilation OK
ci/checkpatch success coding style OK

Commit Message

Gavin Hu Sept. 27, 2019, 5:41 a.m. UTC
  The rte_wait_until_equal_xx APIs abstract the functionality of
'polling for a memory location to become equal to a given value'.

Add the RTE_ARM_USE_WFE configuration entry for aarch64, disabled
by default. When it is enabled, the above APIs will call WFE instruction
to save CPU cycles and power.

Signed-off-by: Gavin Hu <gavin.hu@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
Reviewed-by: Steve Capper <steve.capper@arm.com>
Reviewed-by: Ola Liljedahl <ola.liljedahl@arm.com>
Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
Reviewed-by: Phil Yang <phil.yang@arm.com>
Acked-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 config/arm/meson.build                             |   1 +
 config/common_base                                 |   5 +
 .../common/include/arch/arm/rte_pause_64.h         |  30 ++++++
 lib/librte_eal/common/include/generic/rte_pause.h  | 106 +++++++++++++++++++++
 4 files changed, 142 insertions(+)
  

Comments

Jerin Jacob Sept. 27, 2019, 11:03 a.m. UTC | #1
On Fri, Sep 27, 2019 at 11:12 AM Gavin Hu <gavin.hu@arm.com> wrote:
>
> The rte_wait_until_equal_xx APIs abstract the functionality of
> 'polling for a memory location to become equal to a given value'.
>
> Add the RTE_ARM_USE_WFE configuration entry for aarch64, disabled
> by default. When it is enabled, the above APIs will call WFE instruction
> to save CPU cycles and power.
>
> Signed-off-by: Gavin Hu <gavin.hu@arm.com>
> Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
> Reviewed-by: Steve Capper <steve.capper@arm.com>
> Reviewed-by: Ola Liljedahl <ola.liljedahl@arm.com>
> Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> Reviewed-by: Phil Yang <phil.yang@arm.com>
> Acked-by: Pavan Nikhilesh <pbhagavatula@marvell.com>

Acked-by: Jerin Jacob <jerinj@marvell.com>
  
Ananyev, Konstantin Oct. 17, 2019, 1:14 p.m. UTC | #2
Hi Gavin,

> 
> The rte_wait_until_equal_xx APIs abstract the functionality of
> 'polling for a memory location to become equal to a given value'.
> 
> Add the RTE_ARM_USE_WFE configuration entry for aarch64, disabled
> by default. When it is enabled, the above APIs will call WFE instruction
> to save CPU cycles and power.
> 
> Signed-off-by: Gavin Hu <gavin.hu@arm.com>
> Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
> Reviewed-by: Steve Capper <steve.capper@arm.com>
> Reviewed-by: Ola Liljedahl <ola.liljedahl@arm.com>
> Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> Reviewed-by: Phil Yang <phil.yang@arm.com>
> Acked-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
> ---
>  config/arm/meson.build                             |   1 +
>  config/common_base                                 |   5 +
>  .../common/include/arch/arm/rte_pause_64.h         |  30 ++++++
>  lib/librte_eal/common/include/generic/rte_pause.h  | 106 +++++++++++++++++++++
>  4 files changed, 142 insertions(+)
> 
> diff --git a/config/arm/meson.build b/config/arm/meson.build
> index 979018e..b4b4cac 100644
> --- a/config/arm/meson.build
> +++ b/config/arm/meson.build
> @@ -26,6 +26,7 @@ flags_common_default = [
>  	['RTE_LIBRTE_AVP_PMD', false],
> 
>  	['RTE_SCHED_VECTOR', false],
> +	['RTE_ARM_USE_WFE', false],
>  ]
> 
>  flags_generic = [
> diff --git a/config/common_base b/config/common_base
> index 8ef75c2..8861713 100644
> --- a/config/common_base
> +++ b/config/common_base
> @@ -111,6 +111,11 @@ CONFIG_RTE_MAX_VFIO_CONTAINERS=64
>  CONFIG_RTE_MALLOC_DEBUG=n
>  CONFIG_RTE_EAL_NUMA_AWARE_HUGEPAGES=n
>  CONFIG_RTE_USE_LIBBSD=n
> +# Use WFE instructions to implement the rte_wait_for_equal_xxx APIs,
> +# calling these APIs put the cores in low power state while waiting
> +# for the memory address to become equal to the expected value.
> +# This is supported only by aarch64.
> +CONFIG_RTE_ARM_USE_WFE=n
> 
>  #
>  # Recognize/ignore the AVX/AVX512 CPU flags for performance/power testing.
> diff --git a/lib/librte_eal/common/include/arch/arm/rte_pause_64.h b/lib/librte_eal/common/include/arch/arm/rte_pause_64.h
> index 93895d3..dabde17 100644
> --- a/lib/librte_eal/common/include/arch/arm/rte_pause_64.h
> +++ b/lib/librte_eal/common/include/arch/arm/rte_pause_64.h
> @@ -1,5 +1,6 @@
>  /* SPDX-License-Identifier: BSD-3-Clause
>   * Copyright(c) 2017 Cavium, Inc
> + * Copyright(c) 2019 Arm Limited
>   */
> 
>  #ifndef _RTE_PAUSE_ARM64_H_
> @@ -17,6 +18,35 @@ static inline void rte_pause(void)
>  	asm volatile("yield" ::: "memory");
>  }
> 
> +#ifdef RTE_ARM_USE_WFE
> +#define __WAIT_UNTIL_EQUAL(name, asm_op, wide, type) \
> +static __rte_always_inline void \
> +rte_wait_until_equal_##name(volatile type * addr, type expected) \
> +{ \
> +	type tmp; \
> +	asm volatile( \
> +		#asm_op " %" #wide "[tmp], %[addr]\n" \
> +		"cmp	%" #wide "[tmp], %" #wide "[expected]\n" \
> +		"b.eq	2f\n" \
> +		"sevl\n" \
> +		"1:	wfe\n" \
> +		#asm_op " %" #wide "[tmp], %[addr]\n" \
> +		"cmp	%" #wide "[tmp], %" #wide "[expected]\n" \
> +		"bne	1b\n" \
> +		"2:\n" \
> +		: [tmp] "=&r" (tmp) \
> +		: [addr] "Q"(*addr), [expected] "r"(expected) \
> +		: "cc", "memory"); \
> +}
> +/* Wait for *addr to be updated with expected value */
> +__WAIT_UNTIL_EQUAL(relaxed_16, ldxrh, w, uint16_t)
> +__WAIT_UNTIL_EQUAL(acquire_16, ldaxrh, w, uint16_t)
> +__WAIT_UNTIL_EQUAL(relaxed_32, ldxr, w, uint32_t)
> +__WAIT_UNTIL_EQUAL(acquire_32, ldaxr, w, uint32_t)
> +__WAIT_UNTIL_EQUAL(relaxed_64, ldxr, x, uint64_t)
> +__WAIT_UNTIL_EQUAL(acquire_64, ldaxr, x, uint64_t)
> +#endif
> +
>  #ifdef __cplusplus
>  }
>  #endif
> diff --git a/lib/librte_eal/common/include/generic/rte_pause.h b/lib/librte_eal/common/include/generic/rte_pause.h
> index 52bd4db..8906473 100644
> --- a/lib/librte_eal/common/include/generic/rte_pause.h
> +++ b/lib/librte_eal/common/include/generic/rte_pause.h
> @@ -1,5 +1,6 @@
>  /* SPDX-License-Identifier: BSD-3-Clause
>   * Copyright(c) 2017 Cavium, Inc
> + * Copyright(c) 2019 Arm Limited
>   */
> 
>  #ifndef _RTE_PAUSE_H_
> @@ -12,6 +13,10 @@
>   *
>   */
> 
> +#include <stdint.h>
> +#include <rte_common.h>
> +#include <rte_atomic.h>
> +
>  /**
>   * Pause CPU execution for a short while
>   *
> @@ -20,4 +25,105 @@
>   */
>  static inline void rte_pause(void);
> 
> +/**
> + * Wait for *addr to be updated with a 16-bit expected value, with a relaxed
> + * memory ordering model meaning the loads around this API can be reordered.
> + *
> + * @param addr
> + *  A pointer to the memory location.
> + * @param expected
> + *  A 16-bit expected value to be in the memory location.
> + */
> +__rte_always_inline
> +static void
> +rte_wait_until_equal_relaxed_16(volatile uint16_t *addr, uint16_t expected);
> +
> +/**
> + * Wait for *addr to be updated with a 32-bit expected value, with a relaxed
> + * memory ordering model meaning the loads around this API can be reordered.
> + *
> + * @param addr
> + *  A pointer to the memory location.
> + * @param expected
> + *  A 32-bit expected value to be in the memory location.
> + */
> +__rte_always_inline
> +static void
> +rte_wait_until_equal_relaxed_32(volatile uint32_t *addr, uint32_t expected);
> +
> +/**
> + * Wait for *addr to be updated with a 64-bit expected value, with a relaxed
> + * memory ordering model meaning the loads around this API can be reordered.
> + *
> + * @param addr
> + *  A pointer to the memory location.
> + * @param expected
> + *  A 64-bit expected value to be in the memory location.
> + */
> +__rte_always_inline
> +static void
> +rte_wait_until_equal_relaxed_64(volatile uint64_t *addr, uint64_t expected);
> +
> +/**
> + * Wait for *addr to be updated with a 16-bit expected value, with an acquire
> + * memory ordering model meaning the loads after this API can't be observed
> + * before this API.
> + *
> + * @param addr
> + *  A pointer to the memory location.
> + * @param expected
> + *  A 16-bit expected value to be in the memory location.
> + */
> +__rte_always_inline
> +static void
> +rte_wait_until_equal_acquire_16(volatile uint16_t *addr, uint16_t expected);
> +
> +/**
> + * Wait for *addr to be updated with a 32-bit expected value, with an acquire
> + * memory ordering model meaning the loads after this API can't be observed
> + * before this API.
> + *
> + * @param addr
> + *  A pointer to the memory location.
> + * @param expected
> + *  A 32-bit expected value to be in the memory location.
> + */
> +__rte_always_inline
> +static void
> +rte_wait_until_equal_acquire_32(volatile uint32_t *addr, uint32_t expected);

LGTM in general.
One stylish thing: wouldn't it be better to have an API like that:
rte_wait_until_equal_acquire_X(addr, expected, memory_order)
?

I.E. - pass memorder as parameter, not to incorporate it into function name?
Less functions, plus user can specify order himself.
Plus looks similar to C11 atomic instrincts.


> +
> +/**
> + * Wait for *addr to be updated with a 64-bit expected value, with an acquire
> + * memory ordering model meaning the loads after this API can't be observed
> + * before this API.
> + *
> + * @param addr
> + *  A pointer to the memory location.
> + * @param expected
> + *  A 64-bit expected value to be in the memory location.
> + */
> +__rte_always_inline
> +static void
> +rte_wait_until_equal_acquire_64(volatile uint64_t *addr, uint64_t expected);
> +
> +#if !defined(RTE_ARM_USE_WFE)
> +#define __WAIT_UNTIL_EQUAL(op_name, size, type, memorder) \
> +__rte_always_inline \
> +static void	\
> +rte_wait_until_equal_##op_name##_##size(volatile type *addr, \
> +	type expected) \
> +{ \
> +	while (__atomic_load_n(addr, memorder) != expected) \
> +		rte_pause(); \
> +}
> +
> +/* Wait for *addr to be updated with expected value */
> +__WAIT_UNTIL_EQUAL(relaxed, 16, uint16_t, __ATOMIC_RELAXED)
> +__WAIT_UNTIL_EQUAL(acquire, 16, uint16_t, __ATOMIC_ACQUIRE)
> +__WAIT_UNTIL_EQUAL(relaxed, 32, uint32_t, __ATOMIC_RELAXED)
> +__WAIT_UNTIL_EQUAL(acquire, 32, uint32_t, __ATOMIC_ACQUIRE)
> +__WAIT_UNTIL_EQUAL(relaxed, 64, uint64_t, __ATOMIC_RELAXED)
> +__WAIT_UNTIL_EQUAL(acquire, 64, uint64_t, __ATOMIC_ACQUIRE)
> +#endif /* RTE_ARM_USE_WFE */
> +
>  #endif /* _RTE_PAUSE_H_ */
> --
> 2.7.4
  
David Marchand Oct. 17, 2019, 3:45 p.m. UTC | #3
On Fri, Sep 27, 2019 at 7:42 AM Gavin Hu <gavin.hu@arm.com> wrote:
>
> The rte_wait_until_equal_xx APIs abstract the functionality of
> 'polling for a memory location to become equal to a given value'.
>
> Add the RTE_ARM_USE_WFE configuration entry for aarch64, disabled
> by default. When it is enabled, the above APIs will call WFE instruction
> to save CPU cycles and power.

- As discussed on irc, I would prefer we have two stages for this:
* first stage, an architecture announces it has its own implementation
of this api, in such a case it defines RTE_ARCH_HAS_WFE in its
arch/xxx/rte_pause.h header before including generic/rte_pause.h
  The default implementation with C11 is then skipped in the generic header.
* second stage, in the arm64 header, if RTE_ARM_USE_WFE is set in the
configuration, then define RTE_ARCH_HAS_WFE

- Can you add a little description on the limitation of using WFE
instruction in the commit log?

- This is a new api, should be marked experimental, even if inlined.

Small comments inline.


>
> Signed-off-by: Gavin Hu <gavin.hu@arm.com>
> Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
> Reviewed-by: Steve Capper <steve.capper@arm.com>
> Reviewed-by: Ola Liljedahl <ola.liljedahl@arm.com>
> Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> Reviewed-by: Phil Yang <phil.yang@arm.com>
> Acked-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
> ---
>  config/arm/meson.build                             |   1 +
>  config/common_base                                 |   5 +
>  .../common/include/arch/arm/rte_pause_64.h         |  30 ++++++
>  lib/librte_eal/common/include/generic/rte_pause.h  | 106 +++++++++++++++++++++
>  4 files changed, 142 insertions(+)
>
> diff --git a/config/arm/meson.build b/config/arm/meson.build
> index 979018e..b4b4cac 100644
> --- a/config/arm/meson.build
> +++ b/config/arm/meson.build
> @@ -26,6 +26,7 @@ flags_common_default = [
>         ['RTE_LIBRTE_AVP_PMD', false],
>
>         ['RTE_SCHED_VECTOR', false],
> +       ['RTE_ARM_USE_WFE', false],
>  ]
>
>  flags_generic = [
> diff --git a/config/common_base b/config/common_base
> index 8ef75c2..8861713 100644
> --- a/config/common_base
> +++ b/config/common_base
> @@ -111,6 +111,11 @@ CONFIG_RTE_MAX_VFIO_CONTAINERS=64
>  CONFIG_RTE_MALLOC_DEBUG=n
>  CONFIG_RTE_EAL_NUMA_AWARE_HUGEPAGES=n
>  CONFIG_RTE_USE_LIBBSD=n
> +# Use WFE instructions to implement the rte_wait_for_equal_xxx APIs,
> +# calling these APIs put the cores in low power state while waiting
> +# for the memory address to become equal to the expected value.
> +# This is supported only by aarch64.
> +CONFIG_RTE_ARM_USE_WFE=n
>
>  #
>  # Recognize/ignore the AVX/AVX512 CPU flags for performance/power testing.
> diff --git a/lib/librte_eal/common/include/arch/arm/rte_pause_64.h b/lib/librte_eal/common/include/arch/arm/rte_pause_64.h
> index 93895d3..dabde17 100644
> --- a/lib/librte_eal/common/include/arch/arm/rte_pause_64.h
> +++ b/lib/librte_eal/common/include/arch/arm/rte_pause_64.h
> @@ -1,5 +1,6 @@
>  /* SPDX-License-Identifier: BSD-3-Clause
>   * Copyright(c) 2017 Cavium, Inc
> + * Copyright(c) 2019 Arm Limited
>   */
>
>  #ifndef _RTE_PAUSE_ARM64_H_
> @@ -17,6 +18,35 @@ static inline void rte_pause(void)
>         asm volatile("yield" ::: "memory");
>  }
>
> +#ifdef RTE_ARM_USE_WFE
> +#define __WAIT_UNTIL_EQUAL(name, asm_op, wide, type) \
> +static __rte_always_inline void \
> +rte_wait_until_equal_##name(volatile type * addr, type expected) \
> +{ \
> +       type tmp; \
> +       asm volatile( \
> +               #asm_op " %" #wide "[tmp], %[addr]\n" \
> +               "cmp    %" #wide "[tmp], %" #wide "[expected]\n" \
> +               "b.eq   2f\n" \
> +               "sevl\n" \
> +               "1:     wfe\n" \
> +               #asm_op " %" #wide "[tmp], %[addr]\n" \
> +               "cmp    %" #wide "[tmp], %" #wide "[expected]\n" \
> +               "bne    1b\n" \
> +               "2:\n" \
> +               : [tmp] "=&r" (tmp) \
> +               : [addr] "Q"(*addr), [expected] "r"(expected) \
> +               : "cc", "memory"); \
> +}
> +/* Wait for *addr to be updated with expected value */
> +__WAIT_UNTIL_EQUAL(relaxed_16, ldxrh, w, uint16_t)
> +__WAIT_UNTIL_EQUAL(acquire_16, ldaxrh, w, uint16_t)
> +__WAIT_UNTIL_EQUAL(relaxed_32, ldxr, w, uint32_t)
> +__WAIT_UNTIL_EQUAL(acquire_32, ldaxr, w, uint32_t)
> +__WAIT_UNTIL_EQUAL(relaxed_64, ldxr, x, uint64_t)
> +__WAIT_UNTIL_EQUAL(acquire_64, ldaxr, x, uint64_t)

Missing #undef __WAIT_UNTIL_EQUAL

> +#endif
> +
>  #ifdef __cplusplus
>  }
>  #endif
> diff --git a/lib/librte_eal/common/include/generic/rte_pause.h b/lib/librte_eal/common/include/generic/rte_pause.h
> index 52bd4db..8906473 100644
> --- a/lib/librte_eal/common/include/generic/rte_pause.h
> +++ b/lib/librte_eal/common/include/generic/rte_pause.h
> @@ -1,5 +1,6 @@
>  /* SPDX-License-Identifier: BSD-3-Clause
>   * Copyright(c) 2017 Cavium, Inc
> + * Copyright(c) 2019 Arm Limited
>   */
>
>  #ifndef _RTE_PAUSE_H_
> @@ -12,6 +13,10 @@
>   *
>   */
>
> +#include <stdint.h>
> +#include <rte_common.h>
> +#include <rte_atomic.h>
> +
>  /**
>   * Pause CPU execution for a short while
>   *
> @@ -20,4 +25,105 @@
>   */
>  static inline void rte_pause(void);
>
> +/**

Missing warning on experimental api.

> + * Wait for *addr to be updated with a 16-bit expected value, with a relaxed
> + * memory ordering model meaning the loads around this API can be reordered.
> + *
> + * @param addr
> + *  A pointer to the memory location.
> + * @param expected
> + *  A 16-bit expected value to be in the memory location.
> + */

Missing experimental tag.
Saying this only once, please update declarations below, plus doxygen header.


> +__rte_always_inline
> +static void
> +rte_wait_until_equal_relaxed_16(volatile uint16_t *addr, uint16_t expected);
> +
> +/**
> + * Wait for *addr to be updated with a 32-bit expected value, with a relaxed
> + * memory ordering model meaning the loads around this API can be reordered.
> + *
> + * @param addr
> + *  A pointer to the memory location.
> + * @param expected
> + *  A 32-bit expected value to be in the memory location.
> + */
> +__rte_always_inline
> +static void
> +rte_wait_until_equal_relaxed_32(volatile uint32_t *addr, uint32_t expected);
> +
> +/**
> + * Wait for *addr to be updated with a 64-bit expected value, with a relaxed
> + * memory ordering model meaning the loads around this API can be reordered.
> + *
> + * @param addr
> + *  A pointer to the memory location.
> + * @param expected
> + *  A 64-bit expected value to be in the memory location.
> + */
> +__rte_always_inline
> +static void
> +rte_wait_until_equal_relaxed_64(volatile uint64_t *addr, uint64_t expected);
> +
> +/**
> + * Wait for *addr to be updated with a 16-bit expected value, with an acquire
> + * memory ordering model meaning the loads after this API can't be observed
> + * before this API.
> + *
> + * @param addr
> + *  A pointer to the memory location.
> + * @param expected
> + *  A 16-bit expected value to be in the memory location.
> + */
> +__rte_always_inline
> +static void
> +rte_wait_until_equal_acquire_16(volatile uint16_t *addr, uint16_t expected);
> +
> +/**
> + * Wait for *addr to be updated with a 32-bit expected value, with an acquire
> + * memory ordering model meaning the loads after this API can't be observed
> + * before this API.
> + *
> + * @param addr
> + *  A pointer to the memory location.
> + * @param expected
> + *  A 32-bit expected value to be in the memory location.
> + */
> +__rte_always_inline
> +static void
> +rte_wait_until_equal_acquire_32(volatile uint32_t *addr, uint32_t expected);
> +
> +/**
> + * Wait for *addr to be updated with a 64-bit expected value, with an acquire
> + * memory ordering model meaning the loads after this API can't be observed
> + * before this API.
> + *
> + * @param addr
> + *  A pointer to the memory location.
> + * @param expected
> + *  A 64-bit expected value to be in the memory location.
> + */
> +__rte_always_inline
> +static void
> +rte_wait_until_equal_acquire_64(volatile uint64_t *addr, uint64_t expected);
> +
> +#if !defined(RTE_ARM_USE_WFE)
> +#define __WAIT_UNTIL_EQUAL(op_name, size, type, memorder) \
> +__rte_always_inline \
> +static void    \
> +rte_wait_until_equal_##op_name##_##size(volatile type *addr, \
> +       type expected) \
> +{ \
> +       while (__atomic_load_n(addr, memorder) != expected) \
> +               rte_pause(); \
> +}
> +
> +/* Wait for *addr to be updated with expected value */
> +__WAIT_UNTIL_EQUAL(relaxed, 16, uint16_t, __ATOMIC_RELAXED)
> +__WAIT_UNTIL_EQUAL(acquire, 16, uint16_t, __ATOMIC_ACQUIRE)
> +__WAIT_UNTIL_EQUAL(relaxed, 32, uint32_t, __ATOMIC_RELAXED)
> +__WAIT_UNTIL_EQUAL(acquire, 32, uint32_t, __ATOMIC_ACQUIRE)
> +__WAIT_UNTIL_EQUAL(relaxed, 64, uint64_t, __ATOMIC_RELAXED)
> +__WAIT_UNTIL_EQUAL(acquire, 64, uint64_t, __ATOMIC_ACQUIRE)

#undef __WAIT_UNTIL_EQUAL

> +#endif /* RTE_ARM_USE_WFE */
> +
>  #endif /* _RTE_PAUSE_H_ */
> --
> 2.7.4
>
  
Ananyev, Konstantin Oct. 17, 2019, 4:44 p.m. UTC | #4
> 
> Hi Gavin,
> 
> >
> > The rte_wait_until_equal_xx APIs abstract the functionality of
> > 'polling for a memory location to become equal to a given value'.
> >
> > Add the RTE_ARM_USE_WFE configuration entry for aarch64, disabled
> > by default. When it is enabled, the above APIs will call WFE instruction
> > to save CPU cycles and power.
> >
> > Signed-off-by: Gavin Hu <gavin.hu@arm.com>
> > Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
> > Reviewed-by: Steve Capper <steve.capper@arm.com>
> > Reviewed-by: Ola Liljedahl <ola.liljedahl@arm.com>
> > Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> > Reviewed-by: Phil Yang <phil.yang@arm.com>
> > Acked-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
> > ---
> >  config/arm/meson.build                             |   1 +
> >  config/common_base                                 |   5 +
> >  .../common/include/arch/arm/rte_pause_64.h         |  30 ++++++
> >  lib/librte_eal/common/include/generic/rte_pause.h  | 106 +++++++++++++++++++++
> >  4 files changed, 142 insertions(+)
> >
> > diff --git a/config/arm/meson.build b/config/arm/meson.build
> > index 979018e..b4b4cac 100644
> > --- a/config/arm/meson.build
> > +++ b/config/arm/meson.build
> > @@ -26,6 +26,7 @@ flags_common_default = [
> >  	['RTE_LIBRTE_AVP_PMD', false],
> >
> >  	['RTE_SCHED_VECTOR', false],
> > +	['RTE_ARM_USE_WFE', false],
> >  ]
> >
> >  flags_generic = [
> > diff --git a/config/common_base b/config/common_base
> > index 8ef75c2..8861713 100644
> > --- a/config/common_base
> > +++ b/config/common_base
> > @@ -111,6 +111,11 @@ CONFIG_RTE_MAX_VFIO_CONTAINERS=64
> >  CONFIG_RTE_MALLOC_DEBUG=n
> >  CONFIG_RTE_EAL_NUMA_AWARE_HUGEPAGES=n
> >  CONFIG_RTE_USE_LIBBSD=n
> > +# Use WFE instructions to implement the rte_wait_for_equal_xxx APIs,
> > +# calling these APIs put the cores in low power state while waiting
> > +# for the memory address to become equal to the expected value.
> > +# This is supported only by aarch64.
> > +CONFIG_RTE_ARM_USE_WFE=n
> >
> >  #
> >  # Recognize/ignore the AVX/AVX512 CPU flags for performance/power testing.
> > diff --git a/lib/librte_eal/common/include/arch/arm/rte_pause_64.h b/lib/librte_eal/common/include/arch/arm/rte_pause_64.h
> > index 93895d3..dabde17 100644
> > --- a/lib/librte_eal/common/include/arch/arm/rte_pause_64.h
> > +++ b/lib/librte_eal/common/include/arch/arm/rte_pause_64.h
> > @@ -1,5 +1,6 @@
> >  /* SPDX-License-Identifier: BSD-3-Clause
> >   * Copyright(c) 2017 Cavium, Inc
> > + * Copyright(c) 2019 Arm Limited
> >   */
> >
> >  #ifndef _RTE_PAUSE_ARM64_H_
> > @@ -17,6 +18,35 @@ static inline void rte_pause(void)
> >  	asm volatile("yield" ::: "memory");
> >  }
> >
> > +#ifdef RTE_ARM_USE_WFE
> > +#define __WAIT_UNTIL_EQUAL(name, asm_op, wide, type) \
> > +static __rte_always_inline void \
> > +rte_wait_until_equal_##name(volatile type * addr, type expected) \
> > +{ \
> > +	type tmp; \
> > +	asm volatile( \
> > +		#asm_op " %" #wide "[tmp], %[addr]\n" \
> > +		"cmp	%" #wide "[tmp], %" #wide "[expected]\n" \
> > +		"b.eq	2f\n" \
> > +		"sevl\n" \
> > +		"1:	wfe\n" \
> > +		#asm_op " %" #wide "[tmp], %[addr]\n" \
> > +		"cmp	%" #wide "[tmp], %" #wide "[expected]\n" \
> > +		"bne	1b\n" \
> > +		"2:\n" \
> > +		: [tmp] "=&r" (tmp) \
> > +		: [addr] "Q"(*addr), [expected] "r"(expected) \
> > +		: "cc", "memory"); \
> > +}

One more thought:
Why do you need to write asm code for the whole procedure?
Why not to do like linux kernel:
define wfe() and sev() macros and use them inside normal C code?
 
#define sev()		asm volatile("sev" : : : "memory")
#define wfe()		asm volatile("wfe" : : : "memory")

Then:
rte_wait_until_equal_32(volatile uint32_t *addr, uint32_t expected, int memorder)
{
     if (__atomic_load_n(addr, memorder) != expected) {
         sev();
         do {
             wfe();
         } while ((__atomic_load_n(addr, memorder) != expected);
     }
}

?

> > +/* Wait for *addr to be updated with expected value */
> > +__WAIT_UNTIL_EQUAL(relaxed_16, ldxrh, w, uint16_t)
> > +__WAIT_UNTIL_EQUAL(acquire_16, ldaxrh, w, uint16_t)
> > +__WAIT_UNTIL_EQUAL(relaxed_32, ldxr, w, uint32_t)
> > +__WAIT_UNTIL_EQUAL(acquire_32, ldaxr, w, uint32_t)
> > +__WAIT_UNTIL_EQUAL(relaxed_64, ldxr, x, uint64_t)
> > +__WAIT_UNTIL_EQUAL(acquire_64, ldaxr, x, uint64_t)
> > +#endif
> > +
> >  #ifdef __cplusplus
> >  }
> >  #endif
> > diff --git a/lib/librte_eal/common/include/generic/rte_pause.h b/lib/librte_eal/common/include/generic/rte_pause.h
> > index 52bd4db..8906473 100644
> > --- a/lib/librte_eal/common/include/generic/rte_pause.h
> > +++ b/lib/librte_eal/common/include/generic/rte_pause.h
> > @@ -1,5 +1,6 @@
> >  /* SPDX-License-Identifier: BSD-3-Clause
> >   * Copyright(c) 2017 Cavium, Inc
> > + * Copyright(c) 2019 Arm Limited
> >   */
> >
> >  #ifndef _RTE_PAUSE_H_
> > @@ -12,6 +13,10 @@
> >   *
> >   */
> >
> > +#include <stdint.h>
> > +#include <rte_common.h>
> > +#include <rte_atomic.h>
> > +
> >  /**
> >   * Pause CPU execution for a short while
> >   *
> > @@ -20,4 +25,105 @@
> >   */
> >  static inline void rte_pause(void);
> >
> > +/**
> > + * Wait for *addr to be updated with a 16-bit expected value, with a relaxed
> > + * memory ordering model meaning the loads around this API can be reordered.
> > + *
> > + * @param addr
> > + *  A pointer to the memory location.
> > + * @param expected
> > + *  A 16-bit expected value to be in the memory location.
> > + */
> > +__rte_always_inline
> > +static void
> > +rte_wait_until_equal_relaxed_16(volatile uint16_t *addr, uint16_t expected);
> > +
> > +/**
> > + * Wait for *addr to be updated with a 32-bit expected value, with a relaxed
> > + * memory ordering model meaning the loads around this API can be reordered.
> > + *
> > + * @param addr
> > + *  A pointer to the memory location.
> > + * @param expected
> > + *  A 32-bit expected value to be in the memory location.
> > + */
> > +__rte_always_inline
> > +static void
> > +rte_wait_until_equal_relaxed_32(volatile uint32_t *addr, uint32_t expected);
> > +
> > +/**
> > + * Wait for *addr to be updated with a 64-bit expected value, with a relaxed
> > + * memory ordering model meaning the loads around this API can be reordered.
> > + *
> > + * @param addr
> > + *  A pointer to the memory location.
> > + * @param expected
> > + *  A 64-bit expected value to be in the memory location.
> > + */
> > +__rte_always_inline
> > +static void
> > +rte_wait_until_equal_relaxed_64(volatile uint64_t *addr, uint64_t expected);
> > +
> > +/**
> > + * Wait for *addr to be updated with a 16-bit expected value, with an acquire
> > + * memory ordering model meaning the loads after this API can't be observed
> > + * before this API.
> > + *
> > + * @param addr
> > + *  A pointer to the memory location.
> > + * @param expected
> > + *  A 16-bit expected value to be in the memory location.
> > + */
> > +__rte_always_inline
> > +static void
> > +rte_wait_until_equal_acquire_16(volatile uint16_t *addr, uint16_t expected);
> > +
> > +/**
> > + * Wait for *addr to be updated with a 32-bit expected value, with an acquire
> > + * memory ordering model meaning the loads after this API can't be observed
> > + * before this API.
> > + *
> > + * @param addr
> > + *  A pointer to the memory location.
> > + * @param expected
> > + *  A 32-bit expected value to be in the memory location.
> > + */
> > +__rte_always_inline
> > +static void
> > +rte_wait_until_equal_acquire_32(volatile uint32_t *addr, uint32_t expected);
> 
> LGTM in general.
> One stylish thing: wouldn't it be better to have an API like that:
> rte_wait_until_equal_acquire_X(addr, expected, memory_order)
> ?
> 
> I.E. - pass memorder as parameter, not to incorporate it into function name?
> Less functions, plus user can specify order himself.
> Plus looks similar to C11 atomic instrincts.
> 
> 
> > +
> > +/**
> > + * Wait for *addr to be updated with a 64-bit expected value, with an acquire
> > + * memory ordering model meaning the loads after this API can't be observed
> > + * before this API.
> > + *
> > + * @param addr
> > + *  A pointer to the memory location.
> > + * @param expected
> > + *  A 64-bit expected value to be in the memory location.
> > + */
> > +__rte_always_inline
> > +static void
> > +rte_wait_until_equal_acquire_64(volatile uint64_t *addr, uint64_t expected);
> > +
> > +#if !defined(RTE_ARM_USE_WFE)
> > +#define __WAIT_UNTIL_EQUAL(op_name, size, type, memorder) \
> > +__rte_always_inline \
> > +static void	\
> > +rte_wait_until_equal_##op_name##_##size(volatile type *addr, \
> > +	type expected) \
> > +{ \
> > +	while (__atomic_load_n(addr, memorder) != expected) \
> > +		rte_pause(); \
> > +}
> > +
> > +/* Wait for *addr to be updated with expected value */
> > +__WAIT_UNTIL_EQUAL(relaxed, 16, uint16_t, __ATOMIC_RELAXED)
> > +__WAIT_UNTIL_EQUAL(acquire, 16, uint16_t, __ATOMIC_ACQUIRE)
> > +__WAIT_UNTIL_EQUAL(relaxed, 32, uint32_t, __ATOMIC_RELAXED)
> > +__WAIT_UNTIL_EQUAL(acquire, 32, uint32_t, __ATOMIC_ACQUIRE)
> > +__WAIT_UNTIL_EQUAL(relaxed, 64, uint64_t, __ATOMIC_RELAXED)
> > +__WAIT_UNTIL_EQUAL(acquire, 64, uint64_t, __ATOMIC_ACQUIRE)
> > +#endif /* RTE_ARM_USE_WFE */
> > +
> >  #endif /* _RTE_PAUSE_H_ */
> > --
> > 2.7.4
  
Gavin Hu Oct. 21, 2019, 7:21 a.m. UTC | #5
> -----Original Message-----
> From: Ananyev, Konstantin <konstantin.ananyev@intel.com>
> Sent: Thursday, October 17, 2019 9:15 PM
> To: Gavin Hu (Arm Technology China) <Gavin.Hu@arm.com>; dev@dpdk.org
> Cc: nd <nd@arm.com>; thomas@monjalon.net;
> stephen@networkplumber.org; hemant.agrawal@nxp.com;
> jerinj@marvell.com; pbhagavatula@marvell.com; Honnappa Nagarahalli
> <Honnappa.Nagarahalli@arm.com>; Ruifeng Wang (Arm Technology China)
> <Ruifeng.Wang@arm.com>; Phil Yang (Arm Technology China)
> <Phil.Yang@arm.com>; Steve Capper <Steve.Capper@arm.com>
> Subject: RE: [dpdk-dev] [PATCH v7 2/7] eal: add the APIs to wait until equal
> 
> Hi Gavin,
> 
> >
> > The rte_wait_until_equal_xx APIs abstract the functionality of
> > 'polling for a memory location to become equal to a given value'.
> >
> > Add the RTE_ARM_USE_WFE configuration entry for aarch64, disabled
> > by default. When it is enabled, the above APIs will call WFE instruction
> > to save CPU cycles and power.
> >
> > Signed-off-by: Gavin Hu <gavin.hu@arm.com>
> > Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
> > Reviewed-by: Steve Capper <steve.capper@arm.com>
> > Reviewed-by: Ola Liljedahl <ola.liljedahl@arm.com>
> > Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> > Reviewed-by: Phil Yang <phil.yang@arm.com>
> > Acked-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
> > ---
> >  config/arm/meson.build                             |   1 +
> >  config/common_base                                 |   5 +
> >  .../common/include/arch/arm/rte_pause_64.h         |  30 ++++++
> >  lib/librte_eal/common/include/generic/rte_pause.h  | 106
> +++++++++++++++++++++
> >  4 files changed, 142 insertions(+)
> >
> > diff --git a/config/arm/meson.build b/config/arm/meson.build
> > index 979018e..b4b4cac 100644
> > --- a/config/arm/meson.build
> > +++ b/config/arm/meson.build
> > @@ -26,6 +26,7 @@ flags_common_default = [
> >  	['RTE_LIBRTE_AVP_PMD', false],
> >
> >  	['RTE_SCHED_VECTOR', false],
> > +	['RTE_ARM_USE_WFE', false],
> >  ]
> >
> >  flags_generic = [
> > diff --git a/config/common_base b/config/common_base
> > index 8ef75c2..8861713 100644
> > --- a/config/common_base
> > +++ b/config/common_base
> > @@ -111,6 +111,11 @@ CONFIG_RTE_MAX_VFIO_CONTAINERS=64
> >  CONFIG_RTE_MALLOC_DEBUG=n
> >  CONFIG_RTE_EAL_NUMA_AWARE_HUGEPAGES=n
> >  CONFIG_RTE_USE_LIBBSD=n
> > +# Use WFE instructions to implement the rte_wait_for_equal_xxx APIs,
> > +# calling these APIs put the cores in low power state while waiting
> > +# for the memory address to become equal to the expected value.
> > +# This is supported only by aarch64.
> > +CONFIG_RTE_ARM_USE_WFE=n
> >
> >  #
> >  # Recognize/ignore the AVX/AVX512 CPU flags for performance/power
> testing.
> > diff --git a/lib/librte_eal/common/include/arch/arm/rte_pause_64.h
> b/lib/librte_eal/common/include/arch/arm/rte_pause_64.h
> > index 93895d3..dabde17 100644
> > --- a/lib/librte_eal/common/include/arch/arm/rte_pause_64.h
> > +++ b/lib/librte_eal/common/include/arch/arm/rte_pause_64.h
> > @@ -1,5 +1,6 @@
> >  /* SPDX-License-Identifier: BSD-3-Clause
> >   * Copyright(c) 2017 Cavium, Inc
> > + * Copyright(c) 2019 Arm Limited
> >   */
> >
> >  #ifndef _RTE_PAUSE_ARM64_H_
> > @@ -17,6 +18,35 @@ static inline void rte_pause(void)
> >  	asm volatile("yield" ::: "memory");
> >  }
> >
> > +#ifdef RTE_ARM_USE_WFE
> > +#define __WAIT_UNTIL_EQUAL(name, asm_op, wide, type) \
> > +static __rte_always_inline void \
> > +rte_wait_until_equal_##name(volatile type * addr, type expected) \
> > +{ \
> > +	type tmp; \
> > +	asm volatile( \
> > +		#asm_op " %" #wide "[tmp], %[addr]\n" \
> > +		"cmp	%" #wide "[tmp], %" #wide "[expected]\n" \
> > +		"b.eq	2f\n" \
> > +		"sevl\n" \
> > +		"1:	wfe\n" \
> > +		#asm_op " %" #wide "[tmp], %[addr]\n" \
> > +		"cmp	%" #wide "[tmp], %" #wide "[expected]\n" \
> > +		"bne	1b\n" \
> > +		"2:\n" \
> > +		: [tmp] "=&r" (tmp) \
> > +		: [addr] "Q"(*addr), [expected] "r"(expected) \
> > +		: "cc", "memory"); \
> > +}
> > +/* Wait for *addr to be updated with expected value */
> > +__WAIT_UNTIL_EQUAL(relaxed_16, ldxrh, w, uint16_t)
> > +__WAIT_UNTIL_EQUAL(acquire_16, ldaxrh, w, uint16_t)
> > +__WAIT_UNTIL_EQUAL(relaxed_32, ldxr, w, uint32_t)
> > +__WAIT_UNTIL_EQUAL(acquire_32, ldaxr, w, uint32_t)
> > +__WAIT_UNTIL_EQUAL(relaxed_64, ldxr, x, uint64_t)
> > +__WAIT_UNTIL_EQUAL(acquire_64, ldaxr, x, uint64_t)
> > +#endif
> > +
> >  #ifdef __cplusplus
> >  }
> >  #endif
> > diff --git a/lib/librte_eal/common/include/generic/rte_pause.h
> b/lib/librte_eal/common/include/generic/rte_pause.h
> > index 52bd4db..8906473 100644
> > --- a/lib/librte_eal/common/include/generic/rte_pause.h
> > +++ b/lib/librte_eal/common/include/generic/rte_pause.h
> > @@ -1,5 +1,6 @@
> >  /* SPDX-License-Identifier: BSD-3-Clause
> >   * Copyright(c) 2017 Cavium, Inc
> > + * Copyright(c) 2019 Arm Limited
> >   */
> >
> >  #ifndef _RTE_PAUSE_H_
> > @@ -12,6 +13,10 @@
> >   *
> >   */
> >
> > +#include <stdint.h>
> > +#include <rte_common.h>
> > +#include <rte_atomic.h>
> > +
> >  /**
> >   * Pause CPU execution for a short while
> >   *
> > @@ -20,4 +25,105 @@
> >   */
> >  static inline void rte_pause(void);
> >
> > +/**
> > + * Wait for *addr to be updated with a 16-bit expected value, with a
> relaxed
> > + * memory ordering model meaning the loads around this API can be
> reordered.
> > + *
> > + * @param addr
> > + *  A pointer to the memory location.
> > + * @param expected
> > + *  A 16-bit expected value to be in the memory location.
> > + */
> > +__rte_always_inline
> > +static void
> > +rte_wait_until_equal_relaxed_16(volatile uint16_t *addr, uint16_t
> expected);
> > +
> > +/**
> > + * Wait for *addr to be updated with a 32-bit expected value, with a
> relaxed
> > + * memory ordering model meaning the loads around this API can be
> reordered.
> > + *
> > + * @param addr
> > + *  A pointer to the memory location.
> > + * @param expected
> > + *  A 32-bit expected value to be in the memory location.
> > + */
> > +__rte_always_inline
> > +static void
> > +rte_wait_until_equal_relaxed_32(volatile uint32_t *addr, uint32_t
> expected);
> > +
> > +/**
> > + * Wait for *addr to be updated with a 64-bit expected value, with a
> relaxed
> > + * memory ordering model meaning the loads around this API can be
> reordered.
> > + *
> > + * @param addr
> > + *  A pointer to the memory location.
> > + * @param expected
> > + *  A 64-bit expected value to be in the memory location.
> > + */
> > +__rte_always_inline
> > +static void
> > +rte_wait_until_equal_relaxed_64(volatile uint64_t *addr, uint64_t
> expected);
> > +
> > +/**
> > + * Wait for *addr to be updated with a 16-bit expected value, with an
> acquire
> > + * memory ordering model meaning the loads after this API can't be
> observed
> > + * before this API.
> > + *
> > + * @param addr
> > + *  A pointer to the memory location.
> > + * @param expected
> > + *  A 16-bit expected value to be in the memory location.
> > + */
> > +__rte_always_inline
> > +static void
> > +rte_wait_until_equal_acquire_16(volatile uint16_t *addr, uint16_t
> expected);
> > +
> > +/**
> > + * Wait for *addr to be updated with a 32-bit expected value, with an
> acquire
> > + * memory ordering model meaning the loads after this API can't be
> observed
> > + * before this API.
> > + *
> > + * @param addr
> > + *  A pointer to the memory location.
> > + * @param expected
> > + *  A 32-bit expected value to be in the memory location.
> > + */
> > +__rte_always_inline
> > +static void
> > +rte_wait_until_equal_acquire_32(volatile uint32_t *addr, uint32_t
> expected);
> 
> LGTM in general.
> One stylish thing: wouldn't it be better to have an API like that:
> rte_wait_until_equal_acquire_X(addr, expected, memory_order)
> ?
> 
> I.E. - pass memorder as parameter, not to incorporate it into function name?
> Less functions, plus user can specify order himself.
> Plus looks similar to C11 atomic instrincts.
> 
Thanks for your comment, will fix this in v8.
> > +
> > +/**
> > + * Wait for *addr to be updated with a 64-bit expected value, with an
> acquire
> > + * memory ordering model meaning the loads after this API can't be
> observed
> > + * before this API.
> > + *
> > + * @param addr
> > + *  A pointer to the memory location.
> > + * @param expected
> > + *  A 64-bit expected value to be in the memory location.
> > + */
> > +__rte_always_inline
> > +static void
> > +rte_wait_until_equal_acquire_64(volatile uint64_t *addr, uint64_t
> expected);
> > +
> > +#if !defined(RTE_ARM_USE_WFE)
> > +#define __WAIT_UNTIL_EQUAL(op_name, size, type, memorder) \
> > +__rte_always_inline \
> > +static void	\
> > +rte_wait_until_equal_##op_name##_##size(volatile type *addr, \
> > +	type expected) \
> > +{ \
> > +	while (__atomic_load_n(addr, memorder) != expected) \
> > +		rte_pause(); \
> > +}
> > +
> > +/* Wait for *addr to be updated with expected value */
> > +__WAIT_UNTIL_EQUAL(relaxed, 16, uint16_t, __ATOMIC_RELAXED)
> > +__WAIT_UNTIL_EQUAL(acquire, 16, uint16_t, __ATOMIC_ACQUIRE)
> > +__WAIT_UNTIL_EQUAL(relaxed, 32, uint32_t, __ATOMIC_RELAXED)
> > +__WAIT_UNTIL_EQUAL(acquire, 32, uint32_t, __ATOMIC_ACQUIRE)
> > +__WAIT_UNTIL_EQUAL(relaxed, 64, uint64_t, __ATOMIC_RELAXED)
> > +__WAIT_UNTIL_EQUAL(acquire, 64, uint64_t, __ATOMIC_ACQUIRE)
> > +#endif /* RTE_ARM_USE_WFE */
> > +
> >  #endif /* _RTE_PAUSE_H_ */
> > --
> > 2.7.4
  
Gavin Hu Oct. 21, 2019, 7:38 a.m. UTC | #6
Hi David, 

One comment about the experimental tag for the API inlined. 
/Gavin
> -----Original Message-----
> From: David Marchand <david.marchand@redhat.com>
> Sent: Thursday, October 17, 2019 11:45 PM
> To: Gavin Hu (Arm Technology China) <Gavin.Hu@arm.com>
> Cc: dev <dev@dpdk.org>; nd <nd@arm.com>; thomas@monjalon.net;
> Stephen Hemminger <stephen@networkplumber.org>;
> hemant.agrawal@nxp.com; jerinj@marvell.com; Pavan Nikhilesh
> <pbhagavatula@marvell.com>; Honnappa Nagarahalli
> <Honnappa.Nagarahalli@arm.com>; Ruifeng Wang (Arm Technology China)
> <Ruifeng.Wang@arm.com>; Phil Yang (Arm Technology China)
> <Phil.Yang@arm.com>; Steve Capper <Steve.Capper@arm.com>
> Subject: Re: [dpdk-dev] [PATCH v7 2/7] eal: add the APIs to wait until equal
> 
> On Fri, Sep 27, 2019 at 7:42 AM Gavin Hu <gavin.hu@arm.com> wrote:
> >
> > The rte_wait_until_equal_xx APIs abstract the functionality of
> > 'polling for a memory location to become equal to a given value'.
> >
> > Add the RTE_ARM_USE_WFE configuration entry for aarch64, disabled
> > by default. When it is enabled, the above APIs will call WFE instruction
> > to save CPU cycles and power.
> 
> - As discussed on irc, I would prefer we have two stages for this:
> * first stage, an architecture announces it has its own implementation
> of this api, in such a case it defines RTE_ARCH_HAS_WFE in its
> arch/xxx/rte_pause.h header before including generic/rte_pause.h
>   The default implementation with C11 is then skipped in the generic
> header.
> * second stage, in the arm64 header, if RTE_ARM_USE_WFE is set in the
> configuration, then define RTE_ARCH_HAS_WFE
Will fix in v8
> - Can you add a little description on the limitation of using WFE
> instruction in the commit log?
Will fix in v8
> 
> - This is a new api, should be marked experimental, even if inlined.
It is ok to add for the patches except the rte_ring, which called the API in the .h file, other than the .c file. 
For the .h file is included by a lot of components, which require adding 'allowing_experimenal_apis = true' to the meson.build and makefile.
I am worried adding too many of these changes is confusing. I may leave this patch out of the series if there is no decorous solutions. 
/Gavin
> Small comments inline.
> 
> 
> >
> > Signed-off-by: Gavin Hu <gavin.hu@arm.com>
> > Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
> > Reviewed-by: Steve Capper <steve.capper@arm.com>
> > Reviewed-by: Ola Liljedahl <ola.liljedahl@arm.com>
> > Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> > Reviewed-by: Phil Yang <phil.yang@arm.com>
> > Acked-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
> > ---
> >  config/arm/meson.build                             |   1 +
> >  config/common_base                                 |   5 +
> >  .../common/include/arch/arm/rte_pause_64.h         |  30 ++++++
> >  lib/librte_eal/common/include/generic/rte_pause.h  | 106
> +++++++++++++++++++++
> >  4 files changed, 142 insertions(+)
> >
> > diff --git a/config/arm/meson.build b/config/arm/meson.build
> > index 979018e..b4b4cac 100644
> > --- a/config/arm/meson.build
> > +++ b/config/arm/meson.build
> > @@ -26,6 +26,7 @@ flags_common_default = [
> >         ['RTE_LIBRTE_AVP_PMD', false],
> >
> >         ['RTE_SCHED_VECTOR', false],
> > +       ['RTE_ARM_USE_WFE', false],
> >  ]
> >
> >  flags_generic = [
> > diff --git a/config/common_base b/config/common_base
> > index 8ef75c2..8861713 100644
> > --- a/config/common_base
> > +++ b/config/common_base
> > @@ -111,6 +111,11 @@ CONFIG_RTE_MAX_VFIO_CONTAINERS=64
> >  CONFIG_RTE_MALLOC_DEBUG=n
> >  CONFIG_RTE_EAL_NUMA_AWARE_HUGEPAGES=n
> >  CONFIG_RTE_USE_LIBBSD=n
> > +# Use WFE instructions to implement the rte_wait_for_equal_xxx APIs,
> > +# calling these APIs put the cores in low power state while waiting
> > +# for the memory address to become equal to the expected value.
> > +# This is supported only by aarch64.
> > +CONFIG_RTE_ARM_USE_WFE=n
> >
> >  #
> >  # Recognize/ignore the AVX/AVX512 CPU flags for performance/power
> testing.
> > diff --git a/lib/librte_eal/common/include/arch/arm/rte_pause_64.h
> b/lib/librte_eal/common/include/arch/arm/rte_pause_64.h
> > index 93895d3..dabde17 100644
> > --- a/lib/librte_eal/common/include/arch/arm/rte_pause_64.h
> > +++ b/lib/librte_eal/common/include/arch/arm/rte_pause_64.h
> > @@ -1,5 +1,6 @@
> >  /* SPDX-License-Identifier: BSD-3-Clause
> >   * Copyright(c) 2017 Cavium, Inc
> > + * Copyright(c) 2019 Arm Limited
> >   */
> >
> >  #ifndef _RTE_PAUSE_ARM64_H_
> > @@ -17,6 +18,35 @@ static inline void rte_pause(void)
> >         asm volatile("yield" ::: "memory");
> >  }
> >
> > +#ifdef RTE_ARM_USE_WFE
> > +#define __WAIT_UNTIL_EQUAL(name, asm_op, wide, type) \
> > +static __rte_always_inline void \
> > +rte_wait_until_equal_##name(volatile type * addr, type expected) \
> > +{ \
> > +       type tmp; \
> > +       asm volatile( \
> > +               #asm_op " %" #wide "[tmp], %[addr]\n" \
> > +               "cmp    %" #wide "[tmp], %" #wide "[expected]\n" \
> > +               "b.eq   2f\n" \
> > +               "sevl\n" \
> > +               "1:     wfe\n" \
> > +               #asm_op " %" #wide "[tmp], %[addr]\n" \
> > +               "cmp    %" #wide "[tmp], %" #wide "[expected]\n" \
> > +               "bne    1b\n" \
> > +               "2:\n" \
> > +               : [tmp] "=&r" (tmp) \
> > +               : [addr] "Q"(*addr), [expected] "r"(expected) \
> > +               : "cc", "memory"); \
> > +}
> > +/* Wait for *addr to be updated with expected value */
> > +__WAIT_UNTIL_EQUAL(relaxed_16, ldxrh, w, uint16_t)
> > +__WAIT_UNTIL_EQUAL(acquire_16, ldaxrh, w, uint16_t)
> > +__WAIT_UNTIL_EQUAL(relaxed_32, ldxr, w, uint32_t)
> > +__WAIT_UNTIL_EQUAL(acquire_32, ldaxr, w, uint32_t)
> > +__WAIT_UNTIL_EQUAL(relaxed_64, ldxr, x, uint64_t)
> > +__WAIT_UNTIL_EQUAL(acquire_64, ldaxr, x, uint64_t)
> 
> Missing #undef __WAIT_UNTIL_EQUAL
Will fix in v8
> 
> > +#endif
> > +
> >  #ifdef __cplusplus
> >  }
> >  #endif
> > diff --git a/lib/librte_eal/common/include/generic/rte_pause.h
> b/lib/librte_eal/common/include/generic/rte_pause.h
> > index 52bd4db..8906473 100644
> > --- a/lib/librte_eal/common/include/generic/rte_pause.h
> > +++ b/lib/librte_eal/common/include/generic/rte_pause.h
> > @@ -1,5 +1,6 @@
> >  /* SPDX-License-Identifier: BSD-3-Clause
> >   * Copyright(c) 2017 Cavium, Inc
> > + * Copyright(c) 2019 Arm Limited
> >   */
> >
> >  #ifndef _RTE_PAUSE_H_
> > @@ -12,6 +13,10 @@
> >   *
> >   */
> >
> > +#include <stdint.h>
> > +#include <rte_common.h>
> > +#include <rte_atomic.h>
> > +
> >  /**
> >   * Pause CPU execution for a short while
> >   *
> > @@ -20,4 +25,105 @@
> >   */
> >  static inline void rte_pause(void);
> >
> > +/**
> 
> Missing warning on experimental api.
Will add it in v8.
> 
> > + * Wait for *addr to be updated with a 16-bit expected value, with a
> relaxed
> > + * memory ordering model meaning the loads around this API can be
> reordered.
> > + *
> > + * @param addr
> > + *  A pointer to the memory location.
> > + * @param expected
> > + *  A 16-bit expected value to be in the memory location.
> > + */
> 
> Missing experimental tag.
> Saying this only once, please update declarations below, plus doxygen
> header.
Will fix all in v8.
> 
> > +__rte_always_inline
> > +static void
> > +rte_wait_until_equal_relaxed_16(volatile uint16_t *addr, uint16_t
> expected);
> > +
> > +/**
> > + * Wait for *addr to be updated with a 32-bit expected value, with a
> relaxed
> > + * memory ordering model meaning the loads around this API can be
> reordered.
> > + *
> > + * @param addr
> > + *  A pointer to the memory location.
> > + * @param expected
> > + *  A 32-bit expected value to be in the memory location.
> > + */
> > +__rte_always_inline
> > +static void
> > +rte_wait_until_equal_relaxed_32(volatile uint32_t *addr, uint32_t
> expected);
> > +
> > +/**
> > + * Wait for *addr to be updated with a 64-bit expected value, with a
> relaxed
> > + * memory ordering model meaning the loads around this API can be
> reordered.
> > + *
> > + * @param addr
> > + *  A pointer to the memory location.
> > + * @param expected
> > + *  A 64-bit expected value to be in the memory location.
> > + */
> > +__rte_always_inline
> > +static void
> > +rte_wait_until_equal_relaxed_64(volatile uint64_t *addr, uint64_t
> expected);
> > +
> > +/**
> > + * Wait for *addr to be updated with a 16-bit expected value, with an
> acquire
> > + * memory ordering model meaning the loads after this API can't be
> observed
> > + * before this API.
> > + *
> > + * @param addr
> > + *  A pointer to the memory location.
> > + * @param expected
> > + *  A 16-bit expected value to be in the memory location.
> > + */
> > +__rte_always_inline
> > +static void
> > +rte_wait_until_equal_acquire_16(volatile uint16_t *addr, uint16_t
> expected);
> > +
> > +/**
> > + * Wait for *addr to be updated with a 32-bit expected value, with an
> acquire
> > + * memory ordering model meaning the loads after this API can't be
> observed
> > + * before this API.
> > + *
> > + * @param addr
> > + *  A pointer to the memory location.
> > + * @param expected
> > + *  A 32-bit expected value to be in the memory location.
> > + */
> > +__rte_always_inline
> > +static void
> > +rte_wait_until_equal_acquire_32(volatile uint32_t *addr, uint32_t
> expected);
> > +
> > +/**
> > + * Wait for *addr to be updated with a 64-bit expected value, with an
> acquire
> > + * memory ordering model meaning the loads after this API can't be
> observed
> > + * before this API.
> > + *
> > + * @param addr
> > + *  A pointer to the memory location.
> > + * @param expected
> > + *  A 64-bit expected value to be in the memory location.
> > + */
> > +__rte_always_inline
> > +static void
> > +rte_wait_until_equal_acquire_64(volatile uint64_t *addr, uint64_t
> expected);
> > +
> > +#if !defined(RTE_ARM_USE_WFE)
> > +#define __WAIT_UNTIL_EQUAL(op_name, size, type, memorder) \
> > +__rte_always_inline \
> > +static void    \
> > +rte_wait_until_equal_##op_name##_##size(volatile type *addr, \
> > +       type expected) \
> > +{ \
> > +       while (__atomic_load_n(addr, memorder) != expected) \
> > +               rte_pause(); \
> > +}
> > +
> > +/* Wait for *addr to be updated with expected value */
> > +__WAIT_UNTIL_EQUAL(relaxed, 16, uint16_t, __ATOMIC_RELAXED)
> > +__WAIT_UNTIL_EQUAL(acquire, 16, uint16_t, __ATOMIC_ACQUIRE)
> > +__WAIT_UNTIL_EQUAL(relaxed, 32, uint32_t, __ATOMIC_RELAXED)
> > +__WAIT_UNTIL_EQUAL(acquire, 32, uint32_t, __ATOMIC_ACQUIRE)
> > +__WAIT_UNTIL_EQUAL(relaxed, 64, uint64_t, __ATOMIC_RELAXED)
> > +__WAIT_UNTIL_EQUAL(acquire, 64, uint64_t, __ATOMIC_ACQUIRE)
> 
> #undef __WAIT_UNTIL_EQUAL
> 
> > +#endif /* RTE_ARM_USE_WFE */
> > +
> >  #endif /* _RTE_PAUSE_H_ */
> > --
> > 2.7.4
> >
> 
> 
> --
> David Marchand
  
David Marchand Oct. 21, 2019, 7:17 p.m. UTC | #7
On Mon, Oct 21, 2019 at 9:39 AM Gavin Hu (Arm Technology China)
<Gavin.Hu@arm.com> wrote:
> > -----Original Message-----
> > From: David Marchand <david.marchand@redhat.com>
> > - This is a new api, should be marked experimental, even if inlined.
> It is ok to add for the patches except the rte_ring, which called the API in the .h file, other than the .c file.
> For the .h file is included by a lot of components, which require adding 'allowing_experimenal_apis = true' to the meson.build and makefile.
> I am worried adding too many of these changes is confusing. I may leave this patch out of the series if there is no decorous solutions.

You can still keep the current code in the ring headers under a
#ifndef ALLOW_EXPERIMENTAL_API banner and put the call to your new
experimental api in the #else part of it.
Something like:

#ifndef ALLOW_EXPERIMENTAL_API
                while (unlikely(ht->tail != old_val))
                        rte_pause();
#else
                rte_wait_until_equal_relaxed_32(&ht->tail, old_val);

#endif

This way, if the application enables the experimental api, then the
ring code will benefit from it, else it will rely on the current
stable code.
  
Gavin Hu Oct. 23, 2019, 4:20 p.m. UTC | #8
Hi Konstantin,

> -----Original Message-----
> From: Ananyev, Konstantin <konstantin.ananyev@intel.com>
> Sent: Friday, October 18, 2019 12:44 AM
> To: Gavin Hu (Arm Technology China) <Gavin.Hu@arm.com>;
> dev@dpdk.org
> Cc: nd <nd@arm.com>; thomas@monjalon.net;
> stephen@networkplumber.org; hemant.agrawal@nxp.com;
> jerinj@marvell.com; pbhagavatula@marvell.com; Honnappa Nagarahalli
> <Honnappa.Nagarahalli@arm.com>; Ruifeng Wang (Arm Technology China)
> <Ruifeng.Wang@arm.com>; Phil Yang (Arm Technology China)
> <Phil.Yang@arm.com>; Steve Capper <Steve.Capper@arm.com>
> Subject: RE: [dpdk-dev] [PATCH v7 2/7] eal: add the APIs to wait until equal
> 
> 
> >
> > Hi Gavin,
> >
> > >
> > > The rte_wait_until_equal_xx APIs abstract the functionality of
> > > 'polling for a memory location to become equal to a given value'.
> > >
> > > Add the RTE_ARM_USE_WFE configuration entry for aarch64, disabled
> > > by default. When it is enabled, the above APIs will call WFE instruction
> > > to save CPU cycles and power.
> > >
> > > Signed-off-by: Gavin Hu <gavin.hu@arm.com>
> > > Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
> > > Reviewed-by: Steve Capper <steve.capper@arm.com>
> > > Reviewed-by: Ola Liljedahl <ola.liljedahl@arm.com>
> > > Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> > > Reviewed-by: Phil Yang <phil.yang@arm.com>
> > > Acked-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
> > > ---
> > >  config/arm/meson.build                             |   1 +
> > >  config/common_base                                 |   5 +
> > >  .../common/include/arch/arm/rte_pause_64.h         |  30 ++++++
> > >  lib/librte_eal/common/include/generic/rte_pause.h  | 106
> +++++++++++++++++++++
> > >  4 files changed, 142 insertions(+)
> > >
> > > diff --git a/config/arm/meson.build b/config/arm/meson.build
> > > index 979018e..b4b4cac 100644
> > > --- a/config/arm/meson.build
> > > +++ b/config/arm/meson.build
> > > @@ -26,6 +26,7 @@ flags_common_default = [
> > >  	['RTE_LIBRTE_AVP_PMD', false],
> > >
> > >  	['RTE_SCHED_VECTOR', false],
> > > +	['RTE_ARM_USE_WFE', false],
> > >  ]
> > >
> > >  flags_generic = [
> > > diff --git a/config/common_base b/config/common_base
> > > index 8ef75c2..8861713 100644
> > > --- a/config/common_base
> > > +++ b/config/common_base
> > > @@ -111,6 +111,11 @@ CONFIG_RTE_MAX_VFIO_CONTAINERS=64
> > >  CONFIG_RTE_MALLOC_DEBUG=n
> > >  CONFIG_RTE_EAL_NUMA_AWARE_HUGEPAGES=n
> > >  CONFIG_RTE_USE_LIBBSD=n
> > > +# Use WFE instructions to implement the rte_wait_for_equal_xxx APIs,
> > > +# calling these APIs put the cores in low power state while waiting
> > > +# for the memory address to become equal to the expected value.
> > > +# This is supported only by aarch64.
> > > +CONFIG_RTE_ARM_USE_WFE=n
> > >
> > >  #
> > >  # Recognize/ignore the AVX/AVX512 CPU flags for performance/power
> testing.
> > > diff --git a/lib/librte_eal/common/include/arch/arm/rte_pause_64.h
> b/lib/librte_eal/common/include/arch/arm/rte_pause_64.h
> > > index 93895d3..dabde17 100644
> > > --- a/lib/librte_eal/common/include/arch/arm/rte_pause_64.h
> > > +++ b/lib/librte_eal/common/include/arch/arm/rte_pause_64.h
> > > @@ -1,5 +1,6 @@
> > >  /* SPDX-License-Identifier: BSD-3-Clause
> > >   * Copyright(c) 2017 Cavium, Inc
> > > + * Copyright(c) 2019 Arm Limited
> > >   */
> > >
> > >  #ifndef _RTE_PAUSE_ARM64_H_
> > > @@ -17,6 +18,35 @@ static inline void rte_pause(void)
> > >  	asm volatile("yield" ::: "memory");
> > >  }
> > >
> > > +#ifdef RTE_ARM_USE_WFE
> > > +#define __WAIT_UNTIL_EQUAL(name, asm_op, wide, type) \
> > > +static __rte_always_inline void \
> > > +rte_wait_until_equal_##name(volatile type * addr, type expected) \
> > > +{ \
> > > +	type tmp; \
> > > +	asm volatile( \
> > > +		#asm_op " %" #wide "[tmp], %[addr]\n" \
> > > +		"cmp	%" #wide "[tmp], %" #wide "[expected]\n" \
> > > +		"b.eq	2f\n" \
> > > +		"sevl\n" \
> > > +		"1:	wfe\n" \
> > > +		#asm_op " %" #wide "[tmp], %[addr]\n" \
> > > +		"cmp	%" #wide "[tmp], %" #wide "[expected]\n" \
> > > +		"bne	1b\n" \
> > > +		"2:\n" \
> > > +		: [tmp] "=&r" (tmp) \
> > > +		: [addr] "Q"(*addr), [expected] "r"(expected) \
> > > +		: "cc", "memory"); \
> > > +}
> 
> One more thought:
> Why do you need to write asm code for the whole procedure?
> Why not to do like linux kernel:
> define wfe() and sev() macros and use them inside normal C code?
> 
> #define sev()		asm volatile("sev" : : : "memory")
> #define wfe()		asm volatile("wfe" : : : "memory")
> 
> Then:
> rte_wait_until_equal_32(volatile uint32_t *addr, uint32_t expected, int
> memorder)
> {
>      if (__atomic_load_n(addr, memorder) != expected) {
>          sev();
>          do {
>              wfe();
>          } while ((__atomic_load_n(addr, memorder) != expected);
>      }
> }
> 
> ?
A really good suggestion, I made corresponding changes to v8 already, but it missed a armv8 specific feature after internal discussion.
We call wfe to wait/sleep on the 'monitored' address, it will be waken up upon someone write to the monitor address, so before wfe, we have to call load-exclusive instruction to 'monitor'. 
__atomic_load_n - disassembled to "ldr" does not do so. We have to use "ldxrh" for relaxed mem ordering and "ldaxrh" for acquire ordering, in example of 16-bit.

Let me re-think coming back to the full assembly procedure or implementing a 'load-exclusive' function. What do you think? 
/Gavin

> > > +/* Wait for *addr to be updated with expected value */
> > > +__WAIT_UNTIL_EQUAL(relaxed_16, ldxrh, w, uint16_t)
> > > +__WAIT_UNTIL_EQUAL(acquire_16, ldaxrh, w, uint16_t)
> > > +__WAIT_UNTIL_EQUAL(relaxed_32, ldxr, w, uint32_t)
> > > +__WAIT_UNTIL_EQUAL(acquire_32, ldaxr, w, uint32_t)
> > > +__WAIT_UNTIL_EQUAL(relaxed_64, ldxr, x, uint64_t)
> > > +__WAIT_UNTIL_EQUAL(acquire_64, ldaxr, x, uint64_t)
> > > +#endif
> > > +
> > >  #ifdef __cplusplus
> > >  }
> > >  #endif
> > > diff --git a/lib/librte_eal/common/include/generic/rte_pause.h
> b/lib/librte_eal/common/include/generic/rte_pause.h
> > > index 52bd4db..8906473 100644
> > > --- a/lib/librte_eal/common/include/generic/rte_pause.h
> > > +++ b/lib/librte_eal/common/include/generic/rte_pause.h
> > > @@ -1,5 +1,6 @@
> > >  /* SPDX-License-Identifier: BSD-3-Clause
> > >   * Copyright(c) 2017 Cavium, Inc
> > > + * Copyright(c) 2019 Arm Limited
> > >   */
> > >
> > >  #ifndef _RTE_PAUSE_H_
> > > @@ -12,6 +13,10 @@
> > >   *
> > >   */
> > >
> > > +#include <stdint.h>
> > > +#include <rte_common.h>
> > > +#include <rte_atomic.h>
> > > +
> > >  /**
> > >   * Pause CPU execution for a short while
> > >   *
> > > @@ -20,4 +25,105 @@
> > >   */
> > >  static inline void rte_pause(void);
> > >
> > > +/**
> > > + * Wait for *addr to be updated with a 16-bit expected value, with a
> relaxed
> > > + * memory ordering model meaning the loads around this API can be
> reordered.
> > > + *
> > > + * @param addr
> > > + *  A pointer to the memory location.
> > > + * @param expected
> > > + *  A 16-bit expected value to be in the memory location.
> > > + */
> > > +__rte_always_inline
> > > +static void
> > > +rte_wait_until_equal_relaxed_16(volatile uint16_t *addr, uint16_t
> expected);
> > > +
> > > +/**
> > > + * Wait for *addr to be updated with a 32-bit expected value, with a
> relaxed
> > > + * memory ordering model meaning the loads around this API can be
> reordered.
> > > + *
> > > + * @param addr
> > > + *  A pointer to the memory location.
> > > + * @param expected
> > > + *  A 32-bit expected value to be in the memory location.
> > > + */
> > > +__rte_always_inline
> > > +static void
> > > +rte_wait_until_equal_relaxed_32(volatile uint32_t *addr, uint32_t
> expected);
> > > +
> > > +/**
> > > + * Wait for *addr to be updated with a 64-bit expected value, with a
> relaxed
> > > + * memory ordering model meaning the loads around this API can be
> reordered.
> > > + *
> > > + * @param addr
> > > + *  A pointer to the memory location.
> > > + * @param expected
> > > + *  A 64-bit expected value to be in the memory location.
> > > + */
> > > +__rte_always_inline
> > > +static void
> > > +rte_wait_until_equal_relaxed_64(volatile uint64_t *addr, uint64_t
> expected);
> > > +
> > > +/**
> > > + * Wait for *addr to be updated with a 16-bit expected value, with an
> acquire
> > > + * memory ordering model meaning the loads after this API can't be
> observed
> > > + * before this API.
> > > + *
> > > + * @param addr
> > > + *  A pointer to the memory location.
> > > + * @param expected
> > > + *  A 16-bit expected value to be in the memory location.
> > > + */
> > > +__rte_always_inline
> > > +static void
> > > +rte_wait_until_equal_acquire_16(volatile uint16_t *addr, uint16_t
> expected);
> > > +
> > > +/**
> > > + * Wait for *addr to be updated with a 32-bit expected value, with an
> acquire
> > > + * memory ordering model meaning the loads after this API can't be
> observed
> > > + * before this API.
> > > + *
> > > + * @param addr
> > > + *  A pointer to the memory location.
> > > + * @param expected
> > > + *  A 32-bit expected value to be in the memory location.
> > > + */
> > > +__rte_always_inline
> > > +static void
> > > +rte_wait_until_equal_acquire_32(volatile uint32_t *addr, uint32_t
> expected);
> >
> > LGTM in general.
> > One stylish thing: wouldn't it be better to have an API like that:
> > rte_wait_until_equal_acquire_X(addr, expected, memory_order)
> > ?
> >
> > I.E. - pass memorder as parameter, not to incorporate it into function
> name?
> > Less functions, plus user can specify order himself.
> > Plus looks similar to C11 atomic instrincts.
> >
> >
> > > +
> > > +/**
> > > + * Wait for *addr to be updated with a 64-bit expected value, with an
> acquire
> > > + * memory ordering model meaning the loads after this API can't be
> observed
> > > + * before this API.
> > > + *
> > > + * @param addr
> > > + *  A pointer to the memory location.
> > > + * @param expected
> > > + *  A 64-bit expected value to be in the memory location.
> > > + */
> > > +__rte_always_inline
> > > +static void
> > > +rte_wait_until_equal_acquire_64(volatile uint64_t *addr, uint64_t
> expected);
> > > +
> > > +#if !defined(RTE_ARM_USE_WFE)
> > > +#define __WAIT_UNTIL_EQUAL(op_name, size, type, memorder) \
> > > +__rte_always_inline \
> > > +static void	\
> > > +rte_wait_until_equal_##op_name##_##size(volatile type *addr, \
> > > +	type expected) \
> > > +{ \
> > > +	while (__atomic_load_n(addr, memorder) != expected) \
> > > +		rte_pause(); \
> > > +}
> > > +
> > > +/* Wait for *addr to be updated with expected value */
> > > +__WAIT_UNTIL_EQUAL(relaxed, 16, uint16_t, __ATOMIC_RELAXED)
> > > +__WAIT_UNTIL_EQUAL(acquire, 16, uint16_t, __ATOMIC_ACQUIRE)
> > > +__WAIT_UNTIL_EQUAL(relaxed, 32, uint32_t, __ATOMIC_RELAXED)
> > > +__WAIT_UNTIL_EQUAL(acquire, 32, uint32_t, __ATOMIC_ACQUIRE)
> > > +__WAIT_UNTIL_EQUAL(relaxed, 64, uint64_t, __ATOMIC_RELAXED)
> > > +__WAIT_UNTIL_EQUAL(acquire, 64, uint64_t, __ATOMIC_ACQUIRE)
> > > +#endif /* RTE_ARM_USE_WFE */
> > > +
> > >  #endif /* _RTE_PAUSE_H_ */
> > > --
> > > 2.7.4
  
Gavin Hu Oct. 23, 2019, 4:29 p.m. UTC | #9
Hi Konstantin,

> -----Original Message-----
> From: Gavin Hu (Arm Technology China)
> Sent: Thursday, October 24, 2019 12:20 AM
> To: Ananyev, Konstantin <konstantin.ananyev@intel.com>; dev@dpdk.org
> Cc: nd <nd@arm.com>; thomas@monjalon.net;
> stephen@networkplumber.org; hemant.agrawal@nxp.com;
> jerinj@marvell.com; pbhagavatula@marvell.com; Honnappa Nagarahalli
> <Honnappa.Nagarahalli@arm.com>; Ruifeng Wang (Arm Technology China)
> <Ruifeng.Wang@arm.com>; Phil Yang (Arm Technology China)
> <Phil.Yang@arm.com>; Steve Capper <Steve.Capper@arm.com>; nd
> <nd@arm.com>
> Subject: RE: [dpdk-dev] [PATCH v7 2/7] eal: add the APIs to wait until equal
> 
> Hi Konstantin,
> 
> > -----Original Message-----
> > From: Ananyev, Konstantin <konstantin.ananyev@intel.com>
> > Sent: Friday, October 18, 2019 12:44 AM
> > To: Gavin Hu (Arm Technology China) <Gavin.Hu@arm.com>;
> > dev@dpdk.org
> > Cc: nd <nd@arm.com>; thomas@monjalon.net;
> > stephen@networkplumber.org; hemant.agrawal@nxp.com;
> > jerinj@marvell.com; pbhagavatula@marvell.com; Honnappa Nagarahalli
> > <Honnappa.Nagarahalli@arm.com>; Ruifeng Wang (Arm Technology
> China)
> > <Ruifeng.Wang@arm.com>; Phil Yang (Arm Technology China)
> > <Phil.Yang@arm.com>; Steve Capper <Steve.Capper@arm.com>
> > Subject: RE: [dpdk-dev] [PATCH v7 2/7] eal: add the APIs to wait until equal
> >
> >
> > >
> > > Hi Gavin,
> > >
> > > >
> > > > The rte_wait_until_equal_xx APIs abstract the functionality of
> > > > 'polling for a memory location to become equal to a given value'.
> > > >
> > > > Add the RTE_ARM_USE_WFE configuration entry for aarch64, disabled
> > > > by default. When it is enabled, the above APIs will call WFE instruction
> > > > to save CPU cycles and power.
> > > >
> > > > Signed-off-by: Gavin Hu <gavin.hu@arm.com>
> > > > Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
> > > > Reviewed-by: Steve Capper <steve.capper@arm.com>
> > > > Reviewed-by: Ola Liljedahl <ola.liljedahl@arm.com>
> > > > Reviewed-by: Honnappa Nagarahalli
> <honnappa.nagarahalli@arm.com>
> > > > Reviewed-by: Phil Yang <phil.yang@arm.com>
> > > > Acked-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
> > > > ---
> > > >  config/arm/meson.build                             |   1 +
> > > >  config/common_base                                 |   5 +
> > > >  .../common/include/arch/arm/rte_pause_64.h         |  30 ++++++
> > > >  lib/librte_eal/common/include/generic/rte_pause.h  | 106
> > +++++++++++++++++++++
> > > >  4 files changed, 142 insertions(+)
> > > >
> > > > diff --git a/config/arm/meson.build b/config/arm/meson.build
> > > > index 979018e..b4b4cac 100644
> > > > --- a/config/arm/meson.build
> > > > +++ b/config/arm/meson.build
> > > > @@ -26,6 +26,7 @@ flags_common_default = [
> > > >  	['RTE_LIBRTE_AVP_PMD', false],
> > > >
> > > >  	['RTE_SCHED_VECTOR', false],
> > > > +	['RTE_ARM_USE_WFE', false],
> > > >  ]
> > > >
> > > >  flags_generic = [
> > > > diff --git a/config/common_base b/config/common_base
> > > > index 8ef75c2..8861713 100644
> > > > --- a/config/common_base
> > > > +++ b/config/common_base
> > > > @@ -111,6 +111,11 @@ CONFIG_RTE_MAX_VFIO_CONTAINERS=64
> > > >  CONFIG_RTE_MALLOC_DEBUG=n
> > > >  CONFIG_RTE_EAL_NUMA_AWARE_HUGEPAGES=n
> > > >  CONFIG_RTE_USE_LIBBSD=n
> > > > +# Use WFE instructions to implement the rte_wait_for_equal_xxx APIs,
> > > > +# calling these APIs put the cores in low power state while waiting
> > > > +# for the memory address to become equal to the expected value.
> > > > +# This is supported only by aarch64.
> > > > +CONFIG_RTE_ARM_USE_WFE=n
> > > >
> > > >  #
> > > >  # Recognize/ignore the AVX/AVX512 CPU flags for performance/power
> > testing.
> > > > diff --git a/lib/librte_eal/common/include/arch/arm/rte_pause_64.h
> > b/lib/librte_eal/common/include/arch/arm/rte_pause_64.h
> > > > index 93895d3..dabde17 100644
> > > > --- a/lib/librte_eal/common/include/arch/arm/rte_pause_64.h
> > > > +++ b/lib/librte_eal/common/include/arch/arm/rte_pause_64.h
> > > > @@ -1,5 +1,6 @@
> > > >  /* SPDX-License-Identifier: BSD-3-Clause
> > > >   * Copyright(c) 2017 Cavium, Inc
> > > > + * Copyright(c) 2019 Arm Limited
> > > >   */
> > > >
> > > >  #ifndef _RTE_PAUSE_ARM64_H_
> > > > @@ -17,6 +18,35 @@ static inline void rte_pause(void)
> > > >  	asm volatile("yield" ::: "memory");
> > > >  }
> > > >
> > > > +#ifdef RTE_ARM_USE_WFE
> > > > +#define __WAIT_UNTIL_EQUAL(name, asm_op, wide, type) \
> > > > +static __rte_always_inline void \
> > > > +rte_wait_until_equal_##name(volatile type * addr, type expected) \
> > > > +{ \
> > > > +	type tmp; \
> > > > +	asm volatile( \
> > > > +		#asm_op " %" #wide "[tmp], %[addr]\n" \
> > > > +		"cmp	%" #wide "[tmp], %" #wide "[expected]\n" \
> > > > +		"b.eq	2f\n" \
> > > > +		"sevl\n" \
> > > > +		"1:	wfe\n" \
> > > > +		#asm_op " %" #wide "[tmp], %[addr]\n" \
> > > > +		"cmp	%" #wide "[tmp], %" #wide "[expected]\n" \
> > > > +		"bne	1b\n" \
> > > > +		"2:\n" \
> > > > +		: [tmp] "=&r" (tmp) \
> > > > +		: [addr] "Q"(*addr), [expected] "r"(expected) \
> > > > +		: "cc", "memory"); \
> > > > +}
> >
> > One more thought:
> > Why do you need to write asm code for the whole procedure?
> > Why not to do like linux kernel:
> > define wfe() and sev() macros and use them inside normal C code?
> >
> > #define sev()		asm volatile("sev" : : : "memory")
> > #define wfe()		asm volatile("wfe" : : : "memory")
> >
> > Then:
> > rte_wait_until_equal_32(volatile uint32_t *addr, uint32_t expected, int
> > memorder)
> > {
> >      if (__atomic_load_n(addr, memorder) != expected) {
> >          sev();
> >          do {
> >              wfe();
> >          } while ((__atomic_load_n(addr, memorder) != expected);
> >      }
> > }
> >
> > ?
> A really good suggestion, I made corresponding changes to v8 already, but it
> missed a armv8 specific feature after internal discussion.
> We call wfe to wait/sleep on the 'monitored' address, it will be waken up
> upon someone write to the monitor address, so before wfe, we have to call
> load-exclusive instruction to 'monitor'.
> __atomic_load_n - disassembled to "ldr" does not do so. We have to use
> "ldxrh" for relaxed mem ordering and "ldaxrh" for acquire ordering, in
> example of 16-bit.
> 
> Let me re-think coming back to the full assembly procedure or implementing
> a 'load-exclusive' function. What do you think?
> /Gavin
Forgot to mention, kernel uses wfe() without preceding load-exclusive instructions because:
1) it replies on the timer, to wake up, i.e. __delay()
2) explicit calling sev to send wake events, for all kinds of locks
3) IPI instructions.

Our patches can't count on these events, due to of lack of these events or performance  impact. 
/Gavin
> 
> > > > +/* Wait for *addr to be updated with expected value */
> > > > +__WAIT_UNTIL_EQUAL(relaxed_16, ldxrh, w, uint16_t)
> > > > +__WAIT_UNTIL_EQUAL(acquire_16, ldaxrh, w, uint16_t)
> > > > +__WAIT_UNTIL_EQUAL(relaxed_32, ldxr, w, uint32_t)
> > > > +__WAIT_UNTIL_EQUAL(acquire_32, ldaxr, w, uint32_t)
> > > > +__WAIT_UNTIL_EQUAL(relaxed_64, ldxr, x, uint64_t)
> > > > +__WAIT_UNTIL_EQUAL(acquire_64, ldaxr, x, uint64_t)
> > > > +#endif
> > > > +
> > > >  #ifdef __cplusplus
> > > >  }
> > > >  #endif
> > > > diff --git a/lib/librte_eal/common/include/generic/rte_pause.h
> > b/lib/librte_eal/common/include/generic/rte_pause.h
> > > > index 52bd4db..8906473 100644
> > > > --- a/lib/librte_eal/common/include/generic/rte_pause.h
> > > > +++ b/lib/librte_eal/common/include/generic/rte_pause.h
> > > > @@ -1,5 +1,6 @@
> > > >  /* SPDX-License-Identifier: BSD-3-Clause
> > > >   * Copyright(c) 2017 Cavium, Inc
> > > > + * Copyright(c) 2019 Arm Limited
> > > >   */
> > > >
> > > >  #ifndef _RTE_PAUSE_H_
> > > > @@ -12,6 +13,10 @@
> > > >   *
> > > >   */
> > > >
> > > > +#include <stdint.h>
> > > > +#include <rte_common.h>
> > > > +#include <rte_atomic.h>
> > > > +
> > > >  /**
> > > >   * Pause CPU execution for a short while
> > > >   *
> > > > @@ -20,4 +25,105 @@
> > > >   */
> > > >  static inline void rte_pause(void);
> > > >
> > > > +/**
> > > > + * Wait for *addr to be updated with a 16-bit expected value, with a
> > relaxed
> > > > + * memory ordering model meaning the loads around this API can be
> > reordered.
> > > > + *
> > > > + * @param addr
> > > > + *  A pointer to the memory location.
> > > > + * @param expected
> > > > + *  A 16-bit expected value to be in the memory location.
> > > > + */
> > > > +__rte_always_inline
> > > > +static void
> > > > +rte_wait_until_equal_relaxed_16(volatile uint16_t *addr, uint16_t
> > expected);
> > > > +
> > > > +/**
> > > > + * Wait for *addr to be updated with a 32-bit expected value, with a
> > relaxed
> > > > + * memory ordering model meaning the loads around this API can be
> > reordered.
> > > > + *
> > > > + * @param addr
> > > > + *  A pointer to the memory location.
> > > > + * @param expected
> > > > + *  A 32-bit expected value to be in the memory location.
> > > > + */
> > > > +__rte_always_inline
> > > > +static void
> > > > +rte_wait_until_equal_relaxed_32(volatile uint32_t *addr, uint32_t
> > expected);
> > > > +
> > > > +/**
> > > > + * Wait for *addr to be updated with a 64-bit expected value, with a
> > relaxed
> > > > + * memory ordering model meaning the loads around this API can be
> > reordered.
> > > > + *
> > > > + * @param addr
> > > > + *  A pointer to the memory location.
> > > > + * @param expected
> > > > + *  A 64-bit expected value to be in the memory location.
> > > > + */
> > > > +__rte_always_inline
> > > > +static void
> > > > +rte_wait_until_equal_relaxed_64(volatile uint64_t *addr, uint64_t
> > expected);
> > > > +
> > > > +/**
> > > > + * Wait for *addr to be updated with a 16-bit expected value, with an
> > acquire
> > > > + * memory ordering model meaning the loads after this API can't be
> > observed
> > > > + * before this API.
> > > > + *
> > > > + * @param addr
> > > > + *  A pointer to the memory location.
> > > > + * @param expected
> > > > + *  A 16-bit expected value to be in the memory location.
> > > > + */
> > > > +__rte_always_inline
> > > > +static void
> > > > +rte_wait_until_equal_acquire_16(volatile uint16_t *addr, uint16_t
> > expected);
> > > > +
> > > > +/**
> > > > + * Wait for *addr to be updated with a 32-bit expected value, with an
> > acquire
> > > > + * memory ordering model meaning the loads after this API can't be
> > observed
> > > > + * before this API.
> > > > + *
> > > > + * @param addr
> > > > + *  A pointer to the memory location.
> > > > + * @param expected
> > > > + *  A 32-bit expected value to be in the memory location.
> > > > + */
> > > > +__rte_always_inline
> > > > +static void
> > > > +rte_wait_until_equal_acquire_32(volatile uint32_t *addr, uint32_t
> > expected);
> > >
> > > LGTM in general.
> > > One stylish thing: wouldn't it be better to have an API like that:
> > > rte_wait_until_equal_acquire_X(addr, expected, memory_order)
> > > ?
> > >
> > > I.E. - pass memorder as parameter, not to incorporate it into function
> > name?
> > > Less functions, plus user can specify order himself.
> > > Plus looks similar to C11 atomic instrincts.
> > >
> > >
> > > > +
> > > > +/**
> > > > + * Wait for *addr to be updated with a 64-bit expected value, with an
> > acquire
> > > > + * memory ordering model meaning the loads after this API can't be
> > observed
> > > > + * before this API.
> > > > + *
> > > > + * @param addr
> > > > + *  A pointer to the memory location.
> > > > + * @param expected
> > > > + *  A 64-bit expected value to be in the memory location.
> > > > + */
> > > > +__rte_always_inline
> > > > +static void
> > > > +rte_wait_until_equal_acquire_64(volatile uint64_t *addr, uint64_t
> > expected);
> > > > +
> > > > +#if !defined(RTE_ARM_USE_WFE)
> > > > +#define __WAIT_UNTIL_EQUAL(op_name, size, type, memorder) \
> > > > +__rte_always_inline \
> > > > +static void	\
> > > > +rte_wait_until_equal_##op_name##_##size(volatile type *addr, \
> > > > +	type expected) \
> > > > +{ \
> > > > +	while (__atomic_load_n(addr, memorder) != expected) \
> > > > +		rte_pause(); \
> > > > +}
> > > > +
> > > > +/* Wait for *addr to be updated with expected value */
> > > > +__WAIT_UNTIL_EQUAL(relaxed, 16, uint16_t, __ATOMIC_RELAXED)
> > > > +__WAIT_UNTIL_EQUAL(acquire, 16, uint16_t, __ATOMIC_ACQUIRE)
> > > > +__WAIT_UNTIL_EQUAL(relaxed, 32, uint32_t, __ATOMIC_RELAXED)
> > > > +__WAIT_UNTIL_EQUAL(acquire, 32, uint32_t, __ATOMIC_ACQUIRE)
> > > > +__WAIT_UNTIL_EQUAL(relaxed, 64, uint64_t, __ATOMIC_RELAXED)
> > > > +__WAIT_UNTIL_EQUAL(acquire, 64, uint64_t, __ATOMIC_ACQUIRE)
> > > > +#endif /* RTE_ARM_USE_WFE */
> > > > +
> > > >  #endif /* _RTE_PAUSE_H_ */
> > > > --
> > > > 2.7.4
  
Ananyev, Konstantin Oct. 24, 2019, 10:21 a.m. UTC | #10
Hi Gavin,
> > > > > The rte_wait_until_equal_xx APIs abstract the functionality of
> > > > > 'polling for a memory location to become equal to a given value'.
> > > > >
> > > > > Add the RTE_ARM_USE_WFE configuration entry for aarch64, disabled
> > > > > by default. When it is enabled, the above APIs will call WFE instruction
> > > > > to save CPU cycles and power.
> > > > >
> > > > > Signed-off-by: Gavin Hu <gavin.hu@arm.com>
> > > > > Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
> > > > > Reviewed-by: Steve Capper <steve.capper@arm.com>
> > > > > Reviewed-by: Ola Liljedahl <ola.liljedahl@arm.com>
> > > > > Reviewed-by: Honnappa Nagarahalli
> > <honnappa.nagarahalli@arm.com>
> > > > > Reviewed-by: Phil Yang <phil.yang@arm.com>
> > > > > Acked-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
> > > > > ---
> > > > >  config/arm/meson.build                             |   1 +
> > > > >  config/common_base                                 |   5 +
> > > > >  .../common/include/arch/arm/rte_pause_64.h         |  30 ++++++
> > > > >  lib/librte_eal/common/include/generic/rte_pause.h  | 106
> > > +++++++++++++++++++++
> > > > >  4 files changed, 142 insertions(+)
> > > > >
> > > > > diff --git a/config/arm/meson.build b/config/arm/meson.build
> > > > > index 979018e..b4b4cac 100644
> > > > > --- a/config/arm/meson.build
> > > > > +++ b/config/arm/meson.build
> > > > > @@ -26,6 +26,7 @@ flags_common_default = [
> > > > >  	['RTE_LIBRTE_AVP_PMD', false],
> > > > >
> > > > >  	['RTE_SCHED_VECTOR', false],
> > > > > +	['RTE_ARM_USE_WFE', false],
> > > > >  ]
> > > > >
> > > > >  flags_generic = [
> > > > > diff --git a/config/common_base b/config/common_base
> > > > > index 8ef75c2..8861713 100644
> > > > > --- a/config/common_base
> > > > > +++ b/config/common_base
> > > > > @@ -111,6 +111,11 @@ CONFIG_RTE_MAX_VFIO_CONTAINERS=64
> > > > >  CONFIG_RTE_MALLOC_DEBUG=n
> > > > >  CONFIG_RTE_EAL_NUMA_AWARE_HUGEPAGES=n
> > > > >  CONFIG_RTE_USE_LIBBSD=n
> > > > > +# Use WFE instructions to implement the rte_wait_for_equal_xxx APIs,
> > > > > +# calling these APIs put the cores in low power state while waiting
> > > > > +# for the memory address to become equal to the expected value.
> > > > > +# This is supported only by aarch64.
> > > > > +CONFIG_RTE_ARM_USE_WFE=n
> > > > >
> > > > >  #
> > > > >  # Recognize/ignore the AVX/AVX512 CPU flags for performance/power
> > > testing.
> > > > > diff --git a/lib/librte_eal/common/include/arch/arm/rte_pause_64.h
> > > b/lib/librte_eal/common/include/arch/arm/rte_pause_64.h
> > > > > index 93895d3..dabde17 100644
> > > > > --- a/lib/librte_eal/common/include/arch/arm/rte_pause_64.h
> > > > > +++ b/lib/librte_eal/common/include/arch/arm/rte_pause_64.h
> > > > > @@ -1,5 +1,6 @@
> > > > >  /* SPDX-License-Identifier: BSD-3-Clause
> > > > >   * Copyright(c) 2017 Cavium, Inc
> > > > > + * Copyright(c) 2019 Arm Limited
> > > > >   */
> > > > >
> > > > >  #ifndef _RTE_PAUSE_ARM64_H_
> > > > > @@ -17,6 +18,35 @@ static inline void rte_pause(void)
> > > > >  	asm volatile("yield" ::: "memory");
> > > > >  }
> > > > >
> > > > > +#ifdef RTE_ARM_USE_WFE
> > > > > +#define __WAIT_UNTIL_EQUAL(name, asm_op, wide, type) \
> > > > > +static __rte_always_inline void \
> > > > > +rte_wait_until_equal_##name(volatile type * addr, type expected) \
> > > > > +{ \
> > > > > +	type tmp; \
> > > > > +	asm volatile( \
> > > > > +		#asm_op " %" #wide "[tmp], %[addr]\n" \
> > > > > +		"cmp	%" #wide "[tmp], %" #wide "[expected]\n" \
> > > > > +		"b.eq	2f\n" \
> > > > > +		"sevl\n" \
> > > > > +		"1:	wfe\n" \
> > > > > +		#asm_op " %" #wide "[tmp], %[addr]\n" \
> > > > > +		"cmp	%" #wide "[tmp], %" #wide "[expected]\n" \
> > > > > +		"bne	1b\n" \
> > > > > +		"2:\n" \
> > > > > +		: [tmp] "=&r" (tmp) \
> > > > > +		: [addr] "Q"(*addr), [expected] "r"(expected) \
> > > > > +		: "cc", "memory"); \
> > > > > +}
> > >
> > > One more thought:
> > > Why do you need to write asm code for the whole procedure?
> > > Why not to do like linux kernel:
> > > define wfe() and sev() macros and use them inside normal C code?
> > >
> > > #define sev()		asm volatile("sev" : : : "memory")
> > > #define wfe()		asm volatile("wfe" : : : "memory")
> > >
> > > Then:
> > > rte_wait_until_equal_32(volatile uint32_t *addr, uint32_t expected, int
> > > memorder)
> > > {
> > >      if (__atomic_load_n(addr, memorder) != expected) {
> > >          sev();
> > >          do {
> > >              wfe();
> > >          } while ((__atomic_load_n(addr, memorder) != expected);
> > >      }
> > > }
> > >
> > > ?
> > A really good suggestion, I made corresponding changes to v8 already, but it
> > missed a armv8 specific feature after internal discussion.
> > We call wfe to wait/sleep on the 'monitored' address, it will be waken up
> > upon someone write to the monitor address, so before wfe, we have to call
> > load-exclusive instruction to 'monitor'.
> > __atomic_load_n - disassembled to "ldr" does not do so. We have to use
> > "ldxrh" for relaxed mem ordering and "ldaxrh" for acquire ordering, in
> > example of 16-bit.

Didn't realize that, sorry for confusion caused...

> >
> > Let me re-think coming back to the full assembly procedure or implementing
> > a 'load-exclusive' function. What do you think?

After some thought I am leaning towards 'load-exclusive' function -
Hopefully it would help you avoid ras asm here and in other places.
What do you think?
Konstantin

> > /Gavin
> Forgot to mention, kernel uses wfe() without preceding load-exclusive instructions because:
> 1) it replies on the timer, to wake up, i.e. __delay()
> 2) explicit calling sev to send wake events, for all kinds of locks
> 3) IPI instructions.
> 
> Our patches can't count on these events, due to of lack of these events or performance  impact.
> /Gavin
> >
> > > > > +/* Wait for *addr to be updated with expected value */
> > > > > +__WAIT_UNTIL_EQUAL(relaxed_16, ldxrh, w, uint16_t)
> > > > > +__WAIT_UNTIL_EQUAL(acquire_16, ldaxrh, w, uint16_t)
> > > > > +__WAIT_UNTIL_EQUAL(relaxed_32, ldxr, w, uint32_t)
> > > > > +__WAIT_UNTIL_EQUAL(acquire_32, ldaxr, w, uint32_t)
> > > > > +__WAIT_UNTIL_EQUAL(relaxed_64, ldxr, x, uint64_t)
> > > > > +__WAIT_UNTIL_EQUAL(acquire_64, ldaxr, x, uint64_t)
> > > > > +#endif
> > > > > +
> > > > >  #ifdef __cplusplus
> > > > >  }
> > > > >  #endif
  
Gavin Hu Oct. 24, 2019, 10:52 a.m. UTC | #11
Hi Konstantin, 

> -----Original Message-----
> From: Ananyev, Konstantin <konstantin.ananyev@intel.com>
> Sent: Thursday, October 24, 2019 6:21 PM
> To: Gavin Hu (Arm Technology China) <Gavin.Hu@arm.com>;
> dev@dpdk.org
> Cc: nd <nd@arm.com>; thomas@monjalon.net;
> stephen@networkplumber.org; hemant.agrawal@nxp.com;
> jerinj@marvell.com; pbhagavatula@marvell.com; Honnappa Nagarahalli
> <Honnappa.Nagarahalli@arm.com>; Ruifeng Wang (Arm Technology China)
> <Ruifeng.Wang@arm.com>; Phil Yang (Arm Technology China)
> <Phil.Yang@arm.com>; Steve Capper <Steve.Capper@arm.com>; nd
> <nd@arm.com>; nd <nd@arm.com>
> Subject: RE: [dpdk-dev] [PATCH v7 2/7] eal: add the APIs to wait until equal
> 
> 
> 
> Hi Gavin,
> > > > > > The rte_wait_until_equal_xx APIs abstract the functionality of
> > > > > > 'polling for a memory location to become equal to a given value'.
> > > > > >
> > > > > > Add the RTE_ARM_USE_WFE configuration entry for aarch64,
> disabled
> > > > > > by default. When it is enabled, the above APIs will call WFE
> instruction
> > > > > > to save CPU cycles and power.
> > > > > >
> > > > > > Signed-off-by: Gavin Hu <gavin.hu@arm.com>
> > > > > > Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
> > > > > > Reviewed-by: Steve Capper <steve.capper@arm.com>
> > > > > > Reviewed-by: Ola Liljedahl <ola.liljedahl@arm.com>
> > > > > > Reviewed-by: Honnappa Nagarahalli
> > > <honnappa.nagarahalli@arm.com>
> > > > > > Reviewed-by: Phil Yang <phil.yang@arm.com>
> > > > > > Acked-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
> > > > > > ---
> > > > > >  config/arm/meson.build                             |   1 +
> > > > > >  config/common_base                                 |   5 +
> > > > > >  .../common/include/arch/arm/rte_pause_64.h         |  30 ++++++
> > > > > >  lib/librte_eal/common/include/generic/rte_pause.h  | 106
> > > > +++++++++++++++++++++
> > > > > >  4 files changed, 142 insertions(+)
> > > > > >
> > > > > > diff --git a/config/arm/meson.build b/config/arm/meson.build
> > > > > > index 979018e..b4b4cac 100644
> > > > > > --- a/config/arm/meson.build
> > > > > > +++ b/config/arm/meson.build
> > > > > > @@ -26,6 +26,7 @@ flags_common_default = [
> > > > > >  	['RTE_LIBRTE_AVP_PMD', false],
> > > > > >
> > > > > >  	['RTE_SCHED_VECTOR', false],
> > > > > > +	['RTE_ARM_USE_WFE', false],
> > > > > >  ]
> > > > > >
> > > > > >  flags_generic = [
> > > > > > diff --git a/config/common_base b/config/common_base
> > > > > > index 8ef75c2..8861713 100644
> > > > > > --- a/config/common_base
> > > > > > +++ b/config/common_base
> > > > > > @@ -111,6 +111,11 @@
> CONFIG_RTE_MAX_VFIO_CONTAINERS=64
> > > > > >  CONFIG_RTE_MALLOC_DEBUG=n
> > > > > >  CONFIG_RTE_EAL_NUMA_AWARE_HUGEPAGES=n
> > > > > >  CONFIG_RTE_USE_LIBBSD=n
> > > > > > +# Use WFE instructions to implement the rte_wait_for_equal_xxx
> APIs,
> > > > > > +# calling these APIs put the cores in low power state while waiting
> > > > > > +# for the memory address to become equal to the expected
> value.
> > > > > > +# This is supported only by aarch64.
> > > > > > +CONFIG_RTE_ARM_USE_WFE=n
> > > > > >
> > > > > >  #
> > > > > >  # Recognize/ignore the AVX/AVX512 CPU flags for
> performance/power
> > > > testing.
> > > > > > diff --git
> a/lib/librte_eal/common/include/arch/arm/rte_pause_64.h
> > > > b/lib/librte_eal/common/include/arch/arm/rte_pause_64.h
> > > > > > index 93895d3..dabde17 100644
> > > > > > --- a/lib/librte_eal/common/include/arch/arm/rte_pause_64.h
> > > > > > +++ b/lib/librte_eal/common/include/arch/arm/rte_pause_64.h
> > > > > > @@ -1,5 +1,6 @@
> > > > > >  /* SPDX-License-Identifier: BSD-3-Clause
> > > > > >   * Copyright(c) 2017 Cavium, Inc
> > > > > > + * Copyright(c) 2019 Arm Limited
> > > > > >   */
> > > > > >
> > > > > >  #ifndef _RTE_PAUSE_ARM64_H_
> > > > > > @@ -17,6 +18,35 @@ static inline void rte_pause(void)
> > > > > >  	asm volatile("yield" ::: "memory");
> > > > > >  }
> > > > > >
> > > > > > +#ifdef RTE_ARM_USE_WFE
> > > > > > +#define __WAIT_UNTIL_EQUAL(name, asm_op, wide, type) \
> > > > > > +static __rte_always_inline void \
> > > > > > +rte_wait_until_equal_##name(volatile type * addr, type
> expected) \
> > > > > > +{ \
> > > > > > +	type tmp; \
> > > > > > +	asm volatile( \
> > > > > > +		#asm_op " %" #wide "[tmp], %[addr]\n" \
> > > > > > +		"cmp	%" #wide "[tmp], %" #wide "[expected]\n"
> \
> > > > > > +		"b.eq	2f\n" \
> > > > > > +		"sevl\n" \
> > > > > > +		"1:	wfe\n" \
> > > > > > +		#asm_op " %" #wide "[tmp], %[addr]\n" \
> > > > > > +		"cmp	%" #wide "[tmp], %" #wide "[expected]\n"
> \
> > > > > > +		"bne	1b\n" \
> > > > > > +		"2:\n" \
> > > > > > +		: [tmp] "=&r" (tmp) \
> > > > > > +		: [addr] "Q"(*addr), [expected] "r"(expected) \
> > > > > > +		: "cc", "memory"); \
> > > > > > +}
> > > >
> > > > One more thought:
> > > > Why do you need to write asm code for the whole procedure?
> > > > Why not to do like linux kernel:
> > > > define wfe() and sev() macros and use them inside normal C code?
> > > >
> > > > #define sev()		asm volatile("sev" : : : "memory")
> > > > #define wfe()		asm volatile("wfe" : : : "memory")
> > > >
> > > > Then:
> > > > rte_wait_until_equal_32(volatile uint32_t *addr, uint32_t expected,
> int
> > > > memorder)
> > > > {
> > > >      if (__atomic_load_n(addr, memorder) != expected) {
> > > >          sev();
> > > >          do {
> > > >              wfe();
> > > >          } while ((__atomic_load_n(addr, memorder) != expected);
> > > >      }
> > > > }
> > > >
> > > > ?
> > > A really good suggestion, I made corresponding changes to v8 already,
> but it
> > > missed a armv8 specific feature after internal discussion.
> > > We call wfe to wait/sleep on the 'monitored' address, it will be waken up
> > > upon someone write to the monitor address, so before wfe, we have to
> call
> > > load-exclusive instruction to 'monitor'.
> > > __atomic_load_n - disassembled to "ldr" does not do so. We have to
> use
> > > "ldxrh" for relaxed mem ordering and "ldaxrh" for acquire ordering, in
> > > example of 16-bit.
> 
> Didn't realize that, sorry for confusion caused...
Your comments are really helpful! Although we missed this point, anyway it helped to make the patches in a better shape(I personally likes the new v9 more than v7 😊), really appreciate, thanks!
/Gavin
> 
> > >
> > > Let me re-think coming back to the full assembly procedure or
> implementing
> > > a 'load-exclusive' function. What do you think?
> 
> After some thought I am leaning towards 'load-exclusive' function -
> Hopefully it would help you avoid ras asm here and in other places.
> What do you think?
> Konstantin
Yes, I implemented 'load-exclusive' function in v9, please have a review, thanks!
Currently I did not make it 'rte_' as it is not used in other places than the rte_wait_until_equal APIs. 
Any more comments are welcome!
/Gavin
> 
> > > /Gavin
> > Forgot to mention, kernel uses wfe() without preceding load-exclusive
> instructions because:
> > 1) it replies on the timer, to wake up, i.e. __delay()
> > 2) explicit calling sev to send wake events, for all kinds of locks
> > 3) IPI instructions.
> >
> > Our patches can't count on these events, due to of lack of these events or
> performance  impact.
> > /Gavin
> > >
> > > > > > +/* Wait for *addr to be updated with expected value */
> > > > > > +__WAIT_UNTIL_EQUAL(relaxed_16, ldxrh, w, uint16_t)
> > > > > > +__WAIT_UNTIL_EQUAL(acquire_16, ldaxrh, w, uint16_t)
> > > > > > +__WAIT_UNTIL_EQUAL(relaxed_32, ldxr, w, uint32_t)
> > > > > > +__WAIT_UNTIL_EQUAL(acquire_32, ldaxr, w, uint32_t)
> > > > > > +__WAIT_UNTIL_EQUAL(relaxed_64, ldxr, x, uint64_t)
> > > > > > +__WAIT_UNTIL_EQUAL(acquire_64, ldaxr, x, uint64_t)
> > > > > > +#endif
> > > > > > +
> > > > > >  #ifdef __cplusplus
> > > > > >  }
> > > > > >  #endif
  

Patch

diff --git a/config/arm/meson.build b/config/arm/meson.build
index 979018e..b4b4cac 100644
--- a/config/arm/meson.build
+++ b/config/arm/meson.build
@@ -26,6 +26,7 @@  flags_common_default = [
 	['RTE_LIBRTE_AVP_PMD', false],
 
 	['RTE_SCHED_VECTOR', false],
+	['RTE_ARM_USE_WFE', false],
 ]
 
 flags_generic = [
diff --git a/config/common_base b/config/common_base
index 8ef75c2..8861713 100644
--- a/config/common_base
+++ b/config/common_base
@@ -111,6 +111,11 @@  CONFIG_RTE_MAX_VFIO_CONTAINERS=64
 CONFIG_RTE_MALLOC_DEBUG=n
 CONFIG_RTE_EAL_NUMA_AWARE_HUGEPAGES=n
 CONFIG_RTE_USE_LIBBSD=n
+# Use WFE instructions to implement the rte_wait_for_equal_xxx APIs,
+# calling these APIs put the cores in low power state while waiting
+# for the memory address to become equal to the expected value.
+# This is supported only by aarch64.
+CONFIG_RTE_ARM_USE_WFE=n
 
 #
 # Recognize/ignore the AVX/AVX512 CPU flags for performance/power testing.
diff --git a/lib/librte_eal/common/include/arch/arm/rte_pause_64.h b/lib/librte_eal/common/include/arch/arm/rte_pause_64.h
index 93895d3..dabde17 100644
--- a/lib/librte_eal/common/include/arch/arm/rte_pause_64.h
+++ b/lib/librte_eal/common/include/arch/arm/rte_pause_64.h
@@ -1,5 +1,6 @@ 
 /* SPDX-License-Identifier: BSD-3-Clause
  * Copyright(c) 2017 Cavium, Inc
+ * Copyright(c) 2019 Arm Limited
  */
 
 #ifndef _RTE_PAUSE_ARM64_H_
@@ -17,6 +18,35 @@  static inline void rte_pause(void)
 	asm volatile("yield" ::: "memory");
 }
 
+#ifdef RTE_ARM_USE_WFE
+#define __WAIT_UNTIL_EQUAL(name, asm_op, wide, type) \
+static __rte_always_inline void \
+rte_wait_until_equal_##name(volatile type * addr, type expected) \
+{ \
+	type tmp; \
+	asm volatile( \
+		#asm_op " %" #wide "[tmp], %[addr]\n" \
+		"cmp	%" #wide "[tmp], %" #wide "[expected]\n" \
+		"b.eq	2f\n" \
+		"sevl\n" \
+		"1:	wfe\n" \
+		#asm_op " %" #wide "[tmp], %[addr]\n" \
+		"cmp	%" #wide "[tmp], %" #wide "[expected]\n" \
+		"bne	1b\n" \
+		"2:\n" \
+		: [tmp] "=&r" (tmp) \
+		: [addr] "Q"(*addr), [expected] "r"(expected) \
+		: "cc", "memory"); \
+}
+/* Wait for *addr to be updated with expected value */
+__WAIT_UNTIL_EQUAL(relaxed_16, ldxrh, w, uint16_t)
+__WAIT_UNTIL_EQUAL(acquire_16, ldaxrh, w, uint16_t)
+__WAIT_UNTIL_EQUAL(relaxed_32, ldxr, w, uint32_t)
+__WAIT_UNTIL_EQUAL(acquire_32, ldaxr, w, uint32_t)
+__WAIT_UNTIL_EQUAL(relaxed_64, ldxr, x, uint64_t)
+__WAIT_UNTIL_EQUAL(acquire_64, ldaxr, x, uint64_t)
+#endif
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/librte_eal/common/include/generic/rte_pause.h b/lib/librte_eal/common/include/generic/rte_pause.h
index 52bd4db..8906473 100644
--- a/lib/librte_eal/common/include/generic/rte_pause.h
+++ b/lib/librte_eal/common/include/generic/rte_pause.h
@@ -1,5 +1,6 @@ 
 /* SPDX-License-Identifier: BSD-3-Clause
  * Copyright(c) 2017 Cavium, Inc
+ * Copyright(c) 2019 Arm Limited
  */
 
 #ifndef _RTE_PAUSE_H_
@@ -12,6 +13,10 @@ 
  *
  */
 
+#include <stdint.h>
+#include <rte_common.h>
+#include <rte_atomic.h>
+
 /**
  * Pause CPU execution for a short while
  *
@@ -20,4 +25,105 @@ 
  */
 static inline void rte_pause(void);
 
+/**
+ * Wait for *addr to be updated with a 16-bit expected value, with a relaxed
+ * memory ordering model meaning the loads around this API can be reordered.
+ *
+ * @param addr
+ *  A pointer to the memory location.
+ * @param expected
+ *  A 16-bit expected value to be in the memory location.
+ */
+__rte_always_inline
+static void
+rte_wait_until_equal_relaxed_16(volatile uint16_t *addr, uint16_t expected);
+
+/**
+ * Wait for *addr to be updated with a 32-bit expected value, with a relaxed
+ * memory ordering model meaning the loads around this API can be reordered.
+ *
+ * @param addr
+ *  A pointer to the memory location.
+ * @param expected
+ *  A 32-bit expected value to be in the memory location.
+ */
+__rte_always_inline
+static void
+rte_wait_until_equal_relaxed_32(volatile uint32_t *addr, uint32_t expected);
+
+/**
+ * Wait for *addr to be updated with a 64-bit expected value, with a relaxed
+ * memory ordering model meaning the loads around this API can be reordered.
+ *
+ * @param addr
+ *  A pointer to the memory location.
+ * @param expected
+ *  A 64-bit expected value to be in the memory location.
+ */
+__rte_always_inline
+static void
+rte_wait_until_equal_relaxed_64(volatile uint64_t *addr, uint64_t expected);
+
+/**
+ * Wait for *addr to be updated with a 16-bit expected value, with an acquire
+ * memory ordering model meaning the loads after this API can't be observed
+ * before this API.
+ *
+ * @param addr
+ *  A pointer to the memory location.
+ * @param expected
+ *  A 16-bit expected value to be in the memory location.
+ */
+__rte_always_inline
+static void
+rte_wait_until_equal_acquire_16(volatile uint16_t *addr, uint16_t expected);
+
+/**
+ * Wait for *addr to be updated with a 32-bit expected value, with an acquire
+ * memory ordering model meaning the loads after this API can't be observed
+ * before this API.
+ *
+ * @param addr
+ *  A pointer to the memory location.
+ * @param expected
+ *  A 32-bit expected value to be in the memory location.
+ */
+__rte_always_inline
+static void
+rte_wait_until_equal_acquire_32(volatile uint32_t *addr, uint32_t expected);
+
+/**
+ * Wait for *addr to be updated with a 64-bit expected value, with an acquire
+ * memory ordering model meaning the loads after this API can't be observed
+ * before this API.
+ *
+ * @param addr
+ *  A pointer to the memory location.
+ * @param expected
+ *  A 64-bit expected value to be in the memory location.
+ */
+__rte_always_inline
+static void
+rte_wait_until_equal_acquire_64(volatile uint64_t *addr, uint64_t expected);
+
+#if !defined(RTE_ARM_USE_WFE)
+#define __WAIT_UNTIL_EQUAL(op_name, size, type, memorder) \
+__rte_always_inline \
+static void	\
+rte_wait_until_equal_##op_name##_##size(volatile type *addr, \
+	type expected) \
+{ \
+	while (__atomic_load_n(addr, memorder) != expected) \
+		rte_pause(); \
+}
+
+/* Wait for *addr to be updated with expected value */
+__WAIT_UNTIL_EQUAL(relaxed, 16, uint16_t, __ATOMIC_RELAXED)
+__WAIT_UNTIL_EQUAL(acquire, 16, uint16_t, __ATOMIC_ACQUIRE)
+__WAIT_UNTIL_EQUAL(relaxed, 32, uint32_t, __ATOMIC_RELAXED)
+__WAIT_UNTIL_EQUAL(acquire, 32, uint32_t, __ATOMIC_ACQUIRE)
+__WAIT_UNTIL_EQUAL(relaxed, 64, uint64_t, __ATOMIC_RELAXED)
+__WAIT_UNTIL_EQUAL(acquire, 64, uint64_t, __ATOMIC_ACQUIRE)
+#endif /* RTE_ARM_USE_WFE */
+
 #endif /* _RTE_PAUSE_H_ */