[RFC,v3,1/5] eal: add new definitions for wait scheme

Message ID 20210926063302.1541193-2-feifei.wang2@arm.com (mailing list archive)
State Superseded, archived
Delegated to: David Marchand
Headers
Series add new definitions for wait scheme |

Checks

Context Check Description
ci/checkpatch success coding style OK

Commit Message

Feifei Wang Sept. 26, 2021, 6:32 a.m. UTC
  Introduce macros as generic interface for address monitoring.

Signed-off-by: Feifei Wang <feifei.wang2@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
---
 lib/eal/arm/include/rte_pause_64.h  | 151 ++++++++++++++++++----------
 lib/eal/include/generic/rte_pause.h |  78 ++++++++++++++
 2 files changed, 175 insertions(+), 54 deletions(-)
  

Comments

Ananyev, Konstantin Oct. 7, 2021, 4:18 p.m. UTC | #1
> Introduce macros as generic interface for address monitoring.
> 
> Signed-off-by: Feifei Wang <feifei.wang2@arm.com>
> Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
> ---
>  lib/eal/arm/include/rte_pause_64.h  | 151 ++++++++++++++++++----------
>  lib/eal/include/generic/rte_pause.h |  78 ++++++++++++++
>  2 files changed, 175 insertions(+), 54 deletions(-)
> 
> diff --git a/lib/eal/arm/include/rte_pause_64.h b/lib/eal/arm/include/rte_pause_64.h
> index e87d10b8cc..205510e044 100644
> --- a/lib/eal/arm/include/rte_pause_64.h
> +++ b/lib/eal/arm/include/rte_pause_64.h
> @@ -31,20 +31,12 @@ static inline void rte_pause(void)
>  /* Put processor into low power WFE(Wait For Event) state. */
>  #define __WFE() { asm volatile("wfe" : : : "memory"); }
> 
> -static __rte_always_inline void
> -rte_wait_until_equal_16(volatile uint16_t *addr, uint16_t expected,
> -		int memorder)
> -{
> -	uint16_t value;
> -
> -	assert(memorder == __ATOMIC_ACQUIRE || memorder == __ATOMIC_RELAXED);
> -
> -	/*
> -	 * Atomic exclusive load from addr, it returns the 16-bit content of
> -	 * *addr while making it 'monitored',when it is written by someone
> -	 * else, the 'monitored' state is cleared and a event is generated
> -	 * implicitly to exit WFE.
> -	 */
> +/*
> + * Atomic exclusive load from addr, it returns the 16-bit content of
> + * *addr while making it 'monitored', when it is written by someone
> + * else, the 'monitored' state is cleared and a event is generated
> + * implicitly to exit WFE.
> + */
>  #define __LOAD_EXC_16(src, dst, memorder) {               \
>  	if (memorder == __ATOMIC_RELAXED) {               \
>  		asm volatile("ldxrh %w[tmp], [%x[addr]]"  \
> @@ -58,6 +50,52 @@ rte_wait_until_equal_16(volatile uint16_t *addr, uint16_t expected,
>  			: "memory");                      \
>  	} }
> 
> +/*
> + * Atomic exclusive load from addr, it returns the 32-bit content of
> + * *addr while making it 'monitored', when it is written by someone
> + * else, the 'monitored' state is cleared and a event is generated
> + * implicitly to exit WFE.
> + */
> +#define __LOAD_EXC_32(src, dst, memorder) {              \
> +	if (memorder == __ATOMIC_RELAXED) {              \
> +		asm volatile("ldxr %w[tmp], [%x[addr]]"  \
> +			: [tmp] "=&r" (dst)              \
> +			: [addr] "r"(src)                \
> +			: "memory");                     \
> +	} else {                                         \
> +		asm volatile("ldaxr %w[tmp], [%x[addr]]" \
> +			: [tmp] "=&r" (dst)              \
> +			: [addr] "r"(src)                \
> +			: "memory");                     \
> +	} }
> +
> +/*
> + * Atomic exclusive load from addr, it returns the 64-bit content of
> + * *addr while making it 'monitored', when it is written by someone
> + * else, the 'monitored' state is cleared and a event is generated
> + * implicitly to exit WFE.
> + */
> +#define __LOAD_EXC_64(src, dst, memorder) {              \
> +	if (memorder == __ATOMIC_RELAXED) {              \
> +		asm volatile("ldxr %x[tmp], [%x[addr]]"  \
> +			: [tmp] "=&r" (dst)              \
> +			: [addr] "r"(src)                \
> +			: "memory");                     \
> +	} else {                                         \
> +		asm volatile("ldaxr %x[tmp], [%x[addr]]" \
> +			: [tmp] "=&r" (dst)              \
> +			: [addr] "r"(src)                \
> +			: "memory");                     \
> +	} }
> +
> +static __rte_always_inline void
> +rte_wait_until_equal_16(volatile uint16_t *addr, uint16_t expected,
> +		int memorder)
> +{
> +	uint16_t value;
> +
> +	assert(memorder == __ATOMIC_ACQUIRE || memorder == __ATOMIC_RELAXED);
> +
>  	__LOAD_EXC_16(addr, value, memorder)
>  	if (value != expected) {
>  		__SEVL()
> @@ -66,7 +104,6 @@ rte_wait_until_equal_16(volatile uint16_t *addr, uint16_t expected,
>  			__LOAD_EXC_16(addr, value, memorder)
>  		} while (value != expected);
>  	}
> -#undef __LOAD_EXC_16
>  }
> 
>  static __rte_always_inline void
> @@ -77,25 +114,6 @@ rte_wait_until_equal_32(volatile uint32_t *addr, uint32_t expected,
> 
>  	assert(memorder == __ATOMIC_ACQUIRE || memorder == __ATOMIC_RELAXED);
> 
> -	/*
> -	 * Atomic exclusive load from addr, it returns the 32-bit content of
> -	 * *addr while making it 'monitored',when it is written by someone
> -	 * else, the 'monitored' state is cleared and a event is generated
> -	 * implicitly to exit WFE.
> -	 */
> -#define __LOAD_EXC_32(src, dst, memorder) {              \
> -	if (memorder == __ATOMIC_RELAXED) {              \
> -		asm volatile("ldxr %w[tmp], [%x[addr]]"  \
> -			: [tmp] "=&r" (dst)              \
> -			: [addr] "r"(src)                \
> -			: "memory");                     \
> -	} else {                                         \
> -		asm volatile("ldaxr %w[tmp], [%x[addr]]" \
> -			: [tmp] "=&r" (dst)              \
> -			: [addr] "r"(src)                \
> -			: "memory");                     \
> -	} }
> -
>  	__LOAD_EXC_32(addr, value, memorder)
>  	if (value != expected) {
>  		__SEVL()
> @@ -104,7 +122,6 @@ rte_wait_until_equal_32(volatile uint32_t *addr, uint32_t expected,
>  			__LOAD_EXC_32(addr, value, memorder)
>  		} while (value != expected);
>  	}
> -#undef __LOAD_EXC_32
>  }
> 
>  static __rte_always_inline void
> @@ -115,25 +132,6 @@ rte_wait_until_equal_64(volatile uint64_t *addr, uint64_t expected,
> 
>  	assert(memorder == __ATOMIC_ACQUIRE || memorder == __ATOMIC_RELAXED);
> 
> -	/*
> -	 * Atomic exclusive load from addr, it returns the 64-bit content of
> -	 * *addr while making it 'monitored',when it is written by someone
> -	 * else, the 'monitored' state is cleared and a event is generated
> -	 * implicitly to exit WFE.
> -	 */
> -#define __LOAD_EXC_64(src, dst, memorder) {              \
> -	if (memorder == __ATOMIC_RELAXED) {              \
> -		asm volatile("ldxr %x[tmp], [%x[addr]]"  \
> -			: [tmp] "=&r" (dst)              \
> -			: [addr] "r"(src)                \
> -			: "memory");                     \
> -	} else {                                         \
> -		asm volatile("ldaxr %x[tmp], [%x[addr]]" \
> -			: [tmp] "=&r" (dst)              \
> -			: [addr] "r"(src)                \
> -			: "memory");                     \
> -	} }
> -
>  	__LOAD_EXC_64(addr, value, memorder)
>  	if (value != expected) {
>  		__SEVL()
> @@ -143,6 +141,51 @@ rte_wait_until_equal_64(volatile uint64_t *addr, uint64_t expected,
>  		} while (value != expected);
>  	}
>  }
> +
> +#define rte_wait_event_16(addr, mask, expected, cond, memorder)                \
> +do {									       \
> +	uint16_t value                                                         \
> +	assert(memorder == __ATOMIC_ACQUIRE || memorder == __ATOMIC_RELAXED);  \
> +	__LOAD_EXC_16(addr, value, memorder)				       \
> +	if ((value & mask) cond expected) {				       \
> +		__SEVL()						       \
> +		do {							       \
> +			__WFE()						       \
> +			__LOAD_EXC_16(addr, value, memorder)		       \
> +		} while ((value & mask) cond expected);			       \
> +	}								       \
> +} while (0)
> +
> +#define rte_wait_event_32(addr, mask, expected, cond, memorder)                \
> +do {                                                                           \
> +	uint32_t value                                                         \
> +	assert(memorder == __ATOMIC_ACQUIRE || memorder == __ATOMIC_RELAXED);  \
> +	__LOAD_EXC_32(addr, value, memorder)                                   \
> +	if ((value & mask) op expected) {                                      \
> +		__SEVL()                                                       \
> +		do {                                                           \
> +			__WFE()                                                \
> +			__LOAD_EXC_32(addr, value, memorder)                   \
> +		} while ((value & mask) cond expected);                        \
> +	}                                                                      \
> +} while (0)
> +
> +#define rte_wait_event_64(addr, mask, expected, cond, memorder)                \
> +do {                                                                           \
> +	uint64_t value                                                         \
> +	assert(memorder == __ATOMIC_ACQUIRE || memorder == __ATOMIC_RELAXED);  \
> +	__LOAD_EXC_64(addr, value, memorder)                                   \
> +	if ((value & mask) cond expected) {                                    \
> +		__SEVL()                                                       \
> +		do {                                                           \
> +			__WFE()                                                \
> +			__LOAD_EXC_64(addr, value, memorder)                   \
> +		} while ((value & mask) cond expected);                        \
> +	}                                                                      \
> +} while (0)
> +
> +#undef __LOAD_EXC_16
> +#undef __LOAD_EXC_32
>  #undef __LOAD_EXC_64
> 
>  #undef __SEVL
> diff --git a/lib/eal/include/generic/rte_pause.h b/lib/eal/include/generic/rte_pause.h
> index 668ee4a184..4e32107eca 100644
> --- a/lib/eal/include/generic/rte_pause.h
> +++ b/lib/eal/include/generic/rte_pause.h
> @@ -111,6 +111,84 @@ rte_wait_until_equal_64(volatile uint64_t *addr, uint64_t expected,
>  	while (__atomic_load_n(addr, memorder) != expected)
>  		rte_pause();
>  }
> +
> +/*
> + * Wait until a 16-bit *addr breaks the condition, with a relaxed memory
> + * ordering model meaning the loads around this API can be reordered.
> + *
> + * @param addr
> + *  A pointer to the memory location.
> + * @param mask
> + *  A mask of value bits in interest
> + * @param expected
> + *  A 16-bit expected value to be in the memory location.
> + * @param cond
> + *  A symbol representing the condition (==, !=).
> + * @param memorder
> + *  Two different memory orders that can be specified:
> + *  __ATOMIC_ACQUIRE and __ATOMIC_RELAXED. These map to
> + *  C++11 memory orders with the same names, see the C++11 standard or
> + *  the GCC wiki on atomic synchronization for detailed definition.
> + */

Hmm, so now we have 2 APIs doing similar thing:
rte_wait_until_equal_n() and rte_wait_event_n().
Can we probably unite them somehow?
At least make rte_wait_until_equal_n() to use rte_wait_event_n() underneath.

> +#define rte_wait_event_16(addr, mask, expected, cond, memorder)		       \
> +do {									       \
> +	assert(memorder == __ATOMIC_ACQUIRE || memorder == __ATOMIC_RELAXED);  \

And why user is not allowed to use __ATOMIC_SEQ_CST here?
BTW, if we expect memorder to always be a constant, might be better BUILD_BUG_ON()?

> +									       \
> +	while ((__atomic_load_n(addr, memorder) & mask) cond expected)	       \
> +		rte_pause();						       \
> +} while (0)

Two thoughts with these macros:
1. It is a goof practise to put () around macro parameters in the macro body.
Will save from a lot of unexpected troubles.
2. I think these 3 macros can be united into one.
Something like:

#define rte_wait_event(addr, mask, expected, cond, memorder) do {\
        typeof (*(addr)) val = __atomic_load_n((addr), (memorder)); \
        if ((val & (typeof(val))(mask)) cond (typeof(val))(expected)) \
                break; \
        rte_pause(); \
} while (1);


> +
> +/*
> + * Wait until a 32-bit *addr breaks the condition, with a relaxed memory
> + * ordering model meaning the loads around this API can be reordered.
> + *
> + * @param addr
> + *  A pointer to the memory location.
> + * @param mask
> + *  A mask of value bits in interest.
> + * @param expected
> + *  A 32-bit expected value to be in the memory location.
> + * @param cond
> + *  A symbol representing the condition (==, !=).
> + * @param memorder
> + *  Two different memory orders that can be specified:
> + *  __ATOMIC_ACQUIRE and __ATOMIC_RELAXED. These map to
> + *  C++11 memory orders with the same names, see the C++11 standard or
> + *  the GCC wiki on atomic synchronization for detailed definition.
> + */
> +#define rte_wait_event_32(addr, mask, expected, cond, memorder)		       \
> +do {									       \
> +	assert(memorder == __ATOMIC_ACQUIRE || memorder == __ATOMIC_RELAXED);  \
> +									       \
> +	while ((__atomic_load_n(addr, memorder) & mask) cond expected)	       \
> +		rte_pause();						       \
> +} while (0)
> +
> +/*
> + * Wait until a 64-bit *addr breaks the condition, with a relaxed memory
> + * ordering model meaning the loads around this API can be reordered.
> + *
> + * @param addr
> + *  A pointer to the memory location.
> + * @param mask
> + *  A mask of value bits in interest
> + * @param expected
> + *  A 64-bit expected value to be in the memory location.
> + * @param cond
> + *  A symbol representing the condition (==, !=).
> + * @param memorder
> + *  Two different memory orders that can be specified:
> + *  __ATOMIC_ACQUIRE and __ATOMIC_RELAXED. These map to
> + *  C++11 memory orders with the same names, see the C++11 standard or
> + *  the GCC wiki on atomic synchronization for detailed definition.
> + */
> +#define rte_wait_event_64(addr, mask, expected, cond, memorder)		       \
> +do {									       \
> +	assert(memorder == __ATOMIC_ACQUIRE || memorder == __ATOMIC_RELAXED);  \
> +									       \
> +	while ((__atomic_load_n(addr, memorder) & mask) cond expected)	       \
> +		rte_pause();						       \
> +} while (0)
>  #endif
> 
>  #endif /* _RTE_PAUSE_H_ */
> --
> 2.25.1
  
Feifei Wang Oct. 12, 2021, 8:09 a.m. UTC | #2
> -----邮件原件-----
> 发件人: Ananyev, Konstantin <konstantin.ananyev@intel.com>
> 发送时间: Friday, October 8, 2021 12:19 AM
> 收件人: Feifei Wang <Feifei.Wang2@arm.com>; Ruifeng Wang
> <Ruifeng.Wang@arm.com>
> 抄送: dev@dpdk.org; nd <nd@arm.com>
> 主题: RE: [dpdk-dev] [RFC PATCH v3 1/5] eal: add new definitions for wait
> scheme

[snip]

> > diff --git a/lib/eal/include/generic/rte_pause.h
> > b/lib/eal/include/generic/rte_pause.h
> > index 668ee4a184..4e32107eca 100644
> > --- a/lib/eal/include/generic/rte_pause.h
> > +++ b/lib/eal/include/generic/rte_pause.h
> > @@ -111,6 +111,84 @@ rte_wait_until_equal_64(volatile uint64_t *addr,
> uint64_t expected,
> >  	while (__atomic_load_n(addr, memorder) != expected)
> >  		rte_pause();
> >  }
> > +
> > +/*
> > + * Wait until a 16-bit *addr breaks the condition, with a relaxed
> > +memory
> > + * ordering model meaning the loads around this API can be reordered.
> > + *
> > + * @param addr
> > + *  A pointer to the memory location.
> > + * @param mask
> > + *  A mask of value bits in interest
> > + * @param expected
> > + *  A 16-bit expected value to be in the memory location.
> > + * @param cond
> > + *  A symbol representing the condition (==, !=).
> > + * @param memorder
> > + *  Two different memory orders that can be specified:
> > + *  __ATOMIC_ACQUIRE and __ATOMIC_RELAXED. These map to
> > + *  C++11 memory orders with the same names, see the C++11 standard
> > +or
> > + *  the GCC wiki on atomic synchronization for detailed definition.
> > + */
> 
> Hmm, so now we have 2 APIs doing similar thing:
> rte_wait_until_equal_n() and rte_wait_event_n().
> Can we probably unite them somehow?
> At least make rte_wait_until_equal_n() to use rte_wait_event_n() underneath.
> 
You are right. We plan to change rte_wait_until_equal API after this new scheme
is achieved.  And then, we will merge wait_unil into wait_event definition in the next new
patch series.
 
> > +#define rte_wait_event_16(addr, mask, expected, cond, memorder)
> 		       \
> > +do {									       \
> > +	assert(memorder == __ATOMIC_ACQUIRE || memorder ==
> > +__ATOMIC_RELAXED);  \
> 
> And why user is not allowed to use __ATOMIC_SEQ_CST here?
Actually this is just a load operation, and acquire here is enough to make sure 'load
addr value' can be before other operations.
 
> BTW, if we expect memorder to always be a constant, might be better
> BUILD_BUG_ON()?
If I understand correctly, you means we can replace 'assert' by 'build_bug_on':
RTE_BUILD_BUG_ON(memorder != __ATOMIC_ACQUIRE && memorder !=__ATOMIC_RELAXED);  

> 
> > +									       \
> > +	while ((__atomic_load_n(addr, memorder) & mask) cond expected)
> 	       \
> > +		rte_pause();						       \
> > +} while (0)
> 
> Two thoughts with these macros:
> 1. It is a goof practise to put () around macro parameters in the macro body.
> Will save from a lot of unexpected troubles.
> 2. I think these 3 macros can be united into one.
> Something like:
> 
> #define rte_wait_event(addr, mask, expected, cond, memorder) do {\
>         typeof (*(addr)) val = __atomic_load_n((addr), (memorder)); \
>         if ((val & (typeof(val))(mask)) cond (typeof(val))(expected)) \
>                 break; \
>         rte_pause(); \
> } while (1);
For this point, I think it is due to different size need to use different assembly instructions
in arm architecture. For example,
load 16 bits instruction is "ldxrh %w[tmp], [%x[addr]"
load 32 bits instruction is " ldxr %w[tmp], [%x[addr]" 
load 64 bits instruction is " ldxr %x[tmp], [%x[addr] "
And for consistency, we also use 3 APIs in generic path.
> 
> 
> > +
> > +/*
> > + * Wait until a 32-bit *addr breaks the condition, with a relaxed
> > +memory
> > + * ordering model meaning the loads around this API can be reordered.
> > + *
> > + * @param addr
> > + *  A pointer to the memory location.
> > + * @param mask
> > + *  A mask of value bits in interest.
> > + * @param expected
> > + *  A 32-bit expected value to be in the memory location.
> > + * @param cond
> > + *  A symbol representing the condition (==, !=).
> > + * @param memorder
> > + *  Two different memory orders that can be specified:
> > + *  __ATOMIC_ACQUIRE and __ATOMIC_RELAXED. These map to
> > + *  C++11 memory orders with the same names, see the C++11 standard
> > +or
> > + *  the GCC wiki on atomic synchronization for detailed definition.
> > + */
> > +#define rte_wait_event_32(addr, mask, expected, cond, memorder)
> 		       \
> > +do {									       \
> > +	assert(memorder == __ATOMIC_ACQUIRE || memorder ==
> __ATOMIC_RELAXED);  \
> > +									       \
> > +	while ((__atomic_load_n(addr, memorder) & mask) cond expected)
> 	       \
> > +		rte_pause();						       \
> > +} while (0)
> > +
> > +/*
> > + * Wait until a 64-bit *addr breaks the condition, with a relaxed
> > +memory
> > + * ordering model meaning the loads around this API can be reordered.
> > + *
> > + * @param addr
> > + *  A pointer to the memory location.
> > + * @param mask
> > + *  A mask of value bits in interest
> > + * @param expected
> > + *  A 64-bit expected value to be in the memory location.
> > + * @param cond
> > + *  A symbol representing the condition (==, !=).
> > + * @param memorder
> > + *  Two different memory orders that can be specified:
> > + *  __ATOMIC_ACQUIRE and __ATOMIC_RELAXED. These map to
> > + *  C++11 memory orders with the same names, see the C++11 standard
> > +or
> > + *  the GCC wiki on atomic synchronization for detailed definition.
> > + */
> > +#define rte_wait_event_64(addr, mask, expected, cond, memorder)
> 		       \
> > +do {									       \
> > +	assert(memorder == __ATOMIC_ACQUIRE || memorder ==
> __ATOMIC_RELAXED);  \
> > +									       \
> > +	while ((__atomic_load_n(addr, memorder) & mask) cond expected)
> 	       \
> > +		rte_pause();						       \
> > +} while (0)
> >  #endif
> >
> >  #endif /* _RTE_PAUSE_H_ */
> > --
> > 2.25.1
  
Ananyev, Konstantin Oct. 13, 2021, 3:03 p.m. UTC | #3
> 
> [snip]
> 
> > > diff --git a/lib/eal/include/generic/rte_pause.h
> > > b/lib/eal/include/generic/rte_pause.h
> > > index 668ee4a184..4e32107eca 100644
> > > --- a/lib/eal/include/generic/rte_pause.h
> > > +++ b/lib/eal/include/generic/rte_pause.h
> > > @@ -111,6 +111,84 @@ rte_wait_until_equal_64(volatile uint64_t *addr,
> > uint64_t expected,
> > >  	while (__atomic_load_n(addr, memorder) != expected)
> > >  		rte_pause();
> > >  }
> > > +
> > > +/*
> > > + * Wait until a 16-bit *addr breaks the condition, with a relaxed
> > > +memory
> > > + * ordering model meaning the loads around this API can be reordered.
> > > + *
> > > + * @param addr
> > > + *  A pointer to the memory location.
> > > + * @param mask
> > > + *  A mask of value bits in interest
> > > + * @param expected
> > > + *  A 16-bit expected value to be in the memory location.
> > > + * @param cond
> > > + *  A symbol representing the condition (==, !=).
> > > + * @param memorder
> > > + *  Two different memory orders that can be specified:
> > > + *  __ATOMIC_ACQUIRE and __ATOMIC_RELAXED. These map to
> > > + *  C++11 memory orders with the same names, see the C++11 standard
> > > +or
> > > + *  the GCC wiki on atomic synchronization for detailed definition.
> > > + */
> >
> > Hmm, so now we have 2 APIs doing similar thing:
> > rte_wait_until_equal_n() and rte_wait_event_n().
> > Can we probably unite them somehow?
> > At least make rte_wait_until_equal_n() to use rte_wait_event_n() underneath.
> >
> You are right. We plan to change rte_wait_until_equal API after this new scheme
> is achieved.  And then, we will merge wait_unil into wait_event definition in the next new
> patch series.
> 
> > > +#define rte_wait_event_16(addr, mask, expected, cond, memorder)
> > 		       \
> > > +do {									       \
> > > +	assert(memorder == __ATOMIC_ACQUIRE || memorder ==
> > > +__ATOMIC_RELAXED);  \
> >
> > And why user is not allowed to use __ATOMIC_SEQ_CST here?
> Actually this is just a load operation, and acquire here is enough to make sure 'load
> addr value' can be before other operations.
> 
> > BTW, if we expect memorder to always be a constant, might be better
> > BUILD_BUG_ON()?
> If I understand correctly, you means we can replace 'assert' by 'build_bug_on':
> RTE_BUILD_BUG_ON(memorder != __ATOMIC_ACQUIRE && memorder !=__ATOMIC_RELAXED);

Yes, that was my thought.
In that case I think we should be able to catch wrong memorder at compilation stage.

> 
> >
> > > +									       \
> > > +	while ((__atomic_load_n(addr, memorder) & mask) cond expected)
> > 	       \
> > > +		rte_pause();						       \
> > > +} while (0)
> >
> > Two thoughts with these macros:
> > 1. It is a goof practise to put () around macro parameters in the macro body.
> > Will save from a lot of unexpected troubles.
> > 2. I think these 3 macros can be united into one.
> > Something like:
> >
> > #define rte_wait_event(addr, mask, expected, cond, memorder) do {\
> >         typeof (*(addr)) val = __atomic_load_n((addr), (memorder)); \
> >         if ((val & (typeof(val))(mask)) cond (typeof(val))(expected)) \
> >                 break; \
> >         rte_pause(); \
> > } while (1);
> For this point, I think it is due to different size need to use different assembly instructions
> in arm architecture. For example,
> load 16 bits instruction is "ldxrh %w[tmp], [%x[addr]"
> load 32 bits instruction is " ldxr %w[tmp], [%x[addr]"
> load 64 bits instruction is " ldxr %x[tmp], [%x[addr] "

Ok, but it could be then something like that for arm specific code:
if (sizeof(val) == sizeof(uint16_t)) \
	__LOAD_EXC_16(...); \
else if (sizeof(val) == sizeof(uint32_t)) \	
	__LOAD_EXC_32(...); \
else if (sizeof(val) == sizeof(uint64_t)) \
	__LOAD_EXC_64(...); \
...

> And for consistency, we also use 3 APIs in generic path.
Honestly, even one multi-line macro doesn't look nice.
Having 3 identical ones looks even worse.
  
Stephen Hemminger Oct. 13, 2021, 5 p.m. UTC | #4
On Wed, 13 Oct 2021 15:03:56 +0000
"Ananyev, Konstantin" <konstantin.ananyev@intel.com> wrote:

> > addr value' can be before other operations.
> >   
> > > BTW, if we expect memorder to always be a constant, might be better
> > > BUILD_BUG_ON()?  
> > If I understand correctly, you means we can replace 'assert' by 'build_bug_on':
> > RTE_BUILD_BUG_ON(memorder != __ATOMIC_ACQUIRE && memorder !=__ATOMIC_RELAXED);  
> 
> Yes, that was my thought.
> In that case I think we should be able to catch wrong memorder at compilation stage.

Maybe:
   RTE_BUILD_BUG_ON(!_constant_p(memorder));
   RTE_BUILD_BUG_ON(memorder != __ATOMIC_ACQUIRE && memorder !=__ATOMIC_RELAXED);
  
Feifei Wang Oct. 14, 2021, 3:08 a.m. UTC | #5
> -----邮件原件-----
> 发件人: Ananyev, Konstantin <konstantin.ananyev@intel.com>
> 发送时间: Wednesday, October 13, 2021 11:04 PM
> 收件人: Feifei Wang <Feifei.Wang2@arm.com>; Ruifeng Wang
> <Ruifeng.Wang@arm.com>
> 抄送: dev@dpdk.org; nd <nd@arm.com>; nd <nd@arm.com>
> 主题: RE: [dpdk-dev] [RFC PATCH v3 1/5] eal: add new definitions for wait
> scheme
> 
> >
> > [snip]
> >
> > > > diff --git a/lib/eal/include/generic/rte_pause.h
> > > > b/lib/eal/include/generic/rte_pause.h
> > > > index 668ee4a184..4e32107eca 100644
> > > > --- a/lib/eal/include/generic/rte_pause.h
> > > > +++ b/lib/eal/include/generic/rte_pause.h
> > > > @@ -111,6 +111,84 @@ rte_wait_until_equal_64(volatile uint64_t
> > > > *addr,
> > > uint64_t expected,
> > > >  	while (__atomic_load_n(addr, memorder) != expected)
> > > >  		rte_pause();
> > > >  }
> > > > +
> > > > +/*
> > > > + * Wait until a 16-bit *addr breaks the condition, with a relaxed
> > > > +memory
> > > > + * ordering model meaning the loads around this API can be reordered.
> > > > + *
> > > > + * @param addr
> > > > + *  A pointer to the memory location.
> > > > + * @param mask
> > > > + *  A mask of value bits in interest
> > > > + * @param expected
> > > > + *  A 16-bit expected value to be in the memory location.
> > > > + * @param cond
> > > > + *  A symbol representing the condition (==, !=).
> > > > + * @param memorder
> > > > + *  Two different memory orders that can be specified:
> > > > + *  __ATOMIC_ACQUIRE and __ATOMIC_RELAXED. These map to
> > > > + *  C++11 memory orders with the same names, see the C++11
> > > > +standard or
> > > > + *  the GCC wiki on atomic synchronization for detailed definition.
> > > > + */
> > >
> > > Hmm, so now we have 2 APIs doing similar thing:
> > > rte_wait_until_equal_n() and rte_wait_event_n().
> > > Can we probably unite them somehow?
> > > At least make rte_wait_until_equal_n() to use rte_wait_event_n()
> underneath.
> > >
> > You are right. We plan to change rte_wait_until_equal API after this
> > new scheme is achieved.  And then, we will merge wait_unil into
> > wait_event definition in the next new patch series.
> >
> > > > +#define rte_wait_event_16(addr, mask, expected, cond, memorder)
> > > 		       \
> > > > +do {
> 	       \
> > > > +	assert(memorder == __ATOMIC_ACQUIRE || memorder ==
> > > > +__ATOMIC_RELAXED);  \
> > >
> > > And why user is not allowed to use __ATOMIC_SEQ_CST here?
> > Actually this is just a load operation, and acquire here is enough to
> > make sure 'load addr value' can be before other operations.
> >
> > > BTW, if we expect memorder to always be a constant, might be better
> > > BUILD_BUG_ON()?
> > If I understand correctly, you means we can replace 'assert' by
> 'build_bug_on':
> > RTE_BUILD_BUG_ON(memorder != __ATOMIC_ACQUIRE && memorder
> > !=__ATOMIC_RELAXED);
> 
> Yes, that was my thought.
> In that case I think we should be able to catch wrong memorder at compilation
> stage.
> 
> >
> > >
> > > > +									       \
> > > > +	while ((__atomic_load_n(addr, memorder) & mask) cond expected)
> > > 	       \
> > > > +		rte_pause();						       \
> > > > +} while (0)
> > >
> > > Two thoughts with these macros:
> > > 1. It is a goof practise to put () around macro parameters in the macro
> body.
> > > Will save from a lot of unexpected troubles.
> > > 2. I think these 3 macros can be united into one.
> > > Something like:
> > >
> > > #define rte_wait_event(addr, mask, expected, cond, memorder) do {\
> > >         typeof (*(addr)) val = __atomic_load_n((addr), (memorder)); \
> > >         if ((val & (typeof(val))(mask)) cond (typeof(val))(expected)) \
> > >                 break; \
> > >         rte_pause(); \
> > > } while (1);
> > For this point, I think it is due to different size need to use
> > different assembly instructions in arm architecture. For example, load
> > 16 bits instruction is "ldxrh %w[tmp], [%x[addr]"
> > load 32 bits instruction is " ldxr %w[tmp], [%x[addr]"
> > load 64 bits instruction is " ldxr %x[tmp], [%x[addr] "
> 
> Ok, but it could be then something like that for arm specific code:
> if (sizeof(val) == sizeof(uint16_t)) \
> 	__LOAD_EXC_16(...); \
> else if (sizeof(val) == sizeof(uint32_t)) \
> 	__LOAD_EXC_32(...); \
> else if (sizeof(val) == sizeof(uint64_t)) \
> 	__LOAD_EXC_64(...); \
> ...
> 
I thinks we should use "addr" as judgement:

rte_wait_event(addr, mask, expected, cond, memorder)
if (sizeof(*addr)) == sizeof(uint16_t) 
	uint16_t value                                                         \
	assert(memorder == __ATOMIC_ACQUIRE || memorder == __ATOMIC_RELAXED);  \
	__LOAD_EXC_16(addr, value, memorder)				       \
	if ((value & mask) cond expected) {				       \
		__SEVL()						       \
		do {							       \
			__WFE()						       \
			__LOAD_EXC_16(addr, value, memorder)		       \
		} while ((value & mask) cond expected);			       \
	}	
if (sizeof(*addr)) == sizeof(uint32_t) 
	..........
if (sizeof(*addr)) == sizeof(uint64_t) 
	...........

> > And for consistency, we also use 3 APIs in generic path.
> Honestly, even one multi-line macro doesn't look nice.
> Having 3 identical ones looks even worse.
  
Feifei Wang Oct. 14, 2021, 3:14 a.m. UTC | #6
> -----邮件原件-----
> 发件人: Stephen Hemminger <stephen@networkplumber.org>
> 发送时间: Thursday, October 14, 2021 1:00 AM
> 收件人: Ananyev, Konstantin <konstantin.ananyev@intel.com>
> 抄送: Feifei Wang <Feifei.Wang2@arm.com>; Ruifeng Wang
> <Ruifeng.Wang@arm.com>; dev@dpdk.org; nd <nd@arm.com>
> 主题: Re: [dpdk-dev] [RFC PATCH v3 1/5] eal: add new definitions for wait
> scheme
> 
> On Wed, 13 Oct 2021 15:03:56 +0000
> "Ananyev, Konstantin" <konstantin.ananyev@intel.com> wrote:
> 
> > > addr value' can be before other operations.
> > >
> > > > BTW, if we expect memorder to always be a constant, might be
> > > > better BUILD_BUG_ON()?
> > > If I understand correctly, you means we can replace 'assert' by
> 'build_bug_on':
> > > RTE_BUILD_BUG_ON(memorder != __ATOMIC_ACQUIRE && memorder
> > > !=__ATOMIC_RELAXED);
> >
> > Yes, that was my thought.
> > In that case I think we should be able to catch wrong memorder at
> compilation stage.
> 
> Maybe:
>    RTE_BUILD_BUG_ON(!_constant_p(memorder));
>    RTE_BUILD_BUG_ON(memorder != __ATOMIC_ACQUIRE &&
> memorder !=__ATOMIC_RELAXED);
> 
Thanks for your comments. One question for this, I do not know why we should check if memorder is a constant?
Is it to check whether memorder has been assigned or NULL?
  

Patch

diff --git a/lib/eal/arm/include/rte_pause_64.h b/lib/eal/arm/include/rte_pause_64.h
index e87d10b8cc..205510e044 100644
--- a/lib/eal/arm/include/rte_pause_64.h
+++ b/lib/eal/arm/include/rte_pause_64.h
@@ -31,20 +31,12 @@  static inline void rte_pause(void)
 /* Put processor into low power WFE(Wait For Event) state. */
 #define __WFE() { asm volatile("wfe" : : : "memory"); }
 
-static __rte_always_inline void
-rte_wait_until_equal_16(volatile uint16_t *addr, uint16_t expected,
-		int memorder)
-{
-	uint16_t value;
-
-	assert(memorder == __ATOMIC_ACQUIRE || memorder == __ATOMIC_RELAXED);
-
-	/*
-	 * Atomic exclusive load from addr, it returns the 16-bit content of
-	 * *addr while making it 'monitored',when it is written by someone
-	 * else, the 'monitored' state is cleared and a event is generated
-	 * implicitly to exit WFE.
-	 */
+/*
+ * Atomic exclusive load from addr, it returns the 16-bit content of
+ * *addr while making it 'monitored', when it is written by someone
+ * else, the 'monitored' state is cleared and a event is generated
+ * implicitly to exit WFE.
+ */
 #define __LOAD_EXC_16(src, dst, memorder) {               \
 	if (memorder == __ATOMIC_RELAXED) {               \
 		asm volatile("ldxrh %w[tmp], [%x[addr]]"  \
@@ -58,6 +50,52 @@  rte_wait_until_equal_16(volatile uint16_t *addr, uint16_t expected,
 			: "memory");                      \
 	} }
 
+/*
+ * Atomic exclusive load from addr, it returns the 32-bit content of
+ * *addr while making it 'monitored', when it is written by someone
+ * else, the 'monitored' state is cleared and a event is generated
+ * implicitly to exit WFE.
+ */
+#define __LOAD_EXC_32(src, dst, memorder) {              \
+	if (memorder == __ATOMIC_RELAXED) {              \
+		asm volatile("ldxr %w[tmp], [%x[addr]]"  \
+			: [tmp] "=&r" (dst)              \
+			: [addr] "r"(src)                \
+			: "memory");                     \
+	} else {                                         \
+		asm volatile("ldaxr %w[tmp], [%x[addr]]" \
+			: [tmp] "=&r" (dst)              \
+			: [addr] "r"(src)                \
+			: "memory");                     \
+	} }
+
+/*
+ * Atomic exclusive load from addr, it returns the 64-bit content of
+ * *addr while making it 'monitored', when it is written by someone
+ * else, the 'monitored' state is cleared and a event is generated
+ * implicitly to exit WFE.
+ */
+#define __LOAD_EXC_64(src, dst, memorder) {              \
+	if (memorder == __ATOMIC_RELAXED) {              \
+		asm volatile("ldxr %x[tmp], [%x[addr]]"  \
+			: [tmp] "=&r" (dst)              \
+			: [addr] "r"(src)                \
+			: "memory");                     \
+	} else {                                         \
+		asm volatile("ldaxr %x[tmp], [%x[addr]]" \
+			: [tmp] "=&r" (dst)              \
+			: [addr] "r"(src)                \
+			: "memory");                     \
+	} }
+
+static __rte_always_inline void
+rte_wait_until_equal_16(volatile uint16_t *addr, uint16_t expected,
+		int memorder)
+{
+	uint16_t value;
+
+	assert(memorder == __ATOMIC_ACQUIRE || memorder == __ATOMIC_RELAXED);
+
 	__LOAD_EXC_16(addr, value, memorder)
 	if (value != expected) {
 		__SEVL()
@@ -66,7 +104,6 @@  rte_wait_until_equal_16(volatile uint16_t *addr, uint16_t expected,
 			__LOAD_EXC_16(addr, value, memorder)
 		} while (value != expected);
 	}
-#undef __LOAD_EXC_16
 }
 
 static __rte_always_inline void
@@ -77,25 +114,6 @@  rte_wait_until_equal_32(volatile uint32_t *addr, uint32_t expected,
 
 	assert(memorder == __ATOMIC_ACQUIRE || memorder == __ATOMIC_RELAXED);
 
-	/*
-	 * Atomic exclusive load from addr, it returns the 32-bit content of
-	 * *addr while making it 'monitored',when it is written by someone
-	 * else, the 'monitored' state is cleared and a event is generated
-	 * implicitly to exit WFE.
-	 */
-#define __LOAD_EXC_32(src, dst, memorder) {              \
-	if (memorder == __ATOMIC_RELAXED) {              \
-		asm volatile("ldxr %w[tmp], [%x[addr]]"  \
-			: [tmp] "=&r" (dst)              \
-			: [addr] "r"(src)                \
-			: "memory");                     \
-	} else {                                         \
-		asm volatile("ldaxr %w[tmp], [%x[addr]]" \
-			: [tmp] "=&r" (dst)              \
-			: [addr] "r"(src)                \
-			: "memory");                     \
-	} }
-
 	__LOAD_EXC_32(addr, value, memorder)
 	if (value != expected) {
 		__SEVL()
@@ -104,7 +122,6 @@  rte_wait_until_equal_32(volatile uint32_t *addr, uint32_t expected,
 			__LOAD_EXC_32(addr, value, memorder)
 		} while (value != expected);
 	}
-#undef __LOAD_EXC_32
 }
 
 static __rte_always_inline void
@@ -115,25 +132,6 @@  rte_wait_until_equal_64(volatile uint64_t *addr, uint64_t expected,
 
 	assert(memorder == __ATOMIC_ACQUIRE || memorder == __ATOMIC_RELAXED);
 
-	/*
-	 * Atomic exclusive load from addr, it returns the 64-bit content of
-	 * *addr while making it 'monitored',when it is written by someone
-	 * else, the 'monitored' state is cleared and a event is generated
-	 * implicitly to exit WFE.
-	 */
-#define __LOAD_EXC_64(src, dst, memorder) {              \
-	if (memorder == __ATOMIC_RELAXED) {              \
-		asm volatile("ldxr %x[tmp], [%x[addr]]"  \
-			: [tmp] "=&r" (dst)              \
-			: [addr] "r"(src)                \
-			: "memory");                     \
-	} else {                                         \
-		asm volatile("ldaxr %x[tmp], [%x[addr]]" \
-			: [tmp] "=&r" (dst)              \
-			: [addr] "r"(src)                \
-			: "memory");                     \
-	} }
-
 	__LOAD_EXC_64(addr, value, memorder)
 	if (value != expected) {
 		__SEVL()
@@ -143,6 +141,51 @@  rte_wait_until_equal_64(volatile uint64_t *addr, uint64_t expected,
 		} while (value != expected);
 	}
 }
+
+#define rte_wait_event_16(addr, mask, expected, cond, memorder)                \
+do {									       \
+	uint16_t value                                                         \
+	assert(memorder == __ATOMIC_ACQUIRE || memorder == __ATOMIC_RELAXED);  \
+	__LOAD_EXC_16(addr, value, memorder)				       \
+	if ((value & mask) cond expected) {				       \
+		__SEVL()						       \
+		do {							       \
+			__WFE()						       \
+			__LOAD_EXC_16(addr, value, memorder)		       \
+		} while ((value & mask) cond expected);			       \
+	}								       \
+} while (0)
+
+#define rte_wait_event_32(addr, mask, expected, cond, memorder)                \
+do {                                                                           \
+	uint32_t value                                                         \
+	assert(memorder == __ATOMIC_ACQUIRE || memorder == __ATOMIC_RELAXED);  \
+	__LOAD_EXC_32(addr, value, memorder)                                   \
+	if ((value & mask) op expected) {                                      \
+		__SEVL()                                                       \
+		do {                                                           \
+			__WFE()                                                \
+			__LOAD_EXC_32(addr, value, memorder)                   \
+		} while ((value & mask) cond expected);                        \
+	}                                                                      \
+} while (0)
+
+#define rte_wait_event_64(addr, mask, expected, cond, memorder)                \
+do {                                                                           \
+	uint64_t value                                                         \
+	assert(memorder == __ATOMIC_ACQUIRE || memorder == __ATOMIC_RELAXED);  \
+	__LOAD_EXC_64(addr, value, memorder)                                   \
+	if ((value & mask) cond expected) {                                    \
+		__SEVL()                                                       \
+		do {                                                           \
+			__WFE()                                                \
+			__LOAD_EXC_64(addr, value, memorder)                   \
+		} while ((value & mask) cond expected);                        \
+	}                                                                      \
+} while (0)
+
+#undef __LOAD_EXC_16
+#undef __LOAD_EXC_32
 #undef __LOAD_EXC_64
 
 #undef __SEVL
diff --git a/lib/eal/include/generic/rte_pause.h b/lib/eal/include/generic/rte_pause.h
index 668ee4a184..4e32107eca 100644
--- a/lib/eal/include/generic/rte_pause.h
+++ b/lib/eal/include/generic/rte_pause.h
@@ -111,6 +111,84 @@  rte_wait_until_equal_64(volatile uint64_t *addr, uint64_t expected,
 	while (__atomic_load_n(addr, memorder) != expected)
 		rte_pause();
 }
+
+/*
+ * Wait until a 16-bit *addr breaks the condition, with a relaxed memory
+ * ordering model meaning the loads around this API can be reordered.
+ *
+ * @param addr
+ *  A pointer to the memory location.
+ * @param mask
+ *  A mask of value bits in interest
+ * @param expected
+ *  A 16-bit expected value to be in the memory location.
+ * @param cond
+ *  A symbol representing the condition (==, !=).
+ * @param memorder
+ *  Two different memory orders that can be specified:
+ *  __ATOMIC_ACQUIRE and __ATOMIC_RELAXED. These map to
+ *  C++11 memory orders with the same names, see the C++11 standard or
+ *  the GCC wiki on atomic synchronization for detailed definition.
+ */
+#define rte_wait_event_16(addr, mask, expected, cond, memorder)		       \
+do {									       \
+	assert(memorder == __ATOMIC_ACQUIRE || memorder == __ATOMIC_RELAXED);  \
+									       \
+	while ((__atomic_load_n(addr, memorder) & mask) cond expected)	       \
+		rte_pause();						       \
+} while (0)
+
+/*
+ * Wait until a 32-bit *addr breaks the condition, with a relaxed memory
+ * ordering model meaning the loads around this API can be reordered.
+ *
+ * @param addr
+ *  A pointer to the memory location.
+ * @param mask
+ *  A mask of value bits in interest.
+ * @param expected
+ *  A 32-bit expected value to be in the memory location.
+ * @param cond
+ *  A symbol representing the condition (==, !=).
+ * @param memorder
+ *  Two different memory orders that can be specified:
+ *  __ATOMIC_ACQUIRE and __ATOMIC_RELAXED. These map to
+ *  C++11 memory orders with the same names, see the C++11 standard or
+ *  the GCC wiki on atomic synchronization for detailed definition.
+ */
+#define rte_wait_event_32(addr, mask, expected, cond, memorder)		       \
+do {									       \
+	assert(memorder == __ATOMIC_ACQUIRE || memorder == __ATOMIC_RELAXED);  \
+									       \
+	while ((__atomic_load_n(addr, memorder) & mask) cond expected)	       \
+		rte_pause();						       \
+} while (0)
+
+/*
+ * Wait until a 64-bit *addr breaks the condition, with a relaxed memory
+ * ordering model meaning the loads around this API can be reordered.
+ *
+ * @param addr
+ *  A pointer to the memory location.
+ * @param mask
+ *  A mask of value bits in interest
+ * @param expected
+ *  A 64-bit expected value to be in the memory location.
+ * @param cond
+ *  A symbol representing the condition (==, !=).
+ * @param memorder
+ *  Two different memory orders that can be specified:
+ *  __ATOMIC_ACQUIRE and __ATOMIC_RELAXED. These map to
+ *  C++11 memory orders with the same names, see the C++11 standard or
+ *  the GCC wiki on atomic synchronization for detailed definition.
+ */
+#define rte_wait_event_64(addr, mask, expected, cond, memorder)		       \
+do {									       \
+	assert(memorder == __ATOMIC_ACQUIRE || memorder == __ATOMIC_RELAXED);  \
+									       \
+	while ((__atomic_load_n(addr, memorder) & mask) cond expected)	       \
+		rte_pause();						       \
+} while (0)
 #endif
 
 #endif /* _RTE_PAUSE_H_ */