[1/1] eal: add 128-bit cmpset (x86-64 only)

Message ID 20190128172945.27251-2-gage.eads@intel.com
State New
Delegated to: Thomas Monjalon
Headers show
Series
  • Add 128-bit compare and set
Related show

Checks

Context Check Description
ci/Intel-compilation success Compilation OK
ci/intel-Performance-Testing success Performance Testing PASS
ci/mellanox-Performance-Testing success Performance Testing PASS
ci/checkpatch success coding style OK

Commit Message

Eads, Gage Jan. 28, 2019, 5:29 p.m.
This operation can be used for non-blocking algorithms, such as a
non-blocking stack or ring.

Signed-off-by: Gage Eads <gage.eads@intel.com>
---
 .../common/include/arch/x86/rte_atomic_64.h        | 31 +++++++++++
 lib/librte_eal/common/include/generic/rte_atomic.h | 65 ++++++++++++++++++++++
 2 files changed, 96 insertions(+)

Comments

Ola Liljedahl Jan. 28, 2019, 11:01 p.m. | #1
On Mon, 2019-01-28 at 11:29 -0600, Gage Eads wrote:
> This operation can be used for non-blocking algorithms, such as a
> non-blocking stack or ring.
> 
> Signed-off-by: Gage Eads <gage.eads@intel.com>
> ---
>  .../common/include/arch/x86/rte_atomic_64.h        | 31 +++++++++++
>  lib/librte_eal/common/include/generic/rte_atomic.h | 65
> ++++++++++++++++++++++
>  2 files changed, 96 insertions(+)
> 
> diff --git a/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> b/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> index fd2ec9c53..b7b90b83e 100644
> --- a/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> +++ b/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> @@ -34,6 +34,7 @@
>  /*
>   * Inspired from FreeBSD src/sys/amd64/include/atomic.h
>   * Copyright (c) 1998 Doug Rabson
> + * Copyright (c) 2019 Intel Corporation
>   * All rights reserved.
>   */
>  
> @@ -46,6 +47,7 @@
>  
>  #include <stdint.h>
>  #include <rte_common.h>
> +#include <rte_compat.h>
>  #include <rte_atomic.h>
>  
>  /*------------------------- 64 bit atomic operations ------------------------
> -*/
> @@ -208,4 +210,33 @@ static inline void rte_atomic64_clear(rte_atomic64_t *v)
>  }
>  #endif
>  
> +static inline int __rte_experimental
__rte_always_inline?

> +rte_atomic128_cmpset(volatile rte_int128_t *dst,
No need to declare the location volatile. Volatile doesn't do what you think it
does.
https://youtu.be/lkgszkPnV8g?t=1027


> +		     rte_int128_t *exp,
I would declare 'exp' const as well and document that 'exp' is not updated (with
the old value) for a failure. The reason being that ARMv8.0/AArch64 cannot
atomically read the old value without also writing the location and that is bad
for performance (unnecessary writes leads to unnecessary contention and worse
scalability). And the user must anyway read the location (in the start of the
critical section) using e.g. non-atomic 64-bit reads so there isn't actually any
requirement for an atomic 128-bit read of the location.

>  rte_int128_t *src,
const rte_int128_t *src?

But why are we not passing 'exp' and 'src' by value? That works great, even with
structs. Passing by value simplifies the compiler's life, especially if the call
is inlined. Ask a compiler developer.

> +		     unsigned int weak,
> +		     enum rte_atomic_memmodel_t success,
> +		     enum rte_atomic_memmodel_t failure)
> +{
> +	RTE_SET_USED(weak);
> +	RTE_SET_USED(success);
> +	RTE_SET_USED(failure);
> +	uint8_t res;
> +
> +	asm volatile (
> +		      MPLOCKED
> +		      "cmpxchg16b %[dst];"
> +		      " sete %[res]"
> +		      : [dst] "=m" (dst->val[0]),
> +			"=A" (exp->val[0]),
> +			[res] "=r" (res)
> +		      : "c" (src->val[1]),
> +			"b" (src->val[0]),
> +			"m" (dst->val[0]),
> +			"d" (exp->val[1]),
> +			"a" (exp->val[0])
> +		      : "memory");
> +
> +	return res;
> +}
> +
>  #endif /* _RTE_ATOMIC_X86_64_H_ */
> diff --git a/lib/librte_eal/common/include/generic/rte_atomic.h
> b/lib/librte_eal/common/include/generic/rte_atomic.h
> index b99ba4688..8d612d566 100644
> --- a/lib/librte_eal/common/include/generic/rte_atomic.h
> +++ b/lib/librte_eal/common/include/generic/rte_atomic.h
> @@ -14,6 +14,7 @@
>  
>  #include <stdint.h>
>  #include <rte_common.h>
> +#include <rte_compat.h>
>  
>  #ifdef __DOXYGEN__
>  
> @@ -1082,4 +1083,68 @@ static inline void rte_atomic64_clear(rte_atomic64_t
> *v)
>  }
>  #endif
>  
> +/*------------------------ 128 bit atomic operations ------------------------
> -*/
> +
> +/**
> + * 128-bit integer structure.
> + */
> +typedef struct {
> +	uint64_t val[2];
> +} __rte_aligned(16) rte_int128_t;
So we can't use __int128?

> +
> +/**
> + * Memory consistency models used in atomic operations. These control the
> + * behavior of the operation with respect to memory barriers and
> + * thread synchronization.
> + *
> + * These directly match those in the C++11 standard; for details on their
> + * behavior, refer to the standard.
> + */
> +enum rte_atomic_memmodel_t {
> +	RTE_ATOMIC_RELAXED,
> +	RTE_ATOMIC_CONSUME,
> +	RTE_ATOMIC_ACQUIRE,
> +	RTE_ATOMIC_RELEASE,
> +	RTE_ATOMIC_ACQ_REL,
> +	RTE_ATOMIC_SEQ_CST,
> +};
> +
> +/* Only implemented on x86-64 currently. The ifdef prevents compilation from
> + * failing for architectures without a definition of this function.
> + */
> +#if defined(RTE_ARCH_X86_64)
> +/**
> + * An atomic compare and set function used by the mutex functions.
> + * (atomic) equivalent to:
> + *   if (*dst == exp)
> + *     *dst = src (all 128-bit words)
> + *
> + * @param dst
> + *   The destination into which the value will be written.
> + * @param exp
> + *   Pointer to the expected value. If the operation fails, this memory is
> + *   updated with the actual value.
> + * @param src
> + *   Pointer to the new value.
> + * @param weak
> + *   A value of true allows the comparison to spuriously fail.
> Implementations
> + *   may ignore this argument and only implement the strong variant.
> + * @param success
> + *   If successful, the operation's memory behavior conforms to this (or a
> + *   stronger) model.
> + * @param failure
> + *   If unsuccessful, the operation's memory behavior conforms to this (or a
> + *   stronger) model. This argument cannot be RTE_ATOMIC_RELEASE,
> + *   RTE_ATOMIC_ACQ_REL, or a stronger model than success.
> + * @return
> + *   Non-zero on success; 0 on failure.
> + */
> +static inline int __rte_experimental
> +rte_atomic128_cmpset(volatile rte_int128_t *dst,
> +		     rte_int128_t *exp, rte_int128_t *src,
> +		     unsigned int weak,
> +		     enum rte_atomic_memmodel_t success,
> +		     enum rte_atomic_memmodel_t failure);
> +#endif
> +
>  #endif /* _RTE_ATOMIC_H_ */
Honnappa Nagarahalli Jan. 31, 2019, 5:48 a.m. | #2
> 
> This operation can be used for non-blocking algorithms, such as a non-
> blocking stack or ring.
> 
> Signed-off-by: Gage Eads <gage.eads@intel.com>
> ---
>  .../common/include/arch/x86/rte_atomic_64.h        | 31 +++++++++++
>  lib/librte_eal/common/include/generic/rte_atomic.h | 65
> ++++++++++++++++++++++
>  2 files changed, 96 insertions(+)
> 
> diff --git a/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> b/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> index fd2ec9c53..b7b90b83e 100644
> --- a/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> +++ b/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> @@ -34,6 +34,7 @@
>  /*
>   * Inspired from FreeBSD src/sys/amd64/include/atomic.h
>   * Copyright (c) 1998 Doug Rabson
> + * Copyright (c) 2019 Intel Corporation
>   * All rights reserved.
>   */
> 
> @@ -46,6 +47,7 @@
> 
>  #include <stdint.h>
>  #include <rte_common.h>
> +#include <rte_compat.h>
>  #include <rte_atomic.h>
> 
>  /*------------------------- 64 bit atomic operations -------------------------*/ @@ -
> 208,4 +210,33 @@ static inline void rte_atomic64_clear(rte_atomic64_t *v)  }
> #endif
> 
> +static inline int __rte_experimental
> +rte_atomic128_cmpset(volatile rte_int128_t *dst,
Does it make sense to call is rte_atomic128_compare_exchange (or ..._cmp_xchg) to indicate it is a compare-exchange operation?

> +		     rte_int128_t *exp, rte_int128_t *src,
> +		     unsigned int weak,
> +		     enum rte_atomic_memmodel_t success,
> +		     enum rte_atomic_memmodel_t failure) {
> +	RTE_SET_USED(weak);
> +	RTE_SET_USED(success);
> +	RTE_SET_USED(failure);
> +	uint8_t res;
> +
> +	asm volatile (
> +		      MPLOCKED
> +		      "cmpxchg16b %[dst];"
> +		      " sete %[res]"
> +		      : [dst] "=m" (dst->val[0]),
> +			"=A" (exp->val[0]),
> +			[res] "=r" (res)
> +		      : "c" (src->val[1]),
> +			"b" (src->val[0]),
> +			"m" (dst->val[0]),
> +			"d" (exp->val[1]),
> +			"a" (exp->val[0])
> +		      : "memory");
> +
> +	return res;
> +}
> +
>  #endif /* _RTE_ATOMIC_X86_64_H_ */
> diff --git a/lib/librte_eal/common/include/generic/rte_atomic.h
> b/lib/librte_eal/common/include/generic/rte_atomic.h
> index b99ba4688..8d612d566 100644
> --- a/lib/librte_eal/common/include/generic/rte_atomic.h
> +++ b/lib/librte_eal/common/include/generic/rte_atomic.h
> @@ -14,6 +14,7 @@
> 
>  #include <stdint.h>
>  #include <rte_common.h>
> +#include <rte_compat.h>
> 
>  #ifdef __DOXYGEN__
> 
> @@ -1082,4 +1083,68 @@ static inline void
> rte_atomic64_clear(rte_atomic64_t *v)  }  #endif
> 
> +/*------------------------ 128 bit atomic operations
> +-------------------------*/
> +
> +/**
> + * 128-bit integer structure.
> + */
> +typedef struct {
> +	uint64_t val[2];
> +} __rte_aligned(16) rte_int128_t;
It looks like '__int128' is available from gcc 4.6. I think we should use '__int128'. We can have it as an internal structure for ease of programming.

> +
> +/**
> + * Memory consistency models used in atomic operations. These control
> +the
> + * behavior of the operation with respect to memory barriers and
> + * thread synchronization.
> + *
> + * These directly match those in the C++11 standard; for details on
> +their
> + * behavior, refer to the standard.
> + */
> +enum rte_atomic_memmodel_t {
> +	RTE_ATOMIC_RELAXED,
> +	RTE_ATOMIC_CONSUME,
> +	RTE_ATOMIC_ACQUIRE,
> +	RTE_ATOMIC_RELEASE,
> +	RTE_ATOMIC_ACQ_REL,
> +	RTE_ATOMIC_SEQ_CST,
> +};
IMO, we can use the GCC provided names. I do not see any advantage to defining our own.

> +
> +/* Only implemented on x86-64 currently. The ifdef prevents compilation
> +from
> + * failing for architectures without a definition of this function.
> + */
Minor comment. We can skip the above comments, the #if below is pretty obvious.

> +#if defined(RTE_ARCH_X86_64)
> +/**
> + * An atomic compare and set function used by the mutex functions.
> + * (atomic) equivalent to:
> + *   if (*dst == exp)
> + *     *dst = src (all 128-bit words)
> + *
> + * @param dst
> + *   The destination into which the value will be written.
> + * @param exp
> + *   Pointer to the expected value. If the operation fails, this memory is
> + *   updated with the actual value.
> + * @param src
> + *   Pointer to the new value.
> + * @param weak
> + *   A value of true allows the comparison to spuriously fail.
> Implementations
> + *   may ignore this argument and only implement the strong variant.
> + * @param success
> + *   If successful, the operation's memory behavior conforms to this (or a
> + *   stronger) model.
> + * @param failure
> + *   If unsuccessful, the operation's memory behavior conforms to this (or a
> + *   stronger) model. This argument cannot be RTE_ATOMIC_RELEASE,
> + *   RTE_ATOMIC_ACQ_REL, or a stronger model than success.
> + * @return
> + *   Non-zero on success; 0 on failure.
> + */
> +static inline int __rte_experimental
> +rte_atomic128_cmpset(volatile rte_int128_t *dst,
> +		     rte_int128_t *exp, rte_int128_t *src,
> +		     unsigned int weak,
> +		     enum rte_atomic_memmodel_t success,
> +		     enum rte_atomic_memmodel_t failure); #endif
> +
>  #endif /* _RTE_ATOMIC_H_ */
> --
> 2.13.6
Eads, Gage Feb. 1, 2019, 5:06 p.m. | #3
> -----Original Message-----
> From: Ola Liljedahl [mailto:Ola.Liljedahl@arm.com]
> Sent: Monday, January 28, 2019 5:02 PM
> To: Eads, Gage <gage.eads@intel.com>; dev@dpdk.org
> Cc: arybchenko@solarflare.com; jerinj@marvell.com;
> chaozhu@linux.vnet.ibm.com; nd <nd@arm.com>; Richardson, Bruce
> <bruce.richardson@intel.com>; Ananyev, Konstantin
> <konstantin.ananyev@intel.com>; hemant.agrawal@nxp.com;
> olivier.matz@6wind.com; Honnappa Nagarahalli
> <Honnappa.Nagarahalli@arm.com>; Gavin Hu (Arm Technology China)
> <Gavin.Hu@arm.com>
> Subject: Re: [dpdk-dev] [PATCH 1/1] eal: add 128-bit cmpset (x86-64 only)
> 
> On Mon, 2019-01-28 at 11:29 -0600, Gage Eads wrote:
> > This operation can be used for non-blocking algorithms, such as a
> > non-blocking stack or ring.
> >
> > Signed-off-by: Gage Eads <gage.eads@intel.com>
> > ---
> >  .../common/include/arch/x86/rte_atomic_64.h        | 31 +++++++++++
> >  lib/librte_eal/common/include/generic/rte_atomic.h | 65
> > ++++++++++++++++++++++
> >  2 files changed, 96 insertions(+)
> >
> > diff --git a/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> > b/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> > index fd2ec9c53..b7b90b83e 100644
> > --- a/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> > +++ b/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> > @@ -34,6 +34,7 @@
> >  /*
> >   * Inspired from FreeBSD src/sys/amd64/include/atomic.h
> >   * Copyright (c) 1998 Doug Rabson
> > + * Copyright (c) 2019 Intel Corporation
> >   * All rights reserved.
> >   */
> >
> > @@ -46,6 +47,7 @@
> >
> >  #include <stdint.h>
> >  #include <rte_common.h>
> > +#include <rte_compat.h>
> >  #include <rte_atomic.h>
> >
> >  /*------------------------- 64 bit atomic operations
> > ------------------------ -*/ @@ -208,4 +210,33 @@ static inline void
> > rte_atomic64_clear(rte_atomic64_t *v)
> >  }
> >  #endif
> >
> > +static inline int __rte_experimental
> __rte_always_inline?
> 
> > +rte_atomic128_cmpset(volatile rte_int128_t *dst,
> No need to declare the location volatile. Volatile doesn't do what you think it
> does.
> https://youtu.be/lkgszkPnV8g?t=1027
> 

I made this volatile to match the existing rte_atomicN_cmpset definitions, which presumably have a good reason for using the keyword. Maintainers, any input here?

> 
> > +		     rte_int128_t *exp,
> I would declare 'exp' const as well and document that 'exp' is not updated (with
> the old value) for a failure. The reason being that ARMv8.0/AArch64 cannot
> atomically read the old value without also writing the location and that is bad
> for performance (unnecessary writes leads to unnecessary contention and
> worse scalability). And the user must anyway read the location (in the start of
> the critical section) using e.g. non-atomic 64-bit reads so there isn't actually any
> requirement for an atomic 128-bit read of the location.
> 

Will change in v2.

> >  rte_int128_t *src,
> const rte_int128_t *src?

Sure, I don't see any harm in using const.

> 
> But why are we not passing 'exp' and 'src' by value? That works great, even with
> structs. Passing by value simplifies the compiler's life, especially if the call is
> inlined. Ask a compiler developer.

I ran objdump on the nb_stack code with both approaches, and pass-by-reference resulted in fewer overall x86_64 assembly ops.
PBV: 100 ops for push, 97 ops for pop
PBR: 92 ops for push, 84 ops for pop

(Using the in-progress v5 nb_stack code)

Another factor -- though much less compelling -- is that with pass-by-reference, the user can create a 16B structure and cast it to rte_int128_t when they call rte_atomic128_cmpset, whereas with pass-by-value they need to put that struct in a union with rte_int128_t.

> 
> > +		     unsigned int weak,
> > +		     enum rte_atomic_memmodel_t success,
> > +		     enum rte_atomic_memmodel_t failure) {
> > +	RTE_SET_USED(weak);
> > +	RTE_SET_USED(success);
> > +	RTE_SET_USED(failure);
> > +	uint8_t res;
> > +
> > +	asm volatile (
> > +		      MPLOCKED
> > +		      "cmpxchg16b %[dst];"
> > +		      " sete %[res]"
> > +		      : [dst] "=m" (dst->val[0]),
> > +			"=A" (exp->val[0]),
> > +			[res] "=r" (res)
> > +		      : "c" (src->val[1]),
> > +			"b" (src->val[0]),
> > +			"m" (dst->val[0]),
> > +			"d" (exp->val[1]),
> > +			"a" (exp->val[0])
> > +		      : "memory");
> > +
> > +	return res;
> > +}
> > +
> >  #endif /* _RTE_ATOMIC_X86_64_H_ */
> > diff --git a/lib/librte_eal/common/include/generic/rte_atomic.h
> > b/lib/librte_eal/common/include/generic/rte_atomic.h
> > index b99ba4688..8d612d566 100644
> > --- a/lib/librte_eal/common/include/generic/rte_atomic.h
> > +++ b/lib/librte_eal/common/include/generic/rte_atomic.h
> > @@ -14,6 +14,7 @@
> >
> >  #include <stdint.h>
> >  #include <rte_common.h>
> > +#include <rte_compat.h>
> >
> >  #ifdef __DOXYGEN__
> >
> > @@ -1082,4 +1083,68 @@ static inline void
> > rte_atomic64_clear(rte_atomic64_t
> > *v)
> >  }
> >  #endif
> >
> > +/*------------------------ 128 bit atomic operations
> > +------------------------
> > -*/
> > +
> > +/**
> > + * 128-bit integer structure.
> > + */
> > +typedef struct {
> > +	uint64_t val[2];
> > +} __rte_aligned(16) rte_int128_t;
> So we can't use __int128?
> 

I'll put it in a union with val[2], in case any implementations want to use it.

Thanks,
Gage

[snip]
Eads, Gage Feb. 1, 2019, 5:11 p.m. | #4
> -----Original Message-----
> From: Honnappa Nagarahalli [mailto:Honnappa.Nagarahalli@arm.com]
> Sent: Wednesday, January 30, 2019 11:48 PM
> To: Eads, Gage <gage.eads@intel.com>; dev@dpdk.org
> Cc: olivier.matz@6wind.com; arybchenko@solarflare.com; Richardson, Bruce
> <bruce.richardson@intel.com>; Ananyev, Konstantin
> <konstantin.ananyev@intel.com>; Gavin Hu (Arm Technology China)
> <Gavin.Hu@arm.com>; nd <nd@arm.com>; chaozhu@linux.vnet.ibm.com;
> jerinj@marvell.com; hemant.agrawal@nxp.com; nd <nd@arm.com>
> Subject: RE: [PATCH 1/1] eal: add 128-bit cmpset (x86-64 only)
> 
> >
> > This operation can be used for non-blocking algorithms, such as a non-
> > blocking stack or ring.
> >
> > Signed-off-by: Gage Eads <gage.eads@intel.com>
> > ---
> >  .../common/include/arch/x86/rte_atomic_64.h        | 31 +++++++++++
> >  lib/librte_eal/common/include/generic/rte_atomic.h | 65
> > ++++++++++++++++++++++
> >  2 files changed, 96 insertions(+)
> >
> > diff --git a/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> > b/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> > index fd2ec9c53..b7b90b83e 100644
> > --- a/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> > +++ b/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> > @@ -34,6 +34,7 @@
> >  /*
> >   * Inspired from FreeBSD src/sys/amd64/include/atomic.h
> >   * Copyright (c) 1998 Doug Rabson
> > + * Copyright (c) 2019 Intel Corporation
> >   * All rights reserved.
> >   */
> >
> > @@ -46,6 +47,7 @@
> >
> >  #include <stdint.h>
> >  #include <rte_common.h>
> > +#include <rte_compat.h>
> >  #include <rte_atomic.h>
> >
> >  /*------------------------- 64 bit atomic operations
> > -------------------------*/ @@ -
> > 208,4 +210,33 @@ static inline void rte_atomic64_clear(rte_atomic64_t
> > *v)  } #endif
> >
> > +static inline int __rte_experimental
> > +rte_atomic128_cmpset(volatile rte_int128_t *dst,
> Does it make sense to call is rte_atomic128_compare_exchange (or
> ..._cmp_xchg) to indicate it is a compare-exchange operation?
> 

Good point, though for v2 I'm planning to change this to true cmpset semantics (no update to exp on failure). See Ola's reply for the justification.

> > +		     rte_int128_t *exp, rte_int128_t *src,
> > +		     unsigned int weak,
> > +		     enum rte_atomic_memmodel_t success,
> > +		     enum rte_atomic_memmodel_t failure) {
> > +	RTE_SET_USED(weak);
> > +	RTE_SET_USED(success);
> > +	RTE_SET_USED(failure);
> > +	uint8_t res;
> > +
> > +	asm volatile (
> > +		      MPLOCKED
> > +		      "cmpxchg16b %[dst];"
> > +		      " sete %[res]"
> > +		      : [dst] "=m" (dst->val[0]),
> > +			"=A" (exp->val[0]),
> > +			[res] "=r" (res)
> > +		      : "c" (src->val[1]),
> > +			"b" (src->val[0]),
> > +			"m" (dst->val[0]),
> > +			"d" (exp->val[1]),
> > +			"a" (exp->val[0])
> > +		      : "memory");
> > +
> > +	return res;
> > +}
> > +
> >  #endif /* _RTE_ATOMIC_X86_64_H_ */
> > diff --git a/lib/librte_eal/common/include/generic/rte_atomic.h
> > b/lib/librte_eal/common/include/generic/rte_atomic.h
> > index b99ba4688..8d612d566 100644
> > --- a/lib/librte_eal/common/include/generic/rte_atomic.h
> > +++ b/lib/librte_eal/common/include/generic/rte_atomic.h
> > @@ -14,6 +14,7 @@
> >
> >  #include <stdint.h>
> >  #include <rte_common.h>
> > +#include <rte_compat.h>
> >
> >  #ifdef __DOXYGEN__
> >
> > @@ -1082,4 +1083,68 @@ static inline void
> > rte_atomic64_clear(rte_atomic64_t *v)  }  #endif
> >
> > +/*------------------------ 128 bit atomic operations
> > +-------------------------*/
> > +
> > +/**
> > + * 128-bit integer structure.
> > + */
> > +typedef struct {
> > +	uint64_t val[2];
> > +} __rte_aligned(16) rte_int128_t;
> It looks like '__int128' is available from gcc 4.6. I think we should use '__int128'.
> We can have it as an internal structure for ease of programming.
> 

Will add in v2.

> > +
> > +/**
> > + * Memory consistency models used in atomic operations. These control
> > +the
> > + * behavior of the operation with respect to memory barriers and
> > + * thread synchronization.
> > + *
> > + * These directly match those in the C++11 standard; for details on
> > +their
> > + * behavior, refer to the standard.
> > + */
> > +enum rte_atomic_memmodel_t {
> > +	RTE_ATOMIC_RELAXED,
> > +	RTE_ATOMIC_CONSUME,
> > +	RTE_ATOMIC_ACQUIRE,
> > +	RTE_ATOMIC_RELEASE,
> > +	RTE_ATOMIC_ACQ_REL,
> > +	RTE_ATOMIC_SEQ_CST,
> > +};
> IMO, we can use the GCC provided names. I do not see any advantage to
> defining our own.
> 

Will change in v2. I was trying to avoid issues with GCC versions that don't have C++11 support, but DPDK's recommended minimum version (4.9) is later than the version that added __atomic builtins (4.7).

> > +
> > +/* Only implemented on x86-64 currently. The ifdef prevents
> > +compilation from
> > + * failing for architectures without a definition of this function.
> > + */
> Minor comment. We can skip the above comments, the #if below is pretty
> obvious.
> 

Sure.

Thanks,
Gage

<snip>
Ola Liljedahl Feb. 1, 2019, 7:01 p.m. | #5
On Fri, 2019-02-01 at 17:06 +0000, Eads, Gage wrote:
> 
> > 
> > -----Original Message-----
> > From: Ola Liljedahl [mailto:Ola.Liljedahl@arm.com]
> > Sent: Monday, January 28, 2019 5:02 PM
> > To: Eads, Gage <gage.eads@intel.com>; dev@dpdk.org
> > Cc: arybchenko@solarflare.com; jerinj@marvell.com;
> > chaozhu@linux.vnet.ibm.com; nd <nd@arm.com>; Richardson, Bruce
> > <bruce.richardson@intel.com>; Ananyev, Konstantin
> > <konstantin.ananyev@intel.com>; hemant.agrawal@nxp.com;
> > olivier.matz@6wind.com; Honnappa Nagarahalli
> > <Honnappa.Nagarahalli@arm.com>; Gavin Hu (Arm Technology China)
> > <Gavin.Hu@arm.com>
> > Subject: Re: [dpdk-dev] [PATCH 1/1] eal: add 128-bit cmpset (x86-64 only)
> > 
> > On Mon, 2019-01-28 at 11:29 -0600, Gage Eads wrote:
> > > 
> > > This operation can be used for non-blocking algorithms, such as a
> > > non-blocking stack or ring.
> > > 
> > > Signed-off-by: Gage Eads <gage.eads@intel.com>
> > > ---
> > >  .../common/include/arch/x86/rte_atomic_64.h        | 31 +++++++++++
> > >  lib/librte_eal/common/include/generic/rte_atomic.h | 65
> > > ++++++++++++++++++++++
> > >  2 files changed, 96 insertions(+)
> > > 
> > > diff --git a/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> > > b/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> > > index fd2ec9c53..b7b90b83e 100644
> > > --- a/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> > > +++ b/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> > > @@ -34,6 +34,7 @@
> > >  /*
> > >   * Inspired from FreeBSD src/sys/amd64/include/atomic.h
> > >   * Copyright (c) 1998 Doug Rabson
> > > + * Copyright (c) 2019 Intel Corporation
> > >   * All rights reserved.
> > >   */
> > > 
> > > @@ -46,6 +47,7 @@
> > > 
> > >  #include <stdint.h>
> > >  #include <rte_common.h>
> > > +#include <rte_compat.h>
> > >  #include <rte_atomic.h>
> > > 
> > >  /*------------------------- 64 bit atomic operations
> > > ------------------------ -*/ @@ -208,4 +210,33 @@ static inline void
> > > rte_atomic64_clear(rte_atomic64_t *v)
> > >  }
> > >  #endif
> > > 
> > > +static inline int __rte_experimental
> > __rte_always_inline?
> > 
> > > 
> > > +rte_atomic128_cmpset(volatile rte_int128_t *dst,
> > No need to declare the location volatile. Volatile doesn't do what you think
> > it
> > does.
> > https://youtu.be/lkgszkPnV8g?t=1027
> > 
> I made this volatile to match the existing rte_atomicN_cmpset definitions,
> which presumably have a good reason for using the keyword. Maintainers, any
> input here?
> 
> > 
> > 
> > > 
> > > +		     rte_int128_t *exp,
> > I would declare 'exp' const as well and document that 'exp' is not updated
> > (with
> > the old value) for a failure. The reason being that ARMv8.0/AArch64 cannot
> > atomically read the old value without also writing the location and that is
> > bad
> > for performance (unnecessary writes leads to unnecessary contention and
> > worse scalability). And the user must anyway read the location (in the start
> > of
> > the critical section) using e.g. non-atomic 64-bit reads so there isn't
> > actually any
> > requirement for an atomic 128-bit read of the location.
> > 
> Will change in v2.
> 
> > 
> > > 
> > >  rte_int128_t *src,
> > const rte_int128_t *src?
> Sure, I don't see any harm in using const.
> 
> > 
> > 
> > But why are we not passing 'exp' and 'src' by value? That works great, even
> > with
> > structs. Passing by value simplifies the compiler's life, especially if the
> > call is
> > inlined. Ask a compiler developer.
> I ran objdump on the nb_stack code with both approaches, and pass-by-reference 
> resulted in fewer overall x86_64 assembly ops.
> PBV: 100 ops for push, 97 ops for pop
> PBR: 92 ops for push, 84 ops for pop
OK I have never checked x86_64 code generation... I have good experiences with
ARM/AArch64, everything seems to be done using registers. I am surprised there
is a difference.

Did a quick check with lfring, passing 'src' (third param) by reference and by
value. No difference in code generation on x86_64.

But if you insist let's go with PBR.

> 
> (Using the in-progress v5 nb_stack code)
> 
> Another factor -- though much less compelling -- is that with pass-by-
> reference, the user can create a 16B structure and cast it to rte_int128_t
> when they call rte_atomic128_cmpset, whereas with pass-by-value they need to
> put that struct in a union with rte_int128_t.
Which is what I always do nowadays... Trying to use as few casts as possible and
lie to the compiler as seldom as possible. But I can see the freedom provided by
taking a pointer to something and cast it it rte_int128_t ptr in the call
to rte_atomic128_cmpset().

Would prefer a name that is more similar to __atomic_compare_exchange(). E.g.
rte_atomic128_compare_exchange() (or perhaps just rte_atomic128_cmpxchg)? All
the rte_atomicXX_cmpset() functions do not take any memory order parameters.
From an Arm perspective, we are not happy with that.

> 
> > 
> > 
> > > 
> > > +		     unsigned int weak,
> > > +		     enum rte_atomic_memmodel_t success,
> > > +		     enum rte_atomic_memmodel_t failure) {
> > > +	RTE_SET_USED(weak);
> > > +	RTE_SET_USED(success);
> > > +	RTE_SET_USED(failure);
> > > +	uint8_t res;
> > > +
> > > +	asm volatile (
> > > +		      MPLOCKED
> > > +		      "cmpxchg16b %[dst];"
> > > +		      " sete %[res]"
> > > +		      : [dst] "=m" (dst->val[0]),
> > > +			"=A" (exp->val[0]),
> > > +			[res] "=r" (res)
> > > +		      : "c" (src->val[1]),
> > > +			"b" (src->val[0]),
> > > +			"m" (dst->val[0]),
> > > +			"d" (exp->val[1]),
> > > +			"a" (exp->val[0])
> > > +		      : "memory");
> > > +
> > > +	return res;
> > > +}
> > > +
> > >  #endif /* _RTE_ATOMIC_X86_64_H_ */
> > > diff --git a/lib/librte_eal/common/include/generic/rte_atomic.h
> > > b/lib/librte_eal/common/include/generic/rte_atomic.h
> > > index b99ba4688..8d612d566 100644
> > > --- a/lib/librte_eal/common/include/generic/rte_atomic.h
> > > +++ b/lib/librte_eal/common/include/generic/rte_atomic.h
> > > @@ -14,6 +14,7 @@
> > > 
> > >  #include <stdint.h>
> > >  #include <rte_common.h>
> > > +#include <rte_compat.h>
> > > 
> > >  #ifdef __DOXYGEN__
> > > 
> > > @@ -1082,4 +1083,68 @@ static inline void
> > > rte_atomic64_clear(rte_atomic64_t
> > > *v)
> > >  }
> > >  #endif
> > > 
> > > +/*------------------------ 128 bit atomic operations
> > > +------------------------
> > > -*/
> > > +
> > > +/**
> > > + * 128-bit integer structure.
> > > + */
> > > +typedef struct {
> > > +	uint64_t val[2];
> > > +} __rte_aligned(16) rte_int128_t;
> > So we can't use __int128?
> > 
> I'll put it in a union with val[2], in case any implementations want to use
> it.
Thinking on this one more time, since the inline asm functions (e.g. for x86_64
cmpxchg16b and for AArch64 LDXP/STXP) anyway will use 64-bit registers, it makes
most sense to make rte_int128_t a struct of 2x64b. The question is whether to
use an array like above or a struct with two elements (which I normally do
internally). Can you compare code generation with the following definition?
typedef struct {
        uint64_t lo, hi;
} __rte_aligned(16) rte_int128_t;

> 
> Thanks,
> Gage
> 
> [snip]
Eads, Gage Feb. 1, 2019, 7:28 p.m. | #6
> -----Original Message-----
> From: Ola Liljedahl [mailto:Ola.Liljedahl@arm.com]
> Sent: Friday, February 1, 2019 1:02 PM
> To: Eads, Gage <gage.eads@intel.com>; dev@dpdk.org
> Cc: jerinj@marvell.com; chaozhu@linux.vnet.ibm.com; nd <nd@arm.com>;
> Richardson, Bruce <bruce.richardson@intel.com>; Ananyev, Konstantin
> <konstantin.ananyev@intel.com>; hemant.agrawal@nxp.com;
> olivier.matz@6wind.com; arybchenko@solarflare.com; Gavin Hu (Arm
> Technology China) <Gavin.Hu@arm.com>; Honnappa Nagarahalli
> <Honnappa.Nagarahalli@arm.com>
> Subject: Re: [dpdk-dev] [PATCH 1/1] eal: add 128-bit cmpset (x86-64 only)
> 
> On Fri, 2019-02-01 at 17:06 +0000, Eads, Gage wrote:
> >
> > >
> > > -----Original Message-----
> > > From: Ola Liljedahl [mailto:Ola.Liljedahl@arm.com]
> > > Sent: Monday, January 28, 2019 5:02 PM
> > > To: Eads, Gage <gage.eads@intel.com>; dev@dpdk.org
> > > Cc: arybchenko@solarflare.com; jerinj@marvell.com;
> > > chaozhu@linux.vnet.ibm.com; nd <nd@arm.com>; Richardson, Bruce
> > > <bruce.richardson@intel.com>; Ananyev, Konstantin
> > > <konstantin.ananyev@intel.com>; hemant.agrawal@nxp.com;
> > > olivier.matz@6wind.com; Honnappa Nagarahalli
> > > <Honnappa.Nagarahalli@arm.com>; Gavin Hu (Arm Technology China)
> > > <Gavin.Hu@arm.com>
> > > Subject: Re: [dpdk-dev] [PATCH 1/1] eal: add 128-bit cmpset (x86-64
> > > only)
> > >
> > > On Mon, 2019-01-28 at 11:29 -0600, Gage Eads wrote:
> > > >
> > > > This operation can be used for non-blocking algorithms, such as a
> > > > non-blocking stack or ring.
> > > >
> > > > Signed-off-by: Gage Eads <gage.eads@intel.com>
> > > > ---
> > > >  .../common/include/arch/x86/rte_atomic_64.h        | 31
> > > > +++++++++++
> > > >  lib/librte_eal/common/include/generic/rte_atomic.h | 65
> > > > ++++++++++++++++++++++
> > > >  2 files changed, 96 insertions(+)
> > > >
> > > > diff --git
> > > > a/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> > > > b/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> > > > index fd2ec9c53..b7b90b83e 100644
> > > > --- a/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> > > > +++ b/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> > > > @@ -34,6 +34,7 @@
> > > >  /*
> > > >   * Inspired from FreeBSD src/sys/amd64/include/atomic.h
> > > >   * Copyright (c) 1998 Doug Rabson
> > > > + * Copyright (c) 2019 Intel Corporation
> > > >   * All rights reserved.
> > > >   */
> > > >
> > > > @@ -46,6 +47,7 @@
> > > >
> > > >  #include <stdint.h>
> > > >  #include <rte_common.h>
> > > > +#include <rte_compat.h>
> > > >  #include <rte_atomic.h>
> > > >
> > > >  /*------------------------- 64 bit atomic operations
> > > > ------------------------ -*/ @@ -208,4 +210,33 @@ static inline
> > > > void rte_atomic64_clear(rte_atomic64_t *v)
> > > >  }
> > > >  #endif
> > > >
> > > > +static inline int __rte_experimental
> > > __rte_always_inline?
> > >
> > > >
> > > > +rte_atomic128_cmpset(volatile rte_int128_t *dst,
> > > No need to declare the location volatile. Volatile doesn't do what
> > > you think it does.
> > > https://youtu.be/lkgszkPnV8g?t=1027
> > >
> > I made this volatile to match the existing rte_atomicN_cmpset
> > definitions, which presumably have a good reason for using the
> > keyword. Maintainers, any input here?
> >
> > >
> > >
> > > >
> > > > +		     rte_int128_t *exp,
> > > I would declare 'exp' const as well and document that 'exp' is not
> > > updated (with the old value) for a failure. The reason being that
> > > ARMv8.0/AArch64 cannot atomically read the old value without also
> > > writing the location and that is bad for performance (unnecessary
> > > writes leads to unnecessary contention and worse scalability). And
> > > the user must anyway read the location (in the start of the critical
> > > section) using e.g. non-atomic 64-bit reads so there isn't actually
> > > any requirement for an atomic 128-bit read of the location.
> > >
> > Will change in v2.
> >
> > >
> > > >
> > > >  rte_int128_t *src,
> > > const rte_int128_t *src?
> > Sure, I don't see any harm in using const.
> >
> > >
> > >
> > > But why are we not passing 'exp' and 'src' by value? That works
> > > great, even with structs. Passing by value simplifies the compiler's
> > > life, especially if the call is inlined. Ask a compiler developer.
> > I ran objdump on the nb_stack code with both approaches, and
> > pass-by-reference resulted in fewer overall x86_64 assembly ops.
> > PBV: 100 ops for push, 97 ops for pop
> > PBR: 92 ops for push, 84 ops for pop
> OK I have never checked x86_64 code generation... I have good experiences
> with ARM/AArch64, everything seems to be done using registers. I am surprised
> there is a difference.
> 
> Did a quick check with lfring, passing 'src' (third param) by reference and by
> value. No difference in code generation on x86_64.
> 
> But if you insist let's go with PBR.
> 
> >
> > (Using the in-progress v5 nb_stack code)
> >
> > Another factor -- though much less compelling -- is that with pass-by-
> > reference, the user can create a 16B structure and cast it to
> > rte_int128_t when they call rte_atomic128_cmpset, whereas with
> > pass-by-value they need to put that struct in a union with rte_int128_t.
> Which is what I always do nowadays... Trying to use as few casts as possible and
> lie to the compiler as seldom as possible. But I can see the freedom provided by
> taking a pointer to something and cast it it rte_int128_t ptr in the call
> to rte_atomic128_cmpset().
> 
> Would prefer a name that is more similar to __atomic_compare_exchange().
> E.g.
> rte_atomic128_compare_exchange() (or perhaps just rte_atomic128_cmpxchg)?
> All the rte_atomicXX_cmpset() functions do not take any memory order
> parameters.
> From an Arm perspective, we are not happy with that.

Since the function returns a boolean success value, isn't compare-and-set the appropriate term?

> 
> >
> > >
> > >
> > > >
> > > > +		     unsigned int weak,
> > > > +		     enum rte_atomic_memmodel_t success,
> > > > +		     enum rte_atomic_memmodel_t failure) {
> > > > +	RTE_SET_USED(weak);
> > > > +	RTE_SET_USED(success);
> > > > +	RTE_SET_USED(failure);
> > > > +	uint8_t res;
> > > > +
> > > > +	asm volatile (
> > > > +		      MPLOCKED
> > > > +		      "cmpxchg16b %[dst];"
> > > > +		      " sete %[res]"
> > > > +		      : [dst] "=m" (dst->val[0]),
> > > > +			"=A" (exp->val[0]),
> > > > +			[res] "=r" (res)
> > > > +		      : "c" (src->val[1]),
> > > > +			"b" (src->val[0]),
> > > > +			"m" (dst->val[0]),
> > > > +			"d" (exp->val[1]),
> > > > +			"a" (exp->val[0])
> > > > +		      : "memory");
> > > > +
> > > > +	return res;
> > > > +}
> > > > +
> > > >  #endif /* _RTE_ATOMIC_X86_64_H_ */ diff --git
> > > > a/lib/librte_eal/common/include/generic/rte_atomic.h
> > > > b/lib/librte_eal/common/include/generic/rte_atomic.h
> > > > index b99ba4688..8d612d566 100644
> > > > --- a/lib/librte_eal/common/include/generic/rte_atomic.h
> > > > +++ b/lib/librte_eal/common/include/generic/rte_atomic.h
> > > > @@ -14,6 +14,7 @@
> > > >
> > > >  #include <stdint.h>
> > > >  #include <rte_common.h>
> > > > +#include <rte_compat.h>
> > > >
> > > >  #ifdef __DOXYGEN__
> > > >
> > > > @@ -1082,4 +1083,68 @@ static inline void
> > > > rte_atomic64_clear(rte_atomic64_t
> > > > *v)
> > > >  }
> > > >  #endif
> > > >
> > > > +/*------------------------ 128 bit atomic operations
> > > > +------------------------
> > > > -*/
> > > > +
> > > > +/**
> > > > + * 128-bit integer structure.
> > > > + */
> > > > +typedef struct {
> > > > +	uint64_t val[2];
> > > > +} __rte_aligned(16) rte_int128_t;
> > > So we can't use __int128?
> > >
> > I'll put it in a union with val[2], in case any implementations want
> > to use it.
> Thinking on this one more time, since the inline asm functions (e.g. for x86_64
> cmpxchg16b and for AArch64 LDXP/STXP) anyway will use 64-bit registers, it
> makes most sense to make rte_int128_t a struct of 2x64b. The question is
> whether to use an array like above or a struct with two elements (which I
> normally do internally). Can you compare code generation with the following
> definition?
> typedef struct {
>         uint64_t lo, hi;
> } __rte_aligned(16) rte_int128_t;
> 

Interestingly, that made no difference in the PBV code but added more instructions overall to PBR:
PBV: 100 insts for push, 97 insts for pop
PBR: 100 insts for push, 83 insts for pop

> >
> > Thanks,
> > Gage
> >
> > [snip]
> --
> Ola Liljedahl, Networking System Architect, Arm Phone +46706866373, Skype
> ola.liljedahl
Ola Liljedahl Feb. 1, 2019, 7:43 p.m. | #7
On Fri, 2019-02-01 at 19:28 +0000, Eads, Gage wrote:
> 
> > 
> > -----Original Message-----
> > From: Ola Liljedahl [mailto:Ola.Liljedahl@arm.com]
> > Sent: Friday, February 1, 2019 1:02 PM
> > To: Eads, Gage <gage.eads@intel.com>; dev@dpdk.org
> > Cc: jerinj@marvell.com; chaozhu@linux.vnet.ibm.com; nd <nd@arm.com>;
> > Richardson, Bruce <bruce.richardson@intel.com>; Ananyev, Konstantin
> > <konstantin.ananyev@intel.com>; hemant.agrawal@nxp.com;
> > olivier.matz@6wind.com; arybchenko@solarflare.com; Gavin Hu (Arm
> > Technology China) <Gavin.Hu@arm.com>; Honnappa Nagarahalli
> > <Honnappa.Nagarahalli@arm.com>
> > Subject: Re: [dpdk-dev] [PATCH 1/1] eal: add 128-bit cmpset (x86-64 only)
> > 
> > On Fri, 2019-02-01 at 17:06 +0000, Eads, Gage wrote:
> > > 
> > > 
> > > > 
> > > > 
> > > > -----Original Message-----
> > > > From: Ola Liljedahl [mailto:Ola.Liljedahl@arm.com]
> > > > Sent: Monday, January 28, 2019 5:02 PM
> > > > To: Eads, Gage <gage.eads@intel.com>; dev@dpdk.org
> > > > Cc: arybchenko@solarflare.com; jerinj@marvell.com;
> > > > chaozhu@linux.vnet.ibm.com; nd <nd@arm.com>; Richardson, Bruce
> > > > <bruce.richardson@intel.com>; Ananyev, Konstantin
> > > > <konstantin.ananyev@intel.com>; hemant.agrawal@nxp.com;
> > > > olivier.matz@6wind.com; Honnappa Nagarahalli
> > > > <Honnappa.Nagarahalli@arm.com>; Gavin Hu (Arm Technology China)
> > > > <Gavin.Hu@arm.com>
> > > > Subject: Re: [dpdk-dev] [PATCH 1/1] eal: add 128-bit cmpset (x86-64
> > > > only)
> > > > 
> > > > On Mon, 2019-01-28 at 11:29 -0600, Gage Eads wrote:
> > > > > 
> > > > > 
> > > > > This operation can be used for non-blocking algorithms, such as a
> > > > > non-blocking stack or ring.
> > > > > 
> > > > > Signed-off-by: Gage Eads <gage.eads@intel.com>
> > > > > ---
> > > > >  .../common/include/arch/x86/rte_atomic_64.h        | 31
> > > > > +++++++++++
> > > > >  lib/librte_eal/common/include/generic/rte_atomic.h | 65
> > > > > ++++++++++++++++++++++
> > > > >  2 files changed, 96 insertions(+)
> > > > > 
> > > > > diff --git
> > > > > a/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> > > > > b/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> > > > > index fd2ec9c53..b7b90b83e 100644
> > > > > --- a/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> > > > > +++ b/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> > > > > @@ -34,6 +34,7 @@
> > > > >  /*
> > > > >   * Inspired from FreeBSD src/sys/amd64/include/atomic.h
> > > > >   * Copyright (c) 1998 Doug Rabson
> > > > > + * Copyright (c) 2019 Intel Corporation
> > > > >   * All rights reserved.
> > > > >   */
> > > > > 
> > > > > @@ -46,6 +47,7 @@
> > > > > 
> > > > >  #include <stdint.h>
> > > > >  #include <rte_common.h>
> > > > > +#include <rte_compat.h>
> > > > >  #include <rte_atomic.h>
> > > > > 
> > > > >  /*------------------------- 64 bit atomic operations
> > > > > ------------------------ -*/ @@ -208,4 +210,33 @@ static inline
> > > > > void rte_atomic64_clear(rte_atomic64_t *v)
> > > > >  }
> > > > >  #endif
> > > > > 
> > > > > +static inline int __rte_experimental
> > > > __rte_always_inline?
> > > > 
> > > > > 
> > > > > 
> > > > > +rte_atomic128_cmpset(volatile rte_int128_t *dst,
> > > > No need to declare the location volatile. Volatile doesn't do what
> > > > you think it does.
> > > > https://youtu.be/lkgszkPnV8g?t=1027
> > > > 
> > > I made this volatile to match the existing rte_atomicN_cmpset
> > > definitions, which presumably have a good reason for using the
> > > keyword. Maintainers, any input here?
> > > 
> > > > 
> > > > 
> > > > 
> > > > > 
> > > > > 
> > > > > +		     rte_int128_t *exp,
> > > > I would declare 'exp' const as well and document that 'exp' is not
> > > > updated (with the old value) for a failure. The reason being that
> > > > ARMv8.0/AArch64 cannot atomically read the old value without also
> > > > writing the location and that is bad for performance (unnecessary
> > > > writes leads to unnecessary contention and worse scalability). And
> > > > the user must anyway read the location (in the start of the critical
> > > > section) using e.g. non-atomic 64-bit reads so there isn't actually
> > > > any requirement for an atomic 128-bit read of the location.
> > > > 
> > > Will change in v2.
> > > 
> > > > 
> > > > 
> > > > > 
> > > > > 
> > > > >  rte_int128_t *src,
> > > > const rte_int128_t *src?
> > > Sure, I don't see any harm in using const.
> > > 
> > > > 
> > > > 
> > > > 
> > > > But why are we not passing 'exp' and 'src' by value? That works
> > > > great, even with structs. Passing by value simplifies the compiler's
> > > > life, especially if the call is inlined. Ask a compiler developer.
> > > I ran objdump on the nb_stack code with both approaches, and
> > > pass-by-reference resulted in fewer overall x86_64 assembly ops.
> > > PBV: 100 ops for push, 97 ops for pop
> > > PBR: 92 ops for push, 84 ops for pop
> > OK I have never checked x86_64 code generation... I have good experiences
> > with ARM/AArch64, everything seems to be done using registers. I am
> > surprised
> > there is a difference.
> > 
> > Did a quick check with lfring, passing 'src' (third param) by reference and
> > by
> > value. No difference in code generation on x86_64.
> > 
> > But if you insist let's go with PBR.
> > 
> > > 
> > > 
> > > (Using the in-progress v5 nb_stack code)
> > > 
> > > Another factor -- though much less compelling -- is that with pass-by-
> > > reference, the user can create a 16B structure and cast it to
> > > rte_int128_t when they call rte_atomic128_cmpset, whereas with
> > > pass-by-value they need to put that struct in a union with rte_int128_t.
> > Which is what I always do nowadays... Trying to use as few casts as possible
> > and
> > lie to the compiler as seldom as possible. But I can see the freedom
> > provided by
> > taking a pointer to something and cast it it rte_int128_t ptr in the call
> > to rte_atomic128_cmpset().
> > 
> > Would prefer a name that is more similar to __atomic_compare_exchange().
> > E.g.
> > rte_atomic128_compare_exchange() (or perhaps just rte_atomic128_cmpxchg)?
> > All the rte_atomicXX_cmpset() functions do not take any memory order
> > parameters.
> > From an Arm perspective, we are not happy with that.
> Since the function returns a boolean success value, isn't compare-and-set the
> appropriate term?
I was thinking of the memory ordering parameters that __atomic_xxx builtins have
and we want (from an Arm perspective).

GCC __atomic_compare_exchange also returns a boolean.
https://gcc.gnu.org/onlinedocs/gcc/_005f_005fatomic-Builtins.html
bool __atomic_compare_exchange_n (type *ptr, type *expected, type desired, bool
weak, int success_memorder, int failure_memorder)
bool __atomic_compare_exchange (type *ptr, type *expected, type *desired, bool
weak, int success_memorder, int failure_memorder)

rte_atomic128_compare_exchange(rte_int128_t *dst, const rte_int128_t *exp, const
rte_int182_t *src, bool weak, int mo_success, int mo_failure);

> 
> > 
> > 
> > > 
> > > 
> > > > 
> > > > 
> > > > 
> > > > > 
> > > > > 
> > > > > +		     unsigned int weak,
> > > > > +		     enum rte_atomic_memmodel_t success,
> > > > > +		     enum rte_atomic_memmodel_t failure) {
> > > > > +	RTE_SET_USED(weak);
> > > > > +	RTE_SET_USED(success);
> > > > > +	RTE_SET_USED(failure);
> > > > > +	uint8_t res;
> > > > > +
> > > > > +	asm volatile (
> > > > > +		      MPLOCKED
> > > > > +		      "cmpxchg16b %[dst];"
> > > > > +		      " sete %[res]"
> > > > > +		      : [dst] "=m" (dst->val[0]),
> > > > > +			"=A" (exp->val[0]),
> > > > > +			[res] "=r" (res)
> > > > > +		      : "c" (src->val[1]),
> > > > > +			"b" (src->val[0]),
> > > > > +			"m" (dst->val[0]),
> > > > > +			"d" (exp->val[1]),
> > > > > +			"a" (exp->val[0])
> > > > > +		      : "memory");
> > > > > +
> > > > > +	return res;
> > > > > +}
> > > > > +
> > > > >  #endif /* _RTE_ATOMIC_X86_64_H_ */ diff --git
> > > > > a/lib/librte_eal/common/include/generic/rte_atomic.h
> > > > > b/lib/librte_eal/common/include/generic/rte_atomic.h
> > > > > index b99ba4688..8d612d566 100644
> > > > > --- a/lib/librte_eal/common/include/generic/rte_atomic.h
> > > > > +++ b/lib/librte_eal/common/include/generic/rte_atomic.h
> > > > > @@ -14,6 +14,7 @@
> > > > > 
> > > > >  #include <stdint.h>
> > > > >  #include <rte_common.h>
> > > > > +#include <rte_compat.h>
> > > > > 
> > > > >  #ifdef __DOXYGEN__
> > > > > 
> > > > > @@ -1082,4 +1083,68 @@ static inline void
> > > > > rte_atomic64_clear(rte_atomic64_t
> > > > > *v)
> > > > >  }
> > > > >  #endif
> > > > > 
> > > > > +/*------------------------ 128 bit atomic operations
> > > > > +------------------------
> > > > > -*/
> > > > > +
> > > > > +/**
> > > > > + * 128-bit integer structure.
> > > > > + */
> > > > > +typedef struct {
> > > > > +	uint64_t val[2];
> > > > > +} __rte_aligned(16) rte_int128_t;
> > > > So we can't use __int128?
> > > > 
> > > I'll put it in a union with val[2], in case any implementations want
> > > to use it.
> > Thinking on this one more time, since the inline asm functions (e.g. for
> > x86_64
> > cmpxchg16b and for AArch64 LDXP/STXP) anyway will use 64-bit registers, it
> > makes most sense to make rte_int128_t a struct of 2x64b. The question is
> > whether to use an array like above or a struct with two elements (which I
> > normally do internally). Can you compare code generation with the following
> > definition?
> > typedef struct {
> >         uint64_t lo, hi;
> > } __rte_aligned(16) rte_int128_t;
> > 
> Interestingly, that made no difference in the PBV code but added more
> instructions overall to PBR:
> PBV: 100 insts for push, 97 insts for pop
> PBR: 100 insts for push, 83 insts for pop
I think we learned something here... Trying to understand exactly what. But I
think this result settles it.

I should test the different alternatives on Arm, does code generation behave the
same as for x86_64.

> 
> > 
> > > 
> > > 
> > > Thanks,
> > > Gage
> > > 
> > > [snip]
> > --
> > Ola Liljedahl, Networking System Architect, Arm Phone +46706866373, Skype
> > ola.liljedahl
Eads, Gage Feb. 1, 2019, 9:05 p.m. | #8
> -----Original Message-----
> From: Ola Liljedahl [mailto:Ola.Liljedahl@arm.com]
> Sent: Friday, February 1, 2019 1:44 PM
> To: Eads, Gage <gage.eads@intel.com>; dev@dpdk.org
> Cc: jerinj@marvell.com; chaozhu@linux.vnet.ibm.com; nd <nd@arm.com>;
> Richardson, Bruce <bruce.richardson@intel.com>; Ananyev, Konstantin
> <konstantin.ananyev@intel.com>; hemant.agrawal@nxp.com;
> olivier.matz@6wind.com; arybchenko@solarflare.com; Gavin Hu (Arm
> Technology China) <Gavin.Hu@arm.com>; Honnappa Nagarahalli
> <Honnappa.Nagarahalli@arm.com>
> Subject: Re: [dpdk-dev] [PATCH 1/1] eal: add 128-bit cmpset (x86-64 only)
> 
> On Fri, 2019-02-01 at 19:28 +0000, Eads, Gage wrote:
> >
> > >
> > > -----Original Message-----
> > > From: Ola Liljedahl [mailto:Ola.Liljedahl@arm.com]
> > > Sent: Friday, February 1, 2019 1:02 PM
> > > To: Eads, Gage <gage.eads@intel.com>; dev@dpdk.org
> > > Cc: jerinj@marvell.com; chaozhu@linux.vnet.ibm.com; nd <nd@arm.com>;
> > > Richardson, Bruce <bruce.richardson@intel.com>; Ananyev, Konstantin
> > > <konstantin.ananyev@intel.com>; hemant.agrawal@nxp.com;
> > > olivier.matz@6wind.com; arybchenko@solarflare.com; Gavin Hu (Arm
> > > Technology China) <Gavin.Hu@arm.com>; Honnappa Nagarahalli
> > > <Honnappa.Nagarahalli@arm.com>
> > > Subject: Re: [dpdk-dev] [PATCH 1/1] eal: add 128-bit cmpset (x86-64
> > > only)
> > >
> > > On Fri, 2019-02-01 at 17:06 +0000, Eads, Gage wrote:
> > > >
> > > >
> > > > >
> > > > >
> > > > > -----Original Message-----
> > > > > From: Ola Liljedahl [mailto:Ola.Liljedahl@arm.com]
> > > > > Sent: Monday, January 28, 2019 5:02 PM
> > > > > To: Eads, Gage <gage.eads@intel.com>; dev@dpdk.org
> > > > > Cc: arybchenko@solarflare.com; jerinj@marvell.com;
> > > > > chaozhu@linux.vnet.ibm.com; nd <nd@arm.com>; Richardson, Bruce
> > > > > <bruce.richardson@intel.com>; Ananyev, Konstantin
> > > > > <konstantin.ananyev@intel.com>; hemant.agrawal@nxp.com;
> > > > > olivier.matz@6wind.com; Honnappa Nagarahalli
> > > > > <Honnappa.Nagarahalli@arm.com>; Gavin Hu (Arm Technology China)
> > > > > <Gavin.Hu@arm.com>
> > > > > Subject: Re: [dpdk-dev] [PATCH 1/1] eal: add 128-bit cmpset
> > > > > (x86-64
> > > > > only)
> > > > >
> > > > > On Mon, 2019-01-28 at 11:29 -0600, Gage Eads wrote:
> > > > > >
> > > > > >
> > > > > > This operation can be used for non-blocking algorithms, such
> > > > > > as a non-blocking stack or ring.
> > > > > >
> > > > > > Signed-off-by: Gage Eads <gage.eads@intel.com>
> > > > > > ---
> > > > > >  .../common/include/arch/x86/rte_atomic_64.h        | 31
> > > > > > +++++++++++
> > > > > >  lib/librte_eal/common/include/generic/rte_atomic.h | 65
> > > > > > ++++++++++++++++++++++
> > > > > >  2 files changed, 96 insertions(+)
> > > > > >
> > > > > > diff --git
> > > > > > a/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> > > > > > b/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> > > > > > index fd2ec9c53..b7b90b83e 100644
> > > > > > --- a/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> > > > > > +++ b/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> > > > > > @@ -34,6 +34,7 @@
> > > > > >  /*
> > > > > >   * Inspired from FreeBSD src/sys/amd64/include/atomic.h
> > > > > >   * Copyright (c) 1998 Doug Rabson
> > > > > > + * Copyright (c) 2019 Intel Corporation
> > > > > >   * All rights reserved.
> > > > > >   */
> > > > > >
> > > > > > @@ -46,6 +47,7 @@
> > > > > >
> > > > > >  #include <stdint.h>
> > > > > >  #include <rte_common.h>
> > > > > > +#include <rte_compat.h>
> > > > > >  #include <rte_atomic.h>
> > > > > >
> > > > > >  /*------------------------- 64 bit atomic operations
> > > > > > ------------------------ -*/ @@ -208,4 +210,33 @@ static
> > > > > > inline void rte_atomic64_clear(rte_atomic64_t *v)
> > > > > >  }
> > > > > >  #endif
> > > > > >
> > > > > > +static inline int __rte_experimental
> > > > > __rte_always_inline?
> > > > >
> > > > > >
> > > > > >
> > > > > > +rte_atomic128_cmpset(volatile rte_int128_t *dst,
> > > > > No need to declare the location volatile. Volatile doesn't do
> > > > > what you think it does.
> > > > > https://youtu.be/lkgszkPnV8g?t=1027
> > > > >
> > > > I made this volatile to match the existing rte_atomicN_cmpset
> > > > definitions, which presumably have a good reason for using the
> > > > keyword. Maintainers, any input here?
> > > >
> > > > >
> > > > >
> > > > >
> > > > > >
> > > > > >
> > > > > > +		     rte_int128_t *exp,
> > > > > I would declare 'exp' const as well and document that 'exp' is
> > > > > not updated (with the old value) for a failure. The reason being
> > > > > that
> > > > > ARMv8.0/AArch64 cannot atomically read the old value without
> > > > > also writing the location and that is bad for performance
> > > > > (unnecessary writes leads to unnecessary contention and worse
> > > > > scalability). And the user must anyway read the location (in the
> > > > > start of the critical
> > > > > section) using e.g. non-atomic 64-bit reads so there isn't
> > > > > actually any requirement for an atomic 128-bit read of the location.
> > > > >
> > > > Will change in v2.
> > > >
> > > > >
> > > > >
> > > > > >
> > > > > >
> > > > > >  rte_int128_t *src,
> > > > > const rte_int128_t *src?
> > > > Sure, I don't see any harm in using const.
> > > >
> > > > >
> > > > >
> > > > >
> > > > > But why are we not passing 'exp' and 'src' by value? That works
> > > > > great, even with structs. Passing by value simplifies the
> > > > > compiler's life, especially if the call is inlined. Ask a compiler developer.
> > > > I ran objdump on the nb_stack code with both approaches, and
> > > > pass-by-reference resulted in fewer overall x86_64 assembly ops.
> > > > PBV: 100 ops for push, 97 ops for pop
> > > > PBR: 92 ops for push, 84 ops for pop
> > > OK I have never checked x86_64 code generation... I have good
> > > experiences with ARM/AArch64, everything seems to be done using
> > > registers. I am surprised there is a difference.
> > >
> > > Did a quick check with lfring, passing 'src' (third param) by
> > > reference and by value. No difference in code generation on x86_64.
> > >
> > > But if you insist let's go with PBR.
> > >
> > > >
> > > >
> > > > (Using the in-progress v5 nb_stack code)
> > > >
> > > > Another factor -- though much less compelling -- is that with
> > > > pass-by- reference, the user can create a 16B structure and cast
> > > > it to rte_int128_t when they call rte_atomic128_cmpset, whereas
> > > > with pass-by-value they need to put that struct in a union with
> rte_int128_t.
> > > Which is what I always do nowadays... Trying to use as few casts as
> > > possible and lie to the compiler as seldom as possible. But I can
> > > see the freedom provided by taking a pointer to something and cast
> > > it it rte_int128_t ptr in the call to rte_atomic128_cmpset().
> > >
> > > Would prefer a name that is more similar to __atomic_compare_exchange().
> > > E.g.
> > > rte_atomic128_compare_exchange() (or perhaps just
> rte_atomic128_cmpxchg)?
> > > All the rte_atomicXX_cmpset() functions do not take any memory order
> > > parameters.
> > > From an Arm perspective, we are not happy with that.
> > Since the function returns a boolean success value, isn't
> > compare-and-set the appropriate term?
> I was thinking of the memory ordering parameters that __atomic_xxx builtins
> have and we want (from an Arm perspective).
> 
> GCC __atomic_compare_exchange also returns a boolean.
> https://gcc.gnu.org/onlinedocs/gcc/_005f_005fatomic-Builtins.html
> bool __atomic_compare_exchange_n (type *ptr, type *expected, type desired,
> bool weak, int success_memorder, int failure_memorder) bool
> __atomic_compare_exchange (type *ptr, type *expected, type *desired, bool
> weak, int success_memorder, int failure_memorder)
> 
> rte_atomic128_compare_exchange(rte_int128_t *dst, const rte_int128_t *exp,
> const rte_int182_t *src, bool weak, int mo_success, int mo_failure);
> 

I assumed GCC chose the name "exchange" because the builtin modifies 'exp' on failure, but I could be wrong. I'm fine with either name. Anyone else have a preference?

<snip>
Ola Liljedahl Feb. 1, 2019, 11:11 p.m. | #9
On Fri, 2019-02-01 at 21:05 +0000, Eads, Gage wrote:
> 
> > 
> > -----Original Message-----
> > From: Ola Liljedahl [mailto:Ola.Liljedahl@arm.com]
> > Sent: Friday, February 1, 2019 1:44 PM
> > To: Eads, Gage <gage.eads@intel.com>; dev@dpdk.org
> > Cc: jerinj@marvell.com; chaozhu@linux.vnet.ibm.com; nd <nd@arm.com>;
> > Richardson, Bruce <bruce.richardson@intel.com>; Ananyev, Konstantin
> > <konstantin.ananyev@intel.com>; hemant.agrawal@nxp.com;
> > olivier.matz@6wind.com; arybchenko@solarflare.com; Gavin Hu (Arm
> > Technology China) <Gavin.Hu@arm.com>; Honnappa Nagarahalli
> > <Honnappa.Nagarahalli@arm.com>
> > Subject: Re: [dpdk-dev] [PATCH 1/1] eal: add 128-bit cmpset (x86-64 only)
> > 
> > On Fri, 2019-02-01 at 19:28 +0000, Eads, Gage wrote:
> > > 
> > > 
> > > > 
> > > > 
> > > > -----Original Message-----
> > > > From: Ola Liljedahl [mailto:Ola.Liljedahl@arm.com]
> > > > Sent: Friday, February 1, 2019 1:02 PM
> > > > To: Eads, Gage <gage.eads@intel.com>; dev@dpdk.org
> > > > Cc: jerinj@marvell.com; chaozhu@linux.vnet.ibm.com; nd <nd@arm.com>;
> > > > Richardson, Bruce <bruce.richardson@intel.com>; Ananyev, Konstantin
> > > > <konstantin.ananyev@intel.com>; hemant.agrawal@nxp.com;
> > > > olivier.matz@6wind.com; arybchenko@solarflare.com; Gavin Hu (Arm
> > > > Technology China) <Gavin.Hu@arm.com>; Honnappa Nagarahalli
> > > > <Honnappa.Nagarahalli@arm.com>
> > > > Subject: Re: [dpdk-dev] [PATCH 1/1] eal: add 128-bit cmpset (x86-64
> > > > only)
> > > > 
> > > > On Fri, 2019-02-01 at 17:06 +0000, Eads, Gage wrote:
> > > > > 
> > > > > 
> > > > > 
> > > > > > 
> > > > > > 
> > > > > > 
> > > > > > -----Original Message-----
> > > > > > From: Ola Liljedahl [mailto:Ola.Liljedahl@arm.com]
> > > > > > Sent: Monday, January 28, 2019 5:02 PM
> > > > > > To: Eads, Gage <gage.eads@intel.com>; dev@dpdk.org
> > > > > > Cc: arybchenko@solarflare.com; jerinj@marvell.com;
> > > > > > chaozhu@linux.vnet.ibm.com; nd <nd@arm.com>; Richardson, Bruce
> > > > > > <bruce.richardson@intel.com>; Ananyev, Konstantin
> > > > > > <konstantin.ananyev@intel.com>; hemant.agrawal@nxp.com;
> > > > > > olivier.matz@6wind.com; Honnappa Nagarahalli
> > > > > > <Honnappa.Nagarahalli@arm.com>; Gavin Hu (Arm Technology China)
> > > > > > <Gavin.Hu@arm.com>
> > > > > > Subject: Re: [dpdk-dev] [PATCH 1/1] eal: add 128-bit cmpset
> > > > > > (x86-64
> > > > > > only)
> > > > > > 
> > > > > > On Mon, 2019-01-28 at 11:29 -0600, Gage Eads wrote:
> > > > > > > 
> > > > > > > 
> > > > > > > 
> > > > > > > This operation can be used for non-blocking algorithms, such
> > > > > > > as a non-blocking stack or ring.
> > > > > > > 
> > > > > > > Signed-off-by: Gage Eads <gage.eads@intel.com>
> > > > > > > ---
> > > > > > >  .../common/include/arch/x86/rte_atomic_64.h        | 31
> > > > > > > +++++++++++
> > > > > > >  lib/librte_eal/common/include/generic/rte_atomic.h | 65
> > > > > > > ++++++++++++++++++++++
> > > > > > >  2 files changed, 96 insertions(+)
> > > > > > > 
> > > > > > > diff --git
> > > > > > > a/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> > > > > > > b/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> > > > > > > index fd2ec9c53..b7b90b83e 100644
> > > > > > > --- a/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> > > > > > > +++ b/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> > > > > > > @@ -34,6 +34,7 @@
> > > > > > >  /*
> > > > > > >   * Inspired from FreeBSD src/sys/amd64/include/atomic.h
> > > > > > >   * Copyright (c) 1998 Doug Rabson
> > > > > > > + * Copyright (c) 2019 Intel Corporation
> > > > > > >   * All rights reserved.
> > > > > > >   */
> > > > > > > 
> > > > > > > @@ -46,6 +47,7 @@
> > > > > > > 
> > > > > > >  #include <stdint.h>
> > > > > > >  #include <rte_common.h>
> > > > > > > +#include <rte_compat.h>
> > > > > > >  #include <rte_atomic.h>
> > > > > > > 
> > > > > > >  /*------------------------- 64 bit atomic operations
> > > > > > > ------------------------ -*/ @@ -208,4 +210,33 @@ static
> > > > > > > inline void rte_atomic64_clear(rte_atomic64_t *v)
> > > > > > >  }
> > > > > > >  #endif
> > > > > > > 
> > > > > > > +static inline int __rte_experimental
> > > > > > __rte_always_inline?
> > > > > > 
> > > > > > > 
> > > > > > > 
> > > > > > > 
> > > > > > > +rte_atomic128_cmpset(volatile rte_int128_t *dst,
> > > > > > No need to declare the location volatile. Volatile doesn't do
> > > > > > what you think it does.
> > > > > > https://youtu.be/lkgszkPnV8g?t=1027
> > > > > > 
> > > > > I made this volatile to match the existing rte_atomicN_cmpset
> > > > > definitions, which presumably have a good reason for using the
> > > > > keyword. Maintainers, any input here?
> > > > > 
> > > > > > 
> > > > > > 
> > > > > > 
> > > > > > 
> > > > > > > 
> > > > > > > 
> > > > > > > 
> > > > > > > +		     rte_int128_t *exp,
> > > > > > I would declare 'exp' const as well and document that 'exp' is
> > > > > > not updated (with the old value) for a failure. The reason being
> > > > > > that
> > > > > > ARMv8.0/AArch64 cannot atomically read the old value without
> > > > > > also writing the location and that is bad for performance
> > > > > > (unnecessary writes leads to unnecessary contention and worse
> > > > > > scalability). And the user must anyway read the location (in the
> > > > > > start of the critical
> > > > > > section) using e.g. non-atomic 64-bit reads so there isn't
> > > > > > actually any requirement for an atomic 128-bit read of the location.
> > > > > > 
> > > > > Will change in v2.
> > > > > 
> > > > > > 
> > > > > > 
> > > > > > 
> > > > > > > 
> > > > > > > 
> > > > > > > 
> > > > > > >  rte_int128_t *src,
> > > > > > const rte_int128_t *src?
> > > > > Sure, I don't see any harm in using const.
> > > > > 
> > > > > > 
> > > > > > 
> > > > > > 
> > > > > > 
> > > > > > But why are we not passing 'exp' and 'src' by value? That works
> > > > > > great, even with structs. Passing by value simplifies the
> > > > > > compiler's life, especially if the call is inlined. Ask a compiler
> > > > > > developer.
> > > > > I ran objdump on the nb_stack code with both approaches, and
> > > > > pass-by-reference resulted in fewer overall x86_64 assembly ops.
> > > > > PBV: 100 ops for push, 97 ops for pop
> > > > > PBR: 92 ops for push, 84 ops for pop
> > > > OK I have never checked x86_64 code generation... I have good
> > > > experiences with ARM/AArch64, everything seems to be done using
> > > > registers. I am surprised there is a difference.
> > > > 
> > > > Did a quick check with lfring, passing 'src' (third param) by
> > > > reference and by value. No difference in code generation on x86_64.
> > > > 
> > > > But if you insist let's go with PBR.
> > > > 
> > > > > 
> > > > > 
> > > > > 
> > > > > (Using the in-progress v5 nb_stack code)
> > > > > 
> > > > > Another factor -- though much less compelling -- is that with
> > > > > pass-by- reference, the user can create a 16B structure and cast
> > > > > it to rte_int128_t when they call rte_atomic128_cmpset, whereas
> > > > > with pass-by-value they need to put that struct in a union with
> > rte_int128_t.
> > > 
> > > > 
> > > > Which is what I always do nowadays... Trying to use as few casts as
> > > > possible and lie to the compiler as seldom as possible. But I can
> > > > see the freedom provided by taking a pointer to something and cast
> > > > it it rte_int128_t ptr in the call to rte_atomic128_cmpset().
> > > > 
> > > > Would prefer a name that is more similar to __atomic_compare_exchange().
> > > > E.g.
> > > > rte_atomic128_compare_exchange() (or perhaps just
> > rte_atomic128_cmpxchg)?
> > > 
> > > > 
> > > > All the rte_atomicXX_cmpset() functions do not take any memory order
> > > > parameters.
> > > > From an Arm perspective, we are not happy with that.
> > > Since the function returns a boolean success value, isn't
> > > compare-and-set the appropriate term?
> > I was thinking of the memory ordering parameters that __atomic_xxx builtins
> > have and we want (from an Arm perspective).
> > 
> > GCC __atomic_compare_exchange also returns a boolean.
> > https://gcc.gnu.org/onlinedocs/gcc/_005f_005fatomic-Builtins.html
> > bool __atomic_compare_exchange_n (type *ptr, type *expected, type desired,
> > bool weak, int success_memorder, int failure_memorder) bool
> > __atomic_compare_exchange (type *ptr, type *expected, type *desired, bool
> > weak, int success_memorder, int failure_memorder)
> > 
> > rte_atomic128_compare_exchange(rte_int128_t *dst, const rte_int128_t *exp,
> > const rte_int182_t *src, bool weak, int mo_success, int mo_failure);
> > 
> I assumed GCC chose the name "exchange" because the builtin modifies 'exp' on
> failure, but I could be wrong.
You are probably right. But exchange doesn't necessary mean that the old value
is returned, just that it is replaced with a different value?
However __atomic_exchange() (unconditionally) returns the old value so there is
some precedent for retunrning the old value. And it can be done, just not
guaranteed to be atomic. Perhaps there is a way to both eat and have the cake?

Re-use the prototype of __atomic_compare_exchange_16() so must return a value if
the comparison fails. But depending on the 'weak' flag, we for weak=false
require an atomic read which on ARMv8.0 must be accomplished by writing back the
old value (if the comparison fails). Or for weak=true we allow both spurious
LL/SC failures (this is what the weak parameter is for) and non-atomic read of
the old value (this is my 'frail' atomic compare and exchange in lfring).

The GCC documentation says this:
"If they are not equal, the operation is a read and the current contents of ptr
are written into expected."
We follow this but just don't guarantee an atomic read (for weak=true).

bool
rte_atomic128_compare_exchange(rte_int128_t *dst,
                               rte_int128_t *exp, //Note no const now
                               const rte_int128_t *src,
                               bool weak,
                               int mo_success,
                               int mo_failure);


>  I'm fine with either name. Anyone else have a preference?

> 
> <snip>
Honnappa Nagarahalli Feb. 4, 2019, 6:33 p.m. | #10
> >
> > On Mon, 2019-01-28 at 11:29 -0600, Gage Eads wrote:
> > > This operation can be used for non-blocking algorithms, such as a
> > > non-blocking stack or ring.
> > >
> > > Signed-off-by: Gage Eads <gage.eads@intel.com>
> > > ---
> > >  .../common/include/arch/x86/rte_atomic_64.h        | 31 +++++++++++
> > >  lib/librte_eal/common/include/generic/rte_atomic.h | 65
> > > ++++++++++++++++++++++
> > >  2 files changed, 96 insertions(+)
> > >
> > > diff --git a/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> > > b/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> > > index fd2ec9c53..b7b90b83e 100644
> > > --- a/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> > > +++ b/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
> > > @@ -34,6 +34,7 @@
> > >  /*
> > >   * Inspired from FreeBSD src/sys/amd64/include/atomic.h
> > >   * Copyright (c) 1998 Doug Rabson
> > > + * Copyright (c) 2019 Intel Corporation
> > >   * All rights reserved.
> > >   */
> > >
> > > @@ -46,6 +47,7 @@
> > >
> > >  #include <stdint.h>
> > >  #include <rte_common.h>
> > > +#include <rte_compat.h>
> > >  #include <rte_atomic.h>
> > >
> > >  /*------------------------- 64 bit atomic operations
> > > ------------------------ -*/ @@ -208,4 +210,33 @@ static inline void
> > > rte_atomic64_clear(rte_atomic64_t *v)
> > >  }
> > >  #endif
> > >
> > > +static inline int __rte_experimental
> > __rte_always_inline?
> >
> > > +rte_atomic128_cmpset(volatile rte_int128_t *dst,
> > No need to declare the location volatile. Volatile doesn't do what you
> > think it does.
> > https://youtu.be/lkgszkPnV8g?t=1027
> >
> 
> I made this volatile to match the existing rte_atomicN_cmpset definitions,
> which presumably have a good reason for using the keyword. Maintainers, any
> input here?
> 
> >
> > > +		     rte_int128_t *exp,
> > I would declare 'exp' const as well and document that 'exp' is not
> > updated (with the old value) for a failure. The reason being that
> > ARMv8.0/AArch64 cannot atomically read the old value without also
> > writing the location and that is bad for performance (unnecessary
> > writes leads to unnecessary contention and worse scalability). And the
> > user must anyway read the location (in the start of the critical
> > section) using e.g. non-atomic 64-bit reads so there isn't actually any
> requirement for an atomic 128-bit read of the location.
> >
> 
> Will change in v2.
> 
IMO, we should not change the definition of this API, because
1) This API will differ from __atomic_compare_exchange_n API. It will be a new API to learn for the users.
2) The definition in this patch will make it easy to replace this API call with __atomic_xxx API (whenever it supports 128b natively on all the platforms)
3) I do not see any documentation in [1] indicating that the 'read on failure' will be an atomic read.

[1] https://gcc.gnu.org/onlinedocs/gcc/_005f_005fatomic-Builtins.html

> > >  rte_int128_t *src,
> > const rte_int128_t *src?
> 
> Sure, I don't see any harm in using const.
> 
> >
> > But why are we not passing 'exp' and 'src' by value? That works great,
> > even with structs. Passing by value simplifies the compiler's life,
> > especially if the call is inlined. Ask a compiler developer.
> 
> I ran objdump on the nb_stack code with both approaches, and pass-by-
> reference resulted in fewer overall x86_64 assembly ops.
> PBV: 100 ops for push, 97 ops for pop
> PBR: 92 ops for push, 84 ops for pop
> 
> (Using the in-progress v5 nb_stack code)
> 
> Another factor -- though much less compelling -- is that with pass-by-reference,
> the user can create a 16B structure and cast it to rte_int128_t when they call
> rte_atomic128_cmpset, whereas with pass-by-value they need to put that
> struct in a union with rte_int128_t.
> 
> >
> > > +		     unsigned int weak,
> > > +		     enum rte_atomic_memmodel_t success,
> > > +		     enum rte_atomic_memmodel_t failure) {
> > > +	RTE_SET_USED(weak);
> > > +	RTE_SET_USED(success);
> > > +	RTE_SET_USED(failure);
> > > +	uint8_t res;
> > > +
> > > +	asm volatile (
> > > +		      MPLOCKED
> > > +		      "cmpxchg16b %[dst];"
> > > +		      " sete %[res]"
> > > +		      : [dst] "=m" (dst->val[0]),
> > > +			"=A" (exp->val[0]),
> > > +			[res] "=r" (res)
> > > +		      : "c" (src->val[1]),
> > > +			"b" (src->val[0]),
> > > +			"m" (dst->val[0]),
> > > +			"d" (exp->val[1]),
> > > +			"a" (exp->val[0])
> > > +		      : "memory");
> > > +
> > > +	return res;
> > > +}
> > > +
> > >  #endif /* _RTE_ATOMIC_X86_64_H_ */
> > > diff --git a/lib/librte_eal/common/include/generic/rte_atomic.h
> > > b/lib/librte_eal/common/include/generic/rte_atomic.h
> > > index b99ba4688..8d612d566 100644
> > > --- a/lib/librte_eal/common/include/generic/rte_atomic.h
> > > +++ b/lib/librte_eal/common/include/generic/rte_atomic.h
> > > @@ -14,6 +14,7 @@
> > >
> > >  #include <stdint.h>
> > >  #include <rte_common.h>
> > > +#include <rte_compat.h>
> > >
> > >  #ifdef __DOXYGEN__
> > >
> > > @@ -1082,4 +1083,68 @@ static inline void
> > > rte_atomic64_clear(rte_atomic64_t
> > > *v)
> > >  }
> > >  #endif
> > >
> > > +/*------------------------ 128 bit atomic operations
> > > +------------------------
> > > -*/
> > > +
> > > +/**
> > > + * 128-bit integer structure.
> > > + */
> > > +typedef struct {
> > > +	uint64_t val[2];
> > > +} __rte_aligned(16) rte_int128_t;
> > So we can't use __int128?
> >
> 
> I'll put it in a union with val[2], in case any implementations want to use it.
> 
> Thanks,
> Gage
> 
> [snip]

Patch

diff --git a/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h b/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
index fd2ec9c53..b7b90b83e 100644
--- a/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
+++ b/lib/librte_eal/common/include/arch/x86/rte_atomic_64.h
@@ -34,6 +34,7 @@ 
 /*
  * Inspired from FreeBSD src/sys/amd64/include/atomic.h
  * Copyright (c) 1998 Doug Rabson
+ * Copyright (c) 2019 Intel Corporation
  * All rights reserved.
  */
 
@@ -46,6 +47,7 @@ 
 
 #include <stdint.h>
 #include <rte_common.h>
+#include <rte_compat.h>
 #include <rte_atomic.h>
 
 /*------------------------- 64 bit atomic operations -------------------------*/
@@ -208,4 +210,33 @@  static inline void rte_atomic64_clear(rte_atomic64_t *v)
 }
 #endif
 
+static inline int __rte_experimental
+rte_atomic128_cmpset(volatile rte_int128_t *dst,
+		     rte_int128_t *exp, rte_int128_t *src,
+		     unsigned int weak,
+		     enum rte_atomic_memmodel_t success,
+		     enum rte_atomic_memmodel_t failure)
+{
+	RTE_SET_USED(weak);
+	RTE_SET_USED(success);
+	RTE_SET_USED(failure);
+	uint8_t res;
+
+	asm volatile (
+		      MPLOCKED
+		      "cmpxchg16b %[dst];"
+		      " sete %[res]"
+		      : [dst] "=m" (dst->val[0]),
+			"=A" (exp->val[0]),
+			[res] "=r" (res)
+		      : "c" (src->val[1]),
+			"b" (src->val[0]),
+			"m" (dst->val[0]),
+			"d" (exp->val[1]),
+			"a" (exp->val[0])
+		      : "memory");
+
+	return res;
+}
+
 #endif /* _RTE_ATOMIC_X86_64_H_ */
diff --git a/lib/librte_eal/common/include/generic/rte_atomic.h b/lib/librte_eal/common/include/generic/rte_atomic.h
index b99ba4688..8d612d566 100644
--- a/lib/librte_eal/common/include/generic/rte_atomic.h
+++ b/lib/librte_eal/common/include/generic/rte_atomic.h
@@ -14,6 +14,7 @@ 
 
 #include <stdint.h>
 #include <rte_common.h>
+#include <rte_compat.h>
 
 #ifdef __DOXYGEN__
 
@@ -1082,4 +1083,68 @@  static inline void rte_atomic64_clear(rte_atomic64_t *v)
 }
 #endif
 
+/*------------------------ 128 bit atomic operations -------------------------*/
+
+/**
+ * 128-bit integer structure.
+ */
+typedef struct {
+	uint64_t val[2];
+} __rte_aligned(16) rte_int128_t;
+
+/**
+ * Memory consistency models used in atomic operations. These control the
+ * behavior of the operation with respect to memory barriers and
+ * thread synchronization.
+ *
+ * These directly match those in the C++11 standard; for details on their
+ * behavior, refer to the standard.
+ */
+enum rte_atomic_memmodel_t {
+	RTE_ATOMIC_RELAXED,
+	RTE_ATOMIC_CONSUME,
+	RTE_ATOMIC_ACQUIRE,
+	RTE_ATOMIC_RELEASE,
+	RTE_ATOMIC_ACQ_REL,
+	RTE_ATOMIC_SEQ_CST,
+};
+
+/* Only implemented on x86-64 currently. The ifdef prevents compilation from
+ * failing for architectures without a definition of this function.
+ */
+#if defined(RTE_ARCH_X86_64)
+/**
+ * An atomic compare and set function used by the mutex functions.
+ * (atomic) equivalent to:
+ *   if (*dst == exp)
+ *     *dst = src (all 128-bit words)
+ *
+ * @param dst
+ *   The destination into which the value will be written.
+ * @param exp
+ *   Pointer to the expected value. If the operation fails, this memory is
+ *   updated with the actual value.
+ * @param src
+ *   Pointer to the new value.
+ * @param weak
+ *   A value of true allows the comparison to spuriously fail. Implementations
+ *   may ignore this argument and only implement the strong variant.
+ * @param success
+ *   If successful, the operation's memory behavior conforms to this (or a
+ *   stronger) model.
+ * @param failure
+ *   If unsuccessful, the operation's memory behavior conforms to this (or a
+ *   stronger) model. This argument cannot be RTE_ATOMIC_RELEASE,
+ *   RTE_ATOMIC_ACQ_REL, or a stronger model than success.
+ * @return
+ *   Non-zero on success; 0 on failure.
+ */
+static inline int __rte_experimental
+rte_atomic128_cmpset(volatile rte_int128_t *dst,
+		     rte_int128_t *exp, rte_int128_t *src,
+		     unsigned int weak,
+		     enum rte_atomic_memmodel_t success,
+		     enum rte_atomic_memmodel_t failure);
+#endif
+
 #endif /* _RTE_ATOMIC_H_ */