[v5,4/4] eal/atomic: add wrapper for c11 atomic thread fence

Message ID 1590483667-10318-5-git-send-email-phil.yang@arm.com (mailing list archive)
State Superseded, archived
Delegated to: David Marchand
Headers
Series generic rte atomic APIs deprecate proposal |

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/travis-robot success Travis build: passed
ci/Intel-compilation success Compilation OK

Commit Message

Phil Yang May 26, 2020, 9:01 a.m. UTC
  Provide a wrapper for __atomic_thread_fence built-in to support
optimized code for __ATOMIC_SEQ_CST memory order for x86 platforms.

Suggested-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
Signed-off-by: Phil Yang <phil.yang@arm.com>
Reviewed-by: Ola Liljedahl <ola.liljedahl@arm.com>
---
 lib/librte_eal/arm/include/rte_atomic_32.h  |  6 ++++++
 lib/librte_eal/arm/include/rte_atomic_64.h  |  6 ++++++
 lib/librte_eal/include/generic/rte_atomic.h |  6 ++++++
 lib/librte_eal/ppc/include/rte_atomic.h     |  6 ++++++
 lib/librte_eal/x86/include/rte_atomic.h     | 17 +++++++++++++++++
 5 files changed, 41 insertions(+)
  

Comments

Honnappa Nagarahalli June 29, 2020, 4:40 a.m. UTC | #1
<snip>

> Subject: [PATCH v5 4/4] eal/atomic: add wrapper for c11 atomic thread fence
> 
> Provide a wrapper for __atomic_thread_fence built-in to support optimized
> code for __ATOMIC_SEQ_CST memory order for x86 platforms.
> 
> Suggested-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> Signed-off-by: Phil Yang <phil.yang@arm.com>
> Reviewed-by: Ola Liljedahl <ola.liljedahl@arm.com>
> ---
>  lib/librte_eal/arm/include/rte_atomic_32.h  |  6 ++++++
> lib/librte_eal/arm/include/rte_atomic_64.h  |  6 ++++++
> lib/librte_eal/include/generic/rte_atomic.h |  6 ++++++
>  lib/librte_eal/ppc/include/rte_atomic.h     |  6 ++++++
>  lib/librte_eal/x86/include/rte_atomic.h     | 17 +++++++++++++++++
>  5 files changed, 41 insertions(+)
> 
> diff --git a/lib/librte_eal/arm/include/rte_atomic_32.h
> b/lib/librte_eal/arm/include/rte_atomic_32.h
> index 7dc0d06..dbe7cc6 100644
> --- a/lib/librte_eal/arm/include/rte_atomic_32.h
> +++ b/lib/librte_eal/arm/include/rte_atomic_32.h
> @@ -37,6 +37,12 @@ extern "C" {
> 
>  #define rte_cio_rmb() rte_rmb()
> 
> +static __rte_always_inline void
> +rte_atomic_thread_fence(int mo)
> +{
> +	__atomic_thread_fence(mo);
> +}
> +
>  #ifdef __cplusplus
>  }
>  #endif
> diff --git a/lib/librte_eal/arm/include/rte_atomic_64.h
> b/lib/librte_eal/arm/include/rte_atomic_64.h
> index 7b7099c..22ff8ec 100644
> --- a/lib/librte_eal/arm/include/rte_atomic_64.h
> +++ b/lib/librte_eal/arm/include/rte_atomic_64.h
> @@ -41,6 +41,12 @@ extern "C" {
> 
>  #define rte_cio_rmb() asm volatile("dmb oshld" : : : "memory")
> 
> +static __rte_always_inline void
> +rte_atomic_thread_fence(int mo)
> +{
> +	__atomic_thread_fence(mo);
> +}
> +
>  /*------------------------ 128 bit atomic operations -------------------------*/
> 
>  #if defined(__ARM_FEATURE_ATOMICS) ||
> defined(RTE_ARM_FEATURE_ATOMICS) diff --git
> a/lib/librte_eal/include/generic/rte_atomic.h
> b/lib/librte_eal/include/generic/rte_atomic.h
> index e6ab15a..5b941db 100644
> --- a/lib/librte_eal/include/generic/rte_atomic.h
> +++ b/lib/librte_eal/include/generic/rte_atomic.h
> @@ -158,6 +158,12 @@ static inline void rte_cio_rmb(void);
>  	asm volatile ("" : : : "memory");	\
>  } while(0)
> 
> +/**
> + * Synchronization fence between threads based on the specified
> + * memory order.
> + */
> +static inline void rte_atomic_thread_fence(int mo);
> +
>  /*------------------------- 16 bit atomic operations -------------------------*/
> 
>  /**
> diff --git a/lib/librte_eal/ppc/include/rte_atomic.h
> b/lib/librte_eal/ppc/include/rte_atomic.h
> index 7e3e131..91c5f30 100644
> --- a/lib/librte_eal/ppc/include/rte_atomic.h
> +++ b/lib/librte_eal/ppc/include/rte_atomic.h
> @@ -40,6 +40,12 @@ extern "C" {
> 
>  #define rte_cio_rmb() rte_rmb()
> 
> +static __rte_always_inline void
> +rte_atomic_thread_fence(int mo)
> +{
> +	__atomic_thread_fence(mo);
> +}
> +
>  /*------------------------- 16 bit atomic operations -------------------------*/
>  /* To be compatible with Power7, use GCC built-in functions for 16 bit
>   * operations */
> diff --git a/lib/librte_eal/x86/include/rte_atomic.h
> b/lib/librte_eal/x86/include/rte_atomic.h
> index b9dcd30..bd256e7 100644
> --- a/lib/librte_eal/x86/include/rte_atomic.h
> +++ b/lib/librte_eal/x86/include/rte_atomic.h
> @@ -83,6 +83,23 @@ rte_smp_mb(void)
> 
>  #define rte_cio_rmb() rte_compiler_barrier()
> 
> +/**
> + * Synchronization fence between threads based on the specified
> + * memory order.
> + *
> + * On x86 the __atomic_thread_fence(__ATOMIC_SEQ_CST) generates
> + * full 'mfence' which is quite expensive. The optimized
> + * implementation of rte_smp_mb is used instead.
> + */
> +static __rte_always_inline void
> +rte_atomic_thread_fence(int mo)
> +{
> +	if (mo == __ATOMIC_SEQ_CST)
> +		rte_smp_mb();
> +	else
> +		__atomic_thread_fence(mo);
> +}
I think __ATOMIC_SEQ_CST needs to be used rarely. IMO, rte_atomic_thread_fence should be called only for __ATOMIC_SEQ_CST memory order. For all others the __atomic_thread_fence can be used directly. This will help us to stick to using the atomic built-ins in most of the cases.

Konstantin, is this ok for you?

> +
>  /*------------------------- 16 bit atomic operations -------------------------*/
> 
>  #ifndef RTE_FORCE_INTRINSICS
> --
> 2.7.4
  
Ananyev, Konstantin June 29, 2020, 10:09 a.m. UTC | #2
> 
> <snip>
> 
> > Subject: [PATCH v5 4/4] eal/atomic: add wrapper for c11 atomic thread fence
> >
> > Provide a wrapper for __atomic_thread_fence built-in to support optimized
> > code for __ATOMIC_SEQ_CST memory order for x86 platforms.
> >
> > Suggested-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> > Signed-off-by: Phil Yang <phil.yang@arm.com>
> > Reviewed-by: Ola Liljedahl <ola.liljedahl@arm.com>
> > ---
> >  lib/librte_eal/arm/include/rte_atomic_32.h  |  6 ++++++
> > lib/librte_eal/arm/include/rte_atomic_64.h  |  6 ++++++
> > lib/librte_eal/include/generic/rte_atomic.h |  6 ++++++
> >  lib/librte_eal/ppc/include/rte_atomic.h     |  6 ++++++
> >  lib/librte_eal/x86/include/rte_atomic.h     | 17 +++++++++++++++++
> >  5 files changed, 41 insertions(+)
> >
> > diff --git a/lib/librte_eal/arm/include/rte_atomic_32.h
> > b/lib/librte_eal/arm/include/rte_atomic_32.h
> > index 7dc0d06..dbe7cc6 100644
> > --- a/lib/librte_eal/arm/include/rte_atomic_32.h
> > +++ b/lib/librte_eal/arm/include/rte_atomic_32.h
> > @@ -37,6 +37,12 @@ extern "C" {
> >
> >  #define rte_cio_rmb() rte_rmb()
> >
> > +static __rte_always_inline void
> > +rte_atomic_thread_fence(int mo)
> > +{
> > +	__atomic_thread_fence(mo);
> > +}
> > +
> >  #ifdef __cplusplus
> >  }
> >  #endif
> > diff --git a/lib/librte_eal/arm/include/rte_atomic_64.h
> > b/lib/librte_eal/arm/include/rte_atomic_64.h
> > index 7b7099c..22ff8ec 100644
> > --- a/lib/librte_eal/arm/include/rte_atomic_64.h
> > +++ b/lib/librte_eal/arm/include/rte_atomic_64.h
> > @@ -41,6 +41,12 @@ extern "C" {
> >
> >  #define rte_cio_rmb() asm volatile("dmb oshld" : : : "memory")
> >
> > +static __rte_always_inline void
> > +rte_atomic_thread_fence(int mo)
> > +{
> > +	__atomic_thread_fence(mo);
> > +}
> > +
> >  /*------------------------ 128 bit atomic operations -------------------------*/
> >
> >  #if defined(__ARM_FEATURE_ATOMICS) ||
> > defined(RTE_ARM_FEATURE_ATOMICS) diff --git
> > a/lib/librte_eal/include/generic/rte_atomic.h
> > b/lib/librte_eal/include/generic/rte_atomic.h
> > index e6ab15a..5b941db 100644
> > --- a/lib/librte_eal/include/generic/rte_atomic.h
> > +++ b/lib/librte_eal/include/generic/rte_atomic.h
> > @@ -158,6 +158,12 @@ static inline void rte_cio_rmb(void);
> >  	asm volatile ("" : : : "memory");	\
> >  } while(0)
> >
> > +/**
> > + * Synchronization fence between threads based on the specified
> > + * memory order.
> > + */
> > +static inline void rte_atomic_thread_fence(int mo);
> > +
> >  /*------------------------- 16 bit atomic operations -------------------------*/
> >
> >  /**
> > diff --git a/lib/librte_eal/ppc/include/rte_atomic.h
> > b/lib/librte_eal/ppc/include/rte_atomic.h
> > index 7e3e131..91c5f30 100644
> > --- a/lib/librte_eal/ppc/include/rte_atomic.h
> > +++ b/lib/librte_eal/ppc/include/rte_atomic.h
> > @@ -40,6 +40,12 @@ extern "C" {
> >
> >  #define rte_cio_rmb() rte_rmb()
> >
> > +static __rte_always_inline void
> > +rte_atomic_thread_fence(int mo)
> > +{
> > +	__atomic_thread_fence(mo);
> > +}
> > +
> >  /*------------------------- 16 bit atomic operations -------------------------*/
> >  /* To be compatible with Power7, use GCC built-in functions for 16 bit
> >   * operations */
> > diff --git a/lib/librte_eal/x86/include/rte_atomic.h
> > b/lib/librte_eal/x86/include/rte_atomic.h
> > index b9dcd30..bd256e7 100644
> > --- a/lib/librte_eal/x86/include/rte_atomic.h
> > +++ b/lib/librte_eal/x86/include/rte_atomic.h
> > @@ -83,6 +83,23 @@ rte_smp_mb(void)
> >
> >  #define rte_cio_rmb() rte_compiler_barrier()
> >
> > +/**
> > + * Synchronization fence between threads based on the specified
> > + * memory order.
> > + *
> > + * On x86 the __atomic_thread_fence(__ATOMIC_SEQ_CST) generates
> > + * full 'mfence' which is quite expensive. The optimized
> > + * implementation of rte_smp_mb is used instead.
> > + */
> > +static __rte_always_inline void
> > +rte_atomic_thread_fence(int mo)
> > +{
> > +	if (mo == __ATOMIC_SEQ_CST)
> > +		rte_smp_mb();
> > +	else
> > +		__atomic_thread_fence(mo);
> > +}
> I think __ATOMIC_SEQ_CST needs to be used rarely. IMO, rte_atomic_thread_fence should be called only for __ATOMIC_SEQ_CST memory
> order. For all others the __atomic_thread_fence can be used directly. This will help us to stick to using the atomic built-ins in most of the
> cases.
> 
> Konstantin, is this ok for you?

My preference is to have one generic rte_atomic_thread_fence() for all cases
(I.E - current Phil implementation looks good to me).
I think it is more consistent approach and would help to avoid confusion.
Konstantin
  
Ananyev, Konstantin June 29, 2020, 10:13 a.m. UTC | #3
> 
> Provide a wrapper for __atomic_thread_fence built-in to support
> optimized code for __ATOMIC_SEQ_CST memory order for x86 platforms.
> 
> Suggested-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> Signed-off-by: Phil Yang <phil.yang@arm.com>
> Reviewed-by: Ola Liljedahl <ola.liljedahl@arm.com>
> ---
>  lib/librte_eal/arm/include/rte_atomic_32.h  |  6 ++++++
>  lib/librte_eal/arm/include/rte_atomic_64.h  |  6 ++++++
>  lib/librte_eal/include/generic/rte_atomic.h |  6 ++++++
>  lib/librte_eal/ppc/include/rte_atomic.h     |  6 ++++++
>  lib/librte_eal/x86/include/rte_atomic.h     | 17 +++++++++++++++++
>  5 files changed, 41 insertions(+)
> 
> diff --git a/lib/librte_eal/arm/include/rte_atomic_32.h b/lib/librte_eal/arm/include/rte_atomic_32.h
> index 7dc0d06..dbe7cc6 100644
> --- a/lib/librte_eal/arm/include/rte_atomic_32.h
> +++ b/lib/librte_eal/arm/include/rte_atomic_32.h
> @@ -37,6 +37,12 @@ extern "C" {
> 
>  #define rte_cio_rmb() rte_rmb()
> 
> +static __rte_always_inline void
> +rte_atomic_thread_fence(int mo)
> +{
> +	__atomic_thread_fence(mo);
> +}
> +
>  #ifdef __cplusplus
>  }
>  #endif
> diff --git a/lib/librte_eal/arm/include/rte_atomic_64.h b/lib/librte_eal/arm/include/rte_atomic_64.h
> index 7b7099c..22ff8ec 100644
> --- a/lib/librte_eal/arm/include/rte_atomic_64.h
> +++ b/lib/librte_eal/arm/include/rte_atomic_64.h
> @@ -41,6 +41,12 @@ extern "C" {
> 
>  #define rte_cio_rmb() asm volatile("dmb oshld" : : : "memory")
> 
> +static __rte_always_inline void
> +rte_atomic_thread_fence(int mo)
> +{
> +	__atomic_thread_fence(mo);
> +}
> +
>  /*------------------------ 128 bit atomic operations -------------------------*/
> 
>  #if defined(__ARM_FEATURE_ATOMICS) || defined(RTE_ARM_FEATURE_ATOMICS)
> diff --git a/lib/librte_eal/include/generic/rte_atomic.h b/lib/librte_eal/include/generic/rte_atomic.h
> index e6ab15a..5b941db 100644
> --- a/lib/librte_eal/include/generic/rte_atomic.h
> +++ b/lib/librte_eal/include/generic/rte_atomic.h
> @@ -158,6 +158,12 @@ static inline void rte_cio_rmb(void);
>  	asm volatile ("" : : : "memory");	\
>  } while(0)
> 
> +/**
> + * Synchronization fence between threads based on the specified
> + * memory order.
> + */
> +static inline void rte_atomic_thread_fence(int mo);
> +
>  /*------------------------- 16 bit atomic operations -------------------------*/
> 
>  /**
> diff --git a/lib/librte_eal/ppc/include/rte_atomic.h b/lib/librte_eal/ppc/include/rte_atomic.h
> index 7e3e131..91c5f30 100644
> --- a/lib/librte_eal/ppc/include/rte_atomic.h
> +++ b/lib/librte_eal/ppc/include/rte_atomic.h
> @@ -40,6 +40,12 @@ extern "C" {
> 
>  #define rte_cio_rmb() rte_rmb()
> 
> +static __rte_always_inline void
> +rte_atomic_thread_fence(int mo)
> +{
> +	__atomic_thread_fence(mo);
> +}
> +
>  /*------------------------- 16 bit atomic operations -------------------------*/
>  /* To be compatible with Power7, use GCC built-in functions for 16 bit
>   * operations */
> diff --git a/lib/librte_eal/x86/include/rte_atomic.h b/lib/librte_eal/x86/include/rte_atomic.h
> index b9dcd30..bd256e7 100644
> --- a/lib/librte_eal/x86/include/rte_atomic.h
> +++ b/lib/librte_eal/x86/include/rte_atomic.h
> @@ -83,6 +83,23 @@ rte_smp_mb(void)
> 
>  #define rte_cio_rmb() rte_compiler_barrier()
> 
> +/**
> + * Synchronization fence between threads based on the specified
> + * memory order.
> + *
> + * On x86 the __atomic_thread_fence(__ATOMIC_SEQ_CST) generates
> + * full 'mfence' which is quite expensive. The optimized
> + * implementation of rte_smp_mb is used instead.
> + */
> +static __rte_always_inline void
> +rte_atomic_thread_fence(int mo)
> +{
> +	if (mo == __ATOMIC_SEQ_CST)
> +		rte_smp_mb();
> +	else
> +		__atomic_thread_fence(mo);
> +}
> +
>  /*------------------------- 16 bit atomic operations -------------------------*/
> 
>  #ifndef RTE_FORCE_INTRINSICS
> --

Acked-by: Konstantin Ananyev <konstantin.ananyev@intel.com>

> 2.7.4
  
Honnappa Nagarahalli June 29, 2020, 2:37 p.m. UTC | #4
<snip>

> > > diff --git a/lib/librte_eal/x86/include/rte_atomic.h
> > > b/lib/librte_eal/x86/include/rte_atomic.h
> > > index b9dcd30..bd256e7 100644
> > > --- a/lib/librte_eal/x86/include/rte_atomic.h
> > > +++ b/lib/librte_eal/x86/include/rte_atomic.h
> > > @@ -83,6 +83,23 @@ rte_smp_mb(void)
> > >
> > >  #define rte_cio_rmb() rte_compiler_barrier()
> > >
> > > +/**
> > > + * Synchronization fence between threads based on the specified
> > > + * memory order.
> > > + *
> > > + * On x86 the __atomic_thread_fence(__ATOMIC_SEQ_CST) generates
> > > + * full 'mfence' which is quite expensive. The optimized
> > > + * implementation of rte_smp_mb is used instead.
> > > + */
> > > +static __rte_always_inline void
> > > +rte_atomic_thread_fence(int mo)
> > > +{
> > > +	if (mo == __ATOMIC_SEQ_CST)
> > > +		rte_smp_mb();
> > > +	else
> > > +		__atomic_thread_fence(mo);
> > > +}
> > I think __ATOMIC_SEQ_CST needs to be used rarely. IMO,
> > rte_atomic_thread_fence should be called only for __ATOMIC_SEQ_CST
> > memory order. For all others the __atomic_thread_fence can be used
> directly. This will help us to stick to using the atomic built-ins in most of the
> cases.
> >
> > Konstantin, is this ok for you?
> 
> My preference is to have one generic rte_atomic_thread_fence() for all cases
> (I.E - current Phil implementation looks good to me).
> I think it is more consistent approach and would help to avoid confusion.
Ok, I am fine. The script currently checks for __atomic_thread_fence(__ATOMIC_SEQ_CST), we have to change it to check for __atomic_thread_fence

> Konstantin
  

Patch

diff --git a/lib/librte_eal/arm/include/rte_atomic_32.h b/lib/librte_eal/arm/include/rte_atomic_32.h
index 7dc0d06..dbe7cc6 100644
--- a/lib/librte_eal/arm/include/rte_atomic_32.h
+++ b/lib/librte_eal/arm/include/rte_atomic_32.h
@@ -37,6 +37,12 @@  extern "C" {
 
 #define rte_cio_rmb() rte_rmb()
 
+static __rte_always_inline void
+rte_atomic_thread_fence(int mo)
+{
+	__atomic_thread_fence(mo);
+}
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/librte_eal/arm/include/rte_atomic_64.h b/lib/librte_eal/arm/include/rte_atomic_64.h
index 7b7099c..22ff8ec 100644
--- a/lib/librte_eal/arm/include/rte_atomic_64.h
+++ b/lib/librte_eal/arm/include/rte_atomic_64.h
@@ -41,6 +41,12 @@  extern "C" {
 
 #define rte_cio_rmb() asm volatile("dmb oshld" : : : "memory")
 
+static __rte_always_inline void
+rte_atomic_thread_fence(int mo)
+{
+	__atomic_thread_fence(mo);
+}
+
 /*------------------------ 128 bit atomic operations -------------------------*/
 
 #if defined(__ARM_FEATURE_ATOMICS) || defined(RTE_ARM_FEATURE_ATOMICS)
diff --git a/lib/librte_eal/include/generic/rte_atomic.h b/lib/librte_eal/include/generic/rte_atomic.h
index e6ab15a..5b941db 100644
--- a/lib/librte_eal/include/generic/rte_atomic.h
+++ b/lib/librte_eal/include/generic/rte_atomic.h
@@ -158,6 +158,12 @@  static inline void rte_cio_rmb(void);
 	asm volatile ("" : : : "memory");	\
 } while(0)
 
+/**
+ * Synchronization fence between threads based on the specified
+ * memory order.
+ */
+static inline void rte_atomic_thread_fence(int mo);
+
 /*------------------------- 16 bit atomic operations -------------------------*/
 
 /**
diff --git a/lib/librte_eal/ppc/include/rte_atomic.h b/lib/librte_eal/ppc/include/rte_atomic.h
index 7e3e131..91c5f30 100644
--- a/lib/librte_eal/ppc/include/rte_atomic.h
+++ b/lib/librte_eal/ppc/include/rte_atomic.h
@@ -40,6 +40,12 @@  extern "C" {
 
 #define rte_cio_rmb() rte_rmb()
 
+static __rte_always_inline void
+rte_atomic_thread_fence(int mo)
+{
+	__atomic_thread_fence(mo);
+}
+
 /*------------------------- 16 bit atomic operations -------------------------*/
 /* To be compatible with Power7, use GCC built-in functions for 16 bit
  * operations */
diff --git a/lib/librte_eal/x86/include/rte_atomic.h b/lib/librte_eal/x86/include/rte_atomic.h
index b9dcd30..bd256e7 100644
--- a/lib/librte_eal/x86/include/rte_atomic.h
+++ b/lib/librte_eal/x86/include/rte_atomic.h
@@ -83,6 +83,23 @@  rte_smp_mb(void)
 
 #define rte_cio_rmb() rte_compiler_barrier()
 
+/**
+ * Synchronization fence between threads based on the specified
+ * memory order.
+ *
+ * On x86 the __atomic_thread_fence(__ATOMIC_SEQ_CST) generates
+ * full 'mfence' which is quite expensive. The optimized
+ * implementation of rte_smp_mb is used instead.
+ */
+static __rte_always_inline void
+rte_atomic_thread_fence(int mo)
+{
+	if (mo == __ATOMIC_SEQ_CST)
+		rte_smp_mb();
+	else
+		__atomic_thread_fence(mo);
+}
+
 /*------------------------- 16 bit atomic operations -------------------------*/
 
 #ifndef RTE_FORCE_INTRINSICS