diff mbox series

[RFC,v2,1/2] rcu: add RCU library supporting QSBR mechanism

Message ID	20181222021420.5114-2-honnappa.nagarahalli@arm.com (mailing list archive)
State	Superseded, archived
Delegated to:	Thomas Monjalon
Headers	From: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com> To: dev@dpdk.org, konstantin.ananyev@intel.com, stephen@networkplumber.org, paulmck@linux.ibm.com, honnappa.nagarahalli@arm.com Cc: gavin.hu@arm.com, dharmik.thakkar@arm.com, nd@arm.com Date: Fri, 21 Dec 2018 20:14:19 -0600 Message-Id: <20181222021420.5114-2-honnappa.nagarahalli@arm.com> In-Reply-To: <20181222021420.5114-1-honnappa.nagarahalli@arm.com> References: <20181122033055.3431-1-honnappa.nagarahalli@arm.com> <20181222021420.5114-1-honnappa.nagarahalli@arm.com> Subject: [dpdk-dev] [RFC v2 1/2] rcu: add RCU library supporting QSBR mechanism Precedence: list Errors-To: dev-bounces@dpdk.org Sender: "dev" <dev-bounces@dpdk.org>
Series	rcu: add RCU library supporting QSBR mechanism \| [RFC,v2,0/2] rcu: add RCU library supporting QSBR mechanism [RFC,v2,1/2] rcu: add RCU library supporting QSBR mechanism [RFC,v2,2/2] test/rcu_qsbr: add API and functional tests

Checks

Context	Check	Description
ci/checkpatch	success	coding style OK
ci/Intel-compilation	success	Compilation OK

Commit Message

Honnappa Nagarahalli Dec. 22, 2018, 2:14 a.m. UTC

  Add RCU library supporting quiescent state based memory reclamation method.
This library helps identify the quiescent state of the reader threads so
that the writers can free the memory associated with the lock less data
structures.

Signed-off-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
Reviewed-by: Steve Capper <steve.capper@arm.com>
Reviewed-by: Gavin Hu <gavin.hu@arm.com>
---
 config/common_base                 |   6 +
 lib/Makefile                       |   2 +
 lib/librte_rcu/Makefile            |  23 +++
 lib/librte_rcu/meson.build         |   5 +
 lib/librte_rcu/rte_rcu_qsbr.c      |  63 ++++++
 lib/librte_rcu/rte_rcu_qsbr.h      | 321 +++++++++++++++++++++++++++++
 lib/librte_rcu/rte_rcu_version.map |   8 +
 lib/meson.build                    |   2 +-
 mk/rte.app.mk                      |   1 +
 9 files changed, 430 insertions(+), 1 deletion(-)
 create mode 100644 lib/librte_rcu/Makefile
 create mode 100644 lib/librte_rcu/meson.build
 create mode 100644 lib/librte_rcu/rte_rcu_qsbr.c
 create mode 100644 lib/librte_rcu/rte_rcu_qsbr.h
 create mode 100644 lib/librte_rcu/rte_rcu_version.map

Comments

Ananyev, Konstantin Jan. 15, 2019, 11:39 a.m. UTC | #1

Hi Honnappa,

 
> Add RCU library supporting quiescent state based memory reclamation method.
> This library helps identify the quiescent state of the reader threads so
> that the writers can free the memory associated with the lock less data
> structures.
> 
> Signed-off-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> Reviewed-by: Steve Capper <steve.capper@arm.com>
> Reviewed-by: Gavin Hu <gavin.hu@arm.com>
> ---
...

> diff --git a/lib/librte_rcu/rte_rcu_qsbr.h b/lib/librte_rcu/rte_rcu_qsbr.h
> new file mode 100644
> index 000000000..c818e77fd
> --- /dev/null
> +++ b/lib/librte_rcu/rte_rcu_qsbr.h
> @@ -0,0 +1,321 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright (c) 2018 Arm Limited
> + */
> +
> +#ifndef _RTE_RCU_QSBR_H_
> +#define _RTE_RCU_QSBR_H_
> +
> +/**
> + * @file
> + * RTE Quiescent State Based Reclamation (QSBR)
> + *
> + * Quiescent State (QS) is any point in the thread execution
> + * where the thread does not hold a reference to shared memory.
> + * A critical section for a data structure can be a quiescent state for
> + * another data structure.
> + *
> + * This library provides the ability to identify quiescent state.
> + */
> +
> +#ifdef __cplusplus
> +extern "C" {
> +#endif
> +
> +#include <stdio.h>
> +#include <stdint.h>
> +#include <errno.h>
> +#include <rte_common.h>
> +#include <rte_memory.h>
> +#include <rte_lcore.h>
> +#include <rte_debug.h>
> +
> +/**< Maximum number of reader threads supported. */
> +#define RTE_RCU_MAX_THREADS 128
> +
> +#if !RTE_IS_POWER_OF_2(RTE_RCU_MAX_THREADS)
> +#error RTE_RCU_MAX_THREADS must be a power of 2
> +#endif
> +
> +/**< Number of array elements required for the bit-map */
> +#define RTE_QSBR_BIT_MAP_ELEMS (RTE_RCU_MAX_THREADS/(sizeof(uint64_t) * 8))
> +
> +/* Thread IDs are stored as a bitmap of 64b element array. Given thread id
> + * needs to be converted to index into the array and the id within
> + * the array element.
> + */
> +#define RTE_QSBR_THR_INDEX_SHIFT 6
> +#define RTE_QSBR_THR_ID_MASK 0x3f
> +
> +/* Worker thread counter */
> +struct rte_rcu_qsbr_cnt {
> +	uint64_t cnt; /**< Quiescent state counter. */
> +} __rte_cache_aligned;
> +
> +/**
> + * RTE thread Quiescent State structure.
> + */
> +struct rte_rcu_qsbr {
> +	uint64_t reg_thread_id[RTE_QSBR_BIT_MAP_ELEMS] __rte_cache_aligned;
> +	/**< Registered reader thread IDs - reader threads reporting
> +	 * on this QS variable represented in a bit map.
> +	 */
> +
> +	uint64_t token __rte_cache_aligned;
> +	/**< Counter to allow for multiple simultaneous QS queries */
> +
> +	struct rte_rcu_qsbr_cnt w[RTE_RCU_MAX_THREADS] __rte_cache_aligned;
> +	/**< QS counter for each reader thread, counts upto
> +	 * current value of token.

As I understand you decided to stick with neutral thread_id and let user define
what exactly thread_id is (lcore, syste, thread id, something else)?
If so, can you probably get rid of RTE_RCU_MAX_THREADS limitation?
I.E. struct rte_rcu_qsbr_cnt w[] and allow user at init time to define
max number of threads allowed.
Or something like:
#define RTE_RCU_QSBR_DEF(name, max_thread) struct name { \
  uint64_t reg_thread_id[ALIGN_CEIL(max_thread, 64)  >> 6]; \
   ...
   struct rte_rcu_qsbr_cnt w[max_thread]; \
}


> +	 */
> +} __rte_cache_aligned;
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice
> + *
> + * Initialize a Quiescent State (QS) variable.
> + *
> + * @param v
> + *   QS variable
> + *
> + */
> +void __rte_experimental
> +rte_rcu_qsbr_init(struct rte_rcu_qsbr *v);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice
> + *
> + * Add a reader thread, to the list of threads reporting their quiescent
> + * state on a QS variable.
> + *
> + * This is implemented as a lock-free function. It is multi-thread
> + * safe.
> + * Any reader thread that wants to report its quiescent state must
> + * call this API before calling rte_rcu_qsbr_update. This can be called
> + * during initialization or as part of the packet processing loop.
> + * Any ongoing QS queries may wait for the status from this registered
> + * thread.
> + *
> + * @param v
> + *   QS variable
> + * @param thread_id
> + *   Reader thread with this thread ID will report its quiescent state on
> + *   the QS variable.
> + */
> +static __rte_always_inline void __rte_experimental
> +rte_rcu_qsbr_register_thread(struct rte_rcu_qsbr *v, unsigned int thread_id)
> +{
> +	unsigned int i, id;
> +
> +	RTE_ASSERT(v == NULL || thread_id >= RTE_RCU_MAX_THREADS);
> +
> +	id = thread_id & RTE_QSBR_THR_ID_MASK;
> +	i = thread_id >> RTE_QSBR_THR_INDEX_SHIFT;
> +
> +	/* Worker thread has to count the quiescent states
> +	 * only from the current value of token.
> +	 * __atomic_store_n(cnt, __ATOMIC_RELAXED) is used to ensure
> +	 * 'cnt' (64b) is accessed atomically.
> +	 */
> +	__atomic_store_n(&v->w[thread_id].cnt,
> +		__atomic_load_n(&v->token, __ATOMIC_ACQUIRE),
> +		__ATOMIC_RELAXED);
> +
> +	/* Release the store to initial TQS count so that readers
> +	 * can use it immediately after this function returns.
> +	 */
> +	__atomic_fetch_or(&v->reg_thread_id[i], 1UL << id, __ATOMIC_RELEASE);
> +}
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice
> + *
> + * Remove a reader thread, from the list of threads reporting their
> + * quiescent state on a QS variable.
> + *
> + * This is implemented as a lock-free function. It is multi-thread safe.
> + * This API can be called from the reader threads during shutdown.
> + * Ongoing QS queries will stop waiting for the status from this
> + * unregistered reader thread.
> + *
> + * @param v
> + *   QS variable
> + * @param thread_id
> + *   Reader thread with this thread ID will stop reporting its quiescent
> + *   state on the QS variable.
> + */
> +static __rte_always_inline void __rte_experimental
> +rte_rcu_qsbr_unregister_thread(struct rte_rcu_qsbr *v, unsigned int thread_id)
> +{
> +	unsigned int i, id;
> +
> +	RTE_ASSERT(v == NULL || thread_id >= RTE_RCU_MAX_THREADS);
> +
> +	id = thread_id & RTE_QSBR_THR_ID_MASK;
> +	i = thread_id >> RTE_QSBR_THR_INDEX_SHIFT;
> +
> +	/* Make sure the removal of the thread from the list of
> +	 * reporting threads is visible before the thread
> +	 * does anything else.
> +	 */
> +	__atomic_fetch_and(&v->reg_thread_id[i],
> +				~(1UL << id), __ATOMIC_RELEASE);
> +}
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice
> + *
> + * Trigger the reader threads to report the quiescent state
> + * status.
> + *
> + * This is implemented as a lock-free function. It is multi-thread
> + * safe and can be called from worker threads.
> + *
> + * @param v
> + *   TQS variable
> + * @param n
> + *   Expected number of times the quiescent state is entered
> + * @param t
> + *   - If successful, this is the token for this call of the API.
> + *     This should be passed to rte_rcu_qsbr_check API.
> + */
> +static __rte_always_inline void __rte_experimental
> +rte_rcu_qsbr_start(struct rte_rcu_qsbr *v, unsigned int n, uint64_t *t)
> +{
> +	RTE_ASSERT(v == NULL || t == NULL);
> +
> +	/* This store release will ensure that changes to any data
> +	 * structure are visible to the workers before the token
> +	 * update is visible.
> +	 */
> +	*t = __atomic_add_fetch(&v->token, n, __ATOMIC_RELEASE);
> +}
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice
> + *
> + * Update quiescent state for a reader thread.
> + *
> + * This is implemented as a lock-free function. It is multi-thread safe.
> + * All the reader threads registered to report their quiescent state
> + * on the QS variable must call this API.
> + *
> + * @param v
> + *   QS variable
> + */
> +static __rte_always_inline void __rte_experimental
> +rte_rcu_qsbr_update(struct rte_rcu_qsbr *v, unsigned int thread_id)
> +{
> +	uint64_t t;
> +
> +	RTE_ASSERT(v == NULL || thread_id >= RTE_RCU_MAX_THREADS);
> +
> +	/* Load the token before the reader thread loads any other
> +	 * (lock-free) data structure. This ensures that updates
> +	 * to the data structures are visible if the update
> +	 * to token is visible.
> +	 */
> +	t = __atomic_load_n(&v->token, __ATOMIC_ACQUIRE);
> +
> +	/* Relaxed load/store on the counter is enough as we are
> +	 * reporting an already completed quiescent state.
> +	 * __atomic_load_n(cnt, __ATOMIC_RELAXED) is used as 'cnt' (64b)
> +	 * is accessed atomically.
> +	 * Copy the current token value. This will end grace period
> +	 * of multiple concurrent writers.
> +	 */
> +	if (__atomic_load_n(&v->w[thread_id].cnt, __ATOMIC_RELAXED) != t)
> +		__atomic_store_n(&v->w[thread_id].cnt, t, __ATOMIC_RELAXED);
> +}
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice
> + *
> + * Checks if all the reader threads have entered the quiescent state
> + * 'n' number of times. 'n' is provided in rte_rcu_qsbr_start API.
> + *
> + * This is implemented as a lock-free function. It is multi-thread
> + * safe and can be called from the worker threads as well.
> + *
> + * @param v
> + *   QS variable
> + * @param t
> + *   Token returned by rte_rcu_qsbr_start API
> + * @param wait
> + *   If true, block till all the reader threads have completed entering
> + *   the quiescent state 'n' number of times
> + * @return
> + *   - 0 if all reader threads have NOT passed through specified number
> + *     of quiescent states.
> + *   - 1 if all reader threads have passed through specified number
> + *     of quiescent states.
> + */
> +static __rte_always_inline int __rte_experimental
> +rte_rcu_qsbr_check(struct rte_rcu_qsbr *v, uint64_t t, bool wait)
> +{
> +	uint32_t i, j, id;
> +	uint64_t bmap;
> +
> +	RTE_ASSERT(v == NULL);
> +
> +	i = 0;
> +	do {
> +		/* Load the current registered thread bit map before
> +		 * loading the reader thread quiescent state counters.
> +		 */
> +		bmap = __atomic_load_n(&v->reg_thread_id[i], __ATOMIC_ACQUIRE);
> +		id = i << RTE_QSBR_THR_INDEX_SHIFT;
> +
> +		while (bmap) {
> +			j = __builtin_ctzl(bmap);
> +
> +			/* __atomic_store_n(cnt, __ATOMIC_RELAXED)
> +			 * is used to ensure 'cnt' (64b) is accessed
> +			 * atomically.
> +			 */
> +			if (unlikely(__atomic_load_n(&v->w[id + j].cnt,
> +					__ATOMIC_RELAXED) < t)) {
> +				/* This thread is not in QS */
> +				if (!wait)
> +					return 0;
> +
> +				/* Loop till this thread enters QS */
> +				rte_pause();
> +				continue;
Shouldn't you re-read reg_thread_id[i] here?
Konstantin
> +			}
> +
> +			bmap &= ~(1UL << j);
> +		}
> +
> +		i++;
> +	} while (i < RTE_QSBR_BIT_MAP_ELEMS);
> +
> +	return 1;
> +}
> +

Honnappa Nagarahalli Jan. 15, 2019, 8:43 p.m. UTC | #2

> Hi Honnappa,
> 
> 
> > Add RCU library supporting quiescent state based memory reclamation
> method.
> > This library helps identify the quiescent state of the reader threads
> > so that the writers can free the memory associated with the lock less
> > data structures.
> >
> > Signed-off-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> > Reviewed-by: Steve Capper <steve.capper@arm.com>
> > Reviewed-by: Gavin Hu <gavin.hu@arm.com>
> > ---
> ...
> 
> > diff --git a/lib/librte_rcu/rte_rcu_qsbr.h
> > b/lib/librte_rcu/rte_rcu_qsbr.h new file mode 100644 index
> > 000000000..c818e77fd
> > --- /dev/null
> > +++ b/lib/librte_rcu/rte_rcu_qsbr.h
> > @@ -0,0 +1,321 @@
> > +/* SPDX-License-Identifier: BSD-3-Clause
> > + * Copyright (c) 2018 Arm Limited
> > + */
> > +
> > +#ifndef _RTE_RCU_QSBR_H_
> > +#define _RTE_RCU_QSBR_H_
> > +
> > +/**
> > + * @file
> > + * RTE Quiescent State Based Reclamation (QSBR)
> > + *
> > + * Quiescent State (QS) is any point in the thread execution
> > + * where the thread does not hold a reference to shared memory.
> > + * A critical section for a data structure can be a quiescent state
> > +for
> > + * another data structure.
> > + *
> > + * This library provides the ability to identify quiescent state.
> > + */
> > +
> > +#ifdef __cplusplus
> > +extern "C" {
> > +#endif
> > +
> > +#include <stdio.h>
> > +#include <stdint.h>
> > +#include <errno.h>
> > +#include <rte_common.h>
> > +#include <rte_memory.h>
> > +#include <rte_lcore.h>
> > +#include <rte_debug.h>
> > +
> > +/**< Maximum number of reader threads supported. */ #define
> > +RTE_RCU_MAX_THREADS 128
> > +
> > +#if !RTE_IS_POWER_OF_2(RTE_RCU_MAX_THREADS)
> > +#error RTE_RCU_MAX_THREADS must be a power of 2 #endif
> > +
> > +/**< Number of array elements required for the bit-map */ #define
> > +RTE_QSBR_BIT_MAP_ELEMS (RTE_RCU_MAX_THREADS/(sizeof(uint64_t)
> * 8))
> > +
> > +/* Thread IDs are stored as a bitmap of 64b element array. Given
> > +thread id
> > + * needs to be converted to index into the array and the id within
> > + * the array element.
> > + */
> > +#define RTE_QSBR_THR_INDEX_SHIFT 6
> > +#define RTE_QSBR_THR_ID_MASK 0x3f
> > +
> > +/* Worker thread counter */
> > +struct rte_rcu_qsbr_cnt {
> > +	uint64_t cnt; /**< Quiescent state counter. */ }
> > +__rte_cache_aligned;
> > +
> > +/**
> > + * RTE thread Quiescent State structure.
> > + */
> > +struct rte_rcu_qsbr {
> > +	uint64_t reg_thread_id[RTE_QSBR_BIT_MAP_ELEMS]
> __rte_cache_aligned;
> > +	/**< Registered reader thread IDs - reader threads reporting
> > +	 * on this QS variable represented in a bit map.
> > +	 */
> > +
> > +	uint64_t token __rte_cache_aligned;
> > +	/**< Counter to allow for multiple simultaneous QS queries */
> > +
> > +	struct rte_rcu_qsbr_cnt w[RTE_RCU_MAX_THREADS]
> __rte_cache_aligned;
> > +	/**< QS counter for each reader thread, counts upto
> > +	 * current value of token.
> 
> As I understand you decided to stick with neutral thread_id and let user
> define what exactly thread_id is (lcore, syste, thread id, something else)?
Yes, that is correct. I will reply to the other thread to continue the discussion.

> If so, can you probably get rid of RTE_RCU_MAX_THREADS limitation?
I am not seeing this as a limitation. The user can change this if required. May be I should change it as follows:
#ifndef RTE_RCU_MAX_THREADS
#define RTE_RCU_MAX_THREADS 128
#endif

> I.E. struct rte_rcu_qsbr_cnt w[] and allow user at init time to define max
> number of threads allowed.
> Or something like:
> #define RTE_RCU_QSBR_DEF(name, max_thread) struct name { \
>   uint64_t reg_thread_id[ALIGN_CEIL(max_thread, 64)  >> 6]; \
>    ...
>    struct rte_rcu_qsbr_cnt w[max_thread]; \ }
I am trying to understand this. I am not following why 'name' is required? Would the user call 'RTE_RCU_QSBR_DEF' in the application header file?

> 
> 
> > +	 */
> > +} __rte_cache_aligned;
> > +
> > +/**
> > + * @warning
> > + * @b EXPERIMENTAL: this API may change without prior notice
> > + *
> > + * Initialize a Quiescent State (QS) variable.
> > + *
> > + * @param v
> > + *   QS variable
> > + *
> > + */
> > +void __rte_experimental
> > +rte_rcu_qsbr_init(struct rte_rcu_qsbr *v);
> > +
> > +/**
> > + * @warning
> > + * @b EXPERIMENTAL: this API may change without prior notice
> > + *
> > + * Add a reader thread, to the list of threads reporting their
> > +quiescent
> > + * state on a QS variable.
> > + *
> > + * This is implemented as a lock-free function. It is multi-thread
> > + * safe.
> > + * Any reader thread that wants to report its quiescent state must
> > + * call this API before calling rte_rcu_qsbr_update. This can be
> > +called
> > + * during initialization or as part of the packet processing loop.
> > + * Any ongoing QS queries may wait for the status from this
> > +registered
> > + * thread.
> > + *
> > + * @param v
> > + *   QS variable
> > + * @param thread_id
> > + *   Reader thread with this thread ID will report its quiescent state on
> > + *   the QS variable.
> > + */
> > +static __rte_always_inline void __rte_experimental
> > +rte_rcu_qsbr_register_thread(struct rte_rcu_qsbr *v, unsigned int
> > +thread_id) {
> > +	unsigned int i, id;
> > +
> > +	RTE_ASSERT(v == NULL || thread_id >= RTE_RCU_MAX_THREADS);
> > +
> > +	id = thread_id & RTE_QSBR_THR_ID_MASK;
> > +	i = thread_id >> RTE_QSBR_THR_INDEX_SHIFT;
> > +
> > +	/* Worker thread has to count the quiescent states
> > +	 * only from the current value of token.
> > +	 * __atomic_store_n(cnt, __ATOMIC_RELAXED) is used to ensure
> > +	 * 'cnt' (64b) is accessed atomically.
> > +	 */
> > +	__atomic_store_n(&v->w[thread_id].cnt,
> > +		__atomic_load_n(&v->token, __ATOMIC_ACQUIRE),
> > +		__ATOMIC_RELAXED);
> > +
> > +	/* Release the store to initial TQS count so that readers
> > +	 * can use it immediately after this function returns.
> > +	 */
> > +	__atomic_fetch_or(&v->reg_thread_id[i], 1UL << id,
> > +__ATOMIC_RELEASE); }
> > +
> > +/**
> > + * @warning
> > + * @b EXPERIMENTAL: this API may change without prior notice
> > + *
> > + * Remove a reader thread, from the list of threads reporting their
> > + * quiescent state on a QS variable.
> > + *
> > + * This is implemented as a lock-free function. It is multi-thread safe.
> > + * This API can be called from the reader threads during shutdown.
> > + * Ongoing QS queries will stop waiting for the status from this
> > + * unregistered reader thread.
> > + *
> > + * @param v
> > + *   QS variable
> > + * @param thread_id
> > + *   Reader thread with this thread ID will stop reporting its quiescent
> > + *   state on the QS variable.
> > + */
> > +static __rte_always_inline void __rte_experimental
> > +rte_rcu_qsbr_unregister_thread(struct rte_rcu_qsbr *v, unsigned int
> > +thread_id) {
> > +	unsigned int i, id;
> > +
> > +	RTE_ASSERT(v == NULL || thread_id >= RTE_RCU_MAX_THREADS);
> > +
> > +	id = thread_id & RTE_QSBR_THR_ID_MASK;
> > +	i = thread_id >> RTE_QSBR_THR_INDEX_SHIFT;
> > +
> > +	/* Make sure the removal of the thread from the list of
> > +	 * reporting threads is visible before the thread
> > +	 * does anything else.
> > +	 */
> > +	__atomic_fetch_and(&v->reg_thread_id[i],
> > +				~(1UL << id), __ATOMIC_RELEASE);
> > +}
> > +
> > +/**
> > + * @warning
> > + * @b EXPERIMENTAL: this API may change without prior notice
> > + *
> > + * Trigger the reader threads to report the quiescent state
> > + * status.
> > + *
> > + * This is implemented as a lock-free function. It is multi-thread
> > + * safe and can be called from worker threads.
> > + *
> > + * @param v
> > + *   TQS variable
> > + * @param n
> > + *   Expected number of times the quiescent state is entered
> > + * @param t
> > + *   - If successful, this is the token for this call of the API.
> > + *     This should be passed to rte_rcu_qsbr_check API.
> > + */
> > +static __rte_always_inline void __rte_experimental
> > +rte_rcu_qsbr_start(struct rte_rcu_qsbr *v, unsigned int n, uint64_t
> > +*t) {
> > +	RTE_ASSERT(v == NULL || t == NULL);
> > +
> > +	/* This store release will ensure that changes to any data
> > +	 * structure are visible to the workers before the token
> > +	 * update is visible.
> > +	 */
> > +	*t = __atomic_add_fetch(&v->token, n, __ATOMIC_RELEASE); }
> > +
> > +/**
> > + * @warning
> > + * @b EXPERIMENTAL: this API may change without prior notice
> > + *
> > + * Update quiescent state for a reader thread.
> > + *
> > + * This is implemented as a lock-free function. It is multi-thread safe.
> > + * All the reader threads registered to report their quiescent state
> > + * on the QS variable must call this API.
> > + *
> > + * @param v
> > + *   QS variable
> > + */
> > +static __rte_always_inline void __rte_experimental
> > +rte_rcu_qsbr_update(struct rte_rcu_qsbr *v, unsigned int thread_id) {
> > +	uint64_t t;
> > +
> > +	RTE_ASSERT(v == NULL || thread_id >= RTE_RCU_MAX_THREADS);
> > +
> > +	/* Load the token before the reader thread loads any other
> > +	 * (lock-free) data structure. This ensures that updates
> > +	 * to the data structures are visible if the update
> > +	 * to token is visible.
> > +	 */
> > +	t = __atomic_load_n(&v->token, __ATOMIC_ACQUIRE);
> > +
> > +	/* Relaxed load/store on the counter is enough as we are
> > +	 * reporting an already completed quiescent state.
> > +	 * __atomic_load_n(cnt, __ATOMIC_RELAXED) is used as 'cnt' (64b)
> > +	 * is accessed atomically.
> > +	 * Copy the current token value. This will end grace period
> > +	 * of multiple concurrent writers.
> > +	 */
> > +	if (__atomic_load_n(&v->w[thread_id].cnt, __ATOMIC_RELAXED) != t)
> > +		__atomic_store_n(&v->w[thread_id].cnt, t,
> __ATOMIC_RELAXED); }
> > +
> > +/**
> > + * @warning
> > + * @b EXPERIMENTAL: this API may change without prior notice
> > + *
> > + * Checks if all the reader threads have entered the quiescent state
> > + * 'n' number of times. 'n' is provided in rte_rcu_qsbr_start API.
> > + *
> > + * This is implemented as a lock-free function. It is multi-thread
> > + * safe and can be called from the worker threads as well.
> > + *
> > + * @param v
> > + *   QS variable
> > + * @param t
> > + *   Token returned by rte_rcu_qsbr_start API
> > + * @param wait
> > + *   If true, block till all the reader threads have completed entering
> > + *   the quiescent state 'n' number of times
> > + * @return
> > + *   - 0 if all reader threads have NOT passed through specified number
> > + *     of quiescent states.
> > + *   - 1 if all reader threads have passed through specified number
> > + *     of quiescent states.
> > + */
> > +static __rte_always_inline int __rte_experimental
> > +rte_rcu_qsbr_check(struct rte_rcu_qsbr *v, uint64_t t, bool wait) {
> > +	uint32_t i, j, id;
> > +	uint64_t bmap;
> > +
> > +	RTE_ASSERT(v == NULL);
> > +
> > +	i = 0;
> > +	do {
> > +		/* Load the current registered thread bit map before
> > +		 * loading the reader thread quiescent state counters.
> > +		 */
> > +		bmap = __atomic_load_n(&v->reg_thread_id[i],
> __ATOMIC_ACQUIRE);
> > +		id = i << RTE_QSBR_THR_INDEX_SHIFT;
> > +
> > +		while (bmap) {
> > +			j = __builtin_ctzl(bmap);
> > +
> > +			/* __atomic_store_n(cnt, __ATOMIC_RELAXED)
> > +			 * is used to ensure 'cnt' (64b) is accessed
> > +			 * atomically.
> > +			 */
> > +			if (unlikely(__atomic_load_n(&v->w[id + j].cnt,
> > +					__ATOMIC_RELAXED) < t)) {
> > +				/* This thread is not in QS */
> > +				if (!wait)
> > +					return 0;
> > +
> > +				/* Loop till this thread enters QS */
> > +				rte_pause();
> > +				continue;
> Shouldn't you re-read reg_thread_id[i] here?
> Konstantin
Yes, you are right. I will try to add a test case as well to address this.

> > +			}
> > +
> > +			bmap &= ~(1UL << j);
> > +		}
> > +
> > +		i++;
> > +	} while (i < RTE_QSBR_BIT_MAP_ELEMS);
> > +
> > +	return 1;
> > +}
> > +

Ananyev, Konstantin Jan. 16, 2019, 3:56 p.m. UTC | #3

> > ...
> >
> > > diff --git a/lib/librte_rcu/rte_rcu_qsbr.h
> > > b/lib/librte_rcu/rte_rcu_qsbr.h new file mode 100644 index
> > > 000000000..c818e77fd
> > > --- /dev/null
> > > +++ b/lib/librte_rcu/rte_rcu_qsbr.h
> > > @@ -0,0 +1,321 @@
> > > +/* SPDX-License-Identifier: BSD-3-Clause
> > > + * Copyright (c) 2018 Arm Limited
> > > + */
> > > +
> > > +#ifndef _RTE_RCU_QSBR_H_
> > > +#define _RTE_RCU_QSBR_H_
> > > +
> > > +/**
> > > + * @file
> > > + * RTE Quiescent State Based Reclamation (QSBR)
> > > + *
> > > + * Quiescent State (QS) is any point in the thread execution
> > > + * where the thread does not hold a reference to shared memory.
> > > + * A critical section for a data structure can be a quiescent state
> > > +for
> > > + * another data structure.
> > > + *
> > > + * This library provides the ability to identify quiescent state.
> > > + */
> > > +
> > > +#ifdef __cplusplus
> > > +extern "C" {
> > > +#endif
> > > +
> > > +#include <stdio.h>
> > > +#include <stdint.h>
> > > +#include <errno.h>
> > > +#include <rte_common.h>
> > > +#include <rte_memory.h>
> > > +#include <rte_lcore.h>
> > > +#include <rte_debug.h>
> > > +
> > > +/**< Maximum number of reader threads supported. */ #define
> > > +RTE_RCU_MAX_THREADS 128
> > > +
> > > +#if !RTE_IS_POWER_OF_2(RTE_RCU_MAX_THREADS)
> > > +#error RTE_RCU_MAX_THREADS must be a power of 2 #endif
> > > +
> > > +/**< Number of array elements required for the bit-map */ #define
> > > +RTE_QSBR_BIT_MAP_ELEMS (RTE_RCU_MAX_THREADS/(sizeof(uint64_t)
> > * 8))
> > > +
> > > +/* Thread IDs are stored as a bitmap of 64b element array. Given
> > > +thread id
> > > + * needs to be converted to index into the array and the id within
> > > + * the array element.
> > > + */
> > > +#define RTE_QSBR_THR_INDEX_SHIFT 6
> > > +#define RTE_QSBR_THR_ID_MASK 0x3f
> > > +
> > > +/* Worker thread counter */
> > > +struct rte_rcu_qsbr_cnt {
> > > +	uint64_t cnt; /**< Quiescent state counter. */ }
> > > +__rte_cache_aligned;
> > > +
> > > +/**
> > > + * RTE thread Quiescent State structure.
> > > + */
> > > +struct rte_rcu_qsbr {
> > > +	uint64_t reg_thread_id[RTE_QSBR_BIT_MAP_ELEMS]
> > __rte_cache_aligned;
> > > +	/**< Registered reader thread IDs - reader threads reporting
> > > +	 * on this QS variable represented in a bit map.
> > > +	 */
> > > +
> > > +	uint64_t token __rte_cache_aligned;
> > > +	/**< Counter to allow for multiple simultaneous QS queries */
> > > +
> > > +	struct rte_rcu_qsbr_cnt w[RTE_RCU_MAX_THREADS]
> > __rte_cache_aligned;
> > > +	/**< QS counter for each reader thread, counts upto
> > > +	 * current value of token.
> >
> > As I understand you decided to stick with neutral thread_id and let user
> > define what exactly thread_id is (lcore, syste, thread id, something else)?
> Yes, that is correct. I will reply to the other thread to continue the discussion.
> 
> > If so, can you probably get rid of RTE_RCU_MAX_THREADS limitation?
> I am not seeing this as a limitation. The user can change this if required. May be I should change it as follows:
> #ifndef RTE_RCU_MAX_THREADS
> #define RTE_RCU_MAX_THREADS 128
> #endif

Yep, that's better, though it would still require user to rebuild the code
if he would like to increase total number of threads supported.
Though it seems relatively simply to extend current code to support
dynamic max thread num here (2 variable arrays plus shift value plus mask). 

> 
> > I.E. struct rte_rcu_qsbr_cnt w[] and allow user at init time to define max
> > number of threads allowed.
> > Or something like:
> > #define RTE_RCU_QSBR_DEF(name, max_thread) struct name { \
> >   uint64_t reg_thread_id[ALIGN_CEIL(max_thread, 64)  >> 6]; \
> >    ...
> >    struct rte_rcu_qsbr_cnt w[max_thread]; \ }
> I am trying to understand this. I am not following why 'name' is required? Would the user call 'RTE_RCU_QSBR_DEF' in the application
> header file?

My thought here was to allow user to define his own structures,
depending on the number of max threads he needs/wants:
RTE_RCU_QSBR_DEF(rte_rcu_qsbr_128, 128);
RTE_RCU_QSBR_DEF(rte_rcu_qsbr_64, 64);
...
Konstantin

Honnappa Nagarahalli Jan. 18, 2019, 6:48 a.m. UTC | #4

> 
> > > ...
> > >
> > > > diff --git a/lib/librte_rcu/rte_rcu_qsbr.h
> > > > b/lib/librte_rcu/rte_rcu_qsbr.h new file mode 100644 index
> > > > 000000000..c818e77fd
> > > > --- /dev/null
> > > > +++ b/lib/librte_rcu/rte_rcu_qsbr.h
> > > > @@ -0,0 +1,321 @@
> > > > +/* SPDX-License-Identifier: BSD-3-Clause
> > > > + * Copyright (c) 2018 Arm Limited  */
> > > > +
> > > > +#ifndef _RTE_RCU_QSBR_H_
> > > > +#define _RTE_RCU_QSBR_H_
> > > > +
> > > > +/**
> > > > + * @file
> > > > + * RTE Quiescent State Based Reclamation (QSBR)
> > > > + *
> > > > + * Quiescent State (QS) is any point in the thread execution
> > > > + * where the thread does not hold a reference to shared memory.
> > > > + * A critical section for a data structure can be a quiescent
> > > > +state for
> > > > + * another data structure.
> > > > + *
> > > > + * This library provides the ability to identify quiescent state.
> > > > + */
> > > > +
> > > > +#ifdef __cplusplus
> > > > +extern "C" {
> > > > +#endif
> > > > +
> > > > +#include <stdio.h>
> > > > +#include <stdint.h>
> > > > +#include <errno.h>
> > > > +#include <rte_common.h>
> > > > +#include <rte_memory.h>
> > > > +#include <rte_lcore.h>
> > > > +#include <rte_debug.h>
> > > > +
> > > > +/**< Maximum number of reader threads supported. */ #define
> > > > +RTE_RCU_MAX_THREADS 128
> > > > +
> > > > +#if !RTE_IS_POWER_OF_2(RTE_RCU_MAX_THREADS)
> > > > +#error RTE_RCU_MAX_THREADS must be a power of 2 #endif
> > > > +
> > > > +/**< Number of array elements required for the bit-map */ #define
> > > > +RTE_QSBR_BIT_MAP_ELEMS
> (RTE_RCU_MAX_THREADS/(sizeof(uint64_t)
> > > * 8))
> > > > +
> > > > +/* Thread IDs are stored as a bitmap of 64b element array. Given
> > > > +thread id
> > > > + * needs to be converted to index into the array and the id
> > > > +within
> > > > + * the array element.
> > > > + */
> > > > +#define RTE_QSBR_THR_INDEX_SHIFT 6 #define
> RTE_QSBR_THR_ID_MASK
> > > > +0x3f
> > > > +
> > > > +/* Worker thread counter */
> > > > +struct rte_rcu_qsbr_cnt {
> > > > +	uint64_t cnt; /**< Quiescent state counter. */ }
> > > > +__rte_cache_aligned;
> > > > +
> > > > +/**
> > > > + * RTE thread Quiescent State structure.
> > > > + */
> > > > +struct rte_rcu_qsbr {
> > > > +	uint64_t reg_thread_id[RTE_QSBR_BIT_MAP_ELEMS]
> > > __rte_cache_aligned;
> > > > +	/**< Registered reader thread IDs - reader threads reporting
> > > > +	 * on this QS variable represented in a bit map.
> > > > +	 */
> > > > +
> > > > +	uint64_t token __rte_cache_aligned;
> > > > +	/**< Counter to allow for multiple simultaneous QS queries */
> > > > +
> > > > +	struct rte_rcu_qsbr_cnt w[RTE_RCU_MAX_THREADS]
> > > __rte_cache_aligned;
> > > > +	/**< QS counter for each reader thread, counts upto
> > > > +	 * current value of token.
> > >
> > > As I understand you decided to stick with neutral thread_id and let
> > > user define what exactly thread_id is (lcore, syste, thread id, something
> else)?
> > Yes, that is correct. I will reply to the other thread to continue the discussion.
> >
> > > If so, can you probably get rid of RTE_RCU_MAX_THREADS limitation?
> > I am not seeing this as a limitation. The user can change this if required. May
> be I should change it as follows:
> > #ifndef RTE_RCU_MAX_THREADS
> > #define RTE_RCU_MAX_THREADS 128
> > #endif
> 
> Yep, that's better, though it would still require user to rebuild the code if he
> would like to increase total number of threads supported.
Agree

> Though it seems relatively simply to extend current code to support dynamic
> max thread num here (2 variable arrays plus shift value plus mask).
Agree, supporting dynamic 'max thread num' is simple. But this means memory needs to be allocated to the arrays. The API 'rte_rcu_qsbr_init' has to take max thread num as the parameter. We also have to introduce another API to free this memory. This will become very similar to alloc/free APIs I had in the v1.
I hope I am following you well, please correct me if not.

> 
> >
> > > I.E. struct rte_rcu_qsbr_cnt w[] and allow user at init time to
> > > define max number of threads allowed.
> > > Or something like:
> > > #define RTE_RCU_QSBR_DEF(name, max_thread) struct name { \
> > >   uint64_t reg_thread_id[ALIGN_CEIL(max_thread, 64)  >> 6]; \
> > >    ...
> > >    struct rte_rcu_qsbr_cnt w[max_thread]; \ }
> > I am trying to understand this. I am not following why 'name' is
> > required? Would the user call 'RTE_RCU_QSBR_DEF' in the application
> header file?
> 
> My thought here was to allow user to define his own structures, depending on
> the number of max threads he needs/wants:
> RTE_RCU_QSBR_DEF(rte_rcu_qsbr_128, 128);
> RTE_RCU_QSBR_DEF(rte_rcu_qsbr_64, 64); ...
Thank you for the clarification, I follow you now. However, it will not solve the problem of dynamic max thread num. Changes to the max number of threads will require recompilation.

> Konstantin

Ananyev, Konstantin Jan. 18, 2019, 12:14 p.m. UTC | #5

> > > > > diff --git a/lib/librte_rcu/rte_rcu_qsbr.h
> > > > > b/lib/librte_rcu/rte_rcu_qsbr.h new file mode 100644 index
> > > > > 000000000..c818e77fd
> > > > > --- /dev/null
> > > > > +++ b/lib/librte_rcu/rte_rcu_qsbr.h
> > > > > @@ -0,0 +1,321 @@
> > > > > +/* SPDX-License-Identifier: BSD-3-Clause
> > > > > + * Copyright (c) 2018 Arm Limited  */
> > > > > +
> > > > > +#ifndef _RTE_RCU_QSBR_H_
> > > > > +#define _RTE_RCU_QSBR_H_
> > > > > +
> > > > > +/**
> > > > > + * @file
> > > > > + * RTE Quiescent State Based Reclamation (QSBR)
> > > > > + *
> > > > > + * Quiescent State (QS) is any point in the thread execution
> > > > > + * where the thread does not hold a reference to shared memory.
> > > > > + * A critical section for a data structure can be a quiescent
> > > > > +state for
> > > > > + * another data structure.
> > > > > + *
> > > > > + * This library provides the ability to identify quiescent state.
> > > > > + */
> > > > > +
> > > > > +#ifdef __cplusplus
> > > > > +extern "C" {
> > > > > +#endif
> > > > > +
> > > > > +#include <stdio.h>
> > > > > +#include <stdint.h>
> > > > > +#include <errno.h>
> > > > > +#include <rte_common.h>
> > > > > +#include <rte_memory.h>
> > > > > +#include <rte_lcore.h>
> > > > > +#include <rte_debug.h>
> > > > > +
> > > > > +/**< Maximum number of reader threads supported. */ #define
> > > > > +RTE_RCU_MAX_THREADS 128
> > > > > +
> > > > > +#if !RTE_IS_POWER_OF_2(RTE_RCU_MAX_THREADS)
> > > > > +#error RTE_RCU_MAX_THREADS must be a power of 2 #endif
> > > > > +
> > > > > +/**< Number of array elements required for the bit-map */ #define
> > > > > +RTE_QSBR_BIT_MAP_ELEMS
> > (RTE_RCU_MAX_THREADS/(sizeof(uint64_t)
> > > > * 8))
> > > > > +
> > > > > +/* Thread IDs are stored as a bitmap of 64b element array. Given
> > > > > +thread id
> > > > > + * needs to be converted to index into the array and the id
> > > > > +within
> > > > > + * the array element.
> > > > > + */
> > > > > +#define RTE_QSBR_THR_INDEX_SHIFT 6 #define
> > RTE_QSBR_THR_ID_MASK
> > > > > +0x3f
> > > > > +
> > > > > +/* Worker thread counter */
> > > > > +struct rte_rcu_qsbr_cnt {
> > > > > +	uint64_t cnt; /**< Quiescent state counter. */ }
> > > > > +__rte_cache_aligned;
> > > > > +
> > > > > +/**
> > > > > + * RTE thread Quiescent State structure.
> > > > > + */
> > > > > +struct rte_rcu_qsbr {
> > > > > +	uint64_t reg_thread_id[RTE_QSBR_BIT_MAP_ELEMS]
> > > > __rte_cache_aligned;
> > > > > +	/**< Registered reader thread IDs - reader threads reporting
> > > > > +	 * on this QS variable represented in a bit map.
> > > > > +	 */
> > > > > +
> > > > > +	uint64_t token __rte_cache_aligned;
> > > > > +	/**< Counter to allow for multiple simultaneous QS queries */
> > > > > +
> > > > > +	struct rte_rcu_qsbr_cnt w[RTE_RCU_MAX_THREADS]
> > > > __rte_cache_aligned;
> > > > > +	/**< QS counter for each reader thread, counts upto
> > > > > +	 * current value of token.
> > > >
> > > > As I understand you decided to stick with neutral thread_id and let
> > > > user define what exactly thread_id is (lcore, syste, thread id, something
> > else)?
> > > Yes, that is correct. I will reply to the other thread to continue the discussion.
> > >
> > > > If so, can you probably get rid of RTE_RCU_MAX_THREADS limitation?
> > > I am not seeing this as a limitation. The user can change this if required. May
> > be I should change it as follows:
> > > #ifndef RTE_RCU_MAX_THREADS
> > > #define RTE_RCU_MAX_THREADS 128
> > > #endif
> >
> > Yep, that's better, though it would still require user to rebuild the code if he
> > would like to increase total number of threads supported.
> Agree
> 
> > Though it seems relatively simply to extend current code to support dynamic
> > max thread num here (2 variable arrays plus shift value plus mask).
> Agree, supporting dynamic 'max thread num' is simple. But this means memory needs to be allocated to the arrays. The API
> 'rte_rcu_qsbr_init' has to take max thread num as the parameter. We also have to introduce another API to free this memory. This will
> become very similar to alloc/free APIs I had in the v1.
> I hope I am following you well, please correct me if not.

I think we can still leave alloc/free tasks to the user.
We probabply just need extra function rte_rcu_qsbr_size(uint32_ max_threads)
to help user calculate required size.
rte_rcu_qsbr_init() might take as an additional parameter 'size' to make checks.
Thought about something like that:

size_t sz = rte_rcu_qsbr_size(max_threads);
struct rte_rcu_qsbr *qsbr = alloc_aligned(CACHE_LINE, sz);
rte_rcu_qsbr_init(qsbr, max_threads, sz);
...

Konstantin

> 
> >
> > >
> > > > I.E. struct rte_rcu_qsbr_cnt w[] and allow user at init time to
> > > > define max number of threads allowed.
> > > > Or something like:
> > > > #define RTE_RCU_QSBR_DEF(name, max_thread) struct name { \
> > > >   uint64_t reg_thread_id[ALIGN_CEIL(max_thread, 64)  >> 6]; \
> > > >    ...
> > > >    struct rte_rcu_qsbr_cnt w[max_thread]; \ }
> > > I am trying to understand this. I am not following why 'name' is
> > > required? Would the user call 'RTE_RCU_QSBR_DEF' in the application
> > header file?
> >
> > My thought here was to allow user to define his own structures, depending on
> > the number of max threads he needs/wants:
> > RTE_RCU_QSBR_DEF(rte_rcu_qsbr_128, 128);
> > RTE_RCU_QSBR_DEF(rte_rcu_qsbr_64, 64); ...
> Thank you for the clarification, I follow you now. However, it will not solve the problem of dynamic max thread num. Changes to the max
> number of threads will require recompilation.
> 
> > Konstantin

Honnappa Nagarahalli Jan. 24, 2019, 5:15 p.m. UTC | #6

<snip>

> > > > > > +/**
> > > > > > + * RTE thread Quiescent State structure.
> > > > > > + */
> > > > > > +struct rte_rcu_qsbr {
> > > > > > +	uint64_t reg_thread_id[RTE_QSBR_BIT_MAP_ELEMS]
> > > > > __rte_cache_aligned;
> > > > > > +	/**< Registered reader thread IDs - reader threads reporting
> > > > > > +	 * on this QS variable represented in a bit map.
> > > > > > +	 */
> > > > > > +
> > > > > > +	uint64_t token __rte_cache_aligned;
> > > > > > +	/**< Counter to allow for multiple simultaneous QS queries
> > > > > > +*/
> > > > > > +
> > > > > > +	struct rte_rcu_qsbr_cnt w[RTE_RCU_MAX_THREADS]
> > > > > __rte_cache_aligned;
> > > > > > +	/**< QS counter for each reader thread, counts upto
> > > > > > +	 * current value of token.
> > > > >
> > > > > As I understand you decided to stick with neutral thread_id and
> > > > > let user define what exactly thread_id is (lcore, syste, thread
> > > > > id, something
> > > else)?
> > > > Yes, that is correct. I will reply to the other thread to continue the
> discussion.
> > > >
> > > > > If so, can you probably get rid of RTE_RCU_MAX_THREADS limitation?
> > > > I am not seeing this as a limitation. The user can change this if
> > > > required. May
> > > be I should change it as follows:
> > > > #ifndef RTE_RCU_MAX_THREADS
> > > > #define RTE_RCU_MAX_THREADS 128
> > > > #endif
> > >
> > > Yep, that's better, though it would still require user to rebuild
> > > the code if he would like to increase total number of threads supported.
> > Agree
> >
> > > Though it seems relatively simply to extend current code to support
> > > dynamic max thread num here (2 variable arrays plus shift value plus
> mask).
> > Agree, supporting dynamic 'max thread num' is simple. But this means
> > memory needs to be allocated to the arrays. The API
> > 'rte_rcu_qsbr_init' has to take max thread num as the parameter. We also
> have to introduce another API to free this memory. This will become very
> similar to alloc/free APIs I had in the v1.
> > I hope I am following you well, please correct me if not.
> 
> I think we can still leave alloc/free tasks to the user.
> We probabply just need extra function rte_rcu_qsbr_size(uint32_
> max_threads) to help user calculate required size.
> rte_rcu_qsbr_init() might take as an additional parameter 'size' to make
> checks.
The size is returned by an API provided by the library. Why does it need to be validated again? If 'size' is required for rte_rcu_qsbr_init, it could calculate it again.

> Thought about something like that:
> 
> size_t sz = rte_rcu_qsbr_size(max_threads); struct rte_rcu_qsbr *qsbr =
> alloc_aligned(CACHE_LINE, sz); rte_rcu_qsbr_init(qsbr, max_threads, sz); ...
> 
Do you see any advantage for allowing the user to allocate the memory?
This approach requires the user to call 3 APIs (including memory allocation). These 3 can be abstracted in a rte_rcu_qsbr_alloc API, user has to call just 1 API.

> Konstantin
> 
> >
> > >
> > > >
> > > > > I.E. struct rte_rcu_qsbr_cnt w[] and allow user at init time to
> > > > > define max number of threads allowed.
> > > > > Or something like:
> > > > > #define RTE_RCU_QSBR_DEF(name, max_thread) struct name { \
> > > > >   uint64_t reg_thread_id[ALIGN_CEIL(max_thread, 64)  >> 6]; \
> > > > >    ...
> > > > >    struct rte_rcu_qsbr_cnt w[max_thread]; \ }
> > > > I am trying to understand this. I am not following why 'name' is
> > > > required? Would the user call 'RTE_RCU_QSBR_DEF' in the
> > > > application
> > > header file?
> > >
> > > My thought here was to allow user to define his own structures,
> > > depending on the number of max threads he needs/wants:
> > > RTE_RCU_QSBR_DEF(rte_rcu_qsbr_128, 128);
> > > RTE_RCU_QSBR_DEF(rte_rcu_qsbr_64, 64); ...
> > Thank you for the clarification, I follow you now. However, it will
> > not solve the problem of dynamic max thread num. Changes to the max
> number of threads will require recompilation.
> >
> > > Konstantin

Ananyev, Konstantin Jan. 24, 2019, 6:05 p.m. UTC | #7

> <snip>
> 
> > > > > > > +/**
> > > > > > > + * RTE thread Quiescent State structure.
> > > > > > > + */
> > > > > > > +struct rte_rcu_qsbr {
> > > > > > > +	uint64_t reg_thread_id[RTE_QSBR_BIT_MAP_ELEMS]
> > > > > > __rte_cache_aligned;
> > > > > > > +	/**< Registered reader thread IDs - reader threads reporting
> > > > > > > +	 * on this QS variable represented in a bit map.
> > > > > > > +	 */
> > > > > > > +
> > > > > > > +	uint64_t token __rte_cache_aligned;
> > > > > > > +	/**< Counter to allow for multiple simultaneous QS queries
> > > > > > > +*/
> > > > > > > +
> > > > > > > +	struct rte_rcu_qsbr_cnt w[RTE_RCU_MAX_THREADS]
> > > > > > __rte_cache_aligned;
> > > > > > > +	/**< QS counter for each reader thread, counts upto
> > > > > > > +	 * current value of token.
> > > > > >
> > > > > > As I understand you decided to stick with neutral thread_id and
> > > > > > let user define what exactly thread_id is (lcore, syste, thread
> > > > > > id, something
> > > > else)?
> > > > > Yes, that is correct. I will reply to the other thread to continue the
> > discussion.
> > > > >
> > > > > > If so, can you probably get rid of RTE_RCU_MAX_THREADS limitation?
> > > > > I am not seeing this as a limitation. The user can change this if
> > > > > required. May
> > > > be I should change it as follows:
> > > > > #ifndef RTE_RCU_MAX_THREADS
> > > > > #define RTE_RCU_MAX_THREADS 128
> > > > > #endif
> > > >
> > > > Yep, that's better, though it would still require user to rebuild
> > > > the code if he would like to increase total number of threads supported.
> > > Agree
> > >
> > > > Though it seems relatively simply to extend current code to support
> > > > dynamic max thread num here (2 variable arrays plus shift value plus
> > mask).
> > > Agree, supporting dynamic 'max thread num' is simple. But this means
> > > memory needs to be allocated to the arrays. The API
> > > 'rte_rcu_qsbr_init' has to take max thread num as the parameter. We also
> > have to introduce another API to free this memory. This will become very
> > similar to alloc/free APIs I had in the v1.
> > > I hope I am following you well, please correct me if not.
> >
> > I think we can still leave alloc/free tasks to the user.
> > We probabply just need extra function rte_rcu_qsbr_size(uint32_
> > max_threads) to help user calculate required size.
> > rte_rcu_qsbr_init() might take as an additional parameter 'size' to make
> > checks.
> The size is returned by an API provided by the library. Why does it need to be validated again? If 'size' is required for rte_rcu_qsbr_init, it
> could calculate it again.

Just as extra-safety check.
I don't have strong opinion here - if you think it is overkill, let's drop it.


> 
> > Thought about something like that:
> >
> > size_t sz = rte_rcu_qsbr_size(max_threads); struct rte_rcu_qsbr *qsbr =
> > alloc_aligned(CACHE_LINE, sz); rte_rcu_qsbr_init(qsbr, max_threads, sz); ...
> >
> Do you see any advantage for allowing the user to allocate the memory?
So user can choose where to allocate the memory (eal malloc, normal malloc, stack, something else).
Again user might decide to make rcu part of some complex data structure - 
in that case he probably would like to allocate one big chunk of memory at once and then
provide part of it for rcu.
Or some other usage scenario that I can't predict.

> This approach requires the user to call 3 APIs (including memory allocation). These 3 can be abstracted in a rte_rcu_qsbr_alloc API, user has
> to call just 1 API.
> 
> > Konstantin
> >
> > >
> > > >
> > > > >
> > > > > > I.E. struct rte_rcu_qsbr_cnt w[] and allow user at init time to
> > > > > > define max number of threads allowed.
> > > > > > Or something like:
> > > > > > #define RTE_RCU_QSBR_DEF(name, max_thread) struct name { \
> > > > > >   uint64_t reg_thread_id[ALIGN_CEIL(max_thread, 64)  >> 6]; \
> > > > > >    ...
> > > > > >    struct rte_rcu_qsbr_cnt w[max_thread]; \ }
> > > > > I am trying to understand this. I am not following why 'name' is
> > > > > required? Would the user call 'RTE_RCU_QSBR_DEF' in the
> > > > > application
> > > > header file?
> > > >
> > > > My thought here was to allow user to define his own structures,
> > > > depending on the number of max threads he needs/wants:
> > > > RTE_RCU_QSBR_DEF(rte_rcu_qsbr_128, 128);
> > > > RTE_RCU_QSBR_DEF(rte_rcu_qsbr_64, 64); ...
> > > Thank you for the clarification, I follow you now. However, it will
> > > not solve the problem of dynamic max thread num. Changes to the max
> > number of threads will require recompilation.
> > >
> > > > Konstantin

Honnappa Nagarahalli Feb. 22, 2019, 7:07 a.m. UTC | #8

> > <snip>
> >
> > > > > > > > +/**
> > > > > > > > + * RTE thread Quiescent State structure.
> > > > > > > > + */
> > > > > > > > +struct rte_rcu_qsbr {
> > > > > > > > +	uint64_t reg_thread_id[RTE_QSBR_BIT_MAP_ELEMS]
> > > > > > > __rte_cache_aligned;
> > > > > > > > +	/**< Registered reader thread IDs - reader threads reporting
> > > > > > > > +	 * on this QS variable represented in a bit map.
> > > > > > > > +	 */
> > > > > > > > +
> > > > > > > > +	uint64_t token __rte_cache_aligned;
> > > > > > > > +	/**< Counter to allow for multiple simultaneous QS
> > > > > > > > +queries */
> > > > > > > > +
> > > > > > > > +	struct rte_rcu_qsbr_cnt w[RTE_RCU_MAX_THREADS]
> > > > > > > __rte_cache_aligned;
> > > > > > > > +	/**< QS counter for each reader thread, counts upto
> > > > > > > > +	 * current value of token.
> > > > > > >
> > > > > > > As I understand you decided to stick with neutral thread_id
> > > > > > > and let user define what exactly thread_id is (lcore, syste,
> > > > > > > thread id, something
> > > > > else)?
> > > > > > Yes, that is correct. I will reply to the other thread to
> > > > > > continue the
> > > discussion.
> > > > > >
> > > > > > > If so, can you probably get rid of RTE_RCU_MAX_THREADS
> limitation?
> > > > > > I am not seeing this as a limitation. The user can change this
> > > > > > if required. May
> > > > > be I should change it as follows:
> > > > > > #ifndef RTE_RCU_MAX_THREADS
> > > > > > #define RTE_RCU_MAX_THREADS 128 #endif
> > > > >
> > > > > Yep, that's better, though it would still require user to
> > > > > rebuild the code if he would like to increase total number of threads
> supported.
> > > > Agree
> > > >
> > > > > Though it seems relatively simply to extend current code to
> > > > > support dynamic max thread num here (2 variable arrays plus
> > > > > shift value plus
> > > mask).
> > > > Agree, supporting dynamic 'max thread num' is simple. But this
> > > > means memory needs to be allocated to the arrays. The API
> > > > 'rte_rcu_qsbr_init' has to take max thread num as the parameter.
> > > > We also
> > > have to introduce another API to free this memory. This will become
> > > very similar to alloc/free APIs I had in the v1.
> > > > I hope I am following you well, please correct me if not.
> > >
> > > I think we can still leave alloc/free tasks to the user.
> > > We probabply just need extra function rte_rcu_qsbr_size(uint32_
> > > max_threads) to help user calculate required size.
> > > rte_rcu_qsbr_init() might take as an additional parameter 'size' to
> > > make checks.
> > The size is returned by an API provided by the library. Why does it
> > need to be validated again? If 'size' is required for rte_rcu_qsbr_init, it
> could calculate it again.
> 
> Just as extra-safety check.
> I don't have strong opinion here - if you think it is overkill, let's drop it.
> 
> 
> >
> > > Thought about something like that:
> > >
> > > size_t sz = rte_rcu_qsbr_size(max_threads); struct rte_rcu_qsbr
> > > *qsbr = alloc_aligned(CACHE_LINE, sz); rte_rcu_qsbr_init(qsbr,
> max_threads, sz); ...
> > >
> > Do you see any advantage for allowing the user to allocate the memory?
> So user can choose where to allocate the memory (eal malloc, normal malloc,
> stack, something else).
> Again user might decide to make rcu part of some complex data structure - in
> that case he probably would like to allocate one big chunk of memory at once
> and then provide part of it for rcu.
> Or some other usage scenario that I can't predict.
> 

I made this change and added performance tests similar to liburcu. With the dynamic memory allocation change the performance of rte_rcu_qsbr_update comes down by 42% - 45% and that of rte_rcu_qsbr_check also comes down by 133% on Arm platform. On x86 (E5-2660 v4 @ 2.00GHz), the results are mixed. rte_rcu_qsbr_update comes down by 15%, but that of rte_rcu_qsbr_check improves.

On the Arm platform, the issue seems to be due to address calculation that needs to happen at run time. If I fix the reg_thread_id array size, I am getting back/improving the performance both for Arm and x86. What this means is, we will still have a max thread limitation, but it will be high - 512 (1 cache line). We could make this 1024 (2 cache lines). However, per thread counter data size will depend on the 'max thread' provided by the user. I think this solution serves your requirement (though with an acceptable constraint not affecting the near future), please let me know what you think.

These changes and the 3 variants of the implementation are present in RFC v3 [1], in case you want to run these tests.
1/5, 2/5 - same as RFC v2 + 1 bug fixed
3/5 - Addition of rte_rcu_qsbr_get_memsize. Memory size for register thread bitmap array as well as per thread counter data is calculated based on max_threads parameter
4/5 - Test cases are modified to use the new API
5/5 - Size of register thread bitmap array is fixed to hold 512 thread IDs. However, the per thread counter data is calculated based on max_threads parameter.

If you do not want to run the tests, you can just look at 3/5 and 5/5.

[1] http://patchwork.dpdk.org/cover/50431/

> > This approach requires the user to call 3 APIs (including memory
> > allocation). These 3 can be abstracted in a rte_rcu_qsbr_alloc API, user has
> to call just 1 API.
> >
> > > Konstantin
> > >

<snip>

diff mbox series

Patch

diff --git a/config/common_base b/config/common_base
index d12ae98bc..e148549d8 100644
--- a/config/common_base
+++ b/config/common_base
@@ -792,6 +792,12 @@  CONFIG_RTE_LIBRTE_LATENCY_STATS=y
 #
 CONFIG_RTE_LIBRTE_TELEMETRY=n
 
+#
+# Compile librte_rcu
+#
+CONFIG_RTE_LIBRTE_RCU=y
+CONFIG_RTE_LIBRTE_RCU_DEBUG=n
+
 #
 # Compile librte_lpm
 #
diff --git a/lib/Makefile b/lib/Makefile
index b7370ef97..b674662c8 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -108,6 +108,8 @@  DIRS-$(CONFIG_RTE_LIBRTE_BPF) += librte_bpf
 DEPDIRS-librte_bpf := librte_eal librte_mempool librte_mbuf librte_ethdev
 DIRS-$(CONFIG_RTE_LIBRTE_TELEMETRY) += librte_telemetry
 DEPDIRS-librte_telemetry := librte_eal librte_metrics librte_ethdev
+DIRS-$(CONFIG_RTE_LIBRTE_RCU) += librte_rcu
+DEPDIRS-librte_rcu := librte_eal
 
 ifeq ($(CONFIG_RTE_EXEC_ENV_LINUXAPP),y)
 DIRS-$(CONFIG_RTE_LIBRTE_KNI) += librte_kni
diff --git a/lib/librte_rcu/Makefile b/lib/librte_rcu/Makefile
new file mode 100644
index 000000000..6aa677bd1
--- /dev/null
+++ b/lib/librte_rcu/Makefile
@@ -0,0 +1,23 @@ 
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(c) 2018 Arm Limited
+
+include $(RTE_SDK)/mk/rte.vars.mk
+
+# library name
+LIB = librte_rcu.a
+
+CFLAGS += -DALLOW_EXPERIMENTAL_API
+CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) -O3
+LDLIBS += -lrte_eal
+
+EXPORT_MAP := rte_rcu_version.map
+
+LIBABIVER := 1
+
+# all source are stored in SRCS-y
+SRCS-$(CONFIG_RTE_LIBRTE_RCU) := rte_rcu_qsbr.c
+
+# install includes
+SYMLINK-$(CONFIG_RTE_LIBRTE_RCU)-include := rte_rcu_qsbr.h
+
+include $(RTE_SDK)/mk/rte.lib.mk
diff --git a/lib/librte_rcu/meson.build b/lib/librte_rcu/meson.build
new file mode 100644
index 000000000..c009ae4b7
--- /dev/null
+++ b/lib/librte_rcu/meson.build
@@ -0,0 +1,5 @@ 
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(c) 2018 Arm Limited
+
+sources = files('rte_rcu_qsbr.c')
+headers = files('rte_rcu_qsbr.h')
diff --git a/lib/librte_rcu/rte_rcu_qsbr.c b/lib/librte_rcu/rte_rcu_qsbr.c
new file mode 100644
index 000000000..3c2577ee2
--- /dev/null
+++ b/lib/librte_rcu/rte_rcu_qsbr.c
@@ -0,0 +1,63 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright (c) 2018 Arm Limited
+ */
+
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+#include <errno.h>
+
+#include <rte_common.h>
+#include <rte_log.h>
+#include <rte_memory.h>
+#include <rte_malloc.h>
+#include <rte_eal.h>
+#include <rte_eal_memconfig.h>
+#include <rte_atomic.h>
+#include <rte_per_lcore.h>
+#include <rte_lcore.h>
+#include <rte_errno.h>
+
+#include "rte_rcu_qsbr.h"
+
+/* Initialize a quiescent state variable */
+void __rte_experimental
+rte_rcu_qsbr_init(struct rte_rcu_qsbr *v)
+{
+	memset(v, 0, sizeof(struct rte_rcu_qsbr));
+}
+
+/* Dump the details of a single quiescent state variable to a file. */
+void __rte_experimental
+rte_rcu_qsbr_dump(FILE *f, struct rte_rcu_qsbr *v)
+{
+	uint64_t bmap;
+	uint32_t i, t;
+
+	RTE_ASSERT(v == NULL || f == NULL);
+
+	fprintf(f, "\nQuiescent State Variable @%p\n", v);
+
+	fprintf(f, "  Registered thread ID mask = 0x");
+	for (i = 0; i < RTE_QSBR_BIT_MAP_ELEMS; i++)
+		fprintf(f, "%lx", __atomic_load_n(&v->reg_thread_id[i],
+					__ATOMIC_ACQUIRE));
+	fprintf(f, "\n");
+
+	fprintf(f, "  Token = %lu\n",
+			__atomic_load_n(&v->token, __ATOMIC_ACQUIRE));
+
+	fprintf(f, "Quiescent State Counts for readers:\n");
+	for (i = 0; i < RTE_QSBR_BIT_MAP_ELEMS; i++) {
+		bmap = __atomic_load_n(&v->reg_thread_id[i],
+					__ATOMIC_ACQUIRE);
+		while (bmap) {
+			t = __builtin_ctzl(bmap);
+			fprintf(f, "thread ID = %d, count = %lu\n", t,
+				__atomic_load_n(&v->w[i].cnt,
+						__ATOMIC_RELAXED));
+			bmap &= ~(1UL << t);
+		}
+	}
+}
diff --git a/lib/librte_rcu/rte_rcu_qsbr.h b/lib/librte_rcu/rte_rcu_qsbr.h
new file mode 100644
index 000000000..c818e77fd
--- /dev/null
+++ b/lib/librte_rcu/rte_rcu_qsbr.h
@@ -0,0 +1,321 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright (c) 2018 Arm Limited
+ */
+
+#ifndef _RTE_RCU_QSBR_H_
+#define _RTE_RCU_QSBR_H_
+
+/**
+ * @file
+ * RTE Quiescent State Based Reclamation (QSBR)
+ *
+ * Quiescent State (QS) is any point in the thread execution
+ * where the thread does not hold a reference to shared memory.
+ * A critical section for a data structure can be a quiescent state for
+ * another data structure.
+ *
+ * This library provides the ability to identify quiescent state.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h>
+#include <stdint.h>
+#include <errno.h>
+#include <rte_common.h>
+#include <rte_memory.h>
+#include <rte_lcore.h>
+#include <rte_debug.h>
+
+/**< Maximum number of reader threads supported. */
+#define RTE_RCU_MAX_THREADS 128
+
+#if !RTE_IS_POWER_OF_2(RTE_RCU_MAX_THREADS)
+#error RTE_RCU_MAX_THREADS must be a power of 2
+#endif
+
+/**< Number of array elements required for the bit-map */
+#define RTE_QSBR_BIT_MAP_ELEMS (RTE_RCU_MAX_THREADS/(sizeof(uint64_t) * 8))
+
+/* Thread IDs are stored as a bitmap of 64b element array. Given thread id
+ * needs to be converted to index into the array and the id within
+ * the array element.
+ */
+#define RTE_QSBR_THR_INDEX_SHIFT 6
+#define RTE_QSBR_THR_ID_MASK 0x3f
+
+/* Worker thread counter */
+struct rte_rcu_qsbr_cnt {
+	uint64_t cnt; /**< Quiescent state counter. */
+} __rte_cache_aligned;
+
+/**
+ * RTE thread Quiescent State structure.
+ */
+struct rte_rcu_qsbr {
+	uint64_t reg_thread_id[RTE_QSBR_BIT_MAP_ELEMS] __rte_cache_aligned;
+	/**< Registered reader thread IDs - reader threads reporting
+	 * on this QS variable represented in a bit map.
+	 */
+
+	uint64_t token __rte_cache_aligned;
+	/**< Counter to allow for multiple simultaneous QS queries */
+
+	struct rte_rcu_qsbr_cnt w[RTE_RCU_MAX_THREADS] __rte_cache_aligned;
+	/**< QS counter for each reader thread, counts upto
+	 * current value of token.
+	 */
+} __rte_cache_aligned;
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Initialize a Quiescent State (QS) variable.
+ *
+ * @param v
+ *   QS variable
+ *
+ */
+void __rte_experimental
+rte_rcu_qsbr_init(struct rte_rcu_qsbr *v);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Add a reader thread, to the list of threads reporting their quiescent
+ * state on a QS variable.
+ *
+ * This is implemented as a lock-free function. It is multi-thread
+ * safe.
+ * Any reader thread that wants to report its quiescent state must
+ * call this API before calling rte_rcu_qsbr_update. This can be called
+ * during initialization or as part of the packet processing loop.
+ * Any ongoing QS queries may wait for the status from this registered
+ * thread.
+ *
+ * @param v
+ *   QS variable
+ * @param thread_id
+ *   Reader thread with this thread ID will report its quiescent state on
+ *   the QS variable.
+ */
+static __rte_always_inline void __rte_experimental
+rte_rcu_qsbr_register_thread(struct rte_rcu_qsbr *v, unsigned int thread_id)
+{
+	unsigned int i, id;
+
+	RTE_ASSERT(v == NULL || thread_id >= RTE_RCU_MAX_THREADS);
+
+	id = thread_id & RTE_QSBR_THR_ID_MASK;
+	i = thread_id >> RTE_QSBR_THR_INDEX_SHIFT;
+
+	/* Worker thread has to count the quiescent states
+	 * only from the current value of token.
+	 * __atomic_store_n(cnt, __ATOMIC_RELAXED) is used to ensure
+	 * 'cnt' (64b) is accessed atomically.
+	 */
+	__atomic_store_n(&v->w[thread_id].cnt,
+		__atomic_load_n(&v->token, __ATOMIC_ACQUIRE),
+		__ATOMIC_RELAXED);
+
+	/* Release the store to initial TQS count so that readers
+	 * can use it immediately after this function returns.
+	 */
+	__atomic_fetch_or(&v->reg_thread_id[i], 1UL << id, __ATOMIC_RELEASE);
+}
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Remove a reader thread, from the list of threads reporting their
+ * quiescent state on a QS variable.
+ *
+ * This is implemented as a lock-free function. It is multi-thread safe.
+ * This API can be called from the reader threads during shutdown.
+ * Ongoing QS queries will stop waiting for the status from this
+ * unregistered reader thread.
+ *
+ * @param v
+ *   QS variable
+ * @param thread_id
+ *   Reader thread with this thread ID will stop reporting its quiescent
+ *   state on the QS variable.
+ */
+static __rte_always_inline void __rte_experimental
+rte_rcu_qsbr_unregister_thread(struct rte_rcu_qsbr *v, unsigned int thread_id)
+{
+	unsigned int i, id;
+
+	RTE_ASSERT(v == NULL || thread_id >= RTE_RCU_MAX_THREADS);
+
+	id = thread_id & RTE_QSBR_THR_ID_MASK;
+	i = thread_id >> RTE_QSBR_THR_INDEX_SHIFT;
+
+	/* Make sure the removal of the thread from the list of
+	 * reporting threads is visible before the thread
+	 * does anything else.
+	 */
+	__atomic_fetch_and(&v->reg_thread_id[i],
+				~(1UL << id), __ATOMIC_RELEASE);
+}
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Trigger the reader threads to report the quiescent state
+ * status.
+ *
+ * This is implemented as a lock-free function. It is multi-thread
+ * safe and can be called from worker threads.
+ *
+ * @param v
+ *   TQS variable
+ * @param n
+ *   Expected number of times the quiescent state is entered
+ * @param t
+ *   - If successful, this is the token for this call of the API.
+ *     This should be passed to rte_rcu_qsbr_check API.
+ */
+static __rte_always_inline void __rte_experimental
+rte_rcu_qsbr_start(struct rte_rcu_qsbr *v, unsigned int n, uint64_t *t)
+{
+	RTE_ASSERT(v == NULL || t == NULL);
+
+	/* This store release will ensure that changes to any data
+	 * structure are visible to the workers before the token
+	 * update is visible.
+	 */
+	*t = __atomic_add_fetch(&v->token, n, __ATOMIC_RELEASE);
+}
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Update quiescent state for a reader thread.
+ *
+ * This is implemented as a lock-free function. It is multi-thread safe.
+ * All the reader threads registered to report their quiescent state
+ * on the QS variable must call this API.
+ *
+ * @param v
+ *   QS variable
+ */
+static __rte_always_inline void __rte_experimental
+rte_rcu_qsbr_update(struct rte_rcu_qsbr *v, unsigned int thread_id)
+{
+	uint64_t t;
+
+	RTE_ASSERT(v == NULL || thread_id >= RTE_RCU_MAX_THREADS);
+
+	/* Load the token before the reader thread loads any other
+	 * (lock-free) data structure. This ensures that updates
+	 * to the data structures are visible if the update
+	 * to token is visible.
+	 */
+	t = __atomic_load_n(&v->token, __ATOMIC_ACQUIRE);
+
+	/* Relaxed load/store on the counter is enough as we are
+	 * reporting an already completed quiescent state.
+	 * __atomic_load_n(cnt, __ATOMIC_RELAXED) is used as 'cnt' (64b)
+	 * is accessed atomically.
+	 * Copy the current token value. This will end grace period
+	 * of multiple concurrent writers.
+	 */
+	if (__atomic_load_n(&v->w[thread_id].cnt, __ATOMIC_RELAXED) != t)
+		__atomic_store_n(&v->w[thread_id].cnt, t, __ATOMIC_RELAXED);
+}
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Checks if all the reader threads have entered the quiescent state
+ * 'n' number of times. 'n' is provided in rte_rcu_qsbr_start API.
+ *
+ * This is implemented as a lock-free function. It is multi-thread
+ * safe and can be called from the worker threads as well.
+ *
+ * @param v
+ *   QS variable
+ * @param t
+ *   Token returned by rte_rcu_qsbr_start API
+ * @param wait
+ *   If true, block till all the reader threads have completed entering
+ *   the quiescent state 'n' number of times
+ * @return
+ *   - 0 if all reader threads have NOT passed through specified number
+ *     of quiescent states.
+ *   - 1 if all reader threads have passed through specified number
+ *     of quiescent states.
+ */
+static __rte_always_inline int __rte_experimental
+rte_rcu_qsbr_check(struct rte_rcu_qsbr *v, uint64_t t, bool wait)
+{
+	uint32_t i, j, id;
+	uint64_t bmap;
+
+	RTE_ASSERT(v == NULL);
+
+	i = 0;
+	do {
+		/* Load the current registered thread bit map before
+		 * loading the reader thread quiescent state counters.
+		 */
+		bmap = __atomic_load_n(&v->reg_thread_id[i], __ATOMIC_ACQUIRE);
+		id = i << RTE_QSBR_THR_INDEX_SHIFT;
+
+		while (bmap) {
+			j = __builtin_ctzl(bmap);
+
+			/* __atomic_store_n(cnt, __ATOMIC_RELAXED)
+			 * is used to ensure 'cnt' (64b) is accessed
+			 * atomically.
+			 */
+			if (unlikely(__atomic_load_n(&v->w[id + j].cnt,
+					__ATOMIC_RELAXED) < t)) {
+				/* This thread is not in QS */
+				if (!wait)
+					return 0;
+
+				/* Loop till this thread enters QS */
+				rte_pause();
+				continue;
+			}
+
+			bmap &= ~(1UL << j);
+		}
+
+		i++;
+	} while (i < RTE_QSBR_BIT_MAP_ELEMS);
+
+	return 1;
+}
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Dump the details of a single QS variables to a file.
+ *
+ * It is NOT multi-thread safe.
+ *
+ * @param f
+ *   A pointer to a file for output
+ * @param v
+ *   QS variable
+ */
+void __rte_experimental
+rte_rcu_qsbr_dump(FILE *f, struct rte_rcu_qsbr *v);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_RCU_QSBR_H_ */
diff --git a/lib/librte_rcu/rte_rcu_version.map b/lib/librte_rcu/rte_rcu_version.map
new file mode 100644
index 000000000..0df2071be
--- /dev/null
+++ b/lib/librte_rcu/rte_rcu_version.map
@@ -0,0 +1,8 @@ 
+EXPERIMENTAL {
+	global:
+
+	rte_rcu_qsbr_init;
+	rte_rcu_qsbr_dump;
+
+	local: *;
+};
diff --git a/lib/meson.build b/lib/meson.build
index bb7f443f9..d0e49d4e1 100644
--- a/lib/meson.build
+++ b/lib/meson.build
@@ -21,7 +21,7 @@  libraries = [ 'compat', # just a header, used for versioning
 	'gro', 'gso', 'ip_frag', 'jobstats',
 	'kni', 'latencystats', 'lpm', 'member',
 	'meter', 'power', 'pdump', 'rawdev',
-	'reorder', 'sched', 'security', 'vhost',
+	'reorder', 'sched', 'security', 'rcu', 'vhost',
 	# add pkt framework libs which use other libs from above
 	'port', 'table', 'pipeline',
 	# flow_classify lib depends on pkt framework table lib
diff --git a/mk/rte.app.mk b/mk/rte.app.mk
index 3ebc4e64c..d4a1d436a 100644
--- a/mk/rte.app.mk
+++ b/mk/rte.app.mk
@@ -92,6 +92,7 @@  _LDLIBS-$(CONFIG_RTE_LIBRTE_EAL)            += -lrte_eal
 _LDLIBS-$(CONFIG_RTE_LIBRTE_CMDLINE)        += -lrte_cmdline
 _LDLIBS-$(CONFIG_RTE_LIBRTE_REORDER)        += -lrte_reorder
 _LDLIBS-$(CONFIG_RTE_LIBRTE_SCHED)          += -lrte_sched
+_LDLIBS-$(CONFIG_RTE_LIBRTE_RCU)            += -lrte_rcu
 
 ifeq ($(CONFIG_RTE_EXEC_ENV_LINUXAPP),y)
 _LDLIBS-$(CONFIG_RTE_LIBRTE_KNI)            += -lrte_kni