[v2,1/2] cryptodev: support enqueue callback functions
diff mbox series

Message ID 1599549024-195051-1-git-send-email-abhinandan.gujjar@intel.com
State Superseded
Delegated to: akhil goyal
Headers show
Series
  • [v2,1/2] cryptodev: support enqueue callback functions
Related show

Checks

Context Check Description
ci/Performance-Testing fail build patch failure
ci/checkpatch warning coding style issues

Commit Message

Abhinandan Gujjar Sept. 8, 2020, 7:10 a.m. UTC
In an eventdev world, multiple workers (with ordered queue) will be
working on IPsec ESP processing. The ESP header's sequence number is
unique and has to be sequentially incremented in an orderly manner.
This rises a need for incrementing sequence number in crypto stage
especially in event crypto adapter. By adding a user callback to
cryptodev at enqueue burst, the user callback will get executed
in the context of event crypto adapter. This helps the application
to increment the ESP sequence number atomically and orderly manner.

This patch adds APIs to add/remove callback functions. The callback
function will be called for each burst of crypto ops received on a
given crypto device queue pair.

v1->v2:
Moved callback related members to the end of cryptodev struct
Added support for RCU

Signed-off-by: Abhinandan Gujjar <abhinandan.gujjar@intel.com>
---
 config/common_base                             |   1 +
 lib/librte_cryptodev/Makefile                  |   2 +-
 lib/librte_cryptodev/rte_cryptodev.c           | 157 +++++++++++++++++++++++++
 lib/librte_cryptodev/rte_cryptodev.h           | 154 +++++++++++++++++++++++-
 lib/librte_cryptodev/rte_cryptodev_version.map |   6 +
 5 files changed, 318 insertions(+), 2 deletions(-)

Comments

Abhinandan Gujjar Sept. 9, 2020, 4:21 p.m. UTC | #1
+Konstantin & Honnappa

> -----Original Message-----
> From: Gujjar, Abhinandan S <abhinandan.gujjar@intel.com>
> Sent: Tuesday, September 8, 2020 12:40 PM
> To: dev@dpdk.org
> Cc: Doherty, Declan <declan.doherty@intel.com>; jerinj@marvell.com;
> akhil.goyal@nxp.com; Vangati, Narender <narender.vangati@intel.com>;
> Gujjar, Abhinandan S <abhinandan.gujjar@intel.com>
> Subject: [v2 1/2] cryptodev: support enqueue callback functions
> 
> In an eventdev world, multiple workers (with ordered queue) will be working
> on IPsec ESP processing. The ESP header's sequence number is unique and
> has to be sequentially incremented in an orderly manner.
> This rises a need for incrementing sequence number in crypto stage
> especially in event crypto adapter. By adding a user callback to cryptodev at
> enqueue burst, the user callback will get executed in the context of event
> crypto adapter. This helps the application to increment the ESP sequence
> number atomically and orderly manner.
> 
> This patch adds APIs to add/remove callback functions. The callback function
> will be called for each burst of crypto ops received on a given crypto device
> queue pair.
> 
> v1->v2:
> Moved callback related members to the end of cryptodev struct Added
> support for RCU
> 
> Signed-off-by: Abhinandan Gujjar <abhinandan.gujjar@intel.com>
> ---
>  config/common_base                             |   1 +
>  lib/librte_cryptodev/Makefile                  |   2 +-
>  lib/librte_cryptodev/rte_cryptodev.c           | 157
> +++++++++++++++++++++++++
>  lib/librte_cryptodev/rte_cryptodev.h           | 154
> +++++++++++++++++++++++-
>  lib/librte_cryptodev/rte_cryptodev_version.map |   6 +
>  5 files changed, 318 insertions(+), 2 deletions(-)
> 
> diff --git a/config/common_base b/config/common_base index
> fbf0ee7..f5ebde4 100644
> --- a/config/common_base
> +++ b/config/common_base
> @@ -599,6 +599,7 @@
> CONFIG_RTE_LIBRTE_PMD_BBDEV_FPGA_5GNR_FEC=y
>  #
>  CONFIG_RTE_LIBRTE_CRYPTODEV=y
>  CONFIG_RTE_CRYPTO_MAX_DEVS=64
> +CONFIG_RTE_CRYPTODEV_CALLBACKS=y
> 
>  #
>  # Compile PMD for ARMv8 Crypto device
> diff --git a/lib/librte_cryptodev/Makefile b/lib/librte_cryptodev/Makefile
> index 73e77a2..514d552 100644
> --- a/lib/librte_cryptodev/Makefile
> +++ b/lib/librte_cryptodev/Makefile
> @@ -10,7 +10,7 @@ LIB = librte_cryptodev.a  CFLAGS += -O3  CFLAGS +=
> $(WERROR_FLAGS)  LDLIBS += -lrte_eal -lrte_mempool -lrte_ring -lrte_mbuf
> -LDLIBS += -lrte_kvargs
> +LDLIBS += -lrte_kvargs -lrte_rcu
> 
>  # library source files
>  SRCS-y += rte_cryptodev.c rte_cryptodev_pmd.c cryptodev_trace_points.c
> diff --git a/lib/librte_cryptodev/rte_cryptodev.c
> b/lib/librte_cryptodev/rte_cryptodev.c
> index 1dd795b..2fb3e35 100644
> --- a/lib/librte_cryptodev/rte_cryptodev.c
> +++ b/lib/librte_cryptodev/rte_cryptodev.c
> @@ -38,6 +38,7 @@
>  #include <rte_string_fns.h>
>  #include <rte_compat.h>
>  #include <rte_function_versioning.h>
> +#include <rte_rcu_qsbr.h>
> 
>  #include "rte_crypto.h"
>  #include "rte_cryptodev.h"
> @@ -499,6 +500,10 @@ struct
> rte_cryptodev_sym_session_pool_private_data {
>  	return 0;
>  }
> 
> +#ifdef RTE_CRYPTODEV_CALLBACKS
> +/* spinlock for crypto device enq callbacks */ static rte_spinlock_t
> +rte_cryptodev_enq_cb_lock = RTE_SPINLOCK_INITIALIZER; #endif
> 
>  const char *
>  rte_cryptodev_get_feature_name(uint64_t flag) @@ -1449,6 +1454,158
> @@ struct rte_cryptodev *
>  	rte_spinlock_unlock(&rte_cryptodev_cb_lock);
>  }
> 
> +#ifdef RTE_CRYPTODEV_CALLBACKS
> +int
> +rte_cryptodev_rcu_qsbr_add(uint8_t dev_id, struct rte_rcu_qsbr *qsbr) {
> +
> +	struct rte_cryptodev *dev;
> +
> +	if (!rte_cryptodev_pmd_is_valid_dev(dev_id)) {
> +		CDEV_LOG_ERR("Invalid dev_id=%" PRIu8, dev_id);
> +		return -EINVAL;
> +	}
> +
> +	dev = &rte_crypto_devices[dev_id];
> +	dev->qsbr = qsbr;
> +	return 0;
> +}
> +
> +struct rte_cryptodev_enq_callback *
> +rte_cryptodev_add_enq_callback(uint8_t dev_id,
> +			       uint16_t qp_id,
> +			       rte_cryptodev_enq_cb_fn cb_fn,
> +			       void *cb_arg)
> +{
> +	struct rte_cryptodev *dev;
> +	struct rte_cryptodev_enq_callback *cb, *tail;
> +
> +	if (!cb_fn)
> +		return NULL;
> +
> +	if (!rte_cryptodev_pmd_is_valid_dev(dev_id)) {
> +		CDEV_LOG_ERR("Invalid dev_id=%" PRIu8, dev_id);
> +		return NULL;
> +	}
> +
> +	dev = &rte_crypto_devices[dev_id];
> +	if (qp_id >= dev->data->nb_queue_pairs) {
> +		CDEV_LOG_ERR("Invalid queue_pair_id=%d", qp_id);
> +		return NULL;
> +	}
> +
> +	cb = rte_zmalloc(NULL, sizeof(*cb), 0);
> +	if (cb == NULL) {
> +		CDEV_LOG_ERR("Failed to allocate memory for callback on "
> +			     "dev=%d, queue_pair_id=%d", dev_id, qp_id);
> +		rte_errno = ENOMEM;
> +		return NULL;
> +	}
> +
> +	cb->fn = cb_fn;
> +	cb->arg = cb_arg;
> +
> +	rte_spinlock_lock(&rte_cryptodev_enq_cb_lock);
> +	if (dev->enq_cbs == NULL) {
> +		dev->enq_cbs = rte_zmalloc(NULL, sizeof(cb) *
> +					   dev->data->nb_queue_pairs, 0);
> +		if (dev->enq_cbs == NULL) {
> +			CDEV_LOG_ERR("Failed to allocate memory for
> callbacks");
> +			rte_errno = ENOMEM;
> +			rte_free(cb);
> +			return NULL;
> +		}
> +	}
> +
> +	/* Add the callbacks in fifo order. */
> +	tail = dev->enq_cbs[qp_id];
> +	if (tail) {
> +		while (tail->next)
> +			tail = tail->next;
> +		tail->next = cb;
> +	} else
> +		dev->enq_cbs[qp_id] = cb;
> +
> +	rte_spinlock_unlock(&rte_cryptodev_enq_cb_lock);
> +
> +	return cb;
> +}
> +
> +int
> +rte_cryptodev_remove_enq_callback(uint8_t dev_id,
> +				  uint16_t qp_id,
> +				  struct rte_cryptodev_enq_callback *cb) {
> +	struct rte_cryptodev *dev;
> +	struct rte_cryptodev_enq_callback **prev_cb, *curr_cb;
> +	uint16_t qp;
> +	int free_mem;
> +	int ret;
> +
> +	free_mem = 1;
> +
> +	if (!cb) {
> +		CDEV_LOG_ERR("cb is NULL");
> +		return -EINVAL;
> +	}
> +
> +	if (!rte_cryptodev_pmd_is_valid_dev(dev_id)) {
> +		CDEV_LOG_ERR("Invalid dev_id=%" PRIu8, dev_id);
> +		return -EINVAL;
> +	}
> +
> +	dev = &rte_crypto_devices[dev_id];
> +	if (qp_id >= dev->data->nb_queue_pairs) {
> +		CDEV_LOG_ERR("Invalid queue_pair_id=%d", qp_id);
> +		return -EINVAL;
> +	}
> +
> +	if (!dev->qsbr) {
> +		CDEV_LOG_ERR("Rcu qsbr is NULL");
> +		return -EINVAL;
> +	}
> +
> +	rte_spinlock_lock(&rte_cryptodev_enq_cb_lock);
> +	if (dev->enq_cbs == NULL) {
> +		rte_spinlock_unlock(&rte_cryptodev_enq_cb_lock);
> +		return -EINVAL;
> +	}
> +
> +	prev_cb = &dev->enq_cbs[qp_id];
> +	for (; *prev_cb != NULL; prev_cb = &curr_cb->next) {
> +		curr_cb = *prev_cb;
> +		if (curr_cb == cb) {
> +			/* Remove the user cb from the callback list. */
> +			*prev_cb = curr_cb->next;
> +			ret = 0;
> +			break;
> +		}
> +	}
> +
> +	if (!ret) {
> +		/* Call sync with invalid thread id as this is part of
> +		 * control plane API */
> +		rte_rcu_qsbr_synchronize(dev->qsbr,
> +					 RTE_QSBR_THRID_INVALID);
> +		rte_free(cb);
> +	}
> +
> +	for (qp = 0; qp < dev->data->nb_queue_pairs; qp++)
> +		if (dev->enq_cbs[qp] != NULL) {
> +			free_mem = 0;
> +			break;
> +		}
> +
> +	if (free_mem) {
> +		rte_free(dev->enq_cbs);
> +		dev->enq_cbs = NULL;
> +	}
> +
> +	rte_spinlock_unlock(&rte_cryptodev_enq_cb_lock);
> +
> +	return ret;
> +}
> +#endif
> 
>  int
>  rte_cryptodev_sym_session_init(uint8_t dev_id, diff --git
> a/lib/librte_cryptodev/rte_cryptodev.h
> b/lib/librte_cryptodev/rte_cryptodev.h
> index 7b3ebc2..2c7a47b 100644
> --- a/lib/librte_cryptodev/rte_cryptodev.h
> +++ b/lib/librte_cryptodev/rte_cryptodev.h
> @@ -530,6 +530,32 @@ struct rte_cryptodev_qp_conf {  };
> 
>  /**
> + * Function type used for pre processing crypto ops when enqueue burst
> +is
> + * called.
> + *
> + * The callback function is called on enqueue burst immediately
> + * before the crypto ops are put onto the hardware queue for processing.
> + *
> + * @param	dev_id		The identifier of the device.
> + * @param	qp_id		The index of the queue pair in which ops are
> + *				to be enqueued for processing. The value
> + *				must be in the range [0, nb_queue_pairs - 1]
> + *				previously supplied to
> + *				*rte_cryptodev_configure*.
> + * @param	ops		The address of an array of *nb_ops* pointers
> + *				to *rte_crypto_op* structures which contain
> + *				the crypto operations to be processed.
> + * @param	nb_ops		The number of operations to process.
> + * @param	user_param	The arbitrary user parameter passed in by the
> + *				application when the callback was originally
> + *				registered.
> + * @return			The number of ops to be enqueued to the
> + *				crypto device.
> + */
> +typedef uint16_t (*rte_cryptodev_enq_cb_fn)(uint16_t dev_id, uint16_t
> qp_id,
> +		struct rte_crypto_op **ops, uint16_t nb_ops, void
> *user_param);
> +
> +/**
>   * Typedef for application callback function to be registered by application
>   * software for notification of device events
>   *
> @@ -853,7 +879,6 @@ struct rte_cryptodev_config {
>  		enum rte_cryptodev_event_type event,
>  		rte_cryptodev_cb_fn cb_fn, void *cb_arg);
> 
> -
>  typedef uint16_t (*dequeue_pkt_burst_t)(void *qp,
>  		struct rte_crypto_op **ops,	uint16_t nb_ops);
>  /**< Dequeue processed packets from queue pair of a device. */ @@ -870,6
> +895,17 @@ typedef uint16_t (*enqueue_pkt_burst_t)(void *qp,
>  /** Structure to keep track of registered callbacks */
> TAILQ_HEAD(rte_cryptodev_cb_list, rte_cryptodev_callback);
> 
> +/**
> + * @internal
> + * Structure used to hold information about the callbacks to be called
> +for a
> + * queue pair on enqueue.
> + */
> +struct rte_cryptodev_enq_callback {
> +	struct rte_cryptodev_enq_callback *next;
> +	rte_cryptodev_enq_cb_fn fn;
> +	void *arg;
> +};
> +
>  /** The data structure associated with each crypto device. */  struct
> rte_cryptodev {
>  	dequeue_pkt_burst_t dequeue_burst;
> @@ -898,6 +934,14 @@ struct rte_cryptodev {
>  	__extension__
>  	uint8_t attached : 1;
>  	/**< Flag indicating the device is attached */
> +
> +#ifdef RTE_CRYPTODEV_CALLBACKS
> +	struct rte_cryptodev_enq_callback **enq_cbs;
> +	/**< User application callback for pre enqueue processing */
> +
> +	struct rte_rcu_qsbr *qsbr;
> +	/** < RCU QSBR variable for rte_cryptodev_enq_callback */ #endif
>  } __rte_cache_aligned;
> 
>  void *
> @@ -1019,6 +1063,18 @@ struct rte_cryptodev_data {
>  		struct rte_crypto_op **ops, uint16_t nb_ops)  {
>  	struct rte_cryptodev *dev = &rte_cryptodevs[dev_id];
> +#ifdef RTE_CRYPTODEV_CALLBACKS
> +	if (unlikely(dev->enq_cbs != NULL && dev->enq_cbs[qp_id] !=
> NULL)) {
> +		struct rte_cryptodev_enq_callback *cb =
> +			dev->enq_cbs[qp_id];
> +
> +		do {
> +			nb_ops = cb->fn(dev_id, qp_id, ops, nb_ops,
> +					cb->arg);
> +			cb = cb->next;
> +		} while (cb != NULL);
> +	}
> +#endif
> 
>  	rte_cryptodev_trace_enqueue_burst(dev_id, qp_id, (void **)ops,
> nb_ops);
>  	return (*dev->enqueue_burst)(
> @@ -1351,6 +1407,102 @@ struct rte_cryptodev_asym_session *
>  	struct rte_cryptodev_sym_session *sess, union rte_crypto_sym_ofs
> ofs,
>  	struct rte_crypto_sym_vec *vec);
> 
> +#ifdef RTE_CRYPTODEV_CALLBACKS
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice
> + *
> + * Add a user callback for a given crypto device and queue pair which
> +will be
> + * called on crypto ops enqueue.
> + *
> + * This API configures a function to be called for each burst of crypto
> +ops
> + * received on a given crypto device queue pair. The return value is a
> +pointer
> + * that can be used later to remove the callback using
> + * rte_cryptodev_remove_enq_callback().
> + *
> + * Multiple functions are called in the order that they are added.
> + *
> + * @param	dev_id		The identifier of the device.
> + * @param	qp_id		The index of the queue pair in which ops are
> + *				to be enqueued for processing. The value
> + *				must be in the range [0, nb_queue_pairs - 1]
> + *				previously supplied to
> + *				*rte_cryptodev_configure*.
> + * @param	cb_fn		The callback function
> + * @param	cb_arg		A generic pointer parameter which will be
> passed
> + *				to each invocation of the callback function on
> + *				this crypto device and queue pair.
> + *
> + * @return
> + *   NULL on error.
> + *   On success, a pointer value which can later be used to remove the
> callback.
> + */
> +
> +__rte_experimental
> +struct rte_cryptodev_enq_callback *
> +rte_cryptodev_add_enq_callback(uint8_t dev_id,
> +			       uint16_t qp_id,
> +			       rte_cryptodev_enq_cb_fn cb_fn,
> +			       void *cb_arg);
> +
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice
> + *
> + * Remove a user callback function for given crypto device and queue pair.
> + *
> + * This function is used to removed callbacks that were added to a
> +crypto
> + * device queue pair using rte_cryptodev_add_enq_callback().
> + *
> + * Note: The callback expects a RCU QSBR to be configured to
> +synchronize
> + * to free the memory. Application is expected to configure RCU QSBR
> +after
> + * adding an enqueue callback.
> + *
> + *
> + * @param	dev_id		The identifier of the device.
> + * @param	qp_id		The index of the queue pair in which ops are
> + *				to be enqueued for processing. The value
> + *				must be in the range [0, nb_queue_pairs - 1]
> + *				previously supplied to
> + *				*rte_cryptodev_configure*.
> + * @param	cb		Pointer to user supplied callback created via
> + *				rte_cryptodev_add_enq_callback().
> + *
> + * @return
> + *   - 0: Success. Callback was removed.
> + *   - -EINVAL:  The dev_id or the qp_id is out of range, or the callback
> + *               is NULL or not found for the crypto device queue pair.
> + */
> +
> +__rte_experimental
> +int rte_cryptodev_remove_enq_callback(uint8_t dev_id,
> +				  uint16_t qp_id,
> +				  struct rte_cryptodev_enq_callback *cb);
> +
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice
> + *
> + * Associate RCU QSBR variable with a cryptodev.
> + *
> + * This function is used to add RCU QSBR to a crypto device.
> + * The purpose of RCU is to help multiple threads to synchronize
> + * with each other before initiating adding/removing callback
> + * while dataplane threads are running enqueue callbacks.
> + *
> + * @param	dev_id		The identifier of the device.
> + * @param	qsr		RCU QSBR configuration
> + * @return
> + *   On success - 0
> + *   On error - EINVAL.
> + */
> +
> +__rte_experimental
> +int rte_cryptodev_rcu_qsbr_add(uint8_t dev_id, struct rte_rcu_qsbr
> +*qsbr); #endif
> +
>  #ifdef __cplusplus
>  }
>  #endif
> diff --git a/lib/librte_cryptodev/rte_cryptodev_version.map
> b/lib/librte_cryptodev/rte_cryptodev_version.map
> index 02f6dcf..46de3ca 100644
> --- a/lib/librte_cryptodev/rte_cryptodev_version.map
> +++ b/lib/librte_cryptodev/rte_cryptodev_version.map
> @@ -64,6 +64,7 @@ DPDK_20.0 {
>  	rte_cryptodev_sym_capability_get;
>  };
> 
> +
>  EXPERIMENTAL {
>  	global:
> 
> @@ -105,4 +106,9 @@ EXPERIMENTAL {
> 
>  	# added in 20.08
>  	rte_cryptodev_get_qp_status;
> +
> +	# added in 20.11
> +	rte_cryptodev_add_enq_callback;
> +	rte_cryptodev_remove_enq_callback;
> +	rte_cryptodev_rcu_qsbr_add;
>  };
> --
> 1.9.1
Honnappa Nagarahalli Sept. 9, 2020, 10:48 p.m. UTC | #2
<snip>

> >
> > In an eventdev world, multiple workers (with ordered queue) will be
> > working on IPsec ESP processing. The ESP header's sequence number is
> > unique and has to be sequentially incremented in an orderly manner.
> > This rises a need for incrementing sequence number in crypto stage
> > especially in event crypto adapter. By adding a user callback to
> > cryptodev at enqueue burst, the user callback will get executed in the
> > context of event crypto adapter. This helps the application to
> > increment the ESP sequence number atomically and orderly manner.
> >
> > This patch adds APIs to add/remove callback functions. The callback
> > function will be called for each burst of crypto ops received on a
> > given crypto device queue pair.
> >
> > v1->v2:
> > Moved callback related members to the end of cryptodev struct Added
> > support for RCU
> >
> > Signed-off-by: Abhinandan Gujjar <abhinandan.gujjar@intel.com>
> > ---
> >  config/common_base                             |   1 +
> >  lib/librte_cryptodev/Makefile                  |   2 +-
> >  lib/librte_cryptodev/rte_cryptodev.c           | 157
> > +++++++++++++++++++++++++
> >  lib/librte_cryptodev/rte_cryptodev.h           | 154
> > +++++++++++++++++++++++-
> >  lib/librte_cryptodev/rte_cryptodev_version.map |   6 +
> >  5 files changed, 318 insertions(+), 2 deletions(-)
> >
> > diff --git a/config/common_base b/config/common_base index
> > fbf0ee7..f5ebde4 100644
> > --- a/config/common_base
> > +++ b/config/common_base
> > @@ -599,6 +599,7 @@
> > CONFIG_RTE_LIBRTE_PMD_BBDEV_FPGA_5GNR_FEC=y
> >  #
> >  CONFIG_RTE_LIBRTE_CRYPTODEV=y
> >  CONFIG_RTE_CRYPTO_MAX_DEVS=64
> > +CONFIG_RTE_CRYPTODEV_CALLBACKS=y
> >
> >  #
> >  # Compile PMD for ARMv8 Crypto device diff --git
> > a/lib/librte_cryptodev/Makefile b/lib/librte_cryptodev/Makefile index
> > 73e77a2..514d552 100644
> > --- a/lib/librte_cryptodev/Makefile
> > +++ b/lib/librte_cryptodev/Makefile
> > @@ -10,7 +10,7 @@ LIB = librte_cryptodev.a  CFLAGS += -O3  CFLAGS +=
> > $(WERROR_FLAGS)  LDLIBS += -lrte_eal -lrte_mempool -lrte_ring
> > -lrte_mbuf -LDLIBS += -lrte_kvargs
> > +LDLIBS += -lrte_kvargs -lrte_rcu
> >
> >  # library source files
> >  SRCS-y += rte_cryptodev.c rte_cryptodev_pmd.c
> > cryptodev_trace_points.c diff --git
> > a/lib/librte_cryptodev/rte_cryptodev.c
> > b/lib/librte_cryptodev/rte_cryptodev.c
> > index 1dd795b..2fb3e35 100644
> > --- a/lib/librte_cryptodev/rte_cryptodev.c
> > +++ b/lib/librte_cryptodev/rte_cryptodev.c
> > @@ -38,6 +38,7 @@
> >  #include <rte_string_fns.h>
> >  #include <rte_compat.h>
> >  #include <rte_function_versioning.h>
> > +#include <rte_rcu_qsbr.h>
> >
> >  #include "rte_crypto.h"
> >  #include "rte_cryptodev.h"
> > @@ -499,6 +500,10 @@ struct
> > rte_cryptodev_sym_session_pool_private_data {
> >  	return 0;
> >  }
> >
> > +#ifdef RTE_CRYPTODEV_CALLBACKS
> > +/* spinlock for crypto device enq callbacks */ static rte_spinlock_t
> > +rte_cryptodev_enq_cb_lock = RTE_SPINLOCK_INITIALIZER; #endif
Are we claiming the APIs to be thread safe? Is the spinlock required?

> >
> >  const char *
> >  rte_cryptodev_get_feature_name(uint64_t flag) @@ -1449,6 +1454,158
> @@
> > struct rte_cryptodev *
> >  	rte_spinlock_unlock(&rte_cryptodev_cb_lock);
> >  }
> >
> > +#ifdef RTE_CRYPTODEV_CALLBACKS
> > +int
> > +rte_cryptodev_rcu_qsbr_add(uint8_t dev_id, struct rte_rcu_qsbr *qsbr)
Suggest moving this API outside of RTE_CRYPTODEV_CALLBACKS as it can be used for other RCU related features in the future.

> > +{
> > +
> > +	struct rte_cryptodev *dev;
> > +
> > +	if (!rte_cryptodev_pmd_is_valid_dev(dev_id)) {
> > +		CDEV_LOG_ERR("Invalid dev_id=%" PRIu8, dev_id);
> > +		return -EINVAL;
> > +	}
> > +
> > +	dev = &rte_crypto_devices[dev_id];
> > +	dev->qsbr = qsbr;
> > +	return 0;
> > +}
> > +
> > +struct rte_cryptodev_enq_callback *
> > +rte_cryptodev_add_enq_callback(uint8_t dev_id,
> > +			       uint16_t qp_id,
> > +			       rte_cryptodev_enq_cb_fn cb_fn,
> > +			       void *cb_arg)
> > +{
> > +	struct rte_cryptodev *dev;
> > +	struct rte_cryptodev_enq_callback *cb, *tail;
> > +
> > +	if (!cb_fn)
> > +		return NULL;
> > +
> > +	if (!rte_cryptodev_pmd_is_valid_dev(dev_id)) {
> > +		CDEV_LOG_ERR("Invalid dev_id=%" PRIu8, dev_id);
> > +		return NULL;
> > +	}
> > +
> > +	dev = &rte_crypto_devices[dev_id];
> > +	if (qp_id >= dev->data->nb_queue_pairs) {
> > +		CDEV_LOG_ERR("Invalid queue_pair_id=%d", qp_id);
> > +		return NULL;
> > +	}
> > +
> > +	cb = rte_zmalloc(NULL, sizeof(*cb), 0);
> > +	if (cb == NULL) {
> > +		CDEV_LOG_ERR("Failed to allocate memory for callback on "
> > +			     "dev=%d, queue_pair_id=%d", dev_id, qp_id);
> > +		rte_errno = ENOMEM;
> > +		return NULL;
> > +	}
> > +
> > +	cb->fn = cb_fn;
> > +	cb->arg = cb_arg;
> > +
> > +	rte_spinlock_lock(&rte_cryptodev_enq_cb_lock);
> > +	if (dev->enq_cbs == NULL) {
> > +		dev->enq_cbs = rte_zmalloc(NULL, sizeof(cb) *
> > +					   dev->data->nb_queue_pairs, 0);
> > +		if (dev->enq_cbs == NULL) {
> > +			CDEV_LOG_ERR("Failed to allocate memory for
> > callbacks");
> > +			rte_errno = ENOMEM;
> > +			rte_free(cb);
Add unlock here.

> > +			return NULL;
> > +		}
> > +	}
> > +
> > +	/* Add the callbacks in fifo order. */
> > +	tail = dev->enq_cbs[qp_id];
> > +	if (tail) {
> > +		while (tail->next)
> > +			tail = tail->next;
> > +		tail->next = cb;
> > +	} else
> > +		dev->enq_cbs[qp_id] = cb;
> > +
> > +	rte_spinlock_unlock(&rte_cryptodev_enq_cb_lock);
> > +
> > +	return cb;
> > +}
> > +
> > +int
> > +rte_cryptodev_remove_enq_callback(uint8_t dev_id,
> > +				  uint16_t qp_id,
> > +				  struct rte_cryptodev_enq_callback *cb) {
> > +	struct rte_cryptodev *dev;
> > +	struct rte_cryptodev_enq_callback **prev_cb, *curr_cb;
> > +	uint16_t qp;
> > +	int free_mem;
> > +	int ret;
> > +
> > +	free_mem = 1;
> > +
> > +	if (!cb) {
> > +		CDEV_LOG_ERR("cb is NULL");
> > +		return -EINVAL;
> > +	}
> > +
> > +	if (!rte_cryptodev_pmd_is_valid_dev(dev_id)) {
> > +		CDEV_LOG_ERR("Invalid dev_id=%" PRIu8, dev_id);
> > +		return -EINVAL;
> > +	}
> > +
> > +	dev = &rte_crypto_devices[dev_id];
> > +	if (qp_id >= dev->data->nb_queue_pairs) {
> > +		CDEV_LOG_ERR("Invalid queue_pair_id=%d", qp_id);
> > +		return -EINVAL;
> > +	}
> > +
> > +	if (!dev->qsbr) {
> > +		CDEV_LOG_ERR("Rcu qsbr is NULL");
> > +		return -EINVAL;
> > +	}
I see that this check is not done in 'rte_cryptodev_add_enq_callback'. I think this is ok, it avoids enforcing an order for API calls.

> > +
> > +	rte_spinlock_lock(&rte_cryptodev_enq_cb_lock);
> > +	if (dev->enq_cbs == NULL) {
> > +		rte_spinlock_unlock(&rte_cryptodev_enq_cb_lock);
> > +		return -EINVAL;
> > +	}
> > +
> > +	prev_cb = &dev->enq_cbs[qp_id];
> > +	for (; *prev_cb != NULL; prev_cb = &curr_cb->next) {
> > +		curr_cb = *prev_cb;
> > +		if (curr_cb == cb) {
> > +			/* Remove the user cb from the callback list. */
> > +			*prev_cb = curr_cb->next;
> > +			ret = 0;
> > +			break;
> > +		}
> > +	}
> > +
> > +	if (!ret) {
> > +		/* Call sync with invalid thread id as this is part of
> > +		 * control plane API */
> > +		rte_rcu_qsbr_synchronize(dev->qsbr,
> > +					 RTE_QSBR_THRID_INVALID);
> > +		rte_free(cb);
> > +	}
> > +
Nit, IMO, it is more readable if 'free_mem = 1' is moved here.

> > +	for (qp = 0; qp < dev->data->nb_queue_pairs; qp++)
> > +		if (dev->enq_cbs[qp] != NULL) {
> > +			free_mem = 0;
> > +			break;
> > +		}
> > +
> > +	if (free_mem) {
> > +		rte_free(dev->enq_cbs);
> > +		dev->enq_cbs = NULL;
> > +	}
> > +
> > +	rte_spinlock_unlock(&rte_cryptodev_enq_cb_lock);
> > +
> > +	return ret;
> > +}
> > +#endif
> >
> >  int
> >  rte_cryptodev_sym_session_init(uint8_t dev_id, diff --git
> > a/lib/librte_cryptodev/rte_cryptodev.h
> > b/lib/librte_cryptodev/rte_cryptodev.h
> > index 7b3ebc2..2c7a47b 100644
> > --- a/lib/librte_cryptodev/rte_cryptodev.h
> > +++ b/lib/librte_cryptodev/rte_cryptodev.h
> > @@ -530,6 +530,32 @@ struct rte_cryptodev_qp_conf {  };
> >
> >  /**
> > + * Function type used for pre processing crypto ops when enqueue
> > +burst is
> > + * called.
> > + *
> > + * The callback function is called on enqueue burst immediately
> > + * before the crypto ops are put onto the hardware queue for processing.
> > + *
> > + * @param	dev_id		The identifier of the device.
> > + * @param	qp_id		The index of the queue pair in which ops are
> > + *				to be enqueued for processing. The value
> > + *				must be in the range [0, nb_queue_pairs - 1]
> > + *				previously supplied to
> > + *				*rte_cryptodev_configure*.
> > + * @param	ops		The address of an array of *nb_ops* pointers
> > + *				to *rte_crypto_op* structures which contain
> > + *				the crypto operations to be processed.
> > + * @param	nb_ops		The number of operations to process.
> > + * @param	user_param	The arbitrary user parameter passed in by the
> > + *				application when the callback was originally
> > + *				registered.
> > + * @return			The number of ops to be enqueued to the
> > + *				crypto device.
> > + */
> > +typedef uint16_t (*rte_cryptodev_enq_cb_fn)(uint16_t dev_id, uint16_t
> > qp_id,
> > +		struct rte_crypto_op **ops, uint16_t nb_ops, void
> > *user_param);
> > +
> > +/**
> >   * Typedef for application callback function to be registered by application
> >   * software for notification of device events
> >   *
> > @@ -853,7 +879,6 @@ struct rte_cryptodev_config {
> >  		enum rte_cryptodev_event_type event,
> >  		rte_cryptodev_cb_fn cb_fn, void *cb_arg);
> >
> > -
> >  typedef uint16_t (*dequeue_pkt_burst_t)(void *qp,
> >  		struct rte_crypto_op **ops,	uint16_t nb_ops);
> >  /**< Dequeue processed packets from queue pair of a device. */ @@
> > -870,6
> > +895,17 @@ typedef uint16_t (*enqueue_pkt_burst_t)(void *qp,
> >  /** Structure to keep track of registered callbacks */
> > TAILQ_HEAD(rte_cryptodev_cb_list, rte_cryptodev_callback);
> >
> > +/**
> > + * @internal
> > + * Structure used to hold information about the callbacks to be
> > +called for a
> > + * queue pair on enqueue.
> > + */
> > +struct rte_cryptodev_enq_callback {
> > +	struct rte_cryptodev_enq_callback *next;
> > +	rte_cryptodev_enq_cb_fn fn;
> > +	void *arg;
> > +};
> > +
> >  /** The data structure associated with each crypto device. */  struct
> > rte_cryptodev {
> >  	dequeue_pkt_burst_t dequeue_burst;
> > @@ -898,6 +934,14 @@ struct rte_cryptodev {
> >  	__extension__
> >  	uint8_t attached : 1;
> >  	/**< Flag indicating the device is attached */
> > +
> > +#ifdef RTE_CRYPTODEV_CALLBACKS
> > +	struct rte_cryptodev_enq_callback **enq_cbs;
> > +	/**< User application callback for pre enqueue processing */
> > +
> > +	struct rte_rcu_qsbr *qsbr;
Would be good to move this out of RTE_CRYPTODEV_CALLBACKS so that it can be used for other features in the future.

> > +	/** < RCU QSBR variable for rte_cryptodev_enq_callback */ #endif
> >  } __rte_cache_aligned;
> >
> >  void *
> > @@ -1019,6 +1063,18 @@ struct rte_cryptodev_data {
> >  		struct rte_crypto_op **ops, uint16_t nb_ops)  {
> >  	struct rte_cryptodev *dev = &rte_cryptodevs[dev_id];
> > +#ifdef RTE_CRYPTODEV_CALLBACKS
> > +	if (unlikely(dev->enq_cbs != NULL && dev->enq_cbs[qp_id] !=
> > NULL)) {
> > +		struct rte_cryptodev_enq_callback *cb =
> > +			dev->enq_cbs[qp_id];
> > +
> > +		do {
> > +			nb_ops = cb->fn(dev_id, qp_id, ops, nb_ops,
> > +					cb->arg);
> > +			cb = cb->next;
> > +		} while (cb != NULL);
> > +	}
> > +#endif
> >
> >  	rte_cryptodev_trace_enqueue_burst(dev_id, qp_id, (void **)ops,
> > nb_ops);
> >  	return (*dev->enqueue_burst)(
> > @@ -1351,6 +1407,102 @@ struct rte_cryptodev_asym_session *
> >  	struct rte_cryptodev_sym_session *sess, union rte_crypto_sym_ofs
> > ofs,
> >  	struct rte_crypto_sym_vec *vec);
> >
> > +#ifdef RTE_CRYPTODEV_CALLBACKS
> > +/**
> > + * @warning
> > + * @b EXPERIMENTAL: this API may change without prior notice
> > + *
> > + * Add a user callback for a given crypto device and queue pair which
> > +will be
> > + * called on crypto ops enqueue.
> > + *
> > + * This API configures a function to be called for each burst of
> > +crypto ops
> > + * received on a given crypto device queue pair. The return value is
> > +a pointer
> > + * that can be used later to remove the callback using
> > + * rte_cryptodev_remove_enq_callback().
> > + *
> > + * Multiple functions are called in the order that they are added.
> > + *
> > + * @param	dev_id		The identifier of the device.
> > + * @param	qp_id		The index of the queue pair in which ops are
> > + *				to be enqueued for processing. The value
> > + *				must be in the range [0, nb_queue_pairs - 1]
> > + *				previously supplied to
> > + *				*rte_cryptodev_configure*.
> > + * @param	cb_fn		The callback function
> > + * @param	cb_arg		A generic pointer parameter which will be
> > passed
> > + *				to each invocation of the callback function on
> > + *				this crypto device and queue pair.
> > + *
> > + * @return
> > + *   NULL on error.
> > + *   On success, a pointer value which can later be used to remove the
> > callback.
> > + */
> > +
> > +__rte_experimental
> > +struct rte_cryptodev_enq_callback *
> > +rte_cryptodev_add_enq_callback(uint8_t dev_id,
> > +			       uint16_t qp_id,
> > +			       rte_cryptodev_enq_cb_fn cb_fn,
> > +			       void *cb_arg);
> > +
> > +
> > +/**
> > + * @warning
> > + * @b EXPERIMENTAL: this API may change without prior notice
> > + *
> > + * Remove a user callback function for given crypto device and queue pair.
> > + *
> > + * This function is used to removed callbacks that were added to a
> > +crypto
> > + * device queue pair using rte_cryptodev_add_enq_callback().
> > + *
> > + * Note: The callback expects a RCU QSBR to be configured to
> > +synchronize
> > + * to free the memory. Application is expected to configure RCU QSBR
> > +after
> > + * adding an enqueue callback.
The application could configure RCU QSBR any time before 'rte_cryptodev_remove_enq_callback' is called. It has no relation to 'rte_cryptodev_add_enq_callback'

> > + *
> > + *
> > + * @param	dev_id		The identifier of the device.
> > + * @param	qp_id		The index of the queue pair in which ops are
> > + *				to be enqueued for processing. The value
> > + *				must be in the range [0, nb_queue_pairs - 1]
> > + *				previously supplied to
> > + *				*rte_cryptodev_configure*.
> > + * @param	cb		Pointer to user supplied callback created via
> > + *				rte_cryptodev_add_enq_callback().
> > + *
> > + * @return
> > + *   - 0: Success. Callback was removed.
> > + *   - -EINVAL:  The dev_id or the qp_id is out of range, or the callback
> > + *               is NULL or not found for the crypto device queue pair.
Add RCU variable not configured.

> > + */
> > +
> > +__rte_experimental
> > +int rte_cryptodev_remove_enq_callback(uint8_t dev_id,
> > +				  uint16_t qp_id,
> > +				  struct rte_cryptodev_enq_callback *cb);
> > +
> > +
> > +/**
> > + * @warning
> > + * @b EXPERIMENTAL: this API may change without prior notice
> > + *
> > + * Associate RCU QSBR variable with a cryptodev.
> > + *
> > + * This function is used to add RCU QSBR to a crypto device.
> > + * The purpose of RCU is to help multiple threads to synchronize
> > + * with each other before initiating adding/removing callback
> > + * while dataplane threads are running enqueue callbacks.
How about the following to replace the last sentence above?
"RCU is used to safely reclaim the memory shared with the data plane threads."

> > + *
> > + * @param	dev_id		The identifier of the device.
> > + * @param	qsr		RCU QSBR configuration
                            ^^^ qsbr                                  ^^^^^^^^^^^ 'variable' is a better choice
> > + * @return
> > + *   On success - 0
> > + *   On error - EINVAL.
> > + */
> > +
> > +__rte_experimental
> > +int rte_cryptodev_rcu_qsbr_add(uint8_t dev_id, struct rte_rcu_qsbr
> > +*qsbr); #endif
> > +
> >  #ifdef __cplusplus
> >  }
> >  #endif
> > diff --git a/lib/librte_cryptodev/rte_cryptodev_version.map
> > b/lib/librte_cryptodev/rte_cryptodev_version.map
> > index 02f6dcf..46de3ca 100644
> > --- a/lib/librte_cryptodev/rte_cryptodev_version.map
> > +++ b/lib/librte_cryptodev/rte_cryptodev_version.map
> > @@ -64,6 +64,7 @@ DPDK_20.0 {
> >  	rte_cryptodev_sym_capability_get;
> >  };
> >
> > +
> >  EXPERIMENTAL {
> >  	global:
> >
> > @@ -105,4 +106,9 @@ EXPERIMENTAL {
> >
> >  	# added in 20.08
> >  	rte_cryptodev_get_qp_status;
> > +
> > +	# added in 20.11
> > +	rte_cryptodev_add_enq_callback;
> > +	rte_cryptodev_remove_enq_callback;
> > +	rte_cryptodev_rcu_qsbr_add;
> >  };
> > --
> > 1.9.1
Abhinandan Gujjar Sept. 11, 2020, 8:33 a.m. UTC | #3
> -----Original Message-----
> From: Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>
> Sent: Thursday, September 10, 2020 4:19 AM
> To: Gujjar, Abhinandan S <abhinandan.gujjar@intel.com>; dev@dpdk.org
> Cc: Doherty, Declan <declan.doherty@intel.com>; jerinj@marvell.com;
> Akhil.goyal@nxp.com; Vangati, Narender <narender.vangati@intel.com>;
> Ananyev, Konstantin <konstantin.ananyev@intel.com>; nd <nd@arm.com>;
> Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>; nd
> <nd@arm.com>
> Subject: RE: [v2 1/2] cryptodev: support enqueue callback functions
> 
> <snip>
> 
> > >
> > > In an eventdev world, multiple workers (with ordered queue) will be
> > > working on IPsec ESP processing. The ESP header's sequence number is
> > > unique and has to be sequentially incremented in an orderly manner.
> > > This rises a need for incrementing sequence number in crypto stage
> > > especially in event crypto adapter. By adding a user callback to
> > > cryptodev at enqueue burst, the user callback will get executed in
> > > the context of event crypto adapter. This helps the application to
> > > increment the ESP sequence number atomically and orderly manner.
> > >
> > > This patch adds APIs to add/remove callback functions. The callback
> > > function will be called for each burst of crypto ops received on a
> > > given crypto device queue pair.
> > >
> > > v1->v2:
> > > Moved callback related members to the end of cryptodev struct Added
> > > support for RCU
> > >
> > > Signed-off-by: Abhinandan Gujjar <abhinandan.gujjar@intel.com>
> > > ---
> > >  config/common_base                             |   1 +
> > >  lib/librte_cryptodev/Makefile                  |   2 +-
> > >  lib/librte_cryptodev/rte_cryptodev.c           | 157
> > > +++++++++++++++++++++++++
> > >  lib/librte_cryptodev/rte_cryptodev.h           | 154
> > > +++++++++++++++++++++++-
> > >  lib/librte_cryptodev/rte_cryptodev_version.map |   6 +
> > >  5 files changed, 318 insertions(+), 2 deletions(-)
> > >
> > > diff --git a/config/common_base b/config/common_base index
> > > fbf0ee7..f5ebde4 100644
> > > --- a/config/common_base
> > > +++ b/config/common_base
> > > @@ -599,6 +599,7 @@
> > > CONFIG_RTE_LIBRTE_PMD_BBDEV_FPGA_5GNR_FEC=y
> > >  #
> > >  CONFIG_RTE_LIBRTE_CRYPTODEV=y
> > >  CONFIG_RTE_CRYPTO_MAX_DEVS=64
> > > +CONFIG_RTE_CRYPTODEV_CALLBACKS=y
> > >
> > >  #
> > >  # Compile PMD for ARMv8 Crypto device diff --git
> > > a/lib/librte_cryptodev/Makefile b/lib/librte_cryptodev/Makefile
> > > index
> > > 73e77a2..514d552 100644
> > > --- a/lib/librte_cryptodev/Makefile
> > > +++ b/lib/librte_cryptodev/Makefile
> > > @@ -10,7 +10,7 @@ LIB = librte_cryptodev.a  CFLAGS += -O3  CFLAGS +=
> > > $(WERROR_FLAGS)  LDLIBS += -lrte_eal -lrte_mempool -lrte_ring
> > > -lrte_mbuf -LDLIBS += -lrte_kvargs
> > > +LDLIBS += -lrte_kvargs -lrte_rcu
> > >
> > >  # library source files
> > >  SRCS-y += rte_cryptodev.c rte_cryptodev_pmd.c
> > > cryptodev_trace_points.c diff --git
> > > a/lib/librte_cryptodev/rte_cryptodev.c
> > > b/lib/librte_cryptodev/rte_cryptodev.c
> > > index 1dd795b..2fb3e35 100644
> > > --- a/lib/librte_cryptodev/rte_cryptodev.c
> > > +++ b/lib/librte_cryptodev/rte_cryptodev.c
> > > @@ -38,6 +38,7 @@
> > >  #include <rte_string_fns.h>
> > >  #include <rte_compat.h>
> > >  #include <rte_function_versioning.h>
> > > +#include <rte_rcu_qsbr.h>
> > >
> > >  #include "rte_crypto.h"
> > >  #include "rte_cryptodev.h"
> > > @@ -499,6 +500,10 @@ struct
> > > rte_cryptodev_sym_session_pool_private_data {
> > >  	return 0;
> > >  }
> > >
> > > +#ifdef RTE_CRYPTODEV_CALLBACKS
> > > +/* spinlock for crypto device enq callbacks */ static
> > > +rte_spinlock_t rte_cryptodev_enq_cb_lock =
> > > +RTE_SPINLOCK_INITIALIZER; #endif
> Are we claiming the APIs to be thread safe? Is the spinlock required?
Yes. E.g. without spinlock, two threads can still try to add callbacks simultaneously.  
> 
> > >
> > >  const char *
> > >  rte_cryptodev_get_feature_name(uint64_t flag) @@ -1449,6 +1454,158
> > @@
> > > struct rte_cryptodev *
> > >  	rte_spinlock_unlock(&rte_cryptodev_cb_lock);
> > >  }
> > >
> > > +#ifdef RTE_CRYPTODEV_CALLBACKS
> > > +int
> > > +rte_cryptodev_rcu_qsbr_add(uint8_t dev_id, struct rte_rcu_qsbr
> > > +*qsbr)
> Suggest moving this API outside of RTE_CRYPTODEV_CALLBACKS as it can be
> used for other RCU related features in the future.
ok
> 
> > > +{
> > > +
> > > +	struct rte_cryptodev *dev;
> > > +
> > > +	if (!rte_cryptodev_pmd_is_valid_dev(dev_id)) {
> > > +		CDEV_LOG_ERR("Invalid dev_id=%" PRIu8, dev_id);
> > > +		return -EINVAL;
> > > +	}
> > > +
> > > +	dev = &rte_crypto_devices[dev_id];
> > > +	dev->qsbr = qsbr;
> > > +	return 0;
> > > +}
> > > +
> > > +struct rte_cryptodev_enq_callback *
> > > +rte_cryptodev_add_enq_callback(uint8_t dev_id,
> > > +			       uint16_t qp_id,
> > > +			       rte_cryptodev_enq_cb_fn cb_fn,
> > > +			       void *cb_arg)
> > > +{
> > > +	struct rte_cryptodev *dev;
> > > +	struct rte_cryptodev_enq_callback *cb, *tail;
> > > +
> > > +	if (!cb_fn)
> > > +		return NULL;
> > > +
> > > +	if (!rte_cryptodev_pmd_is_valid_dev(dev_id)) {
> > > +		CDEV_LOG_ERR("Invalid dev_id=%" PRIu8, dev_id);
> > > +		return NULL;
> > > +	}
> > > +
> > > +	dev = &rte_crypto_devices[dev_id];
> > > +	if (qp_id >= dev->data->nb_queue_pairs) {
> > > +		CDEV_LOG_ERR("Invalid queue_pair_id=%d", qp_id);
> > > +		return NULL;
> > > +	}
> > > +
> > > +	cb = rte_zmalloc(NULL, sizeof(*cb), 0);
> > > +	if (cb == NULL) {
> > > +		CDEV_LOG_ERR("Failed to allocate memory for callback on "
> > > +			     "dev=%d, queue_pair_id=%d", dev_id, qp_id);
> > > +		rte_errno = ENOMEM;
> > > +		return NULL;
> > > +	}
> > > +
> > > +	cb->fn = cb_fn;
> > > +	cb->arg = cb_arg;
> > > +
> > > +	rte_spinlock_lock(&rte_cryptodev_enq_cb_lock);
> > > +	if (dev->enq_cbs == NULL) {
> > > +		dev->enq_cbs = rte_zmalloc(NULL, sizeof(cb) *
> > > +					   dev->data->nb_queue_pairs, 0);
> > > +		if (dev->enq_cbs == NULL) {
> > > +			CDEV_LOG_ERR("Failed to allocate memory for
> > > callbacks");
> > > +			rte_errno = ENOMEM;
> > > +			rte_free(cb);
> Add unlock here.
ok
> 
> > > +			return NULL;
> > > +		}
> > > +	}
> > > +
> > > +	/* Add the callbacks in fifo order. */
> > > +	tail = dev->enq_cbs[qp_id];
> > > +	if (tail) {
> > > +		while (tail->next)
> > > +			tail = tail->next;
> > > +		tail->next = cb;
> > > +	} else
> > > +		dev->enq_cbs[qp_id] = cb;
> > > +
> > > +	rte_spinlock_unlock(&rte_cryptodev_enq_cb_lock);
> > > +
> > > +	return cb;
> > > +}
> > > +
> > > +int
> > > +rte_cryptodev_remove_enq_callback(uint8_t dev_id,
> > > +				  uint16_t qp_id,
> > > +				  struct rte_cryptodev_enq_callback *cb) {
> > > +	struct rte_cryptodev *dev;
> > > +	struct rte_cryptodev_enq_callback **prev_cb, *curr_cb;
> > > +	uint16_t qp;
> > > +	int free_mem;
> > > +	int ret;
> > > +
> > > +	free_mem = 1;
> > > +
> > > +	if (!cb) {
> > > +		CDEV_LOG_ERR("cb is NULL");
> > > +		return -EINVAL;
> > > +	}
> > > +
> > > +	if (!rte_cryptodev_pmd_is_valid_dev(dev_id)) {
> > > +		CDEV_LOG_ERR("Invalid dev_id=%" PRIu8, dev_id);
> > > +		return -EINVAL;
> > > +	}
> > > +
> > > +	dev = &rte_crypto_devices[dev_id];
> > > +	if (qp_id >= dev->data->nb_queue_pairs) {
> > > +		CDEV_LOG_ERR("Invalid queue_pair_id=%d", qp_id);
> > > +		return -EINVAL;
> > > +	}
> > > +
> > > +	if (!dev->qsbr) {
> > > +		CDEV_LOG_ERR("Rcu qsbr is NULL");
> > > +		return -EINVAL;
> > > +	}
> I see that this check is not done in 'rte_cryptodev_add_enq_callback'. I think
> this is ok, it avoids enforcing an order for API calls.
> 
> > > +
> > > +	rte_spinlock_lock(&rte_cryptodev_enq_cb_lock);
> > > +	if (dev->enq_cbs == NULL) {
> > > +		rte_spinlock_unlock(&rte_cryptodev_enq_cb_lock);
> > > +		return -EINVAL;
> > > +	}
> > > +
> > > +	prev_cb = &dev->enq_cbs[qp_id];
> > > +	for (; *prev_cb != NULL; prev_cb = &curr_cb->next) {
> > > +		curr_cb = *prev_cb;
> > > +		if (curr_cb == cb) {
> > > +			/* Remove the user cb from the callback list. */
> > > +			*prev_cb = curr_cb->next;
> > > +			ret = 0;
> > > +			break;
> > > +		}
> > > +	}
> > > +
> > > +	if (!ret) {
> > > +		/* Call sync with invalid thread id as this is part of
> > > +		 * control plane API */
> > > +		rte_rcu_qsbr_synchronize(dev->qsbr,
> > > +					 RTE_QSBR_THRID_INVALID);
> > > +		rte_free(cb);
> > > +	}
> > > +
> Nit, IMO, it is more readable if 'free_mem = 1' is moved here.
> 
> > > +	for (qp = 0; qp < dev->data->nb_queue_pairs; qp++)
> > > +		if (dev->enq_cbs[qp] != NULL) {
> > > +			free_mem = 0;
> > > +			break;
> > > +		}
> > > +
> > > +	if (free_mem) {
> > > +		rte_free(dev->enq_cbs);
> > > +		dev->enq_cbs = NULL;
> > > +	}
> > > +
> > > +	rte_spinlock_unlock(&rte_cryptodev_enq_cb_lock);
> > > +
> > > +	return ret;
> > > +}
> > > +#endif
> > >
> > >  int
> > >  rte_cryptodev_sym_session_init(uint8_t dev_id, diff --git
> > > a/lib/librte_cryptodev/rte_cryptodev.h
> > > b/lib/librte_cryptodev/rte_cryptodev.h
> > > index 7b3ebc2..2c7a47b 100644
> > > --- a/lib/librte_cryptodev/rte_cryptodev.h
> > > +++ b/lib/librte_cryptodev/rte_cryptodev.h
> > > @@ -530,6 +530,32 @@ struct rte_cryptodev_qp_conf {  };
> > >
> > >  /**
> > > + * Function type used for pre processing crypto ops when enqueue
> > > +burst is
> > > + * called.
> > > + *
> > > + * The callback function is called on enqueue burst immediately
> > > + * before the crypto ops are put onto the hardware queue for
> processing.
> > > + *
> > > + * @param	dev_id		The identifier of the device.
> > > + * @param	qp_id		The index of the queue pair in which
> ops are
> > > + *				to be enqueued for processing. The value
> > > + *				must be in the range [0, nb_queue_pairs - 1]
> > > + *				previously supplied to
> > > + *				*rte_cryptodev_configure*.
> > > + * @param	ops		The address of an array of *nb_ops*
> pointers
> > > + *				to *rte_crypto_op* structures which contain
> > > + *				the crypto operations to be processed.
> > > + * @param	nb_ops		The number of operations to process.
> > > + * @param	user_param	The arbitrary user parameter passed
> in by the
> > > + *				application when the callback was originally
> > > + *				registered.
> > > + * @return			The number of ops to be enqueued
> to the
> > > + *				crypto device.
> > > + */
> > > +typedef uint16_t (*rte_cryptodev_enq_cb_fn)(uint16_t dev_id,
> > > +uint16_t
> > > qp_id,
> > > +		struct rte_crypto_op **ops, uint16_t nb_ops, void
> > > *user_param);
> > > +
> > > +/**
> > >   * Typedef for application callback function to be registered by
> application
> > >   * software for notification of device events
> > >   *
> > > @@ -853,7 +879,6 @@ struct rte_cryptodev_config {
> > >  		enum rte_cryptodev_event_type event,
> > >  		rte_cryptodev_cb_fn cb_fn, void *cb_arg);
> > >
> > > -
> > >  typedef uint16_t (*dequeue_pkt_burst_t)(void *qp,
> > >  		struct rte_crypto_op **ops,	uint16_t nb_ops);
> > >  /**< Dequeue processed packets from queue pair of a device. */ @@
> > > -870,6
> > > +895,17 @@ typedef uint16_t (*enqueue_pkt_burst_t)(void *qp,
> > >  /** Structure to keep track of registered callbacks */
> > > TAILQ_HEAD(rte_cryptodev_cb_list, rte_cryptodev_callback);
> > >
> > > +/**
> > > + * @internal
> > > + * Structure used to hold information about the callbacks to be
> > > +called for a
> > > + * queue pair on enqueue.
> > > + */
> > > +struct rte_cryptodev_enq_callback {
> > > +	struct rte_cryptodev_enq_callback *next;
> > > +	rte_cryptodev_enq_cb_fn fn;
> > > +	void *arg;
> > > +};
> > > +
> > >  /** The data structure associated with each crypto device. */
> > > struct rte_cryptodev {
> > >  	dequeue_pkt_burst_t dequeue_burst; @@ -898,6 +934,14 @@
> struct
> > > rte_cryptodev {
> > >  	__extension__
> > >  	uint8_t attached : 1;
> > >  	/**< Flag indicating the device is attached */
> > > +
> > > +#ifdef RTE_CRYPTODEV_CALLBACKS
> > > +	struct rte_cryptodev_enq_callback **enq_cbs;
> > > +	/**< User application callback for pre enqueue processing */
> > > +
> > > +	struct rte_rcu_qsbr *qsbr;
> Would be good to move this out of RTE_CRYPTODEV_CALLBACKS so that it
> can be used for other features in the future.
ok
> 
> > > +	/** < RCU QSBR variable for rte_cryptodev_enq_callback */ #endif
> > >  } __rte_cache_aligned;
> > >
> > >  void *
> > > @@ -1019,6 +1063,18 @@ struct rte_cryptodev_data {
> > >  		struct rte_crypto_op **ops, uint16_t nb_ops)  {
> > >  	struct rte_cryptodev *dev = &rte_cryptodevs[dev_id];
> > > +#ifdef RTE_CRYPTODEV_CALLBACKS
> > > +	if (unlikely(dev->enq_cbs != NULL && dev->enq_cbs[qp_id] !=
> > > NULL)) {
> > > +		struct rte_cryptodev_enq_callback *cb =
> > > +			dev->enq_cbs[qp_id];
> > > +
> > > +		do {
> > > +			nb_ops = cb->fn(dev_id, qp_id, ops, nb_ops,
> > > +					cb->arg);
> > > +			cb = cb->next;
> > > +		} while (cb != NULL);
> > > +	}
> > > +#endif
> > >
> > >  	rte_cryptodev_trace_enqueue_burst(dev_id, qp_id, (void **)ops,
> > > nb_ops);
> > >  	return (*dev->enqueue_burst)(
> > > @@ -1351,6 +1407,102 @@ struct rte_cryptodev_asym_session *
> > >  	struct rte_cryptodev_sym_session *sess, union rte_crypto_sym_ofs
> > > ofs,
> > >  	struct rte_crypto_sym_vec *vec);
> > >
> > > +#ifdef RTE_CRYPTODEV_CALLBACKS
> > > +/**
> > > + * @warning
> > > + * @b EXPERIMENTAL: this API may change without prior notice
> > > + *
> > > + * Add a user callback for a given crypto device and queue pair
> > > +which will be
> > > + * called on crypto ops enqueue.
> > > + *
> > > + * This API configures a function to be called for each burst of
> > > +crypto ops
> > > + * received on a given crypto device queue pair. The return value
> > > +is a pointer
> > > + * that can be used later to remove the callback using
> > > + * rte_cryptodev_remove_enq_callback().
> > > + *
> > > + * Multiple functions are called in the order that they are added.
> > > + *
> > > + * @param	dev_id		The identifier of the device.
> > > + * @param	qp_id		The index of the queue pair in which
> ops are
> > > + *				to be enqueued for processing. The value
> > > + *				must be in the range [0, nb_queue_pairs - 1]
> > > + *				previously supplied to
> > > + *				*rte_cryptodev_configure*.
> > > + * @param	cb_fn		The callback function
> > > + * @param	cb_arg		A generic pointer parameter which
> will be
> > > passed
> > > + *				to each invocation of the callback function on
> > > + *				this crypto device and queue pair.
> > > + *
> > > + * @return
> > > + *   NULL on error.
> > > + *   On success, a pointer value which can later be used to remove the
> > > callback.
> > > + */
> > > +
> > > +__rte_experimental
> > > +struct rte_cryptodev_enq_callback *
> > > +rte_cryptodev_add_enq_callback(uint8_t dev_id,
> > > +			       uint16_t qp_id,
> > > +			       rte_cryptodev_enq_cb_fn cb_fn,
> > > +			       void *cb_arg);
> > > +
> > > +
> > > +/**
> > > + * @warning
> > > + * @b EXPERIMENTAL: this API may change without prior notice
> > > + *
> > > + * Remove a user callback function for given crypto device and queue
> pair.
> > > + *
> > > + * This function is used to removed callbacks that were added to a
> > > +crypto
> > > + * device queue pair using rte_cryptodev_add_enq_callback().
> > > + *
> > > + * Note: The callback expects a RCU QSBR to be configured to
> > > +synchronize
> > > + * to free the memory. Application is expected to configure RCU
> > > +QSBR after
> > > + * adding an enqueue callback.
> The application could configure RCU QSBR any time before
> 'rte_cryptodev_remove_enq_callback' is called. It has no relation to
> 'rte_cryptodev_add_enq_callback'
Ok. I will update it.
> 
> > > + *
> > > + *
> > > + * @param	dev_id		The identifier of the device.
> > > + * @param	qp_id		The index of the queue pair in which
> ops are
> > > + *				to be enqueued for processing. The value
> > > + *				must be in the range [0, nb_queue_pairs - 1]
> > > + *				previously supplied to
> > > + *				*rte_cryptodev_configure*.
> > > + * @param	cb		Pointer to user supplied callback
> created via
> > > + *				rte_cryptodev_add_enq_callback().
> > > + *
> > > + * @return
> > > + *   - 0: Success. Callback was removed.
> > > + *   - -EINVAL:  The dev_id or the qp_id is out of range, or the callback
> > > + *               is NULL or not found for the crypto device queue pair.
> Add RCU variable not configured.
ok
> 
> > > + */
> > > +
> > > +__rte_experimental
> > > +int rte_cryptodev_remove_enq_callback(uint8_t dev_id,
> > > +				  uint16_t qp_id,
> > > +				  struct rte_cryptodev_enq_callback *cb);
> > > +
> > > +
> > > +/**
> > > + * @warning
> > > + * @b EXPERIMENTAL: this API may change without prior notice
> > > + *
> > > + * Associate RCU QSBR variable with a cryptodev.
> > > + *
> > > + * This function is used to add RCU QSBR to a crypto device.
> > > + * The purpose of RCU is to help multiple threads to synchronize
> > > + * with each other before initiating adding/removing callback
> > > + * while dataplane threads are running enqueue callbacks.
> How about the following to replace the last sentence above?
> "RCU is used to safely reclaim the memory shared with the data plane
> threads."
Sure
> 
> > > + *
> > > + * @param	dev_id		The identifier of the device.
> > > + * @param	qsr		RCU QSBR configuration
>                             ^^^ qsbr                                  ^^^^^^^^^^^ 'variable' is a better
> choice
> > > + * @return
> > > + *   On success - 0
> > > + *   On error - EINVAL.
> > > + */
> > > +
> > > +__rte_experimental
> > > +int rte_cryptodev_rcu_qsbr_add(uint8_t dev_id, struct rte_rcu_qsbr
> > > +*qsbr); #endif
> > > +
> > >  #ifdef __cplusplus
> > >  }
> > >  #endif
> > > diff --git a/lib/librte_cryptodev/rte_cryptodev_version.map
> > > b/lib/librte_cryptodev/rte_cryptodev_version.map
> > > index 02f6dcf..46de3ca 100644
> > > --- a/lib/librte_cryptodev/rte_cryptodev_version.map
> > > +++ b/lib/librte_cryptodev/rte_cryptodev_version.map
> > > @@ -64,6 +64,7 @@ DPDK_20.0 {
> > >  	rte_cryptodev_sym_capability_get;
> > >  };
> > >
> > > +
> > >  EXPERIMENTAL {
> > >  	global:
> > >
> > > @@ -105,4 +106,9 @@ EXPERIMENTAL {
> > >
> > >  	# added in 20.08
> > >  	rte_cryptodev_get_qp_status;
> > > +
> > > +	# added in 20.11
> > > +	rte_cryptodev_add_enq_callback;
> > > +	rte_cryptodev_remove_enq_callback;
> > > +	rte_cryptodev_rcu_qsbr_add;
> > >  };
> > > --
> > > 1.9.1
Ananyev, Konstantin Sept. 16, 2020, 1:39 p.m. UTC | #4
Hi Guijjar,

> 
> In an eventdev world, multiple workers (with ordered queue) will be
> working on IPsec ESP processing. The ESP header's sequence number is
> unique and has to be sequentially incremented in an orderly manner.
> This rises a need for incrementing sequence number in crypto stage
> especially in event crypto adapter. By adding a user callback to
> cryptodev at enqueue burst, the user callback will get executed
> in the context of event crypto adapter. This helps the application
> to increment the ESP sequence number atomically and orderly manner.
> 
> This patch adds APIs to add/remove callback functions. The callback
> function will be called for each burst of crypto ops received on a
> given crypto device queue pair.
> 
> v1->v2:
> Moved callback related members to the end of cryptodev struct
> Added support for RCU
> 
> Signed-off-by: Abhinandan Gujjar <abhinandan.gujjar@intel.com>
> ---
>  config/common_base                             |   1 +
>  lib/librte_cryptodev/Makefile                  |   2 +-
>  lib/librte_cryptodev/rte_cryptodev.c           | 157 +++++++++++++++++++++++++
>  lib/librte_cryptodev/rte_cryptodev.h           | 154 +++++++++++++++++++++++-
>  lib/librte_cryptodev/rte_cryptodev_version.map |   6 +
>  5 files changed, 318 insertions(+), 2 deletions(-)
> 
> diff --git a/config/common_base b/config/common_base
> index fbf0ee7..f5ebde4 100644
> --- a/config/common_base
> +++ b/config/common_base
> @@ -599,6 +599,7 @@ CONFIG_RTE_LIBRTE_PMD_BBDEV_FPGA_5GNR_FEC=y
>  #
>  CONFIG_RTE_LIBRTE_CRYPTODEV=y
>  CONFIG_RTE_CRYPTO_MAX_DEVS=64
> +CONFIG_RTE_CRYPTODEV_CALLBACKS=y
> 
>  #
>  # Compile PMD for ARMv8 Crypto device
> diff --git a/lib/librte_cryptodev/Makefile b/lib/librte_cryptodev/Makefile
> index 73e77a2..514d552 100644
> --- a/lib/librte_cryptodev/Makefile
> +++ b/lib/librte_cryptodev/Makefile
> @@ -10,7 +10,7 @@ LIB = librte_cryptodev.a
>  CFLAGS += -O3
>  CFLAGS += $(WERROR_FLAGS)
>  LDLIBS += -lrte_eal -lrte_mempool -lrte_ring -lrte_mbuf
> -LDLIBS += -lrte_kvargs
> +LDLIBS += -lrte_kvargs -lrte_rcu
> 
>  # library source files
>  SRCS-y += rte_cryptodev.c rte_cryptodev_pmd.c cryptodev_trace_points.c
> diff --git a/lib/librte_cryptodev/rte_cryptodev.c b/lib/librte_cryptodev/rte_cryptodev.c
> index 1dd795b..2fb3e35 100644
> --- a/lib/librte_cryptodev/rte_cryptodev.c
> +++ b/lib/librte_cryptodev/rte_cryptodev.c
> @@ -38,6 +38,7 @@
>  #include <rte_string_fns.h>
>  #include <rte_compat.h>
>  #include <rte_function_versioning.h>
> +#include <rte_rcu_qsbr.h>
> 
>  #include "rte_crypto.h"
>  #include "rte_cryptodev.h"
> @@ -499,6 +500,10 @@ struct rte_cryptodev_sym_session_pool_private_data {
>  	return 0;
>  }
> 
> +#ifdef RTE_CRYPTODEV_CALLBACKS
> +/* spinlock for crypto device enq callbacks */
> +static rte_spinlock_t rte_cryptodev_enq_cb_lock = RTE_SPINLOCK_INITIALIZER;
> +#endif
> 
>  const char *
>  rte_cryptodev_get_feature_name(uint64_t flag)
> @@ -1449,6 +1454,158 @@ struct rte_cryptodev *
>  	rte_spinlock_unlock(&rte_cryptodev_cb_lock);
>  }
> 
> +#ifdef RTE_CRYPTODEV_CALLBACKS
> +int
> +rte_cryptodev_rcu_qsbr_add(uint8_t dev_id, struct rte_rcu_qsbr *qsbr)
> +{
> +
> +	struct rte_cryptodev *dev;
> +
> +	if (!rte_cryptodev_pmd_is_valid_dev(dev_id)) {
> +		CDEV_LOG_ERR("Invalid dev_id=%" PRIu8, dev_id);
> +		return -EINVAL;
> +	}
> +
> +	dev = &rte_crypto_devices[dev_id];
> +	dev->qsbr = qsbr;
> +	return 0;
> +}

So if I understand your patch correctly you propose a new working model
for crypto-devs:
1. Control-plane has to allocate/setup rcu_qsbr and do rte_cryptodev_rcu_qsbr_add().
2. Data-plane has somehow to obtain pointer to that rcu_qsbr and wrap cryptodev_enqueue()
   with rcu_qsbr_quiescent()  or rcu_qsbr_online()/rcu_qsbr_offline().

That seems quite a big change and I don't think it is acceptable for most users.
From my perspective adding/installing call-backs to the dev has to be opaque to the data-plane code.
Also note that different callbacks can be installed by different entities (libs) and might have no idea
about each other. 
That's why I thought it would be better to make all this RCU stuff internal inside cryptodev:
    hide all this rcu_qsbr allocation/setup inside cryptodev init/queue setup
    invoke rcu_qsbr_online()/rcu_qsbr_offline() inside cryptodev_enqueue().

> +
> +struct rte_cryptodev_enq_callback *
> +rte_cryptodev_add_enq_callback(uint8_t dev_id,
> +			       uint16_t qp_id,
> +			       rte_cryptodev_enq_cb_fn cb_fn,
> +			       void *cb_arg)
> +{
> +	struct rte_cryptodev *dev;
> +	struct rte_cryptodev_enq_callback *cb, *tail;
> +
> +	if (!cb_fn)
> +		return NULL;
> +
> +	if (!rte_cryptodev_pmd_is_valid_dev(dev_id)) {
> +		CDEV_LOG_ERR("Invalid dev_id=%" PRIu8, dev_id);
> +		return NULL;
> +	}
> +
> +	dev = &rte_crypto_devices[dev_id];
> +	if (qp_id >= dev->data->nb_queue_pairs) {
> +		CDEV_LOG_ERR("Invalid queue_pair_id=%d", qp_id);
> +		return NULL;
> +	}
> +
> +	cb = rte_zmalloc(NULL, sizeof(*cb), 0);
> +	if (cb == NULL) {
> +		CDEV_LOG_ERR("Failed to allocate memory for callback on "
> +			     "dev=%d, queue_pair_id=%d", dev_id, qp_id);
> +		rte_errno = ENOMEM;
> +		return NULL;
> +	}
> +
> +	cb->fn = cb_fn;
> +	cb->arg = cb_arg;
> +
> +	rte_spinlock_lock(&rte_cryptodev_enq_cb_lock);
> +	if (dev->enq_cbs == NULL) {
> +		dev->enq_cbs = rte_zmalloc(NULL, sizeof(cb) *
> +					   dev->data->nb_queue_pairs, 0);
> +		if (dev->enq_cbs == NULL) {
> +			CDEV_LOG_ERR("Failed to allocate memory for callbacks");
> +			rte_errno = ENOMEM;
> +			rte_free(cb);
> +			return NULL;
> +		}
> +	}
> +
> +	/* Add the callbacks in fifo order. */
> +	tail = dev->enq_cbs[qp_id];
> +	if (tail) {
> +		while (tail->next)
> +			tail = tail->next;
> +		tail->next = cb;
> +	} else
> +		dev->enq_cbs[qp_id] = cb;
> +
> +	rte_spinlock_unlock(&rte_cryptodev_enq_cb_lock);
> +
> +	return cb;
> +}
> +
> +int
> +rte_cryptodev_remove_enq_callback(uint8_t dev_id,
> +				  uint16_t qp_id,
> +				  struct rte_cryptodev_enq_callback *cb)
> +{
> +	struct rte_cryptodev *dev;
> +	struct rte_cryptodev_enq_callback **prev_cb, *curr_cb;
> +	uint16_t qp;
> +	int free_mem;
> +	int ret;
> +
> +	free_mem = 1;
> +
> +	if (!cb) {
> +		CDEV_LOG_ERR("cb is NULL");
> +		return -EINVAL;
> +	}
> +
> +	if (!rte_cryptodev_pmd_is_valid_dev(dev_id)) {
> +		CDEV_LOG_ERR("Invalid dev_id=%" PRIu8, dev_id);
> +		return -EINVAL;
> +	}
> +
> +	dev = &rte_crypto_devices[dev_id];
> +	if (qp_id >= dev->data->nb_queue_pairs) {
> +		CDEV_LOG_ERR("Invalid queue_pair_id=%d", qp_id);
> +		return -EINVAL;
> +	}
> +
> +	if (!dev->qsbr) {
> +		CDEV_LOG_ERR("Rcu qsbr is NULL");
> +		return -EINVAL;
> +	}
> +
> +	rte_spinlock_lock(&rte_cryptodev_enq_cb_lock);
> +	if (dev->enq_cbs == NULL) {
> +		rte_spinlock_unlock(&rte_cryptodev_enq_cb_lock);
> +		return -EINVAL;
> +	}
> +
> +	prev_cb = &dev->enq_cbs[qp_id];
> +	for (; *prev_cb != NULL; prev_cb = &curr_cb->next) {
> +		curr_cb = *prev_cb;
> +		if (curr_cb == cb) {
> +			/* Remove the user cb from the callback list. */
> +			*prev_cb = curr_cb->next;
> +			ret = 0;
> +			break;
> +		}
> +	}
> +
> +	if (!ret) {
> +		/* Call sync with invalid thread id as this is part of
> +		 * control plane API */
> +		rte_rcu_qsbr_synchronize(dev->qsbr,
> +					 RTE_QSBR_THRID_INVALID);
> +		rte_free(cb);
> +	}
> +
> +	for (qp = 0; qp < dev->data->nb_queue_pairs; qp++)
> +		if (dev->enq_cbs[qp] != NULL) {

Some reference count (number of callbacks) seems like a better approach here.

> +			free_mem = 0;
> +			break;
> +		}
> +
> +	if (free_mem) {
> +		rte_free(dev->enq_cbs);
> +		dev->enq_cbs = NULL;
> +	}
> +
> +	rte_spinlock_unlock(&rte_cryptodev_enq_cb_lock);
> +
> +	return ret;
> +}
> +#endif
> 
>  int
>  rte_cryptodev_sym_session_init(uint8_t dev_id,
> diff --git a/lib/librte_cryptodev/rte_cryptodev.h b/lib/librte_cryptodev/rte_cryptodev.h
> index 7b3ebc2..2c7a47b 100644
> --- a/lib/librte_cryptodev/rte_cryptodev.h
> +++ b/lib/librte_cryptodev/rte_cryptodev.h
> @@ -530,6 +530,32 @@ struct rte_cryptodev_qp_conf {
>  };
> 
>  /**
> + * Function type used for pre processing crypto ops when enqueue burst is
> + * called.
> + *
> + * The callback function is called on enqueue burst immediately
> + * before the crypto ops are put onto the hardware queue for processing.
> + *
> + * @param	dev_id		The identifier of the device.
> + * @param	qp_id		The index of the queue pair in which ops are
> + *				to be enqueued for processing. The value
> + *				must be in the range [0, nb_queue_pairs - 1]
> + *				previously supplied to
> + *				*rte_cryptodev_configure*.
> + * @param	ops		The address of an array of *nb_ops* pointers
> + *				to *rte_crypto_op* structures which contain
> + *				the crypto operations to be processed.
> + * @param	nb_ops		The number of operations to process.
> + * @param	user_param	The arbitrary user parameter passed in by the
> + *				application when the callback was originally
> + *				registered.
> + * @return			The number of ops to be enqueued to the
> + *				crypto device.
> + */
> +typedef uint16_t (*rte_cryptodev_enq_cb_fn)(uint16_t dev_id, uint16_t qp_id,
> +		struct rte_crypto_op **ops, uint16_t nb_ops, void *user_param);
> +
> +/**
>   * Typedef for application callback function to be registered by application
>   * software for notification of device events
>   *
> @@ -853,7 +879,6 @@ struct rte_cryptodev_config {
>  		enum rte_cryptodev_event_type event,
>  		rte_cryptodev_cb_fn cb_fn, void *cb_arg);
> 
> -
>  typedef uint16_t (*dequeue_pkt_burst_t)(void *qp,
>  		struct rte_crypto_op **ops,	uint16_t nb_ops);
>  /**< Dequeue processed packets from queue pair of a device. */
> @@ -870,6 +895,17 @@ typedef uint16_t (*enqueue_pkt_burst_t)(void *qp,
>  /** Structure to keep track of registered callbacks */
>  TAILQ_HEAD(rte_cryptodev_cb_list, rte_cryptodev_callback);
> 
> +/**
> + * @internal
> + * Structure used to hold information about the callbacks to be called for a
> + * queue pair on enqueue.
> + */
> +struct rte_cryptodev_enq_callback {
> +	struct rte_cryptodev_enq_callback *next;
> +	rte_cryptodev_enq_cb_fn fn;
> +	void *arg;
> +};
> +
>  /** The data structure associated with each crypto device. */
>  struct rte_cryptodev {
>  	dequeue_pkt_burst_t dequeue_burst;
> @@ -898,6 +934,14 @@ struct rte_cryptodev {
>  	__extension__
>  	uint8_t attached : 1;
>  	/**< Flag indicating the device is attached */
> +
> +#ifdef RTE_CRYPTODEV_CALLBACKS
> +	struct rte_cryptodev_enq_callback **enq_cbs;
> +	/**< User application callback for pre enqueue processing */
> +
> +	struct rte_rcu_qsbr *qsbr;
> +	/** < RCU QSBR variable for rte_cryptodev_enq_callback */

Probably better to have both these fields per queue.
Space for them can be allocated at dev_configure() or so. 
BTW, wouldn't it make sense to have ability to add callback for dequeue too?

> +#endif
>  } __rte_cache_aligned;
> 
>  void *
> @@ -1019,6 +1063,18 @@ struct rte_cryptodev_data {
>  		struct rte_crypto_op **ops, uint16_t nb_ops)
>  {
>  	struct rte_cryptodev *dev = &rte_cryptodevs[dev_id];
> +#ifdef RTE_CRYPTODEV_CALLBACKS
> +	if (unlikely(dev->enq_cbs != NULL && dev->enq_cbs[qp_id] != NULL)) {
> +		struct rte_cryptodev_enq_callback *cb =
> +			dev->enq_cbs[qp_id];
> +
> +		do {
> +			nb_ops = cb->fn(dev_id, qp_id, ops, nb_ops,
> +					cb->arg);
> +			cb = cb->next;
> +		} while (cb != NULL);
> +	}
> +#endif
> 
>  	rte_cryptodev_trace_enqueue_burst(dev_id, qp_id, (void **)ops, nb_ops);
>  	return (*dev->enqueue_burst)(
> @@ -1351,6 +1407,102 @@ struct rte_cryptodev_asym_session *
>  	struct rte_cryptodev_sym_session *sess, union rte_crypto_sym_ofs ofs,
>  	struct rte_crypto_sym_vec *vec);
> 
> +#ifdef RTE_CRYPTODEV_CALLBACKS
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice
> + *
> + * Add a user callback for a given crypto device and queue pair which will be
> + * called on crypto ops enqueue.
> + *
> + * This API configures a function to be called for each burst of crypto ops
> + * received on a given crypto device queue pair. The return value is a pointer
> + * that can be used later to remove the callback using
> + * rte_cryptodev_remove_enq_callback().
> + *
> + * Multiple functions are called in the order that they are added.
> + *
> + * @param	dev_id		The identifier of the device.
> + * @param	qp_id		The index of the queue pair in which ops are
> + *				to be enqueued for processing. The value
> + *				must be in the range [0, nb_queue_pairs - 1]
> + *				previously supplied to
> + *				*rte_cryptodev_configure*.
> + * @param	cb_fn		The callback function
> + * @param	cb_arg		A generic pointer parameter which will be passed
> + *				to each invocation of the callback function on
> + *				this crypto device and queue pair.
> + *
> + * @return
> + *   NULL on error.
> + *   On success, a pointer value which can later be used to remove the callback.
> + */
> +
> +__rte_experimental
> +struct rte_cryptodev_enq_callback *
> +rte_cryptodev_add_enq_callback(uint8_t dev_id,
> +			       uint16_t qp_id,
> +			       rte_cryptodev_enq_cb_fn cb_fn,
> +			       void *cb_arg);
> +
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice
> + *
> + * Remove a user callback function for given crypto device and queue pair.
> + *
> + * This function is used to removed callbacks that were added to a crypto
> + * device queue pair using rte_cryptodev_add_enq_callback().
> + *
> + * Note: The callback expects a RCU QSBR to be configured to synchronize
> + * to free the memory. Application is expected to configure RCU QSBR after
> + * adding an enqueue callback.
> + *
> + *
> + * @param	dev_id		The identifier of the device.
> + * @param	qp_id		The index of the queue pair in which ops are
> + *				to be enqueued for processing. The value
> + *				must be in the range [0, nb_queue_pairs - 1]
> + *				previously supplied to
> + *				*rte_cryptodev_configure*.
> + * @param	cb		Pointer to user supplied callback created via
> + *				rte_cryptodev_add_enq_callback().
> + *
> + * @return
> + *   - 0: Success. Callback was removed.
> + *   - -EINVAL:  The dev_id or the qp_id is out of range, or the callback
> + *               is NULL or not found for the crypto device queue pair.
> + */
> +
> +__rte_experimental
> +int rte_cryptodev_remove_enq_callback(uint8_t dev_id,
> +				  uint16_t qp_id,
> +				  struct rte_cryptodev_enq_callback *cb);
> +
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice
> + *
> + * Associate RCU QSBR variable with a cryptodev.
> + *
> + * This function is used to add RCU QSBR to a crypto device.
> + * The purpose of RCU is to help multiple threads to synchronize
> + * with each other before initiating adding/removing callback
> + * while dataplane threads are running enqueue callbacks.
> + *
> + * @param	dev_id		The identifier of the device.
> + * @param	qsr		RCU QSBR configuration
> + * @return
> + *   On success - 0
> + *   On error - EINVAL.
> + */
> +
> +__rte_experimental
> +int rte_cryptodev_rcu_qsbr_add(uint8_t dev_id, struct rte_rcu_qsbr *qsbr);
> +#endif
> +
>  #ifdef __cplusplus
>  }
>  #endif
> diff --git a/lib/librte_cryptodev/rte_cryptodev_version.map b/lib/librte_cryptodev/rte_cryptodev_version.map
> index 02f6dcf..46de3ca 100644
> --- a/lib/librte_cryptodev/rte_cryptodev_version.map
> +++ b/lib/librte_cryptodev/rte_cryptodev_version.map
> @@ -64,6 +64,7 @@ DPDK_20.0 {
>  	rte_cryptodev_sym_capability_get;
>  };
> 
> +
>  EXPERIMENTAL {
>  	global:
> 
> @@ -105,4 +106,9 @@ EXPERIMENTAL {
> 
>  	# added in 20.08
>  	rte_cryptodev_get_qp_status;
> +
> +	# added in 20.11
> +	rte_cryptodev_add_enq_callback;
> +	rte_cryptodev_remove_enq_callback;
> +	rte_cryptodev_rcu_qsbr_add;
>  };
> --
> 1.9.1
Abhinandan Gujjar Sept. 16, 2020, 3:17 p.m. UTC | #5
Hi Konstantin,

Please find some comments inline.

> -----Original Message-----
> From: Ananyev, Konstantin <konstantin.ananyev@intel.com>
> Sent: Wednesday, September 16, 2020 7:10 PM
> To: Gujjar, Abhinandan S <abhinandan.gujjar@intel.com>; dev@dpdk.org
> Cc: Doherty, Declan <declan.doherty@intel.com>; jerinj@marvell.com;
> akhil.goyal@nxp.com; Vangati, Narender <narender.vangati@intel.com>;
> Gujjar, Abhinandan S <abhinandan.gujjar@intel.com>
> Subject: RE: [dpdk-dev] [v2 1/2] cryptodev: support enqueue callback
> functions
> 
> Hi Guijjar,
> 
> >
> > In an eventdev world, multiple workers (with ordered queue) will be
> > working on IPsec ESP processing. The ESP header's sequence number is
> > unique and has to be sequentially incremented in an orderly manner.
> > This rises a need for incrementing sequence number in crypto stage
> > especially in event crypto adapter. By adding a user callback to
> > cryptodev at enqueue burst, the user callback will get executed in the
> > context of event crypto adapter. This helps the application to
> > increment the ESP sequence number atomically and orderly manner.
> >
> > This patch adds APIs to add/remove callback functions. The callback
> > function will be called for each burst of crypto ops received on a
> > given crypto device queue pair.
> >
> > v1->v2:
> > Moved callback related members to the end of cryptodev struct Added
> > support for RCU
> >
> > Signed-off-by: Abhinandan Gujjar <abhinandan.gujjar@intel.com>
> > ---
> >  config/common_base                             |   1 +
> >  lib/librte_cryptodev/Makefile                  |   2 +-
> >  lib/librte_cryptodev/rte_cryptodev.c           | 157
> +++++++++++++++++++++++++
> >  lib/librte_cryptodev/rte_cryptodev.h           | 154
> +++++++++++++++++++++++-
> >  lib/librte_cryptodev/rte_cryptodev_version.map |   6 +
> >  5 files changed, 318 insertions(+), 2 deletions(-)
> >
> > diff --git a/config/common_base b/config/common_base index
> > fbf0ee7..f5ebde4 100644
> > --- a/config/common_base
> > +++ b/config/common_base
> > @@ -599,6 +599,7 @@
> CONFIG_RTE_LIBRTE_PMD_BBDEV_FPGA_5GNR_FEC=y
> >  #
> >  CONFIG_RTE_LIBRTE_CRYPTODEV=y
> >  CONFIG_RTE_CRYPTO_MAX_DEVS=64
> > +CONFIG_RTE_CRYPTODEV_CALLBACKS=y
> >
> >  #
> >  # Compile PMD for ARMv8 Crypto device diff --git
> > a/lib/librte_cryptodev/Makefile b/lib/librte_cryptodev/Makefile index
> > 73e77a2..514d552 100644
> > --- a/lib/librte_cryptodev/Makefile
> > +++ b/lib/librte_cryptodev/Makefile
> > @@ -10,7 +10,7 @@ LIB = librte_cryptodev.a  CFLAGS += -O3  CFLAGS +=
> > $(WERROR_FLAGS)  LDLIBS += -lrte_eal -lrte_mempool -lrte_ring
> > -lrte_mbuf -LDLIBS += -lrte_kvargs
> > +LDLIBS += -lrte_kvargs -lrte_rcu
> >
> >  # library source files
> >  SRCS-y += rte_cryptodev.c rte_cryptodev_pmd.c
> > cryptodev_trace_points.c diff --git
> > a/lib/librte_cryptodev/rte_cryptodev.c
> > b/lib/librte_cryptodev/rte_cryptodev.c
> > index 1dd795b..2fb3e35 100644
> > --- a/lib/librte_cryptodev/rte_cryptodev.c
> > +++ b/lib/librte_cryptodev/rte_cryptodev.c
> > @@ -38,6 +38,7 @@
> >  #include <rte_string_fns.h>
> >  #include <rte_compat.h>
> >  #include <rte_function_versioning.h>
> > +#include <rte_rcu_qsbr.h>
> >
> >  #include "rte_crypto.h"
> >  #include "rte_cryptodev.h"
> > @@ -499,6 +500,10 @@ struct
> rte_cryptodev_sym_session_pool_private_data {
> >  	return 0;
> >  }
> >
> > +#ifdef RTE_CRYPTODEV_CALLBACKS
> > +/* spinlock for crypto device enq callbacks */ static rte_spinlock_t
> > +rte_cryptodev_enq_cb_lock = RTE_SPINLOCK_INITIALIZER; #endif
> >
> >  const char *
> >  rte_cryptodev_get_feature_name(uint64_t flag) @@ -1449,6 +1454,158
> @@
> > struct rte_cryptodev *
> >  	rte_spinlock_unlock(&rte_cryptodev_cb_lock);
> >  }
> >
> > +#ifdef RTE_CRYPTODEV_CALLBACKS
> > +int
> > +rte_cryptodev_rcu_qsbr_add(uint8_t dev_id, struct rte_rcu_qsbr *qsbr)
> > +{
> > +
> > +	struct rte_cryptodev *dev;
> > +
> > +	if (!rte_cryptodev_pmd_is_valid_dev(dev_id)) {
> > +		CDEV_LOG_ERR("Invalid dev_id=%" PRIu8, dev_id);
> > +		return -EINVAL;
> > +	}
> > +
> > +	dev = &rte_crypto_devices[dev_id];
> > +	dev->qsbr = qsbr;
> > +	return 0;
> > +}
> 
> So if I understand your patch correctly you propose a new working model for
> crypto-devs:
> 1. Control-plane has to allocate/setup rcu_qsbr and do
> rte_cryptodev_rcu_qsbr_add().
> 2. Data-plane has somehow to obtain pointer to that rcu_qsbr and wrap
> cryptodev_enqueue()
>    with rcu_qsbr_quiescent()  or rcu_qsbr_online()/rcu_qsbr_offline().
Yes. I think, it is not a new model. It is same as RCU integration with LPM.
Please refer: https://patches.dpdk.org/cover/73673/

> 
> That seems quite a big change and I don't think it is acceptable for most
> users.
> From my perspective adding/installing call-backs to the dev has to be opaque
> to the data-plane code.
> Also note that different callbacks can be installed by different entities (libs)
> and might have no idea about each other.
> That's why I thought it would be better to make all this RCU stuff internal
> inside cryptodev:
>     hide all this rcu_qsbr allocation/setup inside cryptodev init/queue setup
>     invoke rcu_qsbr_online()/rcu_qsbr_offline() inside cryptodev_enqueue().
I have already tried exploring above stuffs. There are too many constraints.
The changes don't fit in, as per RCU design.
Moreover, having rcu api under enqueue_burst() will affect the performance.
The changes are more on control plane side, which is one time.
The data plane changes are minimal.

> 
> > +
> > +struct rte_cryptodev_enq_callback *
> > +rte_cryptodev_add_enq_callback(uint8_t dev_id,
> > +			       uint16_t qp_id,
> > +			       rte_cryptodev_enq_cb_fn cb_fn,
> > +			       void *cb_arg)
> > +{
> > +	struct rte_cryptodev *dev;
> > +	struct rte_cryptodev_enq_callback *cb, *tail;
> > +
> > +	if (!cb_fn)
> > +		return NULL;
> > +
> > +	if (!rte_cryptodev_pmd_is_valid_dev(dev_id)) {
> > +		CDEV_LOG_ERR("Invalid dev_id=%" PRIu8, dev_id);
> > +		return NULL;
> > +	}
> > +
> > +	dev = &rte_crypto_devices[dev_id];
> > +	if (qp_id >= dev->data->nb_queue_pairs) {
> > +		CDEV_LOG_ERR("Invalid queue_pair_id=%d", qp_id);
> > +		return NULL;
> > +	}
> > +
> > +	cb = rte_zmalloc(NULL, sizeof(*cb), 0);
> > +	if (cb == NULL) {
> > +		CDEV_LOG_ERR("Failed to allocate memory for callback on "
> > +			     "dev=%d, queue_pair_id=%d", dev_id, qp_id);
> > +		rte_errno = ENOMEM;
> > +		return NULL;
> > +	}
> > +
> > +	cb->fn = cb_fn;
> > +	cb->arg = cb_arg;
> > +
> > +	rte_spinlock_lock(&rte_cryptodev_enq_cb_lock);
> > +	if (dev->enq_cbs == NULL) {
> > +		dev->enq_cbs = rte_zmalloc(NULL, sizeof(cb) *
> > +					   dev->data->nb_queue_pairs, 0);
> > +		if (dev->enq_cbs == NULL) {
> > +			CDEV_LOG_ERR("Failed to allocate memory for
> callbacks");
> > +			rte_errno = ENOMEM;
> > +			rte_free(cb);
> > +			return NULL;
> > +		}
> > +	}
> > +
> > +	/* Add the callbacks in fifo order. */
> > +	tail = dev->enq_cbs[qp_id];
> > +	if (tail) {
> > +		while (tail->next)
> > +			tail = tail->next;
> > +		tail->next = cb;
> > +	} else
> > +		dev->enq_cbs[qp_id] = cb;
> > +
> > +	rte_spinlock_unlock(&rte_cryptodev_enq_cb_lock);
> > +
> > +	return cb;
> > +}
> > +
> > +int
> > +rte_cryptodev_remove_enq_callback(uint8_t dev_id,
> > +				  uint16_t qp_id,
> > +				  struct rte_cryptodev_enq_callback *cb) {
> > +	struct rte_cryptodev *dev;
> > +	struct rte_cryptodev_enq_callback **prev_cb, *curr_cb;
> > +	uint16_t qp;
> > +	int free_mem;
> > +	int ret;
> > +
> > +	free_mem = 1;
> > +
> > +	if (!cb) {
> > +		CDEV_LOG_ERR("cb is NULL");
> > +		return -EINVAL;
> > +	}
> > +
> > +	if (!rte_cryptodev_pmd_is_valid_dev(dev_id)) {
> > +		CDEV_LOG_ERR("Invalid dev_id=%" PRIu8, dev_id);
> > +		return -EINVAL;
> > +	}
> > +
> > +	dev = &rte_crypto_devices[dev_id];
> > +	if (qp_id >= dev->data->nb_queue_pairs) {
> > +		CDEV_LOG_ERR("Invalid queue_pair_id=%d", qp_id);
> > +		return -EINVAL;
> > +	}
> > +
> > +	if (!dev->qsbr) {
> > +		CDEV_LOG_ERR("Rcu qsbr is NULL");
> > +		return -EINVAL;
> > +	}
> > +
> > +	rte_spinlock_lock(&rte_cryptodev_enq_cb_lock);
> > +	if (dev->enq_cbs == NULL) {
> > +		rte_spinlock_unlock(&rte_cryptodev_enq_cb_lock);
> > +		return -EINVAL;
> > +	}
> > +
> > +	prev_cb = &dev->enq_cbs[qp_id];
> > +	for (; *prev_cb != NULL; prev_cb = &curr_cb->next) {
> > +		curr_cb = *prev_cb;
> > +		if (curr_cb == cb) {
> > +			/* Remove the user cb from the callback list. */
> > +			*prev_cb = curr_cb->next;
> > +			ret = 0;
> > +			break;
> > +		}
> > +	}
> > +
> > +	if (!ret) {
> > +		/* Call sync with invalid thread id as this is part of
> > +		 * control plane API */
> > +		rte_rcu_qsbr_synchronize(dev->qsbr,
> > +					 RTE_QSBR_THRID_INVALID);
> > +		rte_free(cb);
> > +	}
> > +
> > +	for (qp = 0; qp < dev->data->nb_queue_pairs; qp++)
> > +		if (dev->enq_cbs[qp] != NULL) {
> 
> Some reference count (number of callbacks) seems like a better approach
> here.
Ok.
> 
> > +			free_mem = 0;
> > +			break;
> > +		}
> > +
> > +	if (free_mem) {
> > +		rte_free(dev->enq_cbs);
> > +		dev->enq_cbs = NULL;
> > +	}
> > +
> > +	rte_spinlock_unlock(&rte_cryptodev_enq_cb_lock);
> > +
> > +	return ret;
> > +}
> > +#endif
> >
> >  int
> >  rte_cryptodev_sym_session_init(uint8_t dev_id, diff --git
> > a/lib/librte_cryptodev/rte_cryptodev.h
> > b/lib/librte_cryptodev/rte_cryptodev.h
> > index 7b3ebc2..2c7a47b 100644
> > --- a/lib/librte_cryptodev/rte_cryptodev.h
> > +++ b/lib/librte_cryptodev/rte_cryptodev.h
> > @@ -530,6 +530,32 @@ struct rte_cryptodev_qp_conf {  };
> >
> >  /**
> > + * Function type used for pre processing crypto ops when enqueue
> > +burst is
> > + * called.
> > + *
> > + * The callback function is called on enqueue burst immediately
> > + * before the crypto ops are put onto the hardware queue for processing.
> > + *
> > + * @param	dev_id		The identifier of the device.
> > + * @param	qp_id		The index of the queue pair in which ops are
> > + *				to be enqueued for processing. The value
> > + *				must be in the range [0, nb_queue_pairs - 1]
> > + *				previously supplied to
> > + *				*rte_cryptodev_configure*.
> > + * @param	ops		The address of an array of *nb_ops* pointers
> > + *				to *rte_crypto_op* structures which contain
> > + *				the crypto operations to be processed.
> > + * @param	nb_ops		The number of operations to process.
> > + * @param	user_param	The arbitrary user parameter passed in by the
> > + *				application when the callback was originally
> > + *				registered.
> > + * @return			The number of ops to be enqueued to the
> > + *				crypto device.
> > + */
> > +typedef uint16_t (*rte_cryptodev_enq_cb_fn)(uint16_t dev_id, uint16_t
> qp_id,
> > +		struct rte_crypto_op **ops, uint16_t nb_ops, void
> *user_param);
> > +
> > +/**
> >   * Typedef for application callback function to be registered by application
> >   * software for notification of device events
> >   *
> > @@ -853,7 +879,6 @@ struct rte_cryptodev_config {
> >  		enum rte_cryptodev_event_type event,
> >  		rte_cryptodev_cb_fn cb_fn, void *cb_arg);
> >
> > -
> >  typedef uint16_t (*dequeue_pkt_burst_t)(void *qp,
> >  		struct rte_crypto_op **ops,	uint16_t nb_ops);
> >  /**< Dequeue processed packets from queue pair of a device. */ @@
> > -870,6 +895,17 @@ typedef uint16_t (*enqueue_pkt_burst_t)(void *qp,
> >  /** Structure to keep track of registered callbacks */
> > TAILQ_HEAD(rte_cryptodev_cb_list, rte_cryptodev_callback);
> >
> > +/**
> > + * @internal
> > + * Structure used to hold information about the callbacks to be
> > +called for a
> > + * queue pair on enqueue.
> > + */
> > +struct rte_cryptodev_enq_callback {
> > +	struct rte_cryptodev_enq_callback *next;
> > +	rte_cryptodev_enq_cb_fn fn;
> > +	void *arg;
> > +};
> > +
> >  /** The data structure associated with each crypto device. */  struct
> > rte_cryptodev {
> >  	dequeue_pkt_burst_t dequeue_burst;
> > @@ -898,6 +934,14 @@ struct rte_cryptodev {
> >  	__extension__
> >  	uint8_t attached : 1;
> >  	/**< Flag indicating the device is attached */
> > +
> > +#ifdef RTE_CRYPTODEV_CALLBACKS
> > +	struct rte_cryptodev_enq_callback **enq_cbs;
> > +	/**< User application callback for pre enqueue processing */
> > +
> > +	struct rte_rcu_qsbr *qsbr;
> > +	/** < RCU QSBR variable for rte_cryptodev_enq_callback */
> 
> Probably better to have both these fields per queue.
> Space for them can be allocated at dev_configure() or so.
enq_cbs is allocated during callback add.
Unlike ethdev, each cryptodev have their own max queue pair. There is no macro for that.
I think, single RCU should be good enough, as it has mechanism to track all its reporting threads.

> BTW, wouldn't it make sense to have ability to add callback for dequeue too?
As mentioned in the commit message, this patch was driven by a requirement.
If required, callback for the dequeue can be added in a separate patch. 
> 
> > +#endif
> >  } __rte_cache_aligned;
> >
> >  void *
> > @@ -1019,6 +1063,18 @@ struct rte_cryptodev_data {
> >  		struct rte_crypto_op **ops, uint16_t nb_ops)  {
> >  	struct rte_cryptodev *dev = &rte_cryptodevs[dev_id];
> > +#ifdef RTE_CRYPTODEV_CALLBACKS
> > +	if (unlikely(dev->enq_cbs != NULL && dev->enq_cbs[qp_id] !=
> NULL)) {
> > +		struct rte_cryptodev_enq_callback *cb =
> > +			dev->enq_cbs[qp_id];
> > +
> > +		do {
> > +			nb_ops = cb->fn(dev_id, qp_id, ops, nb_ops,
> > +					cb->arg);
> > +			cb = cb->next;
> > +		} while (cb != NULL);
> > +	}
> > +#endif
> >
> >  	rte_cryptodev_trace_enqueue_burst(dev_id, qp_id, (void **)ops,
> nb_ops);
> >  	return (*dev->enqueue_burst)(
> > @@ -1351,6 +1407,102 @@ struct rte_cryptodev_asym_session *
> >  	struct rte_cryptodev_sym_session *sess, union rte_crypto_sym_ofs
> ofs,
> >  	struct rte_crypto_sym_vec *vec);
> >
> > +#ifdef RTE_CRYPTODEV_CALLBACKS
> > +/**
> > + * @warning
> > + * @b EXPERIMENTAL: this API may change without prior notice
> > + *
> > + * Add a user callback for a given crypto device and queue pair which
> > +will be
> > + * called on crypto ops enqueue.
> > + *
> > + * This API configures a function to be called for each burst of
> > +crypto ops
> > + * received on a given crypto device queue pair. The return value is
> > +a pointer
> > + * that can be used later to remove the callback using
> > + * rte_cryptodev_remove_enq_callback().
> > + *
> > + * Multiple functions are called in the order that they are added.
> > + *
> > + * @param	dev_id		The identifier of the device.
> > + * @param	qp_id		The index of the queue pair in which ops are
> > + *				to be enqueued for processing. The value
> > + *				must be in the range [0, nb_queue_pairs - 1]
> > + *				previously supplied to
> > + *				*rte_cryptodev_configure*.
> > + * @param	cb_fn		The callback function
> > + * @param	cb_arg		A generic pointer parameter which will be
> passed
> > + *				to each invocation of the callback function on
> > + *				this crypto device and queue pair.
> > + *
> > + * @return
> > + *   NULL on error.
> > + *   On success, a pointer value which can later be used to remove the
> callback.
> > + */
> > +
> > +__rte_experimental
> > +struct rte_cryptodev_enq_callback *
> > +rte_cryptodev_add_enq_callback(uint8_t dev_id,
> > +			       uint16_t qp_id,
> > +			       rte_cryptodev_enq_cb_fn cb_fn,
> > +			       void *cb_arg);
> > +
> > +
> > +/**
> > + * @warning
> > + * @b EXPERIMENTAL: this API may change without prior notice
> > + *
> > + * Remove a user callback function for given crypto device and queue pair.
> > + *
> > + * This function is used to removed callbacks that were added to a
> > +crypto
> > + * device queue pair using rte_cryptodev_add_enq_callback().
> > + *
> > + * Note: The callback expects a RCU QSBR to be configured to
> > +synchronize
> > + * to free the memory. Application is expected to configure RCU QSBR
> > +after
> > + * adding an enqueue callback.
> > + *
> > + *
> > + * @param	dev_id		The identifier of the device.
> > + * @param	qp_id		The index of the queue pair in which ops are
> > + *				to be enqueued for processing. The value
> > + *				must be in the range [0, nb_queue_pairs - 1]
> > + *				previously supplied to
> > + *				*rte_cryptodev_configure*.
> > + * @param	cb		Pointer to user supplied callback created via
> > + *				rte_cryptodev_add_enq_callback().
> > + *
> > + * @return
> > + *   - 0: Success. Callback was removed.
> > + *   - -EINVAL:  The dev_id or the qp_id is out of range, or the callback
> > + *               is NULL or not found for the crypto device queue pair.
> > + */
> > +
> > +__rte_experimental
> > +int rte_cryptodev_remove_enq_callback(uint8_t dev_id,
> > +				  uint16_t qp_id,
> > +				  struct rte_cryptodev_enq_callback *cb);
> > +
> > +
> > +/**
> > + * @warning
> > + * @b EXPERIMENTAL: this API may change without prior notice
> > + *
> > + * Associate RCU QSBR variable with a cryptodev.
> > + *
> > + * This function is used to add RCU QSBR to a crypto device.
> > + * The purpose of RCU is to help multiple threads to synchronize
> > + * with each other before initiating adding/removing callback
> > + * while dataplane threads are running enqueue callbacks.
> > + *
> > + * @param	dev_id		The identifier of the device.
> > + * @param	qsr		RCU QSBR configuration
> > + * @return
> > + *   On success - 0
> > + *   On error - EINVAL.
> > + */
> > +
> > +__rte_experimental
> > +int rte_cryptodev_rcu_qsbr_add(uint8_t dev_id, struct rte_rcu_qsbr
> > +*qsbr); #endif
> > +
> >  #ifdef __cplusplus
> >  }
> >  #endif
> > diff --git a/lib/librte_cryptodev/rte_cryptodev_version.map
> > b/lib/librte_cryptodev/rte_cryptodev_version.map
> > index 02f6dcf..46de3ca 100644
> > --- a/lib/librte_cryptodev/rte_cryptodev_version.map
> > +++ b/lib/librte_cryptodev/rte_cryptodev_version.map
> > @@ -64,6 +64,7 @@ DPDK_20.0 {
> >  	rte_cryptodev_sym_capability_get;
> >  };
> >
> > +
> >  EXPERIMENTAL {
> >  	global:
> >
> > @@ -105,4 +106,9 @@ EXPERIMENTAL {
> >
> >  	# added in 20.08
> >  	rte_cryptodev_get_qp_status;
> > +
> > +	# added in 20.11
> > +	rte_cryptodev_add_enq_callback;
> > +	rte_cryptodev_remove_enq_callback;
> > +	rte_cryptodev_rcu_qsbr_add;
> >  };
> > --
> > 1.9.1
Ananyev, Konstantin Sept. 17, 2020, 1:24 p.m. UTC | #6
> > >
> > > In an eventdev world, multiple workers (with ordered queue) will be
> > > working on IPsec ESP processing. The ESP header's sequence number is
> > > unique and has to be sequentially incremented in an orderly manner.
> > > This rises a need for incrementing sequence number in crypto stage
> > > especially in event crypto adapter. By adding a user callback to
> > > cryptodev at enqueue burst, the user callback will get executed in the
> > > context of event crypto adapter. This helps the application to
> > > increment the ESP sequence number atomically and orderly manner.
> > >
> > > This patch adds APIs to add/remove callback functions. The callback
> > > function will be called for each burst of crypto ops received on a
> > > given crypto device queue pair.
> > >
> > > v1->v2:
> > > Moved callback related members to the end of cryptodev struct Added
> > > support for RCU
> > >
> > > Signed-off-by: Abhinandan Gujjar <abhinandan.gujjar@intel.com>
> > > ---
> > >  config/common_base                             |   1 +
> > >  lib/librte_cryptodev/Makefile                  |   2 +-
> > >  lib/librte_cryptodev/rte_cryptodev.c           | 157
> > +++++++++++++++++++++++++
> > >  lib/librte_cryptodev/rte_cryptodev.h           | 154
> > +++++++++++++++++++++++-
> > >  lib/librte_cryptodev/rte_cryptodev_version.map |   6 +
> > >  5 files changed, 318 insertions(+), 2 deletions(-)
> > >
> > > diff --git a/config/common_base b/config/common_base index
> > > fbf0ee7..f5ebde4 100644
> > > --- a/config/common_base
> > > +++ b/config/common_base
> > > @@ -599,6 +599,7 @@
> > CONFIG_RTE_LIBRTE_PMD_BBDEV_FPGA_5GNR_FEC=y
> > >  #
> > >  CONFIG_RTE_LIBRTE_CRYPTODEV=y
> > >  CONFIG_RTE_CRYPTO_MAX_DEVS=64
> > > +CONFIG_RTE_CRYPTODEV_CALLBACKS=y
> > >
> > >  #
> > >  # Compile PMD for ARMv8 Crypto device diff --git
> > > a/lib/librte_cryptodev/Makefile b/lib/librte_cryptodev/Makefile index
> > > 73e77a2..514d552 100644
> > > --- a/lib/librte_cryptodev/Makefile
> > > +++ b/lib/librte_cryptodev/Makefile
> > > @@ -10,7 +10,7 @@ LIB = librte_cryptodev.a  CFLAGS += -O3  CFLAGS +=
> > > $(WERROR_FLAGS)  LDLIBS += -lrte_eal -lrte_mempool -lrte_ring
> > > -lrte_mbuf -LDLIBS += -lrte_kvargs
> > > +LDLIBS += -lrte_kvargs -lrte_rcu
> > >
> > >  # library source files
> > >  SRCS-y += rte_cryptodev.c rte_cryptodev_pmd.c
> > > cryptodev_trace_points.c diff --git
> > > a/lib/librte_cryptodev/rte_cryptodev.c
> > > b/lib/librte_cryptodev/rte_cryptodev.c
> > > index 1dd795b..2fb3e35 100644
> > > --- a/lib/librte_cryptodev/rte_cryptodev.c
> > > +++ b/lib/librte_cryptodev/rte_cryptodev.c
> > > @@ -38,6 +38,7 @@
> > >  #include <rte_string_fns.h>
> > >  #include <rte_compat.h>
> > >  #include <rte_function_versioning.h>
> > > +#include <rte_rcu_qsbr.h>
> > >
> > >  #include "rte_crypto.h"
> > >  #include "rte_cryptodev.h"
> > > @@ -499,6 +500,10 @@ struct
> > rte_cryptodev_sym_session_pool_private_data {
> > >  	return 0;
> > >  }
> > >
> > > +#ifdef RTE_CRYPTODEV_CALLBACKS
> > > +/* spinlock for crypto device enq callbacks */ static rte_spinlock_t
> > > +rte_cryptodev_enq_cb_lock = RTE_SPINLOCK_INITIALIZER; #endif
> > >
> > >  const char *
> > >  rte_cryptodev_get_feature_name(uint64_t flag) @@ -1449,6 +1454,158
> > @@
> > > struct rte_cryptodev *
> > >  	rte_spinlock_unlock(&rte_cryptodev_cb_lock);
> > >  }
> > >
> > > +#ifdef RTE_CRYPTODEV_CALLBACKS
> > > +int
> > > +rte_cryptodev_rcu_qsbr_add(uint8_t dev_id, struct rte_rcu_qsbr *qsbr)
> > > +{
> > > +
> > > +	struct rte_cryptodev *dev;
> > > +
> > > +	if (!rte_cryptodev_pmd_is_valid_dev(dev_id)) {
> > > +		CDEV_LOG_ERR("Invalid dev_id=%" PRIu8, dev_id);
> > > +		return -EINVAL;
> > > +	}
> > > +
> > > +	dev = &rte_crypto_devices[dev_id];
> > > +	dev->qsbr = qsbr;
> > > +	return 0;
> > > +}
> >
> > So if I understand your patch correctly you propose a new working model for
> > crypto-devs:
> > 1. Control-plane has to allocate/setup rcu_qsbr and do
> > rte_cryptodev_rcu_qsbr_add().
> > 2. Data-plane has somehow to obtain pointer to that rcu_qsbr and wrap
> > cryptodev_enqueue()
> >    with rcu_qsbr_quiescent()  or rcu_qsbr_online()/rcu_qsbr_offline().
> Yes. I think, it is not a new model. It is same as RCU integration with LPM.
> Please refer: https://patches.dpdk.org/cover/73673/

I am talking about new working model for crypto-dev enqueue/dequeue.
As I said above now it becomes data-plane thread responsibility to:
 -somehow to obtain pointer to that rcu_qsbr for each cryptodev it is using.
 -call rcu sync functions (quiescent/online/offline) on a regular basis.
Note that now data-plane thread would have to do that always -
even if there are now callbacks installed for that cryptodev queue right now.  
All that changes behaviour of existing apps and I presume would reduce
adoption of  that fature.  
I still think all this callback mechanism should be totally opaque to data-plane
threads - user shouldn't change his app code depending on would some
enqueue/dequeue callbacks be installed or not.

> 
> >
> > That seems quite a big change and I don't think it is acceptable for most
> > users.
> > From my perspective adding/installing call-backs to the dev has to be opaque
> > to the data-plane code.
> > Also note that different callbacks can be installed by different entities (libs)
> > and might have no idea about each other.
> > That's why I thought it would be better to make all this RCU stuff internal
> > inside cryptodev:
> >     hide all this rcu_qsbr allocation/setup inside cryptod somehow to obtain pointer to that rcu_qsbr ev init/queue setup
> >     invoke rcu_qsbr_online()/rcu_qsbr_offline() inside cryptodev_enqueue().
> I have already tried exploring above stuffs. There are too many constraints.
> The changes don't fit in, as per RCU design.

Hmm could you be more specific here - what constraints are you referring to?

> Moreover, having rcu api under enqueue_burst() will affect the performance.

It most likely will. Though my expectation it will affect performance only when
some callbacks are installed. My thought here:
callback function by itself will affect cryptdev_enqueue performance anyway,
so adding extra overhead for sync is probably ok here.
Though for situation when no callbacks are installed - perfomance should be
left unaffected (or impact should be as small as possible).   

> The changes are more on control plane side, which is one time.
> The data plane changes are minimal.

I still think upper layer data-plane code should stay unaffected (zero changes). 
 
> >
> > > +
> > > +struct rte_cryptodev_enq_callback *
> > > +rte_cryptodev_add_enq_callback(uint8_t dev_id,
> > > +			       uint16_t qp_id,
> > > +			       rte_cryptodev_enq_cb_fn cb_fn,
> > > +			       void *cb_arg)
> > > +{
> > > +	struct rte_cryptodev *dev;
> > > +	struct rte_cryptodev_enq_callback *cb, *tail;
> > > +
> > > +	if (!cb_fn)
> > > +		return NULL;
> > > +
> > > +	if (!rte_cryptodev_pmd_is_valid_dev(dev_id)) {
> > > +		CDEV_LOG_ERR("Invalid dev_id=%" PRIu8, dev_id);
> > > +		return NULL;
> > > +	}
> > > +
> > > +	dev = &rte_crypto_devices[dev_id];
> > > +	if (qp_id >= dev->data->nb_queue_pairs) {
> > > +		CDEV_LOG_ERR("Invalid queue_pair_id=%d", qp_id);
> > > +		return NULL;
> > > +	}
> > > +
> > > +	cb = rte_zmalloc(NULL, sizeof(*cb), 0);
> > > +	if (cb == NULL) {
> > > +		CDEV_LOG_ERR("Failed to allocate memory for callback on "
> > > +			     "dev=%d, queue_pair_id=%d", dev_id, qp_id);
> > > +		rte_errno = ENOMEM;
> > > +		return NULL;
> > > +	}
> > > +
> > > +	cb->fn = cb_fn;
> > > +	cb->arg = cb_arg;
> > > +
> > > +	rte_spinlock_lock(&rte_cryptodev_enq_cb_lock);
> > > +	if (dev->enq_cbs == NULL) {
> > > +		dev->enq_cbs = rte_zmalloc(NULL, sizeof(cb) *
> > > +					   dev->data->nb_queue_pairs, 0);
> > > +		if (dev->enq_cbs == NULL) {
> > > +			CDEV_LOG_ERR("Failed to allocate memory for
> > callbacks");
> > > +			rte_errno = ENOMEM;
> > > +			rte_free(cb);
> > > +			return NULL;
> > > +		}
> > > +	}
> > > +
> > > +	/* Add the callbacks in fifo order. */
> > > +	tail = dev->enq_cbs[qp_id];
> > > +	if (tail) {
> > > +		while (tail->next)
> > > +			tail = tail->next;
> > > +		tail->next = cb;
> > > +	} else
> > > +		dev->enq_cbs[qp_id] = cb;
> > > +
> > > +	rte_spinlock_unlock(&rte_cryptodev_enq_cb_lock);
> > > +
> > > +	return cb;
> > > +}
> > > +
> > > +int
> > > +rte_cryptodev_remove_enq_callback(uint8_t dev_id,
> > > +				  uint16_t qp_id,
> > > +				  struct rte_cryptodev_enq_callback *cb) {
> > > +	struct rte_cryptodev *dev;
> > > +	struct rte_cryptodev_enq_callback **prev_cb, *curr_cb;
> > > +	uint16_t qp;
> > > +	int free_mem;
> > > +	int ret;
> > > +
> > > +	free_mem = 1;
> > > +
> > > +	if (!cb) {
> > > +		CDEV_LOG_ERR("cb is NULL");
> > > +		return -EINVAL;
> > > +	}
> > > +
> > > +	if (!rte_cryptodev_pmd_is_valid_dev(dev_id)) {
> > > +		CDEV_LOG_ERR("Invalid dev_id=%" PRIu8, dev_id);
> > > +		return -EINVAL;
> > > +	}
> > > +
> > > +	dev = &rte_crypto_devices[dev_id];
> > > +	if (qp_id >= dev->data->nb_queue_pairs) {
> > > +		CDEV_LOG_ERR("Invalid queue_pair_id=%d", qp_id);
> > > +		return -EINVAL;
> > > +	}
> > > +
> > > +	if (!dev->qsbr) {
> > > +		CDEV_LOG_ERR("Rcu qsbr is NULL");
> > > +		return -EINVAL;
> > > +	}
> > > +
> > > +	rte_spinlock_lock(&rte_cryptodev_enq_cb_lock);
> > > +	if (dev->enq_cbs == NULL) {
> > > +		rte_spinlock_unlock(&rte_cryptodev_enq_cb_lock);
> > > +		return -EINVAL;
> > > +	}
> > > +
> > > +	prev_cb = &dev->enq_cbs[qp_id];
> > > +	for (; *prev_cb != NULL; prev_cb = &curr_cb->next) {
> > > +		curr_cb = *prev_cb;
> > > +		if (curr_cb == cb) {
> > > +			/* Remove the user cb from the callback list. */
> > > +			*prev_cb = curr_cb->next;
> > > +			ret = 0;
> > > +			break;
> > > +		}
> > > +	}
> > > +
> > > +	if (!ret) {
> > > +		/* Call sync with invalid thread id as this is part of
> > > +		 * control plane API */
> > > +		rte_rcu_qsbr_synchronize(dev->qsbr,
> > > +					 RTE_QSBR_THRID_INVALID);
> > > +		rte_free(cb);
> > > +	}
> > > +
> > > +	for (qp = 0; qp < dev->data->nb_queue_pairs; qp++)
> > > +		if (dev->enq_cbs[qp] != NULL) {
> >
> > Some reference count (number of callbacks) seems like a better approach
> > here.
> Ok.
> >
> > > +			free_mem = 0;
> > > +			break;
> > > +		}
> > > +
> > > +	if (free_mem) {
> > > +		rte_free(dev->enq_cbs);
> > > +		dev->enq_cbs = NULL;
> > > +	}
> > > +
> > > +	rte_spinlock_unlock(&rte_cryptodev_enq_cb_lock);
> > > +
> > > +	return ret;
> > > +}
> > > +#endif
> > >
> > >  int
> > >  rte_cryptodev_sym_session_init(uint8_t dev_id, diff --git
> > > a/lib/librte_cryptodev/rte_cryptodev.h
> > > b/lib/librte_cryptodev/rte_cryptodev.h
> > > index 7b3ebc2..2c7a47b 100644
> > > --- a/lib/librte_cryptodev/rte_cryptodev.h
> > > +++ b/lib/librte_cryptodev/rte_cryptodev.h
> > > @@ -530,6 +530,32 @@ struct rte_cryptodev_qp_conf {  };
> > >
> > >  /**
> > > + * Function type used for pre processing crypto ops when enqueue
> > > +burst is
> > > + * called.
> > > + *
> > > + * The callback function is called on enqueue burst immediately
> > > + * before the crypto ops are put onto the hardware queue for processing.
> > > + *
> > > + * @param	dev_id		The identifier of the device.
> > > + * @param	qp_id		The index of the queue pair in which ops are
> > > + *				to be enqueued for processing. The value
> > > + *				must be in the range [0, nb_queue_pairs - 1]
> > > + *				previously supplied to
> > > + *				*rte_cryptodev_configure*.
> > > + * @param	ops		The address of an array of *nb_ops* pointers
> > > + *				to *rte_crypto_op* structures which contain
> > > + *				the crypto operations to be processed.
> > > + * @param	nb_ops		The number of operations to process.
> > > + * @param	user_param	The arbitrary user parameter passed in by the
> > > + *				application when the callback was originally
> > > + *				registered.
> > > + * @return			The number of ops to be enqueued to the
> > > + *				crypto device.
> > > + */
> > > +typedef uint16_t (*rte_cryptodev_enq_cb_fn)(uint16_t dev_id, uint16_t
> > qp_id,
> > > +		struct rte_crypto_op **ops, uint16_t nb_ops, void
> > *user_param);
> > > +
> > > +/**
> > >   * Typedef for application callback function to be registered by application
> > >   * software for notification of device events
> > >   *
> > > @@ -853,7 +879,6 @@ struct rte_cryptodev_config {
> > >  		enum rte_cryptodev_event_type event,
> > >  		rte_cryptodev_cb_fn cb_fn, void *cb_arg);
> > >
> > > -
> > >  typedef uint16_t (*dequeue_pkt_burst_t)(void *qp,
> > >  		struct rte_crypto_op **ops,	uint16_t nb_ops);
> > >  /**< Dequeue processed packets from queue pair of a device. */ @@
> > > -870,6 +895,17 @@ typedef uint16_t (*enqueue_pkt_burst_t)(void *qp,
> > >  /** Structure to keep track of registered callbacks */
> > > TAILQ_HEAD(rte_cryptodev_cb_list, rte_cryptodev_callback);
> > >
> > > +/**
> > > + * @internal
> > > + * Structure used to hold information about the callbacks to be
> > > +called for a
> > > + * queue pair on enqueue.
> > > + */
> > > +struct rte_cryptodev_enq_callback {
> > > +	struct rte_cryptodev_enq_callback *next;
> > > +	rte_cryptodev_enq_cb_fn fn;
> > > +	void *arg;
> > > +};
> > > +
> > >  /** The data structure associated with each crypto device. */  struct
> > > rte_cryptodev {
> > >  	dequeue_pkt_burst_t dequeue_burst;
> > > @@ -898,6 +934,14 @@ struct rte_cryptodev {
> > >  	__extension__
> > >  	uint8_t attached : 1;
> > >  	/**< Flag indicating the device is attached */
> > > +
> > > +#ifdef RTE_CRYPTODEV_CALLBACKS
> > > +	struct rte_cryptodev_enq_callback **enq_cbs;
> > > +	/**< User application callback for pre enqueue processing */
> > > +
> > > +	struct rte_rcu_qsbr *qsbr;
> > > +	/** < RCU QSBR variable for rte_cryptodev_enq_callback */
> >
> > Probably better to have both these fields per queue.
> > Space for them can be allocated at dev_configure() or so.
> enq_cbs is allocated during callback add.
> Unlike ethdev, each cryptodev have their own max queue pair. There is no macro for that.
> I think, single RCU should be good enough, as it has mechanism to track all its reporting threads.
> 
> > BTW, wouldn't it make sense to have ability to add callback for dequeue too?
> As mentioned in the commit message, this patch was driven by a requirement.
> If required, callback for the dequeue can be added in a separate patch.
> >
> > > +#endif
> > >  } __rte_cache_aligned;
> > >
> > >  void *
> > > @@ -1019,6 +1063,18 @@ struct rte_cryptodev_data {
> > >  		struct rte_crypto_op **ops, uint16_t nb_ops)  {
> > >  	struct rte_cryptodev *dev = &rte_cryptodevs[dev_id];
> > > +#ifdef RTE_CRYPTODEV_CALLBACKS
> > > +	if (unlikely(dev->enq_cbs != NULL && dev->enq_cbs[qp_id] !=
> > NULL)) {
> > > +		struct rte_cryptodev_enq_callback *cb =
> > > +			dev->enq_cbs[qp_id];
> > > +
> > > +		do {
> > > +			nb_ops = cb->fn(dev_id, qp_id, ops, nb_ops,
> > > +					cb->arg);
> > > +			cb = cb->next;
> > > +		} while (cb != NULL);
> > > +	}
> > > +#endif
> > >
> > >  	rte_cryptodev_trace_enqueue_burst(dev_id, qp_id, (void **)ops,
> > nb_ops);
> > >  	return (*dev->enqueue_burst)(
> > > @@ -1351,6 +1407,102 @@ struct rte_cryptodev_asym_session *
> > >  	struct rte_cryptodev_sym_session *sess, union rte_crypto_sym_ofs
> > ofs,
> > >  	struct rte_crypto_sym_vec *vec);
> > >
> > > +#ifdef RTE_CRYPTODEV_CALLBACKS
> > > +/**
> > > + * @warning
> > > + * @b EXPERIMENTAL: this API may change without prior notice
> > > + *
> > > + * Add a user callback for a given crypto device and queue pair which
> > > +will be
> > > + * called on crypto ops enqueue.
> > > + *
> > > + * This API configures a function to be called for each burst of
> > > +crypto ops
> > > + * received on a given crypto device queue pair. The return value is
> > > +a pointer
> > > + * that can be used later to remove the callback using
> > > + * rte_cryptodev_remove_enq_callback().
> > > + *
> > > + * Multiple functions are called in the order that they are added.
> > > + *
> > > + * @param	dev_id		The identifier of the device.
> > > + * @param	qp_id		The index of the queue pair in which ops are
> > > + *				to be enqueued for processing. The value
> > > + *				must be in the range [0, nb_queue_pairs - 1]
> > > + *				previously supplied to
> > > + *				*rte_cryptodev_configure*.
> > > + * @param	cb_fn		The callback function
> > > + * @param	cb_arg		A generic pointer parameter which will be
> > passed
> > > + *				to each invocation of the callback function on
> > > + *				this crypto device and queue pair.
> > > + *
> > > + * @return
> > > + *   NULL on error.
> > > + *   On success, a pointer value which can later be used to remove the
> > callback.
> > > + */
> > > +
> > > +__rte_experimental
> > > +struct rte_cryptodev_enq_callback *
> > > +rte_cryptodev_add_enq_callback(uint8_t dev_id,
> > > +			       uint16_t qp_id,
> > > +			       rte_cryptodev_enq_cb_fn cb_fn,
> > > +			       void *cb_arg);
> > > +
> > > +
> > > +/**
> > > + * @warning
> > > + * @b EXPERIMENTAL: this API may change without prior notice
> > > + *
> > > + * Remove a user callback function for given crypto device and queue pair.
> > > + *
> > > + * This function is used to removed callbacks that were added to a
> > > +crypto
> > > + * device queue pair using rte_cryptodev_add_enq_callback().
> > > + *
> > > + * Note: The callback expects a RCU QSBR to be configured to
> > > +synchronize
> > > + * to free the memory. Application is expected to configure RCU QSBR
> > > +after
> > > + * adding an enqueue callback.
> > > + *
> > > + *
> > > + * @param	dev_id		The identifier of the device.
> > > + * @param	qp_id		The index of the queue pair in which ops are
> > > + *				to be enqueued for processing. The value
> > > + *				must be in the range [0, nb_queue_pairs - 1]
> > > + *				previously supplied to
> > > + *				*rte_cryptodev_configure*.
> > > + * @param	cb		Pointer to user supplied callback created via
> > > + *				rte_cryptodev_add_enq_callback().
> > > + *
> > > + * @return
> > > + *   - 0: Success. Callback was removed.
> > > + *   - -EINVAL:  The dev_id or the qp_id is out of range, or the callback
> > > + *               is NULL or not found for the crypto device queue pair.
> > > + */
> > > +
> > > +__rte_experimental
> > > +int rte_cryptodev_remove_enq_callback(uint8_t dev_id,
> > > +				  uint16_t qp_id,
> > > +				  struct rte_cryptodev_enq_callback *cb);
> > > +
> > > +
> > > +/**
> > > + * @warning
> > > + * @b EXPERIMENTAL: this API may change without prior notice
> > > + *
> > > + * Associate RCU QSBR variable with a cryptodev.
> > > + *
> > > + * This function is used to add RCU QSBR to a crypto device.
> > > + * The purpose of RCU is to help multiple threads to synchronize
> > > + * with each other before initiating adding/removing callback
> > > + * while dataplane threads are running enqueue callbacks.
> > > + *
> > > + * @param	dev_id		The identifier of the device.
> > > + * @param	qsr		RCU QSBR configuration
> > > + * @return
> > > + *   On success - 0
> > > + *   On error - EINVAL.
> > > + */
> > > +
> > > +__rte_experimental
> > > +int rte_cryptodev_rcu_qsbr_add(uint8_t dev_id, struct rte_rcu_qsbr
> > > +*qsbr); #endif
> > > +
> > >  #ifdef __cplusplus
> > >  }
> > >  #endif
> > > diff --git a/lib/librte_cryptodev/rte_cryptodev_version.map
> > > b/lib/librte_cryptodev/rte_cryptodev_version.map
> > > index 02f6dcf..46de3ca 100644
> > > --- a/lib/librte_cryptodev/rte_cryptodev_version.map
> > > +++ b/lib/librte_cryptodev/rte_cryptodev_version.map
> > > @@ -64,6 +64,7 @@ DPDK_20.0 {
> > >  	rte_cryptodev_sym_capability_get;
> > >  };
> > >
> > > +
> > >  EXPERIMENTAL {
> > >  	global:
> > >
> > > @@ -105,4 +106,9 @@ EXPERIMENTAL {
> > >
> > >  	# added in 20.08
> > >  	rte_cryptodev_get_qp_status;
> > > +
> > > +	# added in 20.11
> > > +	rte_cryptodev_add_enq_callback;
> > > +	rte_cryptodev_remove_enq_callback;
> > > +	rte_cryptodev_rcu_qsbr_add;
> > >  };
> > > --
> > > 1.9.1
Abhinandan Gujjar Sept. 21, 2020, 10:59 a.m. UTC | #7
Hi Konstantin,

Please find comments inline.

@Doherty, Declan & @Honnappa Nagarahalli,

Can you please chime in?

Regards
Abhinandan

> -----Original Message-----
> From: Ananyev, Konstantin <konstantin.ananyev@intel.com>
> Sent: Thursday, September 17, 2020 6:55 PM
> To: Gujjar, Abhinandan S <abhinandan.gujjar@intel.com>; dev@dpdk.org
> Cc: Doherty, Declan <declan.doherty@intel.com>; jerinj@marvell.com;
> akhil.goyal@nxp.com; Vangati, Narender <narender.vangati@intel.com>
> Subject: RE: [dpdk-dev] [v2 1/2] cryptodev: support enqueue callback
> functions
> 
> 
> 
> > > >
> > > > In an eventdev world, multiple workers (with ordered queue) will
> > > > be working on IPsec ESP processing. The ESP header's sequence
> > > > number is unique and has to be sequentially incremented in an orderly
> manner.
> > > > This rises a need for incrementing sequence number in crypto stage
> > > > especially in event crypto adapter. By adding a user callback to
> > > > cryptodev at enqueue burst, the user callback will get executed in
> > > > the context of event crypto adapter. This helps the application to
> > > > increment the ESP sequence number atomically and orderly manner.
> > > >
> > > > This patch adds APIs to add/remove callback functions. The
> > > > callback function will be called for each burst of crypto ops
> > > > received on a given crypto device queue pair.
> > > >
> > > > v1->v2:
> > > > Moved callback related members to the end of cryptodev struct
> > > > Added support for RCU
> > > >
> > > > Signed-off-by: Abhinandan Gujjar <abhinandan.gujjar@intel.com>
> > > > ---
> > > >  config/common_base                             |   1 +
> > > >  lib/librte_cryptodev/Makefile                  |   2 +-
> > > >  lib/librte_cryptodev/rte_cryptodev.c           | 157
> > > +++++++++++++++++++++++++
> > > >  lib/librte_cryptodev/rte_cryptodev.h           | 154
> > > +++++++++++++++++++++++-
> > > >  lib/librte_cryptodev/rte_cryptodev_version.map |   6 +
> > > >  5 files changed, 318 insertions(+), 2 deletions(-)
> > > >
> > > > diff --git a/config/common_base b/config/common_base index
> > > > fbf0ee7..f5ebde4 100644
> > > > --- a/config/common_base
> > > > +++ b/config/common_base
> > > > @@ -599,6 +599,7 @@
> > > CONFIG_RTE_LIBRTE_PMD_BBDEV_FPGA_5GNR_FEC=y
> > > >  #
> > > >  CONFIG_RTE_LIBRTE_CRYPTODEV=y
> > > >  CONFIG_RTE_CRYPTO_MAX_DEVS=64
> > > > +CONFIG_RTE_CRYPTODEV_CALLBACKS=y
> > > >
> > > >  #
> > > >  # Compile PMD for ARMv8 Crypto device diff --git
> > > > a/lib/librte_cryptodev/Makefile b/lib/librte_cryptodev/Makefile
> > > > index
> > > > 73e77a2..514d552 100644
> > > > --- a/lib/librte_cryptodev/Makefile
> > > > +++ b/lib/librte_cryptodev/Makefile
> > > > @@ -10,7 +10,7 @@ LIB = librte_cryptodev.a  CFLAGS += -O3  CFLAGS
> > > > +=
> > > > $(WERROR_FLAGS)  LDLIBS += -lrte_eal -lrte_mempool -lrte_ring
> > > > -lrte_mbuf -LDLIBS += -lrte_kvargs
> > > > +LDLIBS += -lrte_kvargs -lrte_rcu
> > > >
> > > >  # library source files
> > > >  SRCS-y += rte_cryptodev.c rte_cryptodev_pmd.c
> > > > cryptodev_trace_points.c diff --git
> > > > a/lib/librte_cryptodev/rte_cryptodev.c
> > > > b/lib/librte_cryptodev/rte_cryptodev.c
> > > > index 1dd795b..2fb3e35 100644
> > > > --- a/lib/librte_cryptodev/rte_cryptodev.c
> > > > +++ b/lib/librte_cryptodev/rte_cryptodev.c
> > > > @@ -38,6 +38,7 @@
> > > >  #include <rte_string_fns.h>
> > > >  #include <rte_compat.h>
> > > >  #include <rte_function_versioning.h>
> > > > +#include <rte_rcu_qsbr.h>
> > > >
> > > >  #include "rte_crypto.h"
> > > >  #include "rte_cryptodev.h"
> > > > @@ -499,6 +500,10 @@ struct
> > > rte_cryptodev_sym_session_pool_private_data {
> > > >  	return 0;
> > > >  }
> > > >
> > > > +#ifdef RTE_CRYPTODEV_CALLBACKS
> > > > +/* spinlock for crypto device enq callbacks */ static
> > > > +rte_spinlock_t rte_cryptodev_enq_cb_lock =
> > > > +RTE_SPINLOCK_INITIALIZER; #endif
> > > >
> > > >  const char *
> > > >  rte_cryptodev_get_feature_name(uint64_t flag) @@ -1449,6
> > > > +1454,158
> > > @@
> > > > struct rte_cryptodev *
> > > >  	rte_spinlock_unlock(&rte_cryptodev_cb_lock);
> > > >  }
> > > >
> > > > +#ifdef RTE_CRYPTODEV_CALLBACKS
> > > > +int
> > > > +rte_cryptodev_rcu_qsbr_add(uint8_t dev_id, struct rte_rcu_qsbr
> > > > +*qsbr) {
> > > > +
> > > > +	struct rte_cryptodev *dev;
> > > > +
> > > > +	if (!rte_cryptodev_pmd_is_valid_dev(dev_id)) {
> > > > +		CDEV_LOG_ERR("Invalid dev_id=%" PRIu8, dev_id);
> > > > +		return -EINVAL;
> > > > +	}
> > > > +
> > > > +	dev = &rte_crypto_devices[dev_id];
> > > > +	dev->qsbr = qsbr;
> > > > +	return 0;
> > > > +}
> > >
> > > So if I understand your patch correctly you propose a new working
> > > model for
> > > crypto-devs:
> > > 1. Control-plane has to allocate/setup rcu_qsbr and do
> > > rte_cryptodev_rcu_qsbr_add().
> > > 2. Data-plane has somehow to obtain pointer to that rcu_qsbr and
> > > wrap
> > > cryptodev_enqueue()
> > >    with rcu_qsbr_quiescent()  or rcu_qsbr_online()/rcu_qsbr_offline().
> > Yes. I think, it is not a new model. It is same as RCU integration with LPM.
> > Please refer: https://patches.dpdk.org/cover/73673/
> 
> I am talking about new working model for crypto-dev enqueue/dequeue.
> As I said above now it becomes data-plane thread responsibility to:
>  -somehow to obtain pointer to that rcu_qsbr for each cryptodev it is using.
>  -call rcu sync functions (quiescent/online/offline) on a regular basis.
It is not on regular basis. When data plane comes up, they report online.
They report quiescent when they are done with critical section or shared structure.
All though, there is some dataplane changes involved here, I don't think, it is major.
 
> Note that now data-plane thread would have to do that always - even if
> there are now callbacks installed for that cryptodev queue right now.
> All that changes behaviour of existing apps and I presume would reduce
> adoption of  that fature.
There is always trade off involved!
In the previous patch, you suggested that some lazy app may not free up the
memory allocated by add cb. For such apps, this patch has sync mechanism
with some additional cost of CP & DP changes.

> I still think all this callback mechanism should be totally opaque to data-plane
> threads - user shouldn't change his app code depending on would some
> enqueue/dequeue callbacks be installed or not.
I am not sure, how that can be implemented with existing RCU design.
@Honnappa Nagarahalli, Do you have any suggestions?

> 
> >
> > >
> > > That seems quite a big change and I don't think it is acceptable for
> > > most users.
> > > From my perspective adding/installing call-backs to the dev has to
> > > be opaque to the data-plane code.
> > > Also note that different callbacks can be installed by different
> > > entities (libs) and might have no idea about each other.
> > > That's why I thought it would be better to make all this RCU stuff
> > > internal inside cryptodev:
> > >     hide all this rcu_qsbr allocation/setup inside cryptod somehow to
> obtain pointer to that rcu_qsbr ev init/queue setup
> > >     invoke rcu_qsbr_online()/rcu_qsbr_offline() inside
> cryptodev_enqueue().
> > I have already tried exploring above stuffs. There are too many constraints.
> > The changes don't fit in, as per RCU design.
> 
> Hmm could you be more specific here - what constraints are you referring
> to?
> 
> > Moreover, having rcu api under enqueue_burst() will affect the
> performance.
> 
> It most likely will. Though my expectation it will affect performance only
> when some callbacks are installed. My thought here:
> callback function by itself will affect cryptdev_enqueue performance
> anyway,
With existing callback design, I have measured the performance(with crypto perf test) on xeon.
It was almost negligible and same was shared with Declan.
That is one of the reasons, I didn't want to add to many stuffs in to the callback.
The best part of existing design is crypto lib is not much modified.
The changes are either pushed to CP or DP.

so adding extra overhead for sync is probably ok here.
> Though for situation when no callbacks are installed - perfomance should be
> left unaffected (or impact should be as small as possible).
> 
> > The changes are more on control plane side, which is one time.
> > The data plane changes are minimal.
> 
> I still think upper layer data-plane code should stay unaffected (zero
> changes).
> 
> > >
> > > > +
> > > > +struct rte_cryptodev_enq_callback *
> > > > +rte_cryptodev_add_enq_callback(uint8_t dev_id,
> > > > +			       uint16_t qp_id,
> > > > +			       rte_cryptodev_enq_cb_fn cb_fn,
> > > > +			       void *cb_arg)
> > > > +{
> > > > +	struct rte_cryptodev *dev;
> > > > +	struct rte_cryptodev_enq_callback *cb, *tail;
> > > > +
> > > > +	if (!cb_fn)
> > > > +		return NULL;
> > > > +
> > > > +	if (!rte_cryptodev_pmd_is_valid_dev(dev_id)) {
> > > > +		CDEV_LOG_ERR("Invalid dev_id=%" PRIu8, dev_id);
> > > > +		return NULL;
> > > > +	}
> > > > +
> > > > +	dev = &rte_crypto_devices[dev_id];
> > > > +	if (qp_id >= dev->data->nb_queue_pairs) {
> > > > +		CDEV_LOG_ERR("Invalid queue_pair_id=%d", qp_id);
> > > > +		return NULL;
> > > > +	}
> > > > +
> > > > +	cb = rte_zmalloc(NULL, sizeof(*cb), 0);
> > > > +	if (cb == NULL) {
> > > > +		CDEV_LOG_ERR("Failed to allocate memory for callback on "
> > > > +			     "dev=%d, queue_pair_id=%d", dev_id, qp_id);
> > > > +		rte_errno = ENOMEM;
> > > > +		return NULL;
> > > > +	}
> > > > +
> > > > +	cb->fn = cb_fn;
> > > > +	cb->arg = cb_arg;
> > > > +
> > > > +	rte_spinlock_lock(&rte_cryptodev_enq_cb_lock);
> > > > +	if (dev->enq_cbs == NULL) {
> > > > +		dev->enq_cbs = rte_zmalloc(NULL, sizeof(cb) *
> > > > +					   dev->data->nb_queue_pairs, 0);
> > > > +		if (dev->enq_cbs == NULL) {
> > > > +			CDEV_LOG_ERR("Failed to allocate memory for
> > > callbacks");
> > > > +			rte_errno = ENOMEM;
> > > > +			rte_free(cb);
> > > > +			return NULL;
> > > > +		}
> > > > +	}
> > > > +
> > > > +	/* Add the callbacks in fifo order. */
> > > > +	tail = dev->enq_cbs[qp_id];
> > > > +	if (tail) {
> > > > +		while (tail->next)
> > > > +			tail = tail->next;
> > > > +		tail->next = cb;
> > > > +	} else
> > > > +		dev->enq_cbs[qp_id] = cb;
> > > > +
> > > > +	rte_spinlock_unlock(&rte_cryptodev_enq_cb_lock);
> > > > +
> > > > +	return cb;
> > > > +}
> > > > +
> > > > +int
> > > > +rte_cryptodev_remove_enq_callback(uint8_t dev_id,
> > > > +				  uint16_t qp_id,
> > > > +				  struct rte_cryptodev_enq_callback *cb) {
> > > > +	struct rte_cryptodev *dev;
> > > > +	struct rte_cryptodev_enq_callback **prev_cb, *curr_cb;
> > > > +	uint16_t qp;
> > > > +	int free_mem;
> > > > +	int ret;
> > > > +
> > > > +	free_mem = 1;
> > > > +
> > > > +	if (!cb) {
> > > > +		CDEV_LOG_ERR("cb is NULL");
> > > > +		return -EINVAL;
> > > > +	}
> > > > +
> > > > +	if (!rte_cryptodev_pmd_is_valid_dev(dev_id)) {
> > > > +		CDEV_LOG_ERR("Invalid dev_id=%" PRIu8, dev_id);
> > > > +		return -EINVAL;
> > > > +	}
> > > > +
> > > > +	dev = &rte_crypto_devices[dev_id];
> > > > +	if (qp_id >= dev->data->nb_queue_pairs) {
> > > > +		CDEV_LOG_ERR("Invalid queue_pair_id=%d", qp_id);
> > > > +		return -EINVAL;
> > > > +	}
> > > > +
> > > > +	if (!dev->qsbr) {
> > > > +		CDEV_LOG_ERR("Rcu qsbr is NULL");
> > > > +		return -EINVAL;
> > > > +	}
> > > > +
> > > > +	rte_spinlock_lock(&rte_cryptodev_enq_cb_lock);
> > > > +	if (dev->enq_cbs == NULL) {
> > > > +		rte_spinlock_unlock(&rte_cryptodev_enq_cb_lock);
> > > > +		return -EINVAL;
> > > > +	}
> > > > +
> > > > +	prev_cb = &dev->enq_cbs[qp_id];
> > > > +	for (; *prev_cb != NULL; prev_cb = &curr_cb->next) {
> > > > +		curr_cb = *prev_cb;
> > > > +		if (curr_cb == cb) {
> > > > +			/* Remove the user cb from the callback list. */
> > > > +			*prev_cb = curr_cb->next;
> > > > +			ret = 0;
> > > > +			break;
> > > > +		}
> > > > +	}
> > > > +
> > > > +	if (!ret) {
> > > > +		/* Call sync with invalid thread id as this is part of
> > > > +		 * control plane API */
> > > > +		rte_rcu_qsbr_synchronize(dev->qsbr,
> > > > +					 RTE_QSBR_THRID_INVALID);
> > > > +		rte_free(cb);
> > > > +	}
> > > > +
> > > > +	for (qp = 0; qp < dev->data->nb_queue_pairs; qp++)
> > > > +		if (dev->enq_cbs[qp] != NULL) {
> > >
> > > Some reference count (number of callbacks) seems like a better
> > > approach here.
> > Ok.
> > >
> > > > +			free_mem = 0;
> > > > +			break;
> > > > +		}
> > > > +
> > > > +	if (free_mem) {
> > > > +		rte_free(dev->enq_cbs);
> > > > +		dev->enq_cbs = NULL;
> > > > +	}
> > > > +
> > > > +	rte_spinlock_unlock(&rte_cryptodev_enq_cb_lock);
> > > > +
> > > > +	return ret;
> > > > +}
> > > > +#endif
> > > >
> > > >  int
> > > >  rte_cryptodev_sym_session_init(uint8_t dev_id, diff --git
> > > > a/lib/librte_cryptodev/rte_cryptodev.h
> > > > b/lib/librte_cryptodev/rte_cryptodev.h
> > > > index 7b3ebc2..2c7a47b 100644
> > > > --- a/lib/librte_cryptodev/rte_cryptodev.h
> > > > +++ b/lib/librte_cryptodev/rte_cryptodev.h
> > > > @@ -530,6 +530,32 @@ struct rte_cryptodev_qp_conf {  };
> > > >
> > > >  /**
> > > > + * Function type used for pre processing crypto ops when enqueue
> > > > +burst is
> > > > + * called.
> > > > + *
> > > > + * The callback function is called on enqueue burst immediately
> > > > + * before the crypto ops are put onto the hardware queue for
> processing.
> > > > + *
> > > > + * @param	dev_id		The identifier of the device.
> > > > + * @param	qp_id		The index of the queue pair in which
> ops are
> > > > + *				to be enqueued for processing. The
> value
> > > > + *				must be in the range [0,
> nb_queue_pairs - 1]
> > > > + *				previously supplied to
> > > > + *				*rte_cryptodev_configure*.
> > > > + * @param	ops		The address of an array of *nb_ops*
> pointers
> > > > + *				to *rte_crypto_op* structures which
> contain
> > > > + *				the crypto operations to be
> processed.
> > > > + * @param	nb_ops		The number of operations to process.
> > > > + * @param	user_param	The arbitrary user parameter passed
> in by the
> > > > + *				application when the callback was
> originally
> > > > + *				registered.
> > > > + * @return			The number of ops to be enqueued
> to the
> > > > + *				crypto device.
> > > > + */
> > > > +typedef uint16_t (*rte_cryptodev_enq_cb_fn)(uint16_t dev_id,
> > > > +uint16_t
> > > qp_id,
> > > > +		struct rte_crypto_op **ops, uint16_t nb_ops, void
> > > *user_param);
> > > > +
> > > > +/**
> > > >   * Typedef for application callback function to be registered by
> application
> > > >   * software for notification of device events
> > > >   *
> > > > @@ -853,7 +879,6 @@ struct rte_cryptodev_config {
> > > >  		enum rte_cryptodev_event_type event,
> > > >  		rte_cryptodev_cb_fn cb_fn, void *cb_arg);
> > > >
> > > > -
> > > >  typedef uint16_t (*dequeue_pkt_burst_t)(void *qp,
> > > >  		struct rte_crypto_op **ops,	uint16_t nb_ops);
> > > >  /**< Dequeue processed packets from queue pair of a device. */ @@
> > > > -870,6 +895,17 @@ typedef uint16_t (*enqueue_pkt_burst_t)(void
> > > > *qp,
> > > >  /** Structure to keep track of registered callbacks */
> > > > TAILQ_HEAD(rte_cryptodev_cb_list, rte_cryptodev_callback);
> > > >
> > > > +/**
> > > > + * @internal
> > > > + * Structure used to hold information about the callbacks to be
> > > > +called for a
> > > > + * queue pair on enqueue.
> > > > + */
> > > > +struct rte_cryptodev_enq_callback {
> > > > +	struct rte_cryptodev_enq_callback *next;
> > > > +	rte_cryptodev_enq_cb_fn fn;
> > > > +	void *arg;
> > > > +};
> > > > +
> > > >  /** The data structure associated with each crypto device. */
> > > > struct rte_cryptodev {
> > > >  	dequeue_pkt_burst_t dequeue_burst; @@ -898,6 +934,14 @@
> struct
> > > > rte_cryptodev {
> > > >  	__extension__
> > > >  	uint8_t attached : 1;
> > > >  	/**< Flag indicating the device is attached */
> > > > +
> > > > +#ifdef RTE_CRYPTODEV_CALLBACKS
> > > > +	struct rte_cryptodev_enq_callback **enq_cbs;
> > > > +	/**< User application callback for pre enqueue processing */
> > > > +
> > > > +	struct rte_rcu_qsbr *qsbr;
> > > > +	/** < RCU QSBR variable for rte_cryptodev_enq_callback */
> > >
> > > Probably better to have both these fields per queue.
> > > Space for them can be allocated at dev_configure() or so.
> > enq_cbs is allocated during callback add.
> > Unlike ethdev, each cryptodev have their own max queue pair. There is no
> macro for that.
> > I think, single RCU should be good enough, as it has mechanism to track all
> its reporting threads.
> >
> > > BTW, wouldn't it make sense to have ability to add callback for dequeue
> too?
> > As mentioned in the commit message, this patch was driven by a
> requirement.
> > If required, callback for the dequeue can be added in a separate patch.
> > >
> > > > +#endif
> > > >  } __rte_cache_aligned;
> > > >
> > > >  void *
> > > > @@ -1019,6 +1063,18 @@ struct rte_cryptodev_data {
> > > >  		struct rte_crypto_op **ops, uint16_t nb_ops)  {
> > > >  	struct rte_cryptodev *dev = &rte_cryptodevs[dev_id];
> > > > +#ifdef RTE_CRYPTODEV_CALLBACKS
> > > > +	if (unlikely(dev->enq_cbs != NULL && dev->enq_cbs[qp_id] !=
> > > NULL)) {
> > > > +		struct rte_cryptodev_enq_callback *cb =
> > > > +			dev->enq_cbs[qp_id];
> > > > +
> > > > +		do {
> > > > +			nb_ops = cb->fn(dev_id, qp_id, ops, nb_ops,
> > > > +					cb->arg);
> > > > +			cb = cb->next;
> > > > +		} while (cb != NULL);
> > > > +	}
> > > > +#endif
> > > >
> > > >  	rte_cryptodev_trace_enqueue_burst(dev_id, qp_id, (void **)ops,
> > > nb_ops);
> > > >  	return (*dev->enqueue_burst)(
> > > > @@ -1351,6 +1407,102 @@ struct rte_cryptodev_asym_session *
> > > >  	struct rte_cryptodev_sym_session *sess, union rte_crypto_sym_ofs
> > > ofs,
> > > >  	struct rte_crypto_sym_vec *vec);
> > > >
> > > > +#ifdef RTE_CRYPTODEV_CALLBACKS
> > > > +/**
> > > > + * @warning
> > > > + * @b EXPERIMENTAL: this API may change without prior notice
> > > > + *
> > > > + * Add a user callback for a given crypto device and queue pair
> > > > +which will be
> > > > + * called on crypto ops enqueue.
> > > > + *
> > > > + * This API configures a function to be called for each burst of
> > > > +crypto ops
> > > > + * received on a given crypto device queue pair. The return value
> > > > +is a pointer
> > > > + * that can be used later to remove the callback using
> > > > + * rte_cryptodev_remove_enq_callback().
> > > > + *
> > > > + * Multiple functions are called in the order that they are added.
> > > > + *
> > > > + * @param	dev_id		The identifier of the device.
> > > > + * @param	qp_id		The index of the queue pair in which
> ops are
> > > > + *				to be enqueued for processing. The
> value
> > > > + *				must be in the range [0,
> nb_queue_pairs - 1]
> > > > + *				previously supplied to
> > > > + *				*rte_cryptodev_configure*.
> > > > + * @param	cb_fn		The callback function
> > > > + * @param	cb_arg		A generic pointer parameter which
> will be
> > > passed
> > > > + *				to each invocation of the callback
> function on
> > > > + *				this crypto device and queue pair.
> > > > + *
> > > > + * @return
> > > > + *   NULL on error.
> > > > + *   On success, a pointer value which can later be used to remove the
> > > callback.
> > > > + */
> > > > +
> > > > +__rte_experimental
> > > > +struct rte_cryptodev_enq_callback *
> > > > +rte_cryptodev_add_enq_callback(uint8_t dev_id,
> > > > +			       uint16_t qp_id,
> > > > +			       rte_cryptodev_enq_cb_fn cb_fn,
> > > > +			       void *cb_arg);
> > > > +
> > > > +
> > > > +/**
> > > > + * @warning
> > > > + * @b EXPERIMENTAL: this API may change without prior notice
> > > > + *
> > > > + * Remove a user callback function for given crypto device and queue
> pair.
> > > > + *
> > > > + * This function is used to removed callbacks that were added to
> > > > +a crypto
> > > > + * device queue pair using rte_cryptodev_add_enq_callback().
> > > > + *
> > > > + * Note: The callback expects a RCU QSBR to be configured to
> > > > +synchronize
> > > > + * to free the memory. Application is expected to configure RCU
> > > > +QSBR after
> > > > + * adding an enqueue callback.
> > > > + *
> > > > + *
> > > > + * @param	dev_id		The identifier of the device.
> > > > + * @param	qp_id		The index of the queue pair in which
> ops are
> > > > + *				to be enqueued for processing. The
> value
> > > > + *				must be in the range [0,
> nb_queue_pairs - 1]
> > > > + *				previously supplied to
> > > > + *				*rte_cryptodev_configure*.
> > > > + * @param	cb		Pointer to user supplied callback
> created via
> > > > + *				rte_cryptodev_add_enq_callback().
> > > > + *
> > > > + * @return
> > > > + *   - 0: Success. Callback was removed.
> > > > + *   - -EINVAL:  The dev_id or the qp_id is out of range, or the callback
> > > > + *               is NULL or not found for the crypto device queue pair.
> > > > + */
> > > > +
> > > > +__rte_experimental
> > > > +int rte_cryptodev_remove_enq_callback(uint8_t dev_id,
> > > > +				  uint16_t qp_id,
> > > > +				  struct rte_cryptodev_enq_callback *cb);
> > > > +
> > > > +
> > > > +/**
> > > > + * @warning
> > > > + * @b EXPERIMENTAL: this API may change without prior notice
> > > > + *
> > > > + * Associate RCU QSBR variable with a cryptodev.
> > > > + *
> > > > + * This function is used to add RCU QSBR to a crypto device.
> > > > + * The purpose of RCU is to help multiple threads to synchronize
> > > > + * with each other before initiating adding/removing callback
> > > > + * while dataplane threads are running enqueue callbacks.
> > > > + *
> > > > + * @param	dev_id		The identifier of the device.
> > > > + * @param	qsr		RCU QSBR configuration
> > > > + * @return
> > > > + *   On success - 0
> > > > + *   On error - EINVAL.
> > > > + */
> > > > +
> > > > +__rte_experimental
> > > > +int rte_cryptodev_rcu_qsbr_add(uint8_t dev_id, struct
> > > > +rte_rcu_qsbr *qsbr); #endif
> > > > +
> > > >  #ifdef __cplusplus
> > > >  }
> > > >  #endif
> > > > diff --git a/lib/librte_cryptodev/rte_cryptodev_version.map
> > > > b/lib/librte_cryptodev/rte_cryptodev_version.map
> > > > index 02f6dcf..46de3ca 100644
> > > > --- a/lib/librte_cryptodev/rte_cryptodev_version.map
> > > > +++ b/lib/librte_cryptodev/rte_cryptodev_version.map
> > > > @@ -64,6 +64,7 @@ DPDK_20.0 {
> > > >  	rte_cryptodev_sym_capability_get;  };
> > > >
> > > > +
> > > >  EXPERIMENTAL {
> > > >  	global:
> > > >
> > > > @@ -105,4 +106,9 @@ EXPERIMENTAL {
> > > >
> > > >  	# added in 20.08
> > > >  	rte_cryptodev_get_qp_status;
> > > > +
> > > > +	# added in 20.11
> > > > +	rte_cryptodev_add_enq_callback;
> > > > +	rte_cryptodev_remove_enq_callback;
> > > > +	rte_cryptodev_rcu_qsbr_add;
> > > >  };
> > > > --
> > > > 1.9.1
Ananyev, Konstantin Sept. 21, 2020, 11:45 a.m. UTC | #8
> > > > >
> > > > > In an eventdev world, multiple workers (with ordered queue) will
> > > > > be working on IPsec ESP processing. The ESP header's sequence
> > > > > number is unique and has to be sequentially incremented in an orderly
> > manner.
> > > > > This rises a need for incrementing sequence number in crypto stage
> > > > > especially in event crypto adapter. By adding a user callback to
> > > > > cryptodev at enqueue burst, the user callback will get executed in
> > > > > the context of event crypto adapter. This helps the application to
> > > > > increment the ESP sequence number atomically and orderly manner.
> > > > >
> > > > > This patch adds APIs to add/remove callback functions. The
> > > > > callback function will be called for each burst of crypto ops
> > > > > received on a given crypto device queue pair.
> > > > >
> > > > > v1->v2:
> > > > > Moved callback related members to the end of cryptodev struct
> > > > > Added support for RCU
> > > > >
> > > > > Signed-off-by: Abhinandan Gujjar <abhinandan.gujjar@intel.com>
> > > > > ---
> > > > >  config/common_base                             |   1 +
> > > > >  lib/librte_cryptodev/Makefile                  |   2 +-
> > > > >  lib/librte_cryptodev/rte_cryptodev.c           | 157
> > > > +++++++++++++++++++++++++
> > > > >  lib/librte_cryptodev/rte_cryptodev.h           | 154
> > > > +++++++++++++++++++++++-
> > > > >  lib/librte_cryptodev/rte_cryptodev_version.map |   6 +
> > > > >  5 files changed, 318 insertions(+), 2 deletions(-)
> > > > >
> > > > > diff --git a/config/common_base b/config/common_base index
> > > > > fbf0ee7..f5ebde4 100644
> > > > > --- a/config/common_base
> > > > > +++ b/config/common_base
> > > > > @@ -599,6 +599,7 @@
> > > > CONFIG_RTE_LIBRTE_PMD_BBDEV_FPGA_5GNR_FEC=y
> > > > >  #
> > > > >  CONFIG_RTE_LIBRTE_CRYPTODEV=y
> > > > >  CONFIG_RTE_CRYPTO_MAX_DEVS=64
> > > > > +CONFIG_RTE_CRYPTODEV_CALLBACKS=y
> > > > >
> > > > >  #
> > > > >  # Compile PMD for ARMv8 Crypto device diff --git
> > > > > a/lib/librte_cryptodev/Makefile b/lib/librte_cryptodev/Makefile
> > > > > index
> > > > > 73e77a2..514d552 100644
> > > > > --- a/lib/librte_cryptodev/Makefile
> > > > > +++ b/lib/librte_cryptodev/Makefile
> > > > > @@ -10,7 +10,7 @@ LIB = librte_cryptodev.a  CFLAGS += -O3  CFLAGS
> > > > > +=
> > > > > $(WERROR_FLAGS)  LDLIBS += -lrte_eal -lrte_mempool -lrte_ring
> > > > > -lrte_mbuf -LDLIBS += -lrte_kvargs
> > > > > +LDLIBS += -lrte_kvargs -lrte_rcu
> > > > >
> > > > >  # library source files
> > > > >  SRCS-y += rte_cryptodev.c rte_cryptodev_pmd.c
> > > > > cryptodev_trace_points.c diff --git
> > > > > a/lib/librte_cryptodev/rte_cryptodev.c
> > > > > b/lib/librte_cryptodev/rte_cryptodev.c
> > > > > index 1dd795b..2fb3e35 100644
> > > > > --- a/lib/librte_cryptodev/rte_cryptodev.c
> > > > > +++ b/lib/librte_cryptodev/rte_cryptodev.c
> > > > > @@ -38,6 +38,7 @@
> > > > >  #include <rte_string_fns.h>
> > > > >  #include <rte_compat.h>
> > > > >  #include <rte_function_versioning.h>
> > > > > +#include <rte_rcu_qsbr.h>
> > > > >
> > > > >  #include "rte_crypto.h"
> > > > >  #include "rte_cryptodev.h"
> > > > > @@ -499,6 +500,10 @@ struct
> > > > rte_cryptodev_sym_session_pool_private_data {
> > > > >  	return 0;
> > > > >  }
> > > > >
> > > > > +#ifdef RTE_CRYPTODEV_CALLBACKS
> > > > > +/* spinlock for crypto device enq callbacks */ static
> > > > > +rte_spinlock_t rte_cryptodev_enq_cb_lock =
> > > > > +RTE_SPINLOCK_INITIALIZER; #endif
> > > > >
> > > > >  const char *
> > > > >  rte_cryptodev_get_feature_name(uint64_t flag) @@ -1449,6
> > > > > +1454,158
> > > > @@
> > > > > struct rte_cryptodev *
> > > > >  	rte_spinlock_unlock(&rte_cryptodev_cb_lock);
> > > > >  }
> > > > >
> > > > > +#ifdef RTE_CRYPTODEV_CALLBACKS
> > > > > +int
> > > > > +rte_cryptodev_rcu_qsbr_add(uint8_t dev_id, struct rte_rcu_qsbr
> > > > > +*qsbr) {
> > > > > +
> > > > > +	struct rte_cryptodev *dev;
> > > > > +
> > > > > +	if (!rte_cryptodev_pmd_is_valid_dev(dev_id)) {
> > > > > +		CDEV_LOG_ERR("Invalid dev_id=%" PRIu8, dev_id);
> > > > > +		return -EINVAL;
> > > > > +	}
> > > > > +
> > > > > +	dev = &rte_crypto_devices[dev_id];
> > > > > +	dev->qsbr = qsbr;
> > > > > +	return 0;
> > > > > +}
> > > >
> > > > So if I understand your patch correctly you propose a new working
> > > > model for
> > > > crypto-devs:
> > > > 1. Control-plane has to allocate/setup rcu_qsbr and do
> > > > rte_cryptodev_rcu_qsbr_add().
> > > > 2. Data-plane has somehow to obtain pointer to that rcu_qsbr and
> > > > wrap
> > > > cryptodev_enqueue()
> > > >    with rcu_qsbr_quiescent()  or rcu_qsbr_online()/rcu_qsbr_offline().
> > > Yes. I think, it is not a new model. It is same as RCU integration with LPM.
> > > Please refer: https://patches.dpdk.org/cover/73673/
> >
> > I am talking about new working model for crypto-dev enqueue/dequeue.
> > As I said above now it becomes data-plane thread responsibility to:
> >  -somehow to obtain pointer to that rcu_qsbr for each cryptodev it is using.
> >  -call rcu sync functions (quiescent/online/offline) on a regular basis.
> It is not on regular basis. When data plane comes up, they report online.
> They report quiescent when they are done with critical section or shared structure.

I understand that, but it means all existing apps have to be changed that way.

> All though, there is some dataplane changes involved here, I don't think, it is major.

I still think our goal here should be to make no visible changes to the dataplane.
I.E. all necessary data-plane changes need to be hidden inside CB invocation part.  

> 
> > Note that now data-plane thread would have to do that always - even if
> > there are now callbacks installed for that cryptodev queue right now.
> > All that changes behaviour of existing apps and I presume would reduce
> > adoption of  that fature.
> There is always trade off involved!
> In the previous patch, you suggested that some lazy app may not free up the
> memory allocated by add cb. For such apps, this patch has sync mechanism
> with some additional cost of CP & DP changes.

Sigh, it is not about laziness of the app.
The problem with current ethedev cb mechanism and yours V1 (which was just a clone of it) -
CP doesn't know when it is safe after CB removal to free related memory.

> > I still think all this callback mechanism should be totally opaque to data-plane
> > threads - user shouldn't change his app code depending on would some
> > enqueue/dequeue callbacks be installed or not.
> I am not sure, how that can be implemented with existing RCU design.

As I said below the simplest way - with calling rcu onine/offline inside CB invocation block.
That's why I asked you - did you try that approach and what is the perf numbers?
I presume with no callbacks installed the perf change should be nearly zero.

> @Honnappa Nagarahalli, Do you have any suggestions?
> 
> >
> > >
> > > >
> > > > That seems quite a big change and I don't think it is acceptable for
> > > > most users.
> > > > From my perspective adding/installing call-backs to the dev has to
> > > > be opaque to the data-plane code.
> > > > Also note that different callbacks can be installed by different
> > > > entities (libs) and might have no idea about each other.
> > > > That's why I thought it would be better to make all this RCU stuff
> > > > internal inside cryptodev:
> > > >     hide all this rcu_qsbr allocation/setup inside cryptod somehow to
> > obtain pointer to that rcu_qsbr ev init/queue setup
> > > >     invoke rcu_qsbr_online()/rcu_qsbr_offline() inside
> > cryptodev_enqueue().
> > > I have already tried exploring above stuffs. There are too many constraints.
> > > The changes don't fit in, as per RCU design.
> >
> > Hmm could you be more specific here - what constraints are you referring
> > to?
> >
> > > Moreover, having rcu api under enqueue_burst() will affect the
> > performance.
> >
> > It most likely will. Though my expectation it will affect performance only
> > when some callbacks are installed. My thought here:
> > callback function by itself will affect cryptdev_enqueue performance
> > anyway,
> With existing callback design, I have measured the performance(with crypto perf test) on xeon.
> It was almost negligible and same was shared with Declan.

I am asking about different thing: did you try alternate approach I described,
that wouldn't require changes in the user data-plane code.

> That is one of the reasons, I didn't want to add to many stuffs in to the callback.
> The best part of existing design is crypto lib is not much modified.
> The changes are either pushed to CP or DP.
> 
> so adding extra overhead for sync is probably ok here.

I think that extra overhead when callbacks are present is expected and
probably acceptable.
Changes in the upper-layer data-plane code - probably not.

> > Though for situation when no callbacks are installed - perfomance should be
> > left unaffected (or impact should be as small as possible).
> >
> > > The changes are more on control plane side, which is one time.
> > > The data plane changes are minimal.
> >
> > I still think upper layer data-plane code should stay unaffected (zero
> > changes).
> >
> > > >
> > > > > +
> > > > > +struct rte_cryptodev_enq_callback *
> > > > > +rte_cryptodev_add_enq_callback(uint8_t dev_id,
> > > > > +			       uint16_t qp_id,
> > > > > +			       rte_cryptodev_enq_cb_fn cb_fn,
> > > > > +			       void *cb_arg)
> > > > > +{
> > > > > +	struct rte_cryptodev *dev;
> > > > > +	struct rte_cryptodev_enq_callback *cb, *tail;
> > > > > +
> > > > > +	if (!cb_fn)
> > > > > +		return NULL;
> > > > > +
> > > > > +	if (!rte_cryptodev_pmd_is_valid_dev(dev_id)) {
> > > > > +		CDEV_LOG_ERR("Invalid dev_id=%" PRIu8, dev_id);
> > > > > +		return NULL;
> > > > > +	}
> > > > > +
> > > > > +	dev = &rte_crypto_devices[dev_id];
> > > > > +	if (qp_id >= dev->data->nb_queue_pairs) {
> > > > > +		CDEV_LOG_ERR("Invalid queue_pair_id=%d", qp_id);
> > > > > +		return NULL;
> > > > > +	}
> > > > > +
> > > > > +	cb = rte_zmalloc(NULL, sizeof(*cb), 0);
> > > > > +	if (cb == NULL) {
> > > > > +		CDEV_LOG_ERR("Failed to allocate memory for callback on "
> > > > > +			     "dev=%d, queue_pair_id=%d", dev_id, qp_id);
> > > > > +		rte_errno = ENOMEM;
> > > > > +		return NULL;
> > > > > +	}
> > > > > +
> > > > > +	cb->fn = cb_fn;
> > > > > +	cb->arg = cb_arg;
> > > > > +
> > > > > +	rte_spinlock_lock(&rte_cryptodev_enq_cb_lock);
> > > > > +	if (dev->enq_cbs == NULL) {
> > > > > +		dev->enq_cbs = rte_zmalloc(NULL, sizeof(cb) *
> > > > > +					   dev->data->nb_queue_pairs, 0);
> > > > > +		if (dev->enq_cbs == NULL) {
> > > > > +			CDEV_LOG_ERR("Failed to allocate memory for
> > > > callbacks");
> > > > > +			rte_errno = ENOMEM;
> > > > > +			rte_free(cb);
> > > > > +			return NULL;
> > > > > +		}
> > > > > +	}
> > > > > +
> > > > > +	/* Add the callbacks in fifo order. */
> > > > > +	tail = dev->enq_cbs[qp_id];
> > > > > +	if (tail) {
> > > > > +		while (tail->next)
> > > > > +			tail = tail->next;
> > > > > +		tail->next = cb;
> > > > > +	} else
> > > > > +		dev->enq_cbs[qp_id] = cb;
> > > > > +
> > > > > +	rte_spinlock_unlock(&rte_cryptodev_enq_cb_lock);
> > > > > +
> > > > > +	return cb;
> > > > > +}
> > > > > +
> > > > > +int
> > > > > +rte_cryptodev_remove_enq_callback(uint8_t dev_id,
> > > > > +				  uint16_t qp_id,
> > > > > +				  struct rte_cryptodev_enq_callback *cb) {
> > > > > +	struct rte_cryptodev *dev;
> > > > > +	struct rte_cryptodev_enq_callback **prev_cb, *curr_cb;
> > > > > +	uint16_t qp;
> > > > > +	int free_mem;
> > > > > +	int ret;
> > > > > +
> > > > > +	free_mem = 1;
> > > > > +
> > > > > +	if (!cb) {
> > > > > +		CDEV_LOG_ERR("cb is NULL");
> > > > > +		return -EINVAL;
> > > > > +	}
> > > > > +
> > > > > +	if (!rte_cryptodev_pmd_is_valid_dev(dev_id)) {
> > > > > +		CDEV_LOG_ERR("Invalid dev_id=%" PRIu8, dev_id);
> > > > > +		return -EINVAL;
> > > > > +	}
> > > > > +
> > > > > +	dev = &rte_crypto_devices[dev_id];
> > > > > +	if (qp_id >= dev->data->nb_queue_pairs) {
> > > > > +		CDEV_LOG_ERR("Invalid queue_pair_id=%d", qp_id);
> > > > > +		return -EINVAL;
> > > > > +	}
> > > > > +
> > > > > +	if (!dev->qsbr) {
> > > > > +		CDEV_LOG_ERR("Rcu qsbr is NULL");
> > > > > +		return -EINVAL;
> > > > > +	}
> > > > > +
> > > > > +	rte_spinlock_lock(&rte_cryptodev_enq_cb_lock);
> > > > > +	if (dev->enq_cbs == NULL) {
> > > > > +		rte_spinlock_unlock(&rte_cryptodev_enq_cb_lock);
> > > > > +		return -EINVAL;
> > > > > +	}
> > > > > +
> > > > > +	prev_cb = &dev->enq_cbs[qp_id];
> > > > > +	for (; *prev_cb != NULL; prev_cb = &curr_cb->next) {
> > > > > +		curr_cb = *prev_cb;
> > > > > +		if (curr_cb == cb) {
> > > > > +			/* Remove the user cb from the callback list. */
> > > > > +			*prev_cb = curr_cb->next;
> > > > > +			ret = 0;
> > > > > +			break;
> > > > > +		}
> > > > > +	}
> > > > > +
> > > > > +	if (!ret) {
> > > > > +		/* Call sync with invalid thread id as this is part of
> > > > > +		 * control plane API */
> > > > > +		rte_rcu_qsbr_synchronize(dev->qsbr,
> > > > > +					 RTE_QSBR_THRID_INVALID);
> > > > > +		rte_free(cb);
> > > > > +	}
> > > > > +
> > > > > +	for (qp = 0; qp < dev->data->nb_queue_pairs; qp++)
> > > > > +		if (dev->enq_cbs[qp] != NULL) {
> > > >
> > > > Some reference count (number of callbacks) seems like a better
> > > > approach here.
> > > Ok.
> > > >
> > > > > +			free_mem = 0;
> > > > > +			break;
> > > > > +		}
> > > > > +
> > > > > +	if (free_mem) {
> > > > > +		rte_free(dev->enq_cbs);
> > > > > +		dev->enq_cbs = NULL;
> > > > > +	}
> > > > > +
> > > > > +	rte_spinlock_unlock(&rte_cryptodev_enq_cb_lock);
> > > > > +
> > > > > +	return ret;
> > > > > +}
> > > > > +#endif
> > > > >
> > > > >  int
> > > > >  rte_cryptodev_sym_session_init(uint8_t dev_id, diff --git
> > > > > a/lib/librte_cryptodev/rte_cryptodev.h
> > > > > b/lib/librte_cryptodev/rte_cryptodev.h
> > > > > index 7b3ebc2..2c7a47b 100644
> > > > > --- a/lib/librte_cryptodev/rte_cryptodev.h
> > > > > +++ b/lib/librte_cryptodev/rte_cryptodev.h
> > > > > @@ -530,6 +530,32 @@ struct rte_cryptodev_qp_conf {  };
> > > > >
> > > > >  /**
> > > > > + * Function type used for pre processing crypto ops when enqueue
> > > > > +burst is
> > > > > + * called.
> > > > > + *
> > > > > + * The callback function is called on enqueue burst immediately
> > > > > + * before the crypto ops are put onto the hardware queue for
> > processing.
> > > > > + *
> > > > > + * @param	dev_id		The identifier of the device.
> > > > > + * @param	qp_id		The index of the queue pair in which
> > ops are
> > > > > + *				to be enqueued for processing. The
> > value
> > > > > + *				must be in the range [0,
> > nb_queue_pairs - 1]
> > > > > + *				previously supplied to
> > > > > + *				*rte_cryptodev_configure*.
> > > > > + * @param	ops		The address of an array of *nb_ops*
> > pointers
> > > > > + *				to *rte_crypto_op* structures which
> > contain
> > > > > + *				the crypto operations to be
> > processed.
> > > > > + * @param	nb_ops		The number of operations to process.
> > > > > + * @param	user_param	The arbitrary user parameter passed
> > in by the
> > > > > + *				application when the callback was
> > originally
> > > > > + *				registered.
> > > > > + * @return			The number of ops to be enqueued
> > to the
> > > > > + *				crypto device.
> > > > > + */
> > > > > +typedef uint16_t (*rte_cryptodev_enq_cb_fn)(uint16_t dev_id,
> > > > > +uint16_t
> > > > qp_id,
> > > > > +		struct rte_crypto_op **ops, uint16_t nb_ops, void
> > > > *user_param);
> > > > > +
> > > > > +/**
> > > > >   * Typedef for application callback function to be registered by
> > application
> > > > >   * software for notification of device events
> > > > >   *
> > > > > @@ -853,7 +879,6 @@ struct rte_cryptodev_config {
> > > > >  		enum rte_cryptodev_event_type event,
> > > > >  		rte_cryptodev_cb_fn cb_fn, void *cb_arg);
> > > > >
> > > > > -
> > > > >  typedef uint16_t (*dequeue_pkt_burst_t)(void *qp,
> > > > >  		struct rte_crypto_op **ops,	uint16_t nb_ops);
> > > > >  /**< Dequeue processed packets from queue pair of a device. */ @@
> > > > > -870,6 +895,17 @@ typedef uint16_t (*enqueue_pkt_burst_t)(void
> > > > > *qp,
> > > > >  /** Structure to keep track of registered callbacks */
> > > > > TAILQ_HEAD(rte_cryptodev_cb_list, rte_cryptodev_callback);
> > > > >
> > > > > +/**
> > > > > + * @internal
> > > > > + * Structure used to hold information about the callbacks to be
> > > > > +called for a
> > > > > + * queue pair on enqueue.
> > > > > + */
> > > > > +struct rte_cryptodev_enq_callback {
> > > > > +	struct rte_cryptodev_enq_callback *next;
> > > > > +	rte_cryptodev_enq_cb_fn fn;
> > > > > +	void *arg;
> > > > > +};
> > > > > +
> > > > >  /** The data structure associated with each crypto device. */
> > > > > struct rte_cryptodev {
> > > > >  	dequeue_pkt_burst_t dequeue_burst; @@ -898,6 +934,14 @@
> > struct
> > > > > rte_cryptodev {
> > > > >  	__extension__
> > > > >  	uint8_t attached : 1;
> > > > >  	/**< Flag indicating the device is attached */
> > > > > +
> > > > > +#ifdef RTE_CRYPTODEV_CALLBACKS
> > > > > +	struct rte_cryptodev_enq_callback **enq_cbs;
> > > > > +	/**< User application callback for pre enqueue processing */
> > > > > +
> > > > > +	struct rte_rcu_qsbr *qsbr;
> > > > > +	/** < RCU QSBR variable for rte_cryptodev_enq_callback */
> > > >
> > > > Probably better to have both these fields per queue.
> > > > Space for them can be allocated at dev_configure() or so.
> > > enq_cbs is allocated during callback add.
> > > Unlike ethdev, each cryptodev have their own max queue pair. There is no
> > macro for that.
> > > I think, single RCU should be good enough, as it has mechanism to track all
> > its reporting threads.
> > >
> > > > BTW, wouldn't it make sense to have ability to add callback for dequeue
> > too?
> > > As mentioned in the commit message, this patch was driven by a
> > requirement.
> > > If required, callback for the dequeue can be added in a separate patch.
> > > >
> > > > > +#endif
> > > > >  } __rte_cache_aligned;
> > > > >
> > > > >  void *
> > > > > @@ -1019,6 +1063,18 @@ struct rte_cryptodev_data {
> > > > >  		struct rte_crypto_op **ops, uint16_t nb_ops)  {
> > > > >  	struct rte_cryptodev *dev = &rte_cryptodevs[dev_id];
> > > > > +#ifdef RTE_CRYPTODEV_CALLBACKS
> > > > > +	if (unlikely(dev->enq_cbs != NULL && dev->enq_cbs[qp_id] !=
> > > > NULL)) {
> > > > > +		struct rte_cryptodev_enq_callback *cb =
> > > > > +			dev->enq_cbs[qp_id];
> > > > > +
> > > > > +		do {
> > > > > +			nb_ops = cb->fn(dev_id, qp_id, ops, nb_ops,
> > > > > +					cb->arg);
> > > > > +			cb = cb->next;
> > > > > +		} while (cb != NULL);
> > > > > +	}
> > > > > +#endif
> > > > >
> > > > >  	rte_cryptodev_trace_enqueue_burst(dev_id, qp_id, (void **)ops,
> > > > nb_ops);
> > > > >  	return (*dev->enqueue_burst)(
> > > > > @@ -1351,6 +1407,102 @@ struct rte_cryptodev_asym_session *
> > > > >  	struct rte_cryptodev_sym_session *sess, union rte_crypto_sym_ofs
> > > > ofs,
> > > > >  	struct rte_crypto_sym_vec *vec);
> > > > >
> > > > > +#ifdef RTE_CRYPTODEV_CALLBACKS
> > > > > +/**
> > > > > + * @warning
> > > > > + * @b EXPERIMENTAL: this API may change without prior notice
> > > > > + *
> > > > > + * Add a user callback for a given crypto device and queue pair
> > > > > +which will be
> > > > > + * called on crypto ops enqueue.
> > > > > + *
> > > > > + * This API configures a function to be called for each burst of
> > > > > +crypto ops
> > > > > + * received on a given crypto device queue pair. The return value
> > > > > +is a pointer
> > > > > + * that can be used later to remove the callback using
> > > > > + * rte_cryptodev_remove_enq_callback().
> > > > > + *
> > > > > + * Multiple functions are called in the order that they are added.
> > > > > + *
> > > > > + * @param	dev_id		The identifier of the device.
> > > > > + * @param	qp_id		The index of the queue pair in which
> > ops are
> > > > > + *				to be enqueued for processing. The
> > value
> > > > > + *				must be in the range [0,
> > nb_queue_pairs - 1]
> > > > > + *				previously supplied to
> > > > > + *				*rte_cryptodev_configure*.
> > > > > + * @param	cb_fn		The callback function
> > > > > + * @param	cb_arg		A generic pointer parameter which
> > will be
> > > > passed
> > > > > + *				to each invocation of the callback
> > function on
> > > > > + *				this crypto device and queue pair.
> > > > > + *
> > > > > + * @return
> > > > > + *   NULL on error.
> > > > > + *   On success, a pointer value which can later be used to remove the
> > > > callback.
> > > > > + */
> > > > > +
> > > > > +__rte_experimental
> > > > > +struct rte_cryptodev_enq_callback *
> > > > > +rte_cryptodev_add_enq_callback(uint8_t dev_id,
> > > > > +			       uint16_t qp_id,
> > > > > +			       rte_cryptodev_enq_cb_fn cb_fn,
> > > > > +			       void *cb_arg);
> > > > > +
> > > > > +
> > > > > +/**
> > > > > + * @warning
> > > > > + * @b EXPERIMENTAL: this API may change without prior notice
> > > > > + *
> > > > > + * Remove a user callback function for given crypto device and queue
> > pair.
> > > > > + *
> > > > > + * This function is used to removed callbacks that were added to
> > > > > +a crypto
> > > > > + * device queue pair using rte_cryptodev_add_enq_callback().
> > > > > + *
> > > > > + * Note: The callback expects a RCU QSBR to be configured to
> > > > > +synchronize
> > > > > + * to free the memory. Application is expected to configure RCU
> > > > > +QSBR after
> > > > > + * adding an enqueue callback.
> > > > > + *
> > > > > + *
> > > > > + * @param	dev_id		The identifier of the device.
> > > > > + * @param	qp_id		The index of the queue pair in which
> > ops are
> > > > > + *				to be enqueued for processing. The
> > value
> > > > > + *				must be in the range [0,
> > nb_queue_pairs - 1]
> > > > > + *				previously supplied to
> > > > > + *				*rte_cryptodev_configure*.
> > > > > + * @param	cb		Pointer to user supplied callback
> > created via
> > > > > + *				rte_cryptodev_add_enq_callback().
> > > > > + *
> > > > > + * @return
> > > > > + *   - 0: Success. Callback was removed.
> > > > > + *   - -EINVAL:  The dev_id or the qp_id is out of range, or the callback
> > > > > + *               is NULL or not found for the crypto device queue pair.
> > > > > + */
> > > > > +
> > > > > +__rte_experimental
> > > > > +int rte_cryptodev_remove_enq_callback(uint8_t dev_id,
> > > > > +				  uint16_t qp_id,
> > > > > +				  struct rte_cryptodev_enq_callback *cb);
> > > > > +
> > > > > +
> > > > > +/**
> > > > > + * @warning
> > > > > + * @b EXPERIMENTAL: this API may change without prior notice
> > > > > + *
> > > > > + * Associate RCU QSBR variable with a cryptodev.
> > > > > + *
> > > > > + * This function is used to add RCU QSBR to a crypto device.
> > > > > + * The purpose of RCU is to help multiple threads to synchronize
> > > > > + * with each other before initiating adding/removing callback
> > > > > + * while dataplane threads are running enqueue callbacks.
> > > > > + *
> > > > > + * @param	dev_id		The identifier of the device.
> > > > > + * @param	qsr		RCU QSBR configuration
> > > > > + * @return
> > > > > + *   On success - 0
> > > > > + *   On error - EINVAL.
> > > > > + */
> > > > > +
> > > > > +__rte_experimental
> > > > > +int rte_cryptodev_rcu_qsbr_add(uint8_t dev_id, struct
> > > > > +rte_rcu_qsbr *qsbr); #endif
> > > > > +
> > > > >  #ifdef __cplusplus
> > > > >  }
> > > > >  #endif
> > > > > diff --git a/lib/librte_cryptodev/rte_cryptodev_version.map
> > > > > b/lib/librte_cryptodev/rte_cryptodev_version.map
> > > > > index 02f6dcf..46de3ca 100644
> > > > > --- a/lib/librte_cryptodev/rte_cryptodev_version.map
> > > > > +++ b/lib/librte_cryptodev/rte_cryptodev_version.map
> > > > > @@ -64,6 +64,7 @@ DPDK_20.0 {
> > > > >  	rte_cryptodev_sym_capability_get;  };
> > > > >
> > > > > +
> > > > >  EXPERIMENTAL {
> > > > >  	global:
> > > > >
> > > > > @@ -105,4 +106,9 @@ EXPERIMENTAL {
> > > > >
> > > > >  	# added in 20.08
> > > > >  	rte_cryptodev_get_qp_status;
> > > > > +
> > > > > +	# added in 20.11
> > > > > +	rte_cryptodev_add_enq_callback;
> > > > > +	rte_cryptodev_remove_enq_callback;
> > > > > +	rte_cryptodev_rcu_qsbr_add;
> > > > >  };
> > > > > --
> > > > > 1.9.1
Honnappa Nagarahalli Sept. 23, 2020, 3:25 a.m. UTC | #9
<snip>

> 
> > > > > >
> > > > > > In an eventdev world, multiple workers (with ordered queue)
> > > > > > will be working on IPsec ESP processing. The ESP header's
> > > > > > sequence number is unique and has to be sequentially
> > > > > > incremented in an orderly
> > > manner.
> > > > > > This rises a need for incrementing sequence number in crypto
> > > > > > stage especially in event crypto adapter. By adding a user
> > > > > > callback to cryptodev at enqueue burst, the user callback will
> > > > > > get executed in the context of event crypto adapter. This
> > > > > > helps the application to increment the ESP sequence number
> atomically and orderly manner.
> > > > > >
> > > > > > This patch adds APIs to add/remove callback functions. The
> > > > > > callback function will be called for each burst of crypto ops
> > > > > > received on a given crypto device queue pair.
> > > > > >
> > > > > > v1->v2:
> > > > > > Moved callback related members to the end of cryptodev struct
> > > > > > Added support for RCU
> > > > > >
> > > > > > Signed-off-by: Abhinandan Gujjar <abhinandan.gujjar@intel.com>
> > > > > > ---
> > > > > >  config/common_base                             |   1 +
> > > > > >  lib/librte_cryptodev/Makefile                  |   2 +-
> > > > > >  lib/librte_cryptodev/rte_cryptodev.c           | 157
> > > > > +++++++++++++++++++++++++
> > > > > >  lib/librte_cryptodev/rte_cryptodev.h           | 154
> > > > > +++++++++++++++++++++++-
> > > > > >  lib/librte_cryptodev/rte_cryptodev_version.map |   6 +
> > > > > >  5 files changed, 318 insertions(+), 2 deletions(-)
> > > > > >
> > > > > > diff --git a/config/common_base b/config/common_base index
> > > > > > fbf0ee7..f5ebde4 100644
> > > > > > --- a/config/common_base
> > > > > > +++ b/config/common_base
> > > > > > @@ -599,6 +599,7 @@
> > > > > CONFIG_RTE_LIBRTE_PMD_BBDEV_FPGA_5GNR_FEC=y
> > > > > >  #
> > > > > >  CONFIG_RTE_LIBRTE_CRYPTODEV=y
> > > > > >  CONFIG_RTE_CRYPTO_MAX_DEVS=64
> > > > > > +CONFIG_RTE_CRYPTODEV_CALLBACKS=y
> > > > > >
> > > > > >  #
> > > > > >  # Compile PMD for ARMv8 Crypto device diff --git
> > > > > > a/lib/librte_cryptodev/Makefile
> > > > > > b/lib/librte_cryptodev/Makefile index
> > > > > > 73e77a2..514d552 100644
> > > > > > --- a/lib/librte_cryptodev/Makefile
> > > > > > +++ b/lib/librte_cryptodev/Makefile
> > > > > > @@ -10,7 +10,7 @@ LIB = librte_cryptodev.a  CFLAGS += -O3
> > > > > > CFLAGS
> > > > > > +=
> > > > > > $(WERROR_FLAGS)  LDLIBS += -lrte_eal -lrte_mempool -lrte_ring
> > > > > > -lrte_mbuf -LDLIBS += -lrte_kvargs
> > > > > > +LDLIBS += -lrte_kvargs -lrte_rcu
> > > > > >
> > > > > >  # library source files
> > > > > >  SRCS-y += rte_cryptodev.c rte_cryptodev_pmd.c
> > > > > > cryptodev_trace_points.c diff --git
> > > > > > a/lib/librte_cryptodev/rte_cryptodev.c
> > > > > > b/lib/librte_cryptodev/rte_cryptodev.c
> > > > > > index 1dd795b..2fb3e35 100644
> > > > > > --- a/lib/librte_cryptodev/rte_cryptodev.c
> > > > > > +++ b/lib/librte_cryptodev/rte_cryptodev.c
> > > > > > @@ -38,6 +38,7 @@
> > > > > >  #include <rte_string_fns.h>
> > > > > >  #include <rte_compat.h>
> > > > > >  #include <rte_function_versioning.h>
> > > > > > +#include <rte_rcu_qsbr.h>
> > > > > >
> > > > > >  #include "rte_crypto.h"
> > > > > >  #include "rte_cryptodev.h"
> > > > > > @@ -499,6 +500,10 @@ struct
> > > > > rte_cryptodev_sym_session_pool_private_data {
> > > > > >  	return 0;
> > > > > >  }
> > > > > >
> > > > > > +#ifdef RTE_CRYPTODEV_CALLBACKS
> > > > > > +/* spinlock for crypto device enq callbacks */ static
> > > > > > +rte_spinlock_t rte_cryptodev_enq_cb_lock =
> > > > > > +RTE_SPINLOCK_INITIALIZER; #endif
> > > > > >
> > > > > >  const char *
> > > > > >  rte_cryptodev_get_feature_name(uint64_t flag) @@ -1449,6
> > > > > > +1454,158
> > > > > @@
> > > > > > struct rte_cryptodev *
> > > > > >  	rte_spinlock_unlock(&rte_cryptodev_cb_lock);
> > > > > >  }
> > > > > >
> > > > > > +#ifdef RTE_CRYPTODEV_CALLBACKS int
> > > > > > +rte_cryptodev_rcu_qsbr_add(uint8_t dev_id, struct
> > > > > > +rte_rcu_qsbr
> > > > > > +*qsbr) {
> > > > > > +
> > > > > > +	struct rte_cryptodev *dev;
> > > > > > +
> > > > > > +	if (!rte_cryptodev_pmd_is_valid_dev(dev_id)) {
> > > > > > +		CDEV_LOG_ERR("Invalid dev_id=%" PRIu8, dev_id);
> > > > > > +		return -EINVAL;
> > > > > > +	}
> > > > > > +
> > > > > > +	dev = &rte_crypto_devices[dev_id];
> > > > > > +	dev->qsbr = qsbr;
> > > > > > +	return 0;
> > > > > > +}
> > > > >
> > > > > So if I understand your patch correctly you propose a new
> > > > > working model for
> > > > > crypto-devs:
> > > > > 1. Control-plane has to allocate/setup rcu_qsbr and do
> > > > > rte_cryptodev_rcu_qsbr_add().
> > > > > 2. Data-plane has somehow to obtain pointer to that rcu_qsbr and
> > > > > wrap
> > > > > cryptodev_enqueue()
> > > > >    with rcu_qsbr_quiescent()  or rcu_qsbr_online()/rcu_qsbr_offline().
> > > > Yes. I think, it is not a new model. It is same as RCU integration with
> LPM.
> > > > Please refer: https://patches.dpdk.org/cover/73673/
> > >
> > > I am talking about new working model for crypto-dev enqueue/dequeue.
> > > As I said above now it becomes data-plane thread responsibility to:
> > >  -somehow to obtain pointer to that rcu_qsbr for each cryptodev it is
> using.
> > >  -call rcu sync functions (quiescent/online/offline) on a regular basis.
> > It is not on regular basis. When data plane comes up, they report online.
> > They report quiescent when they are done with critical section or shared
> structure.
> 
> I understand that, but it means all existing apps have to be changed that way.
> 
> > All though, there is some dataplane changes involved here, I don't think, it
> is major.
> 
> I still think our goal here should be to make no visible changes to the
> dataplane.
> I.E. all necessary data-plane changes need to be hidden inside CB invocation
> part.
Please note that this is being implemented using the memory reclamation framework documented at https://doc.dpdk.org/guides/prog_guide/rcu_lib.html#resource-reclamation-framework-for-dpdk

While using RCU there are couple of trade-offs that applications have to consider:
1) Performance - reporting the quiescent state too often results in performance impact on data plane
2) Amount of outstanding memory to reclaim - reporting less often results in more outstanding memory to reclaim

Hence, the quiescent state reporting is left to the application. The application decides how often it reports the quiescent state and has control over the data plane performance and the outstanding memory to reclaim.

When you say "new working model for crypto-dev enqueue/dequeue",

1) are you comparing these with existing crypto-dev enqueue/dequeue APIs? If yes, these are new APIs, it is not breaking anything.
2) are you comparing these with existing call back functions in ethdev enqueue/dequeue APIs? If yes, agree that this is a new model. But, it is possible to support what ethdev supports along with the RCU method used in this patch.

> 
> >
> > > Note that now data-plane thread would have to do that always - even
> > > if there are now callbacks installed for that cryptodev queue right now.
> > > All that changes behaviour of existing apps and I presume would
> > > reduce adoption of  that fature.
If I understand this correct, you are talking about a case where in the application might be registering/unregistering multiple times during its lifetime. In this case, yes, the application might be reporting the quiescent state even when it has not registered the call backs. But, it has the flexibility to not report it if it implements additional logic.
Note that we are assuming that the application has to report quiescent state only for using callback functions. Most probably the application has other requirements to use RCU.
Why not support what is done for ethdev call back functions along with providing RCU method? 

> > There is always trade off involved!
> > In the previous patch, you suggested that some lazy app may not free
> > up the memory allocated by add cb. For such apps, this patch has sync
> > mechanism with some additional cost of CP & DP changes.
> 
> Sigh, it is not about laziness of the app.
> The problem with current ethedev cb mechanism and yours V1 (which was
> just a clone of it) - CP doesn't know when it is safe after CB removal to free
> related memory.
> 
> > > I still think all this callback mechanism should be totally opaque
> > > to data-plane threads - user shouldn't change his app code depending
> > > on would some enqueue/dequeue callbacks be installed or not.
> > I am not sure, how that can be implemented with existing RCU design.
> 
> As I said below the simplest way - with calling rcu onine/offline inside CB
> invocation block.
> That's why I asked you - did you try that approach and what is the perf
> numbers?
> I presume with no callbacks installed the perf change should be nearly zero.
> 
> > @Honnappa Nagarahalli, Do you have any suggestions?
Reporting quiescent state in the call back functions has several disadvantages:
1) it will have performance impacts and the impacts will increase as the number of data plane threads increase.
2) It will require additional configuration parameters to control how often the quiescent state is reported to control the performance impact.
3) Does not take advantage of the fact that most probably the application is using RCU already
4) There are few difficulties as well, please see below.

> >
> > >
> > > >
> > > > >
> > > > > That seems quite a big change and I don't think it is acceptable
> > > > > for most users.
> > > > > From my perspective adding/installing call-backs to the dev has
> > > > > to be opaque to the data-plane code.
> > > > > Also note that different callbacks can be installed by different
> > > > > entities (libs) and might have no idea about each other.
> > > > > That's why I thought it would be better to make all this RCU
> > > > > stuff internal inside cryptodev:
> > > > >     hide all this rcu_qsbr allocation/setup inside cryptod
> > > > > somehow to
> > > obtain pointer to that rcu_qsbr ev init/queue setup
> > > > >     invoke rcu_qsbr_online()/rcu_qsbr_offline() inside
> > > cryptodev_enqueue().
This will bring in the application related information such as the thread ID into the library. If the same API calls are being made from multiple data plane threads, you need a way to configure that information to the library. So, it is better to leave those details for the application to handle.

> > > > I have already tried exploring above stuffs. There are too many
> constraints.
> > > > The changes don't fit in, as per RCU design.
> > >
> > > Hmm could you be more specific here - what constraints are you
> > > referring to?
> > >
> > > > Moreover, having rcu api under enqueue_burst() will affect the
> > > performance.
> > >
> > > It most likely will. Though my expectation it will affect
> > > performance only when some callbacks are installed. My thought here:
> > > callback function by itself will affect cryptdev_enqueue performance
> > > anyway,
> > With existing callback design, I have measured the performance(with
> crypto perf test) on xeon.
> > It was almost negligible and same was shared with Declan.
> 
> I am asking about different thing: did you try alternate approach I described,
> that wouldn't require changes in the user data-plane code.
> 
> > That is one of the reasons, I didn't want to add to many stuffs in to the
> callback.
> > The best part of existing design is crypto lib is not much modified.
> > The changes are either pushed to CP or DP.
> >
> > so adding extra overhead for sync is probably ok here.
> 
> I think that extra overhead when callbacks are present is expected and
> probably acceptable.
> Changes in the upper-layer data-plane code - probably not.
> 
> > > Though for situation when no callbacks are installed - perfomance
> > > should be left unaffected (or impact should be as small as possible).
> > >
> > > > The changes are more on control plane side, which is one time.
> > > > The data plane changes are minimal.
> > >
> > > I still think upper layer data-plane code should stay unaffected
> > > (zero changes).
> > >
> > > > >
> > > > > > +
> > > > > > +struct rte_cryptodev_enq_callback *
> > > > > > +rte_cryptodev_add_enq_callback(uint8_t dev_id,
> > > > > > +			       uint16_t qp_id,
> > > > > > +			       rte_cryptodev_enq_cb_fn cb_fn,
> > > > > > +			       void *cb_arg)
> > > > > > +{
> > > > > > +	struct rte_cryptodev *dev;
> > > > > > +	struct rte_cryptodev_enq_callback *cb, *tail;
> > > > > > +
> > > > > > +	if (!cb_fn)
> > > > > > +		return NULL;
> > > > > > +
> > > > > > +	if (!rte_cryptodev_pmd_is_valid_dev(dev_id)) {
> > > > > > +		CDEV_LOG_ERR("Invalid dev_id=%" PRIu8, dev_id);
> > > > > > +		return NULL;
> > > > > > +	}
> > > > > > +
> > > > > > +	dev = &rte_crypto_devices[dev_id];
> > > > > > +	if (qp_id >= dev->data->nb_queue_pairs) {
> > > > > > +		CDEV_LOG_ERR("Invalid queue_pair_id=%d", qp_id);
> > > > > > +		return NULL;
> > > > > > +	}
> > > > > > +
> > > > > > +	cb = rte_zmalloc(NULL, sizeof(*cb), 0);
> > > > > > +	if (cb == NULL) {
> > > > > > +		CDEV_LOG_ERR("Failed to allocate memory for
> callback on "
> > > > > > +			     "dev=%d, queue_pair_id=%d", dev_id,
> qp_id);
> > > > > > +		rte_errno = ENOMEM;
> > > > > > +		return NULL;
> > > > > > +	}
> > > > > > +
> > > > > > +	cb->fn = cb_fn;
> > > > > > +	cb->arg = cb_arg;
> > > > > > +
> > > > > > +	rte_spinlock_lock(&rte_cryptodev_enq_cb_lock);
> > > > > > +	if (dev->enq_cbs == NULL) {
> > > > > > +		dev->enq_cbs = rte_zmalloc(NULL, sizeof(cb) *
> > > > > > +					   dev->data-
> >nb_queue_pairs, 0);
> > > > > > +		if (dev->enq_cbs == NULL) {
> > > > > > +			CDEV_LOG_ERR("Failed to allocate memory
> for
> > > > > callbacks");
> > > > > > +			rte_errno = ENOMEM;
> > > > > > +			rte_free(cb);
> > > > > > +			return NULL;
> > > > > > +		}
> > > > > > +	}
> > > > > > +
> > > > > > +	/* Add the callbacks in fifo order. */
> > > > > > +	tail = dev->enq_cbs[qp_id];
> > > > > > +	if (tail) {
> > > > > > +		while (tail->next)
> > > > > > +			tail = tail->next;
> > > > > > +		tail->next = cb;
> > > > > > +	} else
> > > > > > +		dev->enq_cbs[qp_id] = cb;
> > > > > > +
> > > > > > +	rte_spinlock_unlock(&rte_cryptodev_enq_cb_lock);
> > > > > > +
> > > > > > +	return cb;
> > > > > > +}
> > > > > > +
> > > > > > +int
> > > > > > +rte_cryptodev_remove_enq_callback(uint8_t dev_id,
> > > > > > +				  uint16_t qp_id,
> > > > > > +				  struct rte_cryptodev_enq_callback
> *cb) {
> > > > > > +	struct rte_cryptodev *dev;
> > > > > > +	struct rte_cryptodev_enq_callback **prev_cb, *curr_cb;
> > > > > > +	uint16_t qp;
> > > > > > +	int free_mem;
> > > > > > +	int ret;
> > > > > > +
> > > > > > +	free_mem = 1;
> > > > > > +
> > > > > > +	if (!cb) {
> > > > > > +		CDEV_LOG_ERR("cb is NULL");
> > > > > > +		return -EINVAL;
> > > > > > +	}
> > > > > > +
> > > > > > +	if (!rte_cryptodev_pmd_is_valid_dev(dev_id)) {
> > > > > > +		CDEV_LOG_ERR("Invalid dev_id=%" PRIu8, dev_id);
> > > > > > +		return -EINVAL;
> > > > > > +	}
> > > > > > +
> > > > > > +	dev = &rte_crypto_devices[dev_id];
> > > > > > +	if (qp_id >= dev->data->nb_queue_pairs) {
> > > > > > +		CDEV_LOG_ERR("Invalid queue_pair_id=%d", qp_id);
> > > > > > +		return -EINVAL;
> > > > > > +	}
> > > > > > +
> > > > > > +	if (!dev->qsbr) {
> > > > > > +		CDEV_LOG_ERR("Rcu qsbr is NULL");
> > > > > > +		return -EINVAL;
> > > > > > +	}
> > > > > > +
> > > > > > +	rte_spinlock_lock(&rte_cryptodev_enq_cb_lock);
> > > > > > +	if (dev->enq_cbs == NULL) {
> > > > > > +		rte_spinlock_unlock(&rte_cryptodev_enq_cb_lock);
> > > > > > +		return -EINVAL;
> > > > > > +	}
> > > > > > +
> > > > > > +	prev_cb = &dev->enq_cbs[qp_id];
> > > > > > +	for (; *prev_cb != NULL; prev_cb = &curr_cb->next) {
> > > > > > +		curr_cb = *prev_cb;
> > > > > > +		if (curr_cb == cb) {
> > > > > > +			/* Remove the user cb from the callback list.
> */
> > > > > > +			*prev_cb = curr_cb->next;
> > > > > > +			ret = 0;
> > > > > > +			break;
> > > > > > +		}
> > > > > > +	}
> > > > > > +
> > > > > > +	if (!ret) {
> > > > > > +		/* Call sync with invalid thread id as this is part of
> > > > > > +		 * control plane API */
> > > > > > +		rte_rcu_qsbr_synchronize(dev->qsbr,
> > > > > > +					 RTE_QSBR_THRID_INVALID);
> > > > > > +		rte_free(cb);
> > > > > > +	}
> > > > > > +
> > > > > > +	for (qp = 0; qp < dev->data->nb_queue_pairs; qp++)
> > > > > > +		if (dev->enq_cbs[qp] != NULL) {
> > > > >
> > > > > Some reference count (number of callbacks) seems like a better
> > > > > approach here.
> > > > Ok.
> > > > >
> > > > > > +			free_mem = 0;
> > > > > > +			break;
> > > > > > +		}
> > > > > > +
> > > > > > +	if (free_mem) {
> > > > > > +		rte_free(dev->enq_cbs);
> > > > > > +		dev->enq_cbs = NULL;
> > > > > > +	}
> > > > > > +
> > > > > > +	rte_spinlock_unlock(&rte_cryptodev_enq_cb_lock);
> > > > > > +
> > > > > > +	return ret;
> > > > > > +}
> > > > > > +#endif
> > > > > >
> > > > > >  int
> > > > > >  rte_cryptodev_sym_session_init(uint8_t dev_id, diff --git
> > > > > > a/lib/librte_cryptodev/rte_cryptodev.h
> > > > > > b/lib/librte_cryptodev/rte_cryptodev.h
> > > > > > index 7b3ebc2..2c7a47b 100644
> > > > > > --- a/lib/librte_cryptodev/rte_cryptodev.h
> > > > > > +++ b/lib/librte_cryptodev/rte_cryptodev.h
> > > > > > @@ -530,6 +530,32 @@ struct rte_cryptodev_qp_conf {  };
> > > > > >
> > > > > >  /**
> > > > > > + * Function type used for pre processing crypto ops when
> > > > > > +enqueue burst is
> > > > > > + * called.
> > > > > > + *
> > > > > > + * The callback function is called on enqueue burst
> > > > > > +immediately
> > > > > > + * before the crypto ops are put onto the hardware queue for
> > > processing.
> > > > > > + *
> > > > > > + * @param	dev_id		The identifier of the device.
> > > > > > + * @param	qp_id		The index of the queue pair in which
> > > ops are
> > > > > > + *				to be enqueued for processing. The
> > > value
> > > > > > + *				must be in the range [0,
> > > nb_queue_pairs - 1]
> > > > > > + *				previously supplied to
> > > > > > + *				*rte_cryptodev_configure*.
> > > > > > + * @param	ops		The address of an array of *nb_ops*
> > > pointers
> > > > > > + *				to *rte_crypto_op* structures which
> > > contain
> > > > > > + *				the crypto operations to be
> > > processed.
> > > > > > + * @param	nb_ops		The number of operations to process.
> > > > > > + * @param	user_param	The arbitrary user parameter passed
> > > in by the
> > > > > > + *				application when the callback was
> > > originally
> > > > > > + *				registered.
> > > > > > + * @return			The number of ops to be enqueued
> > > to the
> > > > > > + *				crypto device.
> > > > > > + */
> > > > > > +typedef uint16_t (*rte_cryptodev_enq_cb_fn)(uint16_t dev_id,
> > > > > > +uint16_t
> > > > > qp_id,
> > > > > > +		struct rte_crypto_op **ops, uint16_t nb_ops, void
> > > > > *user_param);
> > > > > > +
> > > > > > +/**
> > > > > >   * Typedef for application callback function to be registered
> > > > > > by
> > > application
> > > > > >   * software for notification of device events
> > > > > >   *
> > > > > > @@ -853,7 +879,6 @@ struct rte_cryptodev_config {
> > > > > >  		enum rte_cryptodev_event_type event,
> > > > > >  		rte_cryptodev_cb_fn cb_fn, void *cb_arg);
> > > > > >
> > > > > > -
> > > > > >  typedef uint16_t (*dequeue_pkt_burst_t)(void *qp,
> > > > > >  		struct rte_crypto_op **ops,	uint16_t nb_ops);
> > > > > >  /**< Dequeue processed packets from queue pair of a device.
> > > > > > */ @@
> > > > > > -870,6 +895,17 @@ typedef uint16_t (*enqueue_pkt_burst_t)(void
> > > > > > *qp,
> > > > > >  /** Structure to keep track of registered callbacks */
> > > > > > TAILQ_HEAD(rte_cryptodev_cb_list, rte_cryptodev_callback);
> > > > > >
> > > > > > +/**
> > > > > > + * @internal
> > > > > > + * Structure used to hold information about the callbacks to
> > > > > > +be called for a
> > > > > > + * queue pair on enqueue.
> > > > > > + */
> > > > > > +struct rte_cryptodev_enq_callback {
> > > > > > +	struct rte_cryptodev_enq_callback *next;
> > > > > > +	rte_cryptodev_enq_cb_fn fn;
> > > > > > +	void *arg;
> > > > > > +};
> > > > > > +
> > > > > >  /** The data structure associated with each crypto device. */
> > > > > > struct rte_cryptodev {
> > > > > >  	dequeue_pkt_burst_t dequeue_burst; @@ -898,6 +934,14
> @@
> > > struct
> > > > > > rte_cryptodev {
> > > > > >  	__extension__
> > > > > >  	uint8_t attached : 1;
> > > > > >  	/**< Flag indicating the device is attached */
> > > > > > +
> > > > > > +#ifdef RTE_CRYPTODEV_CALLBACKS
> > > > > > +	struct rte_cryptodev_enq_callback **enq_cbs;
> > > > > > +	/**< User application callback for pre enqueue processing */
> > > > > > +
> > > > > > +	struct rte_rcu_qsbr *qsbr;
> > > > > > +	/** < RCU QSBR variable for rte_cryptodev_enq_callback */
> > > > >
> > > > > Probably better to have both these fields per queue.
> > > > > Space for them can be allocated at dev_configure() or so.
> > > > enq_cbs is allocated during callback add.
> > > > Unlike ethdev, each cryptodev have their own max queue pair. There
> > > > is no
> > > macro for that.
> > > > I think, single RCU should be good enough, as it has mechanism to
> > > > track all
> > > its reporting threads.
> > > >
> > > > > BTW, wouldn't it make sense to have ability to add callback for
> > > > > dequeue
> > > too?
> > > > As mentioned in the commit message, this patch was driven by a
> > > requirement.
> > > > If required, callback for the dequeue can be added in a separate patch.
> > > > >
> > > > > > +#endif
> > > > > >  } __rte_cache_aligned;
> > > > > >
> > > > > >  void *
> > > > > > @@ -1019,6 +1063,18 @@ struct rte_cryptodev_data {
> > > > > >  		struct rte_crypto_op **ops, uint16_t nb_ops)  {
> > > > > >  	struct rte_cryptodev *dev = &rte_cryptodevs[dev_id];
> > > > > > +#ifdef RTE_CRYPTODEV_CALLBACKS
> > > > > > +	if (unlikely(dev->enq_cbs != NULL && dev-
> >enq_cbs[qp_id] !=
> > > > > NULL)) {
> > > > > > +		struct rte_cryptodev_enq_callback *cb =
> > > > > > +			dev->enq_cbs[qp_id];
> > > > > > +
> > > > > > +		do {
> > > > > > +			nb_ops = cb->fn(dev_id, qp_id, ops, nb_ops,
> > > > > > +					cb->arg);
> > > > > > +			cb = cb->next;
> > > > > > +		} while (cb != NULL);
> > > > > > +	}
> > > > > > +#endif
> > > > > >
> > > > > >  	rte_cryptodev_trace_enqueue_burst(dev_id, qp_id, (void
> > > > > > **)ops,
> > > > > nb_ops);
> > > > > >  	return (*dev->enqueue_burst)( @@ -1351,6 +1407,102 @@
> struct
> > > > > > rte_cryptodev_asym_session *
> > > > > >  	struct rte_cryptodev_sym_session *sess, union
> > > > > > rte_crypto_sym_ofs
> > > > > ofs,
> > > > > >  	struct rte_crypto_sym_vec *vec);
> > > > > >
> > > > > > +#ifdef RTE_CRYPTODEV_CALLBACKS
> > > > > > +/**
> > > > > > + * @warning
> > > > > > + * @b EXPERIMENTAL: this API may change without prior notice
> > > > > > + *
> > > > > > + * Add a user callback for a given crypto device and queue
> > > > > > +pair which will be
> > > > > > + * called on crypto ops enqueue.
> > > > > > + *
> > > > > > + * This API configures a function to be called for each burst
> > > > > > +of crypto ops
> > > > > > + * received on a given crypto device queue pair. The return
> > > > > > +value is a pointer
> > > > > > + * that can be used later to remove the callback using
> > > > > > + * rte_cryptodev_remove_enq_callback().
> > > > > > + *
> > > > > > + * Multiple functions are called in the order that they are added.
> > > > > > + *
> > > > > > + * @param	dev_id		The identifier of the device.
> > > > > > + * @param	qp_id		The index of the queue pair in which
> > > ops are
> > > > > > + *				to be enqueued for processing. The
> > > value
> > > > > > + *				must be in the range [0,
> > > nb_queue_pairs - 1]
> > > > > > + *				previously supplied to
> > > > > > + *				*rte_cryptodev_configure*.
> > > > > > + * @param	cb_fn		The callback function
> > > > > > + * @param	cb_arg		A generic pointer parameter which
> > > will be
> > > > > passed
> > > > > > + *				to each invocation of the callback
> > > function on
> > > > > > + *				this crypto device and queue pair.
> > > > > > + *
> > > > > > + * @return
> > > > > > + *   NULL on error.
> > > > > > + *   On success, a pointer value which can later be used to remove
> the
> > > > > callback.
> > > > > > + */
> > > > > > +
> > > > > > +__rte_experimental
> > > > > > +struct rte_cryptodev_enq_callback *
> > > > > > +rte_cryptodev_add_enq_callback(uint8_t dev_id,
> > > > > > +			       uint16_t qp_id,
> > > > > > +			       rte_cryptodev_enq_cb_fn cb_fn,
> > > > > > +			       void *cb_arg);
> > > > > > +
> > > > > > +
> > > > > > +/**
> > > > > > + * @warning
> > > > > > + * @b EXPERIMENTAL: this API may change without prior notice
> > > > > > + *
> > > > > > + * Remove a user callback function for given crypto device
> > > > > > +and queue
> > > pair.
> > > > > > + *
> > > > > > + * This function is used to removed callbacks that were added
> > > > > > +to a crypto
> > > > > > + * device queue pair using rte_cryptodev_add_enq_callback().
> > > > > > + *
> > > > > > + * Note: The callback expects a RCU QSBR to be configured to
> > > > > > +synchronize
> > > > > > + * to free the memory. Application is expected to configure
> > > > > > +RCU QSBR after
> > > > > > + * adding an enqueue callback.
> > > > > > + *
> > > > > > + *
> > > > > > + * @param	dev_id		The identifier of the device.
> > > > > > + * @param	qp_id		The index of the queue pair in which
> > > ops are
> > > > > > + *				to be enqueued for processing. The
> > > value
> > > > > > + *				must be in the range [0,
> > > nb_queue_pairs - 1]
> > > > > > + *				previously supplied to
> > > > > > + *				*rte_cryptodev_configure*.
> > > > > > + * @param	cb		Pointer to user supplied callback
> > > created via
> > > > > > + *				rte_cryptodev_add_enq_callback().
> > > > > > + *
> > > > > > + * @return
> > > > > > + *   - 0: Success. Callback was removed.
> > > > > > + *   - -EINVAL:  The dev_id or the qp_id is out of range, or the
> callback
> > > > > > + *               is NULL or not found for the crypto device queue pair.
> > > > > > + */
> > > > > > +
> > > > > > +__rte_experimental
> > > > > > +int rte_cryptodev_remove_enq_callback(uint8_t dev_id,
> > > > > > +				  uint16_t qp_id,
> > > > > > +				  struct rte_cryptodev_enq_callback
> *cb);
> > > > > > +
> > > > > > +
> > > > > > +/**
> > > > > > + * @warning
> > > > > > + * @b EXPERIMENTAL: this API may change without prior notice
> > > > > > + *
> > > > > > + * Associate RCU QSBR variable with a cryptodev.
> > > > > > + *
> > > > > > + * This function is used to add RCU QSBR to a crypto device.
> > > > > > + * The purpose of RCU is to help multiple threads to
> > > > > > +synchronize
> > > > > > + * with each other before initiating adding/removing callback
> > > > > > + * while dataplane threads are running enqueue callbacks.
> > > > > > + *
> > > > > > + * @param	dev_id		The identifier of the device.
> > > > > > + * @param	qsr		RCU QSBR configuration
> > > > > > + * @return
> > > > > > + *   On success - 0
> > > > > > + *   On error - EINVAL.
> > > > > > + */
> > > > > > +
> > > > > > +__rte_experimental
> > > > > > +int rte_cryptodev_rcu_qsbr_add(uint8_t dev_id, struct
> > > > > > +rte_rcu_qsbr *qsbr); #endif
> > > > > > +
> > > > > >  #ifdef __cplusplus
> > > > > >  }
> > > > > >  #endif
> > > > > > diff --git a/lib/librte_cryptodev/rte_cryptodev_version.map
> > > > > > b/lib/librte_cryptodev/rte_cryptodev_version.map
> > > > > > index 02f6dcf..46de3ca 100644
> > > > > > --- a/lib/librte_cryptodev/rte_cryptodev_version.map
> > > > > > +++ b/lib/librte_cryptodev/rte_cryptodev_version.map
> > > > > > @@ -64,6 +64,7 @@ DPDK_20.0 {
> > > > > >  	rte_cryptodev_sym_capability_get;  };
> > > > > >
> > > > > > +
> > > > > >  EXPERIMENTAL {
> > > > > >  	global:
> > > > > >
> > > > > > @@ -105,4 +106,9 @@ EXPERIMENTAL {
> > > > > >
> > > > > >  	# added in 20.08
> > > > > >  	rte_cryptodev_get_qp_status;
> > > > > > +
> > > > > > +	# added in 20.11
> > > > > > +	rte_cryptodev_add_enq_callback;
> > > > > > +	rte_cryptodev_remove_enq_callback;
> > > > > > +	rte_cryptodev_rcu_qsbr_add;
> > > > > >  };
> > > > > > --
> > > > > > 1.9.1
Ananyev, Konstantin Sept. 23, 2020, 11:58 a.m. UTC | #10
> > > > > > >
> > > > > > > In an eventdev world, multiple workers (with ordered queue)
> > > > > > > will be working on IPsec ESP processing. The ESP header's
> > > > > > > sequence number is unique and has to be sequentially
> > > > > > > incremented in an orderly
> > > > manner.
> > > > > > > This rises a need for incrementing sequence number in crypto
> > > > > > > stage especially in event crypto adapter. By adding a user
> > > > > > > callback to cryptodev at enqueue burst, the user callback will
> > > > > > > get executed in the context of event crypto adapter. This
> > > > > > > helps the application to increment the ESP sequence number
> > atomically and orderly manner.
> > > > > > >
> > > > > > > This patch adds APIs to add/remove callback functions. The
> > > > > > > callback function will be called for each burst of crypto ops
> > > > > > > received on a given crypto device queue pair.
> > > > > > >
> > > > > > > v1->v2:
> > > > > > > Moved callback related members to the end of cryptodev struct
> > > > > > > Added support for RCU
> > > > > > >
> > > > > > > Signed-off-by: Abhinandan Gujjar <abhinandan.gujjar@intel.com>
> > > > > > > ---
> > > > > > >  config/common_base                             |   1 +
> > > > > > >  lib/librte_cryptodev/Makefile                  |   2 +-
> > > > > > >  lib/librte_cryptodev/rte_cryptodev.c           | 157
> > > > > > +++++++++++++++++++++++++
> > > > > > >  lib/librte_cryptodev/rte_cryptodev.h           | 154
> > > > > > +++++++++++++++++++++++-
> > > > > > >  lib/librte_cryptodev/rte_cryptodev_version.map |   6 +
> > > > > > >  5 files changed, 318 insertions(+), 2 deletions(-)
> > > > > > >
> > > > > > > diff --git a/config/common_base b/config/common_base index
> > > > > > > fbf0ee7..f5ebde4 100644
> > > > > > > --- a/config/common_base
> > > > > > > +++ b/config/common_base
> > > > > > > @@ -599,6 +599,7 @@
> > > > > > CONFIG_RTE_LIBRTE_PMD_BBDEV_FPGA_5GNR_FEC=y
> > > > > > >  #
> > > > > > >  CONFIG_RTE_LIBRTE_CRYPTODEV=y
> > > > > > >  CONFIG_RTE_CRYPTO_MAX_DEVS=64
> > > > > > > +CONFIG_RTE_CRYPTODEV_CALLBACKS=y
> > > > > > >
> > > > > > >  #
> > > > > > >  # Compile PMD for ARMv8 Crypto device diff --git
> > > > > > > a/lib/librte_cryptodev/Makefile
> > > > > > > b/lib/librte_cryptodev/Makefile index
> > > > > > > 73e77a2..514d552 100644
> > > > > > > --- a/lib/librte_cryptodev/Makefile
> > > > > > > +++ b/lib/librte_cryptodev/Makefile
> > > > > > > @@ -10,7 +10,7 @@ LIB = librte_cryptodev.a  CFLAGS += -O3
> > > > > > > CFLAGS
> > > > > > > +=
> > > > > > > $(WERROR_FLAGS)  LDLIBS += -lrte_eal -lrte_mempool -lrte_ring
> > > > > > > -lrte_mbuf -LDLIBS += -lrte_kvargs
> > > > > > > +LDLIBS += -lrte_kvargs -lrte_rcu
> > > > > > >
> > > > > > >  # library source files
> > > > > > >  SRCS-y += rte_cryptodev.c rte_cryptodev_pmd.c
> > > > > > > cryptodev_trace_points.c diff --git
> > > > > > > a/lib/librte_cryptodev/rte_cryptodev.c
> > > > > > > b/lib/librte_cryptodev/rte_cryptodev.c
> > > > > > > index 1dd795b..2fb3e35 100644
> > > > > > > --- a/lib/librte_cryptodev/rte_cryptodev.c
> > > > > > > +++ b/lib/librte_cryptodev/rte_cryptodev.c
> > > > > > > @@ -38,6 +38,7 @@
> > > > > > >  #include <rte_string_fns.h>
> > > > > > >  #include <rte_compat.h>
> > > > > > >  #include <rte_function_versioning.h>
> > > > > > > +#include <rte_rcu_qsbr.h>
> > > > > > >
> > > > > > >  #include "rte_crypto.h"
> > > > > > >  #include "rte_cryptodev.h"
> > > > > > > @@ -499,6 +500,10 @@ struct
> > > > > > rte_cryptodev_sym_session_pool_private_data {
> > > > > > >  	return 0;
> > > > > > >  }
> > > > > > >
> > > > > > > +#ifdef RTE_CRYPTODEV_CALLBACKS
> > > > > > > +/* spinlock for crypto device enq callbacks */ static
> > > > > > > +rte_spinlock_t rte_cryptodev_enq_cb_lock =
> > > > > > > +RTE_SPINLOCK_INITIALIZER; #endif
> > > > > > >
> > > > > > >  const char *
> > > > > > >  rte_cryptodev_get_feature_name(uint64_t flag) @@ -1449,6
> > > > > > > +1454,158
> > > > > > @@
> > > > > > > struct rte_cryptodev *
> > > > > > >  	rte_spinlock_unlock(&rte_cryptodev_cb_lock);
> > > > > > >  }
> > > > > > >
> > > > > > > +#ifdef RTE_CRYPTODEV_CALLBACKS int
> > > > > > > +rte_cryptodev_rcu_qsbr_add(uint8_t dev_id, struct
> > > > > > > +rte_rcu_qsbr
> > > > > > > +*qsbr) {
> > > > > > > +
> > > > > > > +	struct rte_cryptodev *dev;
> > > > > > > +
> > > > > > > +	if (!rte_cryptodev_pmd_is_valid_dev(dev_id)) {
> > > > > > > +		CDEV_LOG_ERR("Invalid dev_id=%" PRIu8, dev_id);
> > > > > > > +		return -EINVAL;
> > > > > > > +	}
> > > > > > > +
> > > > > > > +	dev = &rte_crypto_devices[dev_id];
> > > > > > > +	dev->qsbr = qsbr;
> > > > > > > +	return 0;
> > > > > > > +}
> > > > > >
> > > > > > So if I understand your patch correctly you propose a new
> > > > > > working model for
> > > > > > crypto-devs:
> > > > > > 1. Control-plane has to allocate/setup rcu_qsbr and do
> > > > > > rte_cryptodev_rcu_qsbr_add().
> > > > > > 2. Data-plane has somehow to obtain pointer to that rcu_qsbr and
> > > > > > wrap
> > > > > > cryptodev_enqueue()
> > > > > >    with rcu_qsbr_quiescent()  or rcu_qsbr_online()/rcu_qsbr_offline().
> > > > > Yes. I think, it is not a new model. It is same as RCU integration with
> > LPM.
> > > > > Please refer: https://patches.dpdk.org/cover/73673/
> > > >
> > > > I am talking about new working model for crypto-dev enqueue/dequeue.
> > > > As I said above now it becomes data-plane thread responsibility to:
> > > >  -somehow to obtain pointer to that rcu_qsbr for each cryptodev it is
> > using.
> > > >  -call rcu sync functions (quiescent/online/offline) on a regular basis.
> > > It is not on regular basis. When data plane comes up, they report online.
> > > They report quiescent when they are done with critical section or shared
> > structure.
> >
> > I understand that, but it means all existing apps have to be changed that way.
> >
> > > All though, there is some dataplane changes involved here, I don't think, it
> > is major.
> >
> > I still think our goal here should be to make no visible changes to the
> > dataplane.
> > I.E. all necessary data-plane changes need to be hidden inside CB invocation
> > part.
> Please note that this is being implemented using the memory reclamation framework documented at
> https://doc.dpdk.org/guides/prog_guide/rcu_lib.html#resource-reclamation-framework-for-dpdk
> 
> While using RCU there are couple of trade-offs that applications have to consider:
> 1) Performance - reporting the quiescent state too often results in performance impact on data plane
> 2) Amount of outstanding memory to reclaim - reporting less often results in more outstanding memory to reclaim
> 
> Hence, the quiescent state reporting is left to the application. The application decides how often it reports the quiescent state and has
> control over the data plane performance and the outstanding memory to reclaim.
> 
> When you say "new working model for crypto-dev enqueue/dequeue",
> 
> 1) are you comparing these with existing crypto-dev enqueue/dequeue APIs? If yes, these are new APIs, it is not breaking anything.
> 2) are you comparing these with existing call back functions in ethdev enqueue/dequeue APIs? If yes, agree that this is a new model. But, it
> is possible to support what ethdev supports along with the RCU method used in this patch.

What I am talking about:

Existing cryptodev enqueue/dequeue model doesn't require for the user to manage any RCU QSBR state manually.
I believe that addition of ability to add/remove enqueue/dequeue callbacks shouldn't change existing working model.
I think that adding/removing such callbacks has to be opaque to the user DP code and shouldn't require user
to change it. Same as we have now for  ethdev callback implementation.
I think that forcing DP code to be aware that callbacks are present or not and
to modify its behaviour depending on that nearly voids the purpose of having callbacks at all.
In that case DP can just invoke callback function directly from it's codepath .

> >
> > >
> > > > Note that now data-plane thread would have to do that always - even
> > > > if there are now callbacks installed for that cryptodev queue right now.
> > > > All that changes behaviour of existing apps and I presume would
> > > > reduce adoption of  that fature.
> If I understand this correct, you are talking about a case where in the application might be registering/unregistering multiple times during its
> lifetime. In this case, yes, the application might be reporting the quiescent state even when it has not registered the call backs. But, it has
> the flexibility to not report it if it implements additional logic.
> Note that we are assuming that the application has to report quiescent state only for using callback functions. Most probably the
> application has other requirements to use RCU.
> Why not support what is done for ethdev call back functions along with providing RCU method?
> 
> > > There is always trade off involved!
> > > In the previous patch, you suggested that some lazy app may not free
> > > up the memory allocated by add cb. For such apps, this patch has sync
> > > mechanism with some additional cost of CP & DP changes.
> >
> > Sigh, it is not about laziness of the app.
> > The problem with current ethedev cb mechanism and yours V1 (which was
> > just a clone of it) - CP doesn't know when it is safe after CB removal to free
> > related memory.
> >
> > > > I still think all this callback mechanism should be totally opaque
> > > > to data-plane threads - user shouldn't change his app code depending
> > > > on would some enqueue/dequeue callbacks be installed or not.
> > > I am not sure, how that can be implemented with existing RCU design.
> >
> > As I said below the simplest way - with calling rcu onine/offline inside CB
> > invocation block.
> > That's why I asked you - did you try that approach and what is the perf
> > numbers?
> > I presume with no callbacks installed the perf change should be nearly zero.
> >
> > > @Honnappa Nagarahalli, Do you have any suggestions?
> Reporting quiescent state in the call back functions has several disadvantages:
> 1) it will have performance impacts and the impacts will increase as the number of data plane threads increase.
> 2) It will require additional configuration parameters to control how often the quiescent state is reported to control the performance
> impact.
> 3) Does not take advantage of the fact that most probably the application is using RCU already
> 4) There are few difficulties as well, please see below.

I suggested Abhinandan to use RCU library because it is already there,
and I thought it would be good not to re-implement the wheel.
Though if you feel librte_rcu doesn't match that task - fine, let's do it without librte_rcu.
After all, what we need here - just an atomic ref count per queue that we are going to increment at
entering and leaving list of callbacks inside enqueue/dequeue. 

> 
> > >
> > > >
> > > > >
> > > > > >
> > > > > > That seems quite a big change and I don't think it is acceptable
> > > > > > for most users.
> > > > > > From my perspective adding/installing call-backs to the dev has
> > > > > > to be opaque to the data-plane code.
> > > > > > Also note that different callbacks can be installed by different
> > > > > > entities (libs) and might have no idea about each other.
> > > > > > That's why I thought it would be better to make all this RCU
> > > > > > stuff internal inside cryptodev:
> > > > > >     hide all this rcu_qsbr allocation/setup inside cryptod
> > > > > > somehow to
> > > > obtain pointer to that rcu_qsbr ev init/queue setup
> > > > > >     invoke rcu_qsbr_online()/rcu_qsbr_offline() inside
> > > > cryptodev_enqueue().
> This will bring in the application related information such as the thread ID into the library.

I don't think it would.
Cryptodev enqueue/dequeue functions are not supposed to be thread safe (same as rx/tx burst).
So we can always use RCU with just one thread(thread_id = 0).
But as I said above - if you feel RCU lib is an overhead here, that's fine -
I think it would be easy enough to do without librte_rcu. 

> If the same API calls are being made from multiple
> data plane threads, you need a way to configure that information to the library. So, it is better to leave those details for the application to
> handle.
> 
> > > > > I have already tried exploring above stuffs. There are too many
> > constraints.
> > > > > The changes don't fit in, as per RCU design.
> > > >
> > > > Hmm could you be more specific here - what constraints are you
> > > > referring to?
> > > >
> > > > > Moreover, having rcu api under enqueue_burst() will affect the
> > > > performance.
> > > >
> > > > It most likely will. Though my expectation it will affect
> > > > performance only when some callbacks are installed. My thought here:
> > > > callback function by itself will affect cryptdev_enqueue performance
> > > > anyway,
> > > With existing callback design, I have measured the performance(with
> > crypto perf test) on xeon.
> > > It was almost negligible and same was shared with Declan.
> >
> > I am asking about different thing: did you try alternate approach I described,
> > that wouldn't require changes in the user data-plane code.
> >
> > > That is one of the reasons, I didn't want to add to many stuffs in to the
> > callback.
> > > The best part of existing design is crypto lib is not much modified.
> > > The changes are either pushed to CP or DP.
> > >
> > > so adding extra overhead for sync is probably ok here.
> >
> > I think that extra overhead when callbacks are present is expected and
> > probably acceptable.
> > Changes in the upper-layer data-plane code - probably not.
> >
> > > > Though for situation when no callbacks are installed - perfomance
> > > > should be left unaffected (or impact should be as small as possible).
> > > >
> > > > > The changes are more on control plane side, which is one time.
> > > > > The data plane changes are minimal.
> > > >
> > > > I still think upper layer data-plane code should stay unaffected
> > > > (zero changes).
> > > >
> > > > > >
> > > > > > > +
> > > > > > > +struct rte_cryptodev_enq_callback *
> > > > > > > +rte_cryptodev_add_enq_callback(uint8_t dev_id,
> > > > > > > +			       uint16_t qp_id,
> > > > > > > +			       rte_cryptodev_enq_cb_fn cb_fn,
> > > > > > > +			       void *cb_arg)
> > > > > > > +{
> > > > > > > +	struct rte_cryptodev *dev;
> > > > > > > +	struct rte_cryptodev_enq_callback *cb, *tail;
> > > > > > > +
> > > > > > > +	if (!cb_fn)
> > > > > > > +		return NULL;
> > > > > > > +
> > > > > > > +	if (!rte_cryptodev_pmd_is_valid_dev(dev_id)) {
> > > > > > > +		CDEV_LOG_ERR("Invalid dev_id=%" PRIu8, dev_id);
> > > > > > > +		return NULL;
> > > > > > > +	}
> > > > > > > +
> > > > > > > +	dev = &rte_crypto_devices[dev_id];
> > > > > > > +	if (qp_id >= dev->data->nb_queue_pairs) {
> > > > > > > +		CDEV_LOG_ERR("Invalid queue_pair_id=%d", qp_id);
> > > > > > > +		return NULL;
> > > > > > > +	}
> > > > > > > +
> > > > > > > +	cb = rte_zmalloc(NULL, sizeof(*cb), 0);
> > > > > > > +	if (cb == NULL) {
> > > > > > > +		CDEV_LOG_ERR("Failed to allocate memory for
> > callback on "
> > > > > > > +			     "dev=%d, queue_pair_id=%d", dev_id,
> > qp_id);
> > > > > > > +		rte_errno = ENOMEM;
> > > > > > > +		return NULL;
> > > > > > > +	}
> > > > > > > +
> > > > > > > +	cb->fn = cb_fn;
> > > > > > > +	cb->arg = cb_arg;
> > > > > > > +
> > > > > > > +	rte_spinlock_lock(&rte_cryptodev_enq_cb_lock);
> > > > > > > +	if (dev->enq_cbs == NULL) {
> > > > > > > +		dev->enq_cbs = rte_zmalloc(NULL, sizeof(cb) *
> > > > > > > +					   dev->data-
> > >nb_queue_pairs, 0);
> > > > > > > +		if (dev->enq_cbs == NULL) {
> > > > > > > +			CDEV_LOG_ERR("Failed to allocate memory
> > for
> > > > > > callbacks");
> > > > > > > +			rte_errno = ENOMEM;
> > > > > > > +			rte_free(cb);
> > > > > > > +			return NULL;
> > > > > > > +		}
> > > > > > > +	}
> > > > > > > +
> > > > > > > +	/* Add the callbacks in fifo order. */
> > > > > > > +	tail = dev->enq_cbs[qp_id];
> > > > > > > +	if (tail) {
> > > > > > > +		while (tail->next)
> > > > > > > +			tail = tail->next;
> > > > > > > +		tail->next = cb;
> > > > > > > +	} else
> > > > > > > +		dev->enq_cbs[qp_id] = cb;
> > > > > > > +
> > > > > > > +	rte_spinlock_unlock(&rte_cryptodev_enq_cb_lock);
> > > > > > > +
> > > > > > > +	return cb;
> > > > > > > +}
> > > > > > > +
> > > > > > > +int
> > > > > > > +rte_cryptodev_remove_enq_callback(uint8_t dev_id,
> > > > > > > +				  uint16_t qp_id,
> > > > > > > +				  struct rte_cryptodev_enq_callback
> > *cb) {
> > > > > > > +	struct rte_cryptodev *dev;
> > > > > > > +	struct rte_cryptodev_enq_callback **prev_cb, *curr_cb;
> > > > > > > +	uint16_t qp;
> > > > > > > +	int free_mem;
> > > > > > > +	int ret;
> > > > > > > +
> > > > > > > +	free_mem = 1;
> > > > > > > +
> > > > > > > +	if (!cb) {
> > > > > > > +		CDEV_LOG_ERR("cb is NULL");
> > > > > > > +		return -EINVAL;
> > > > > > > +	}
> > > > > > > +
> > > > > > > +	if (!rte_cryptodev_pmd_is_valid_dev(dev_id)) {
> > > > > > > +		CDEV_LOG_ERR("Invalid dev_id=%" PRIu8, dev_id);
> > > > > > > +		return -EINVAL;
> > > > > > > +	}
> > > > > > > +
> > > > > > > +	dev = &rte_crypto_devices[dev_id];
> > > > > > > +	if (qp_id >= dev->data->nb_queue_pairs) {
> > > > > > > +		CDEV_LOG_ERR("Invalid queue_pair_id=%d", qp_id);
> > > > > > > +		return -EINVAL;
> > > > > > > +	}
> > > > > > > +
> > > > > > > +	if (!dev->qsbr) {
> > > > > > > +		CDEV_LOG_ERR("Rcu qsbr is NULL");
> > > > > > > +		return -EINVAL;
> > > > > > > +	}
> > > > > > > +
> > > > > > > +	rte_spinlock_lock(&rte_cryptodev_enq_cb_lock);
> > > > > > > +	if (dev->enq_cbs == NULL) {
> > > > > > > +		rte_spinlock_unlock(&rte_cryptodev_enq_cb_lock);
> > > > > > > +		return -EINVAL;
> > > > > > > +	}
> > > > > > > +
> > > > > > > +	prev_cb = &dev->enq_cbs[qp_id];
> > > > > > > +	for (; *prev_cb != NULL; prev_cb = &curr_cb->next) {
> > > > > > > +		curr_cb = *prev_cb;
> > > > > > > +		if (curr_cb == cb) {
> > > > > > > +			/* Remove the user cb from the callback list.
> > */
> > > > > > > +			*prev_cb = curr_cb->next;
> > > > > > > +			ret = 0;
> > > > > > > +			break;
> > > > > > > +		}
> > > > > > > +	}
> > > > > > > +
> > > > > > > +	if (!ret) {
> > > > > > > +		/* Call sync with invalid thread id as this is part of
> > > > > > > +		 * control plane API */
> > > > > > > +		rte_rcu_qsbr_synchronize(dev->qsbr,
> > > > > > > +					 RTE_QSBR_THRID_INVALID);
> > > > > > > +		rte_free(cb);
> > > > > > > +	}
> > > > > > > +
> > > > > > > +	for (qp = 0; qp < dev->data->nb_queue_pairs; qp++)
> > > > > > > +		if (dev->enq_cbs[qp] != NULL) {
> > > > > >
> > > > > > Some reference count (number of callbacks) seems like a better
> > > > > > approach here.
> > > > > Ok.
> > > > > >
> > > > > > > +			free_mem = 0;
> > > > > > > +			break;
> > > > > > > +		}
> > > > > > > +
> > > > > > > +	if (free_mem) {
> > > > > > > +		rte_free(dev->enq_cbs);
> > > > > > > +		dev->enq_cbs = NULL;
> > > > > > > +	}
> > > > > > > +
> > > > > > > +	rte_spinlock_unlock(&rte_cryptodev_enq_cb_lock);
> > > > > > > +
> > > > > > > +	return ret;
> > > > > > > +}
> > > > > > > +#endif
> > > > > > >
> > > > > > >  int
> > > > > > >  rte_cryptodev_sym_session_init(uint8_t dev_id, diff --git
> > > > > > > a/lib/librte_cryptodev/rte_cryptodev.h
> > > > > > > b/lib/librte_cryptodev/rte_cryptodev.h
> > > > > > > index 7b3ebc2..2c7a47b 100644
> > > > > > > --- a/lib/librte_cryptodev/rte_cryptodev.h
> > > > > > > +++ b/lib/librte_cryptodev/rte_cryptodev.h
> > > > > > > @@ -530,6 +530,32 @@ struct rte_cryptodev_qp_conf {  };
> > > > > > >
> > > > > > >  /**
> > > > > > > + * Function type used for pre processing crypto ops when
> > > > > > > +enqueue burst is
> > > > > > > + * called.
> > > > > > > + *
> > > > > > > + * The callback function is called on enqueue burst
> > > > > > > +immediately
> > > > > > > + * before the crypto ops are put onto the hardware queue for
> > > > processing.
> > > > > > > + *
> > > > > > > + * @param	dev_id		The identifier of the device.
> > > > > > > + * @param	qp_id		The index of the queue pair in which
> > > > ops are
> > > > > > > + *				to be enqueued for processing. The
> > > > value
> > > > > > > + *				must be in the range [0,
> > > > nb_queue_pairs - 1]
> > > > > > > + *				previously supplied to
> > > > > > > + *				*rte_cryptodev_configure*.
> > > > > > > + * @param	ops		The address of an array of *nb_ops*
> > > > pointers
> > > > > > > + *				to *rte_crypto_op* structures which
> > > > contain
> > > > > > > + *				the crypto operations to be
> > > > processed.
> > > > > > > + * @param	nb_ops		The number of operations to process.
> > > > > > > + * @param	user_param	The arbitrary user parameter passed
> > > > in by the
> > > > > > > + *				application when the callback was
> > > > originally
> > > > > > > + *				registered.
> > > > > > > + * @return			The number of ops to be enqueued
> > > > to the
> > > > > > > + *				crypto device.
> > > > > > > + */
> > > > > > > +typedef uint16_t (*rte_cryptodev_enq_cb_fn)(uint16_t dev_id,
> > > > > > > +uint16_t
> > > > > > qp_id,
> > > > > > > +		struct rte_crypto_op **ops, uint16_t nb_ops, void
> > > > > > *user_param);
> > > > > > > +
> > > > > > > +/**
> > > > > > >   * Typedef for application callback function to be registered
> > > > > > > by
> > > > application
> > > > > > >   * software for notification of device events
> > > > > > >   *
> > > > > > > @@ -853,7 +879,6 @@ struct rte_cryptodev_config {
> > > > > > >  		enum rte_cryptodev_event_type event,
> > > > > > >  		rte_cryptodev_cb_fn cb_fn, void *cb_arg);
> > > > > > >
> > > > > > > -
> > > > > > >  typedef uint16_t (*dequeue_pkt_burst_t)(void *qp,
> > > > > > >  		struct rte_crypto_op **ops,	uint16_t nb_ops);
> > > > > > >  /**< Dequeue processed packets from queue pair of a device.
> > > > > > > */ @@
> > > > > > > -870,6 +895,17 @@ typedef uint16_t (*enqueue_pkt_burst_t)(void
> > > > > > > *qp,
> > > > > > >  /** Structure to keep track of registered callbacks */
> > > > > > > TAILQ_HEAD(rte_cryptodev_cb_list, rte_cryptodev_callback);
> > > > > > >
> > > > > > > +/**
> > > > > > > + * @internal
> > > > > > > + * Structure used to hold information about the callbacks to
> > > > > > > +be called for a
> > > > > > > + * queue pair on enqueue.
> > > > > > > + */
> > > > > > > +struct rte_cryptodev_enq_callback {
> > > > > > > +	struct rte_cryptodev_enq_callback *next;
> > > > > > > +	rte_cryptodev_enq_cb_fn fn;
> > > > > > > +	void *arg;
> > > > > > > +};
> > > > > > > +
> > > > > > >  /** The data structure associated with each crypto device. */
> > > > > > > struct rte_cryptodev {
> > > > > > >  	dequeue_pkt_burst_t dequeue_burst; @@ -898,6 +934,14
> > @@
> > > > struct
> > > > > > > rte_cryptodev {
> > > > > > >  	__extension__
> > > > > > >  	uint8_t attached : 1;
> > > > > > >  	/**< Flag indicating the device is attached */
> > > > > > > +
> > > > > > > +#ifdef RTE_CRYPTODEV_CALLBACKS
> > > > > > > +	struct rte_cryptodev_enq_callback **enq_cbs;
> > > > > > > +	/**< User application callback for pre enqueue processing */
> > > > > > > +
> > > > > > > +	struct rte_rcu_qsbr *qsbr;
> > > > > > > +	/** < RCU QSBR variable for rte_cryptodev_enq_callback */
> > > > > >
> > > > > > Probably better to have both these fields per queue.
> > > > > > Space for them can be allocated at dev_configure() or so.
> > > > > enq_cbs is allocated during callback add.
> > > > > Unlike ethdev, each cryptodev have their own max queue pair. There
> > > > > is no
> > > > macro for that.
> > > > > I think, single RCU should be good enough, as it has mechanism to
> > > > > track all
> > > > its reporting threads.
> > > > >
> > > > > > BTW, wouldn't it make sense to have ability to add callback for
> > > > > > dequeue
> > > > too?
> > > > > As mentioned in the commit message, this patch was driven by a
> > > > requirement.
> > > > > If required, callback for the dequeue can be added in a separate patch.
> > > > > >
> > > > > > > +#endif
> > > > > > >  } __rte_cache_aligned;
> > > > > > >
> > > > > > >  void *
> > > > > > > @@ -1019,6 +1063,18 @@ struct rte_cryptodev_data {
> > > > > > >  		struct rte_crypto_op **ops, uint16_t nb_ops)  {
> > > > > > >  	struct rte_cryptodev *dev = &rte_cryptodevs[dev_id];
> > > > > > > +#ifdef RTE_CRYPTODEV_CALLBACKS
> > > > > > > +	if (unlikely(dev->enq_cbs != NULL && dev-
> > >enq_cbs[qp_id] !=
> > > > > > NULL)) {
> > > > > > > +		struct rte_cryptodev_enq_callback *cb =
> > > > > > > +			dev->enq_cbs[qp_id];
> > > > > > > +
> > > > > > > +		do {
> > > > > > > +			nb_ops = cb->fn(dev_id, qp_id, ops, nb_ops,
> > > > > > > +					cb->arg);
> > > > > > > +			cb = cb->next;
> > > > > > > +		} while (cb != NULL);
> > > > > > > +	}
> > > > > > > +#endif
> > > > > > >
> > > > > > >  	rte_cryptodev_trace_enqueue_burst(dev_id, qp_id, (void
> > > > > > > **)ops,
> > > > > > nb_ops);
> > > > > > >  	return (*dev->enqueue_burst)( @@ -1351,6 +1407,102 @@
> > struct
> > > > > > > rte_cryptodev_asym_session *
> > > > > > >  	struct rte_cryptodev_sym_session *sess, union
> > > > > > > rte_crypto_sym_ofs
> > > > > > ofs,
> > > > > > >  	struct rte_crypto_sym_vec *vec);
> > > > > > >
> > > > > > > +#ifdef RTE_CRYPTODEV_CALLBACKS
> > > > > > > +/**
> > > > > > > + * @warning
> > > > > > > + * @b EXPERIMENTAL: this API may change without prior notice
> > > > > > > + *
> > > > > > > + * Add a user callback for a given crypto device and queue
> > > > > > > +pair which will be
> > > > > > > + * called on crypto ops enqueue.
> > > > > > > + *
> > > > > > > + * This API configures a function to be called for each burst
> > > > > > > +of crypto ops
> > > > > > > + * received on a given crypto device queue pair. The return
> > > > > > > +value is a pointer
> > > > > > > + * that can be used later to remove the callback using
> > > > > > > + * rte_cryptodev_remove_enq_callback().
> > > > > > > + *
> > > > > > > + * Multiple functions are called in the order that they are added.
> > > > > > > + *
> > > > > > > + * @param	dev_id		The identifier of the device.
> > > > > > > + * @param	qp_id		The index of the queue pair in which
> > > > ops are
> > > > > > > + *				to be enqueued for processing. The
> > > > value
> > > > > > > + *				must be in the range [0,
> > > > nb_queue_pairs - 1]
> > > > > > > + *				previously supplied to
> > > > > > > + *				*rte_cryptodev_configure*.
> > > > > > > + * @param	cb_fn		The callback function
> > > > > > > + * @param	cb_arg		A generic pointer parameter which
> > > > will be
> > > > > > passed
> > > > > > > + *				to each invocation of the callback
> > > > function on
> > > > > > > + *				this crypto device and queue pair.
> > > > > > > + *
> > > > > > > + * @return
> > > > > > > + *   NULL on error.
> > > > > > > + *   On success, a pointer value which can later be used to remove
> > the
> > > > > > callback.
> > > > > > > + */
> > > > > > > +
> > > > > > > +__rte_experimental
> > > > > > > +struct rte_cryptodev_enq_callback *
> > > > > > > +rte_cryptodev_add_enq_callback(uint8_t dev_id,
> > > > > > > +			       uint16_t qp_id,
> > > > > > > +			       rte_cryptodev_enq_cb_fn cb_fn,
> > > > > > > +			       void *cb_arg);
> > > > > > > +
> > > > > > > +
> > > > > > > +/**
> > > > > > > + * @warning
> > > > > > > + * @b EXPERIMENTAL: this API may change without prior notice
> > > > > > > + *
> > > > > > > + * Remove a user callback function for given crypto device
> > > > > > > +and queue
> > > > pair.
> > > > > > > + *
> > > > > > > + * This function is used to removed callbacks that were added
> > > > > > > +to a crypto
> > > > > > > + * device queue pair using rte_cryptodev_add_enq_callback().
> > > > > > > + *
> > > > > > > + * Note: The callback expects a RCU QSBR to be configured to
> > > > > > > +synchronize
> > > > > > > + * to free the memory. Application is expected to configure
> > > > > > > +RCU QSBR after
> > > > > > > + * adding an enqueue callback.
> > > > > > > + *
> > > > > > > + *
> > > > > > > + * @param	dev_id		The identifier of the device.
> > > > > > > + * @param	qp_id		The index of the queue pair in which
> > > > ops are
> > > > > > > + *				to be enqueued for processing. The
> > > > value
> > > > > > > + *				must be in the range [0,
> > > > nb_queue_pairs - 1]
> > > > > > > + *				previously supplied to
> > > > > > > + *				*rte_cryptodev_configure*.
> > > > > > > + * @param	cb		Pointer to user supplied callback
> > > > created via
> > > > > > > + *				rte_cryptodev_add_enq_callback().
> > > > > > > + *
> > > > > > > + * @return
> > > > > > > + *   - 0: Success. Callback was removed.
> > > > > > > + *   - -EINVAL:  The dev_id or the qp_id is out of range, or the
> > callback
> > > > > > > + *               is NULL or not found for the crypto device queue pair.
> > > > > > > + */
> > > > > > > +
> > > > > > > +__rte_experimental
> > > > > > > +int rte_cryptodev_remove_enq_callback(uint8_t dev_id,
> > > > > > > +				  uint16_t qp_id,
> > > > > > > +				  struct rte_cryptodev_enq_callback
> > *cb);
> > > > > > > +
> > > > > > > +
> > > > > > > +/**
> > > > > > > + * @warning
> > > > > > > + * @b EXPERIMENTAL: this API may change without prior notice
> > > > > > > + *
> > > > > > > + * Associate RCU QSBR variable with a cryptodev.
> > > > > > > + *
> > > > > > > + * This function is used to add RCU QSBR to a crypto device.
> > > > > > > + * The purpose of RCU is to help multiple threads to
> > > > > > > +synchronize
> > > > > > > + * with each other before initiating adding/removing callback
> > > > > > > + * while dataplane threads are running enqueue callbacks.
> > > > > > > + *
> > > > > > > + * @param	dev_id		The identifier of the device.
> > > > > > > + * @param	qsr		RCU QSBR configuration
> > > > > > > + * @return
> > > > > > > + *   On success - 0
> > > > > > > + *   On error - EINVAL.
> > > > > > > + */
> > > > > > > +
> > > > > > > +__rte_experimental
> > > > > > > +int rte_cryptodev_rcu_qsbr_add(uint8_t dev_id, struct
> > > > > > > +rte_rcu_qsbr *qsbr); #endif
> > > > > > > +
> > > > > > >  #ifdef __cplusplus
> > > > > > >  }
> > > > > > >  #endif
> > > > > > > diff --git a/lib/librte_cryptodev/rte_cryptodev_version.map
> > > > > > > b/lib/librte_cryptodev/rte_cryptodev_version.map
> > > > > > > index 02f6dcf..46de3ca 100644
> > > > > > > --- a/lib/librte_cryptodev/rte_cryptodev_version.map
> > > > > > > +++ b/lib/librte_cryptodev/rte_cryptodev_version.map
> > > > > > > @@ -64,6 +64,7 @@ DPDK_20.0 {
> > > > > > >  	rte_cryptodev_sym_capability_get;  };
> > > > > > >
> > > > > > > +
> > > > > > >  EXPERIMENTAL {
> > > > > > >  	global:
> > > > > > >
> > > > > > > @@ -105,4 +106,9 @@ EXPERIMENTAL {
> > > > > > >
> > > > > > >  	# added in 20.08
> > > > > > >  	rte_cryptodev_get_qp_status;
> > > > > > > +
> > > > > > > +	# added in 20.11
> > > > > > > +	rte_cryptodev_add_enq_callback;
> > > > > > > +	rte_cryptodev_remove_enq_callback;
> > > > > > > +	rte_cryptodev_rcu_qsbr_add;
> > > > > > >  };
> > > > > > > --
> > > > > > > 1.9.1
Honnappa Nagarahalli Sept. 23, 2020, 10:41 p.m. UTC | #11
<snip>

> > > > > > > >
> > > > > > > > +#ifdef RTE_CRYPTODEV_CALLBACKS int
> > > > > > > > +rte_cryptodev_rcu_qsbr_add(uint8_t dev_id, struct
> > > > > > > > +rte_rcu_qsbr
> > > > > > > > +*qsbr) {
> > > > > > > > +
> > > > > > > > +	struct rte_cryptodev *dev;
> > > > > > > > +
> > > > > > > > +	if (!rte_cryptodev_pmd_is_valid_dev(dev_id)) {
> > > > > > > > +		CDEV_LOG_ERR("Invalid dev_id=%" PRIu8, dev_id);
> > > > > > > > +		return -EINVAL;
> > > > > > > > +	}
> > > > > > > > +
> > > > > > > > +	dev = &rte_crypto_devices[dev_id];
> > > > > > > > +	dev->qsbr = qsbr;
> > > > > > > > +	return 0;
> > > > > > > > +}
> > > > > > >
> > > > > > > So if I understand your patch correctly you propose a new
> > > > > > > working model for
> > > > > > > crypto-devs:
> > > > > > > 1. Control-plane has to allocate/setup rcu_qsbr and do
> > > > > > > rte_cryptodev_rcu_qsbr_add().
> > > > > > > 2. Data-plane has somehow to obtain pointer to that rcu_qsbr
> > > > > > > and wrap
> > > > > > > cryptodev_enqueue()
> > > > > > >    with rcu_qsbr_quiescent()  or
> rcu_qsbr_online()/rcu_qsbr_offline().
> > > > > > Yes. I think, it is not a new model. It is same as RCU
> > > > > > integration with
> > > LPM.
> > > > > > Please refer: https://patches.dpdk.org/cover/73673/
> > > > >
> > > > > I am talking about new working model for crypto-dev
> enqueue/dequeue.
> > > > > As I said above now it becomes data-plane thread responsibility to:
> > > > >  -somehow to obtain pointer to that rcu_qsbr for each cryptodev
> > > > > it is
> > > using.
> > > > >  -call rcu sync functions (quiescent/online/offline) on a regular basis.
> > > > It is not on regular basis. When data plane comes up, they report online.
> > > > They report quiescent when they are done with critical section or
> > > > shared
> > > structure.
> > >
> > > I understand that, but it means all existing apps have to be changed that
> way.
> > >
> > > > All though, there is some dataplane changes involved here, I don't
> > > > think, it
> > > is major.
> > >
> > > I still think our goal here should be to make no visible changes to
> > > the dataplane.
> > > I.E. all necessary data-plane changes need to be hidden inside CB
> > > invocation part.
> > Please note that this is being implemented using the memory
> > reclamation framework documented at
> > https://doc.dpdk.org/guides/prog_guide/rcu_lib.html#resource-reclamati
> > on-framework-for-dpdk
> >
> > While using RCU there are couple of trade-offs that applications have to
> consider:
> > 1) Performance - reporting the quiescent state too often results in
> > performance impact on data plane
> > 2) Amount of outstanding memory to reclaim - reporting less often
> > results in more outstanding memory to reclaim
> >
> > Hence, the quiescent state reporting is left to the application. The
> > application decides how often it reports the quiescent state and has control
> over the data plane performance and the outstanding memory to reclaim.
> >
> > When you say "new working model for crypto-dev enqueue/dequeue",
> >
> > 1) are you comparing these with existing crypto-dev enqueue/dequeue
> APIs? If yes, these are new APIs, it is not breaking anything.
> > 2) are you comparing these with existing call back functions in ethdev
> > enqueue/dequeue APIs? If yes, agree that this is a new model. But, it is
> possible to support what ethdev supports along with the RCU method used
> in this patch.
> 
> What I am talking about:
> 
> Existing cryptodev enqueue/dequeue model doesn't require for the user to
> manage any RCU QSBR state manually.
> I believe that addition of ability to add/remove enqueue/dequeue callbacks
> shouldn't change existing working model.
> I think that adding/removing such callbacks has to be opaque to the user DP
> code and shouldn't require user to change it. Same as we have now for
> ethdev callback implementation.
The ethdev callback implementation conveniently leaves the problem of freeing memory to the user to resolve, it does not handle the issue. Hence, it "looks" to be opaque to the DP code. However, if the application has to implement a safe way to free the call back memory, its DP is affected based on call backs are being used or not.

> I think that forcing DP code to be aware that callbacks are present or not and
> to modify its behaviour depending on that nearly voids the purpose of having
> callbacks at all.
> In that case DP can just invoke callback function directly from it's codepath .
> 
> > >
> > > >
> > > > > Note that now data-plane thread would have to do that always -
> > > > > even if there are now callbacks installed for that cryptodev queue
> right now.
> > > > > All that changes behaviour of existing apps and I presume would
> > > > > reduce adoption of  that fature.
> > If I understand this correct, you are talking about a case where in
> > the application might be registering/unregistering multiple times
> > during its lifetime. In this case, yes, the application might be reporting the
> quiescent state even when it has not registered the call backs. But, it has the
> flexibility to not report it if it implements additional logic.
> > Note that we are assuming that the application has to report quiescent
> > state only for using callback functions. Most probably the application has
> other requirements to use RCU.
> > Why not support what is done for ethdev call back functions along with
> providing RCU method?
> >
> > > > There is always trade off involved!
> > > > In the previous patch, you suggested that some lazy app may not
> > > > free up the memory allocated by add cb. For such apps, this patch
> > > > has sync mechanism with some additional cost of CP & DP changes.
> > >
> > > Sigh, it is not about laziness of the app.
> > > The problem with current ethedev cb mechanism and yours V1 (which
> > > was just a clone of it) - CP doesn't know when it is safe after CB
> > > removal to free related memory.
> > >
> > > > > I still think all this callback mechanism should be totally
> > > > > opaque to data-plane threads - user shouldn't change his app
> > > > > code depending on would some enqueue/dequeue callbacks be
> installed or not.
> > > > I am not sure, how that can be implemented with existing RCU design.
> > >
> > > As I said below the simplest way - with calling rcu onine/offline
> > > inside CB invocation block.
> > > That's why I asked you - did you try that approach and what is the
> > > perf numbers?
> > > I presume with no callbacks installed the perf change should be nearly
> zero.
> > >
> > > > @Honnappa Nagarahalli, Do you have any suggestions?
> > Reporting quiescent state in the call back functions has several
> disadvantages:
> > 1) it will have performance impacts and the impacts will increase as the
> number of data plane threads increase.
> > 2) It will require additional configuration parameters to control how
> > often the quiescent state is reported to control the performance impact.
> > 3) Does not take advantage of the fact that most probably the
> > application is using RCU already
> > 4) There are few difficulties as well, please see below.
> 
> I suggested Abhinandan to use RCU library because it is already there, and I
> thought it would be good not to re-implement the wheel.
> Though if you feel librte_rcu doesn't match that task - fine, let's do it without
> librte_rcu.
> After all, what we need here - just an atomic ref count per queue that we are
> going to increment at entering and leaving list of callbacks inside
> enqueue/dequeue.
Ok, looks like I missed the point that a queue is used by a single data plane thread.
Along with ref count increment you need the memory orderings to avoid race conditions. These will be the same ones used in RCU.
On the control plane, you need to read this counter and poll for the counter updates. All this is same cost as in RCU. To control the cost, you will have to control the rate of quiescent state reporting and might have to expose this as a configuration parameter.

The other important information you have to consider is if the thread is making any blocking calls, which may be in some other library. The thread is supposed to call rcu_qsbr_thread_offline API before calling a blocking call. This allows the RCU to know that this particular thread will not report quiescent state. The cryptodev library might not have that information.

If you want to go ahead with this design, you can still use RCU with single thread configuration (like you have mentioned below) and hide the details from the application.

> 
> >
> > > >
> > > > >
> > > > > >
> > > > > > >
> > > > > > > That seems quite a big change and I don't think it is
> > > > > > > acceptable for most users.
> > > > > > > From my perspective adding/installing call-backs to the dev
> > > > > > > has to be opaque to the data-plane code.
> > > > > > > Also note that different callbacks can be installed by
> > > > > > > different entities (libs) and might have no idea about each other.
> > > > > > > That's why I thought it would be better to make all this RCU
> > > > > > > stuff internal inside cryptodev:
> > > > > > >     hide all this rcu_qsbr allocation/setup inside cryptod
> > > > > > > somehow to
> > > > > obtain pointer to that rcu_qsbr ev init/queue setup
> > > > > > >     invoke rcu_qsbr_online()/rcu_qsbr_offline() inside
> > > > > cryptodev_enqueue().
> > This will bring in the application related information such as the thread ID
> into the library.
> 
> I don't think it would.
> Cryptodev enqueue/dequeue functions are not supposed to be thread safe
> (same as rx/tx burst).
> So we can always use RCU with just one thread(thread_id = 0).
Agree, the memory that needs to be freed is accessed by a single thread on the data plane. RCU with one thread would suffice.

> But as I said above - if you feel RCU lib is an overhead here, that's fine - I
> think it would be easy enough to do without librte_rcu.
> 
> > If the same API calls are being made from multiple data plane threads,
> > you need a way to configure that information to the library. So, it is
> > better to leave those details for the application to handle.
> >
> > > > > > I have already tried exploring above stuffs. There are too
> > > > > > many
> > > constraints.
> > > > > > The changes don't fit in, as per RCU design.
> > > > >
> > > > > Hmm could you be more specific here - what constraints are you
> > > > > referring to?
> > > > >
> > > > > > Moreover, having rcu api under enqueue_burst() will affect the
> > > > > performance.
> > > > >
> > > > > It most likely will. Though my expectation it will affect
> > > > > performance only when some callbacks are installed. My thought
> here:
> > > > > callback function by itself will affect cryptdev_enqueue
> > > > > performance anyway,
> > > > With existing callback design, I have measured the
> > > > performance(with
> > > crypto perf test) on xeon.
> > > > It was almost negligible and same was shared with Declan.
> > >
> > > I am asking about different thing: did you try alternate approach I
> > > described, that wouldn't require changes in the user data-plane code.
> > >
> > > > That is one of the reasons, I didn't want to add to many stuffs in
> > > > to the
> > > callback.
> > > > The best part of existing design is crypto lib is not much modified.
> > > > The changes are either pushed to CP or DP.
> > > >
> > > > so adding extra overhead for sync is probably ok here.
> > >
> > > I think that extra overhead when callbacks are present is expected
> > > and probably acceptable.
> > > Changes in the upper-layer data-plane code - probably not.
> > >
> > > > > Though for situation when no callbacks are installed -
> > > > > perfomance should be left unaffected (or impact should be as small
> as possible).
> > > > >
> > > > > > The changes are more on control plane side, which is one time.
> > > > > > The data plane changes are minimal.
> > > > >
> > > > > I still think upper layer data-plane code should stay unaffected
> > > > > (zero changes).
> > > > >
> > > > > > >

<snip>
Honnappa Nagarahalli Sept. 24, 2020, 5:17 a.m. UTC | #12
<snip>

> 
> In an eventdev world, multiple workers (with ordered queue) will be working
> on IPsec ESP processing. The ESP header's sequence number is unique and
> has to be sequentially incremented in an orderly manner.
> This rises a need for incrementing sequence number in crypto stage
> especially in event crypto adapter. By adding a user callback to cryptodev at
> enqueue burst, the user callback will get executed in the context of event
> crypto adapter. This helps the application to increment the ESP sequence
> number atomically and orderly manner.
> 
> This patch adds APIs to add/remove callback functions. The callback function
> will be called for each burst of crypto ops received on a given crypto device
> queue pair.
> 
> v1->v2:
> Moved callback related members to the end of cryptodev struct Added
> support for RCU
> 
> Signed-off-by: Abhinandan Gujjar <abhinandan.gujjar@intel.com>
> ---
>  config/common_base                             |   1 +
>  lib/librte_cryptodev/Makefile                  |   2 +-
>  lib/librte_cryptodev/rte_cryptodev.c           | 157
> +++++++++++++++++++++++++
>  lib/librte_cryptodev/rte_cryptodev.h           | 154
> +++++++++++++++++++++++-
>  lib/librte_cryptodev/rte_cryptodev_version.map |   6 +
>  5 files changed, 318 insertions(+), 2 deletions(-)
> 
> diff --git a/config/common_base b/config/common_base index
> fbf0ee7..f5ebde4 100644
> --- a/config/common_base
> +++ b/config/common_base
> @@ -599,6 +599,7 @@
> CONFIG_RTE_LIBRTE_PMD_BBDEV_FPGA_5GNR_FEC=y
>  #
>  CONFIG_RTE_LIBRTE_CRYPTODEV=y
>  CONFIG_RTE_CRYPTO_MAX_DEVS=64
> +CONFIG_RTE_CRYPTODEV_CALLBACKS=y
> 
>  #
>  # Compile PMD for ARMv8 Crypto device
> diff --git a/lib/librte_cryptodev/Makefile b/lib/librte_cryptodev/Makefile
> index 73e77a2..514d552 100644
> --- a/lib/librte_cryptodev/Makefile
> +++ b/lib/librte_cryptodev/Makefile
> @@ -10,7 +10,7 @@ LIB = librte_cryptodev.a  CFLAGS += -O3  CFLAGS +=
> $(WERROR_FLAGS)  LDLIBS += -lrte_eal -lrte_mempool -lrte_ring -lrte_mbuf
> -LDLIBS += -lrte_kvargs
> +LDLIBS += -lrte_kvargs -lrte_rcu
> 
>  # library source files
>  SRCS-y += rte_cryptodev.c rte_cryptodev_pmd.c cryptodev_trace_points.c
> diff --git a/lib/librte_cryptodev/rte_cryptodev.c
> b/lib/librte_cryptodev/rte_cryptodev.c
> index 1dd795b..2fb3e35 100644
> --- a/lib/librte_cryptodev/rte_cryptodev.c
> +++ b/lib/librte_cryptodev/rte_cryptodev.c
> @@ -38,6 +38,7 @@
>  #include <rte_string_fns.h>
>  #include <rte_compat.h>
>  #include <rte_function_versioning.h>
> +#include <rte_rcu_qsbr.h>
> 
>  #include "rte_crypto.h"
>  #include "rte_cryptodev.h"
> @@ -499,6 +500,10 @@ struct
> rte_cryptodev_sym_session_pool_private_data {
>  	return 0;
>  }
> 
> +#ifdef RTE_CRYPTODEV_CALLBACKS
> +/* spinlock for crypto device enq callbacks */ static rte_spinlock_t
> +rte_cryptodev_enq_cb_lock = RTE_SPINLOCK_INITIALIZER; #endif
> 
>  const char *
>  rte_cryptodev_get_feature_name(uint64_t flag) @@ -1449,6 +1454,158
> @@ struct rte_cryptodev *
>  	rte_spinlock_unlock(&rte_cryptodev_cb_lock);
>  }
> 
> +#ifdef RTE_CRYPTODEV_CALLBACKS
> +int
> +rte_cryptodev_rcu_qsbr_add(uint8_t dev_id, struct rte_rcu_qsbr *qsbr) {
> +
> +	struct rte_cryptodev *dev;
> +
> +	if (!rte_cryptodev_pmd_is_valid_dev(dev_id)) {
> +		CDEV_LOG_ERR("Invalid dev_id=%" PRIu8, dev_id);
> +		return -EINVAL;
> +	}
> +
> +	dev = &rte_crypto_devices[dev_id];
> +	dev->qsbr = qsbr;
> +	return 0;
> +}
> +
> +struct rte_cryptodev_enq_callback *
> +rte_cryptodev_add_enq_callback(uint8_t dev_id,
> +			       uint16_t qp_id,
> +			       rte_cryptodev_enq_cb_fn cb_fn,
> +			       void *cb_arg)
> +{
> +	struct rte_cryptodev *dev;
> +	struct rte_cryptodev_enq_callback *cb, *tail;
> +
> +	if (!cb_fn)
> +		return NULL;
> +
> +	if (!rte_cryptodev_pmd_is_valid_dev(dev_id)) {
> +		CDEV_LOG_ERR("Invalid dev_id=%" PRIu8, dev_id);
> +		return NULL;
> +	}
> +
> +	dev = &rte_crypto_devices[dev_id];
> +	if (qp_id >= dev->data->nb_queue_pairs) {
> +		CDEV_LOG_ERR("Invalid queue_pair_id=%d", qp_id);
> +		return NULL;
> +	}
> +
> +	cb = rte_zmalloc(NULL, sizeof(*cb), 0);
> +	if (cb == NULL) {
> +		CDEV_LOG_ERR("Failed to allocate memory for callback on "
> +			     "dev=%d, queue_pair_id=%d", dev_id, qp_id);
> +		rte_errno = ENOMEM;
> +		return NULL;
> +	}
> +
> +	cb->fn = cb_fn;
> +	cb->arg = cb_arg;
> +
> +	rte_spinlock_lock(&rte_cryptodev_enq_cb_lock);
> +	if (dev->enq_cbs == NULL) {
> +		dev->enq_cbs = rte_zmalloc(NULL, sizeof(cb) *
> +					   dev->data->nb_queue_pairs, 0);
> +		if (dev->enq_cbs == NULL) {
> +			CDEV_LOG_ERR("Failed to allocate memory for
> callbacks");
> +			rte_errno = ENOMEM;
> +			rte_free(cb);
> +			return NULL;
> +		}
> +	}
> +
> +	/* Add the callbacks in fifo order. */
> +	tail = dev->enq_cbs[qp_id];
> +	if (tail) {
> +		while (tail->next)
> +			tail = tail->next;
> +		tail->next = cb;
The store here needs to complete only after the cb->fn and cb->arg are written. You need to use __atomic_store_n(&tail->next, cb, __ATOMIC_RELEASE)

> +	} else
> +		dev->enq_cbs[qp_id] = cb;
Same here, this should be __atomic_store_n(&dev->enq_cbs[qp_id], cb, __ATOMIC_RELEASE)

(same fixes are required in ethdev as well, will send out a patch)

> +
> +	rte_spinlock_unlock(&rte_cryptodev_enq_cb_lock);
> +
> +	return cb;
> +}
> +
> +int
> +rte_cryptodev_remove_enq_callback(uint8_t dev_id,
> +				  uint16_t qp_id,
> +				  struct rte_cryptodev_enq_callback *cb) {
> +	struct rte_cryptodev *dev;
> +	struct rte_cryptodev_enq_callback **prev_cb, *curr_cb;
> +	uint16_t qp;
> +	int free_mem;
> +	int ret;
> +
> +	free_mem = 1;
> +
> +	if (!cb) {
> +		CDEV_LOG_ERR("cb is NULL");
> +		return -EINVAL;
> +	}
> +
> +	if (!rte_cryptodev_pmd_is_valid_dev(dev_id)) {
> +		CDEV_LOG_ERR("Invalid dev_id=%" PRIu8, dev_id);
> +		return -EINVAL;
> +	}
> +
> +	dev = &rte_crypto_devices[dev_id];
> +	if (qp_id >= dev->data->nb_queue_pairs) {
> +		CDEV_LOG_ERR("Invalid queue_pair_id=%d", qp_id);
> +		return -EINVAL;
> +	}
> +
> +	if (!dev->qsbr) {
> +		CDEV_LOG_ERR("Rcu qsbr is NULL");
> +		return -EINVAL;
> +	}
> +
> +	rte_spinlock_lock(&rte_cryptodev_enq_cb_lock);
> +	if (dev->enq_cbs == NULL) {
> +		rte_spinlock_unlock(&rte_cryptodev_enq_cb_lock);
> +		return -EINVAL;
> +	}
> +
> +	prev_cb = &dev->enq_cbs[qp_id];
> +	for (; *prev_cb != NULL; prev_cb = &curr_cb->next) {
> +		curr_cb = *prev_cb;
> +		if (curr_cb == cb) {
> +			/* Remove the user cb from the callback list. */
> +			*prev_cb = curr_cb->next;
> +			ret = 0;
> +			break;
> +		}
> +	}
> +
> +	if (!ret) {
> +		/* Call sync with invalid thread id as this is part of
> +		 * control plane API */
> +		rte_rcu_qsbr_synchronize(dev->qsbr,
> +					 RTE_QSBR_THRID_INVALID);
> +		rte_free(cb);
> +	}
> +
> +	for (qp = 0; qp < dev->data->nb_queue_pairs; qp++)
> +		if (dev->enq_cbs[qp] != NULL) {
> +			free_mem = 0;
> +			break;
> +		}
> +
> +	if (free_mem) {
> +		rte_free(dev->enq_cbs);
> +		dev->enq_cbs = NULL;
> +	}
> +
> +	rte_spinlock_unlock(&rte_cryptodev_enq_cb_lock);
> +
> +	return ret;
> +}
> +#endif
> 
>  int
>  rte_cryptodev_sym_session_init(uint8_t dev_id, diff --git
> a/lib/librte_cryptodev/rte_cryptodev.h
> b/lib/librte_cryptodev/rte_cryptodev.h
> index 7b3ebc2..2c7a47b 100644
> --- a/lib/librte_cryptodev/rte_cryptodev.h
> +++ b/lib/librte_cryptodev/rte_cryptodev.h
> @@ -530,6 +530,32 @@ struct rte_cryptodev_qp_conf {  };
> 
>  /**
> + * Function type used for pre processing crypto ops when enqueue burst
> +is
> + * called.
> + *
> + * The callback function is called on enqueue burst immediately
> + * before the crypto ops are put onto the hardware queue for processing.
> + *
> + * @param	dev_id		The identifier of the device.
> + * @param	qp_id		The index of the queue pair in which ops are
> + *				to be enqueued for processing. The value
> + *				must be in the range [0, nb_queue_pairs - 1]
> + *				previously supplied to
> + *				*rte_cryptodev_configure*.
> + * @param	ops		The address of an array of *nb_ops* pointers
> + *				to *rte_crypto_op* structures which contain
> + *				the crypto operations to be processed.
> + * @param	nb_ops		The number of operations to process.
> + * @param	user_param	The arbitrary user parameter passed in by the
> + *				application when the callback was originally
> + *				registered.
> + * @return			The number of ops to be enqueued to the
> + *				crypto device.
> + */
> +typedef uint16_t (*rte_cryptodev_enq_cb_fn)(uint16_t dev_id, uint16_t
> qp_id,
> +		struct rte_crypto_op **ops, uint16_t nb_ops, void
> *user_param);
> +
> +/**
>   * Typedef for application callback function to be registered by application
>   * software for notification of device events
>   *
> @@ -853,7 +879,6 @@ struct rte_cryptodev_config {
>  		enum rte_cryptodev_event_type event,
>  		rte_cryptodev_cb_fn cb_fn, void *cb_arg);
> 
> -
>  typedef uint16_t (*dequeue_pkt_burst_t)(void *qp,
>  		struct rte_crypto_op **ops,	uint16_t nb_ops);
>  /**< Dequeue processed packets from queue pair of a device. */ @@ -870,6
> +895,17 @@ typedef uint16_t (*enqueue_pkt_burst_t)(void *qp,
>  /** Structure to keep track of registered callbacks */
> TAILQ_HEAD(rte_cryptodev_cb_list, rte_cryptodev_callback);
> 
> +/**
> + * @internal
> + * Structure used to hold information about the callbacks to be called
> +for a
> + * queue pair on enqueue.
> + */
> +struct rte_cryptodev_enq_callback {
> +	struct rte_cryptodev_enq_callback *next;
> +	rte_cryptodev_enq_cb_fn fn;
> +	void *arg;
> +};
> +
>  /** The data structure associated with each crypto device. */  struct
> rte_cryptodev {
>  	dequeue_pkt_burst_t dequeue_burst;
> @@ -898,6 +934,14 @@ struct rte_cryptodev {
>  	__extension__
>  	uint8_t attached : 1;
>  	/**< Flag indicating the device is attached */
> +
> +#ifdef RTE_CRYPTODEV_CALLBACKS
> +	struct rte_cryptodev_enq_callback **enq_cbs;
> +	/**< User application callback for pre enqueue processing */
> +
> +	struct rte_rcu_qsbr *qsbr;
> +	/** < RCU QSBR variable for rte_cryptodev_enq_callback */ #endif
>  } __rte_cache_aligned;
> 
>  void *
> @@ -1019,6 +1063,18 @@ struct rte_cryptodev_data {
>  		struct rte_crypto_op **ops, uint16_t nb_ops)  {
>  	struct rte_cryptodev *dev = &rte_cryptodevs[dev_id];
> +#ifdef RTE_CRYPTODEV_CALLBACKS
> +	if (unlikely(dev->enq_cbs != NULL && dev->enq_cbs[qp_id] != NULL))
Suggest using __atomic_load_n(&dev->enq_cbs[qp_id], __ATOMIC_RELAXED) as this acts as the guard variable for cb->fn and cb->arg.

> {
> +		struct rte_cryptodev_enq_callback *cb =
> +			dev->enq_cbs[qp_id];
> +
> +		do {
> +			nb_ops = cb->fn(dev_id, qp_id, ops, nb_ops,
> +					cb->arg);
> +			cb = cb->next;
> +		} while (cb != NULL);
> +	}
> +#endif
> 
>  	rte_cryptodev_trace_enqueue_burst(dev_id, qp_id, (void **)ops,
> nb_ops);
>  	return (*dev->enqueue_burst)(
> @@ -1351,6 +1407,102 @@ struct rte_cryptodev_asym_session *
>  	struct rte_cryptodev_sym_session *sess, union rte_crypto_sym_ofs
> ofs,
>  	struct rte_crypto_sym_vec *vec);
> 
> +#ifdef RTE_CRYPTODEV_CALLBACKS
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice
> + *
> + * Add a user callback for a given crypto device and queue pair which
> +will be
> + * called on crypto ops enqueue.
> + *
> + * This API configures a function to be called for each burst of crypto
> +ops
> + * received on a given crypto device queue pair. The return value is a
> +pointer
> + * that can be used later to remove the callback using
> + * rte_cryptodev_remove_enq_callback().
> + *
> + * Multiple functions are called in the order that they are added.
> + *
> + * @param	dev_id		The identifier of the device.
> + * @param	qp_id		The index of the queue pair in which ops are
> + *				to be enqueued for processing. The value
> + *				must be in the range [0, nb_queue_pairs - 1]
> + *				previously supplied to
> + *				*rte_cryptodev_configure*.
> + * @param	cb_fn		The callback function
> + * @param	cb_arg		A generic pointer parameter which will be
> passed
> + *				to each invocation of the callback function on
> + *				this crypto device and queue pair.
> + *
> + * @return
> + *   NULL on error.
> + *   On success, a pointer value which can later be used to remove the
> callback.
> + */
> +
> +__rte_experimental
> +struct rte_cryptodev_enq_callback *
> +rte_cryptodev_add_enq_callback(uint8_t dev_id,
> +			       uint16_t qp_id,
> +			       rte_cryptodev_enq_cb_fn cb_fn,
> +			       void *cb_arg);
> +
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice
> + *
> + * Remove a user callback function for given crypto device and queue pair.
> + *
> + * This function is used to removed callbacks that were added to a
> +crypto
> + * device queue pair using rte_cryptodev_add_enq_callback().
> + *
> + * Note: The callback expects a RCU QSBR to be configured to
> +synchronize
> + * to free the memory. Application is expected to configure RCU QSBR
> +after
> + * adding an enqueue callback.
> + *
> + *
> + * @param	dev_id		The identifier of the device.
> + * @param	qp_id		The index of the queue pair in which ops are
> + *				to be enqueued for processing. The value
> + *				must be in the range [0, nb_queue_pairs - 1]
> + *				previously supplied to
> + *				*rte_cryptodev_configure*.
> + * @param	cb		Pointer to user supplied callback created via
> + *				rte_cryptodev_add_enq_callback().
> + *
> + * @return
> + *   - 0: Success. Callback was removed.
> + *   - -EINVAL:  The dev_id or the qp_id is out of range, or the callback
> + *               is NULL or not found for the crypto device queue pair.
> + */
> +
> +__rte_experimental
> +int rte_cryptodev_remove_enq_callback(uint8_t dev_id,
> +				  uint16_t qp_id,
> +				  struct rte_cryptodev_enq_callback *cb);
> +
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice
> + *
> + * Associate RCU QSBR variable with a cryptodev.
> + *
> + * This function is used to add RCU QSBR to a crypto device.
> + * The purpose of RCU is to help multiple threads to synchronize
> + * with each other before initiating adding/removing callback
> + * while dataplane threads are running enqueue callbacks.
> + *
> + * @param	dev_id		The identifier of the device.
> + * @param	qsr		RCU QSBR configuration
> + * @return
> + *   On success - 0
> + *   On error - EINVAL.
> + */
> +
> +__rte_experimental
> +int rte_cryptodev_rcu_qsbr_add(uint8_t dev_id, struct rte_rcu_qsbr
> +*qsbr); #endif
> +
>  #ifdef __cplusplus
>  }
>  #endif
> diff --git a/lib/librte_cryptodev/rte_cryptodev_version.map
> b/lib/librte_cryptodev/rte_cryptodev_version.map
> index 02f6dcf..46de3ca 100644
> --- a/lib/librte_cryptodev/rte_cryptodev_version.map
> +++ b/lib/librte_cryptodev/rte_cryptodev_version.map
> @@ -64,6 +64,7 @@ DPDK_20.0 {
>  	rte_cryptodev_sym_capability_get;
>  };
> 
> +
>  EXPERIMENTAL {
>  	global:
> 
> @@ -105,4 +106,9 @@ EXPERIMENTAL {
> 
>  	# added in 20.08
>  	rte_cryptodev_get_qp_status;
> +
> +	# added in 20.11
> +	rte_cryptodev_add_enq_callback;
> +	rte_cryptodev_remove_enq_callback;
> +	rte_cryptodev_rcu_qsbr_add;
>  };
> --
> 1.9.1
Ananyev, Konstantin Sept. 24, 2020, 5:06 p.m. UTC | #13
> 
> > > > > > > > >
> > > > > > > > > +#ifdef RTE_CRYPTODEV_CALLBACKS int
> > > > > > > > > +rte_cryptodev_rcu_qsbr_add(uint8_t dev_id, struct
> > > > > > > > > +rte_rcu_qsbr
> > > > > > > > > +*qsbr) {
> > > > > > > > > +
> > > > > > > > > +	struct rte_cryptodev *dev;
> > > > > > > > > +
> > > > > > > > > +	if (!rte_cryptodev_pmd_is_valid_dev(dev_id)) {
> > > > > > > > > +		CDEV_LOG_ERR("Invalid dev_id=%" PRIu8, dev_id);
> > > > > > > > > +		return -EINVAL;
> > > > > > > > > +	}
> > > > > > > > > +
> > > > > > > > > +	dev = &rte_crypto_devices[dev_id];
> > > > > > > > > +	dev->qsbr = qsbr;
> > > > > > > > > +	return 0;
> > > > > > > > > +}
> > > > > > > >
> > > > > > > > So if I understand your patch correctly you propose a new
> > > > > > > > working model for
> > > > > > > > crypto-devs:
> > > > > > > > 1. Control-plane has to allocate/setup rcu_qsbr and do
> > > > > > > > rte_cryptodev_rcu_qsbr_add().
> > > > > > > > 2. Data-plane has somehow to obtain pointer to that rcu_qsbr
> > > > > > > > and wrap
> > > > > > > > cryptodev_enqueue()
> > > > > > > >    with rcu_qsbr_quiescent()  or
> > rcu_qsbr_online()/rcu_qsbr_offline().
> > > > > > > Yes. I think, it is not a new model. It is same as RCU
> > > > > > > integration with
> > > > LPM.
> > > > > > > Please refer: https://patches.dpdk.org/cover/73673/
> > > > > >
> > > > > > I am talking about new working model for crypto-dev
> > enqueue/dequeue.
> > > > > > As I said above now it becomes data-plane thread responsibility to:
> > > > > >  -somehow to obtain pointer to that rcu_qsbr for each cryptodev
> > > > > > it is
> > > > using.
> > > > > >  -call rcu sync functions (quiescent/online/offline) on a regular basis.
> > > > > It is not on regular basis. When data plane comes up, they report online.
> > > > > They report quiescent when they are done with critical section or
> > > > > shared
> > > > structure.
> > > >
> > > > I understand that, but it means all existing apps have to be changed that
> > way.
> > > >
> > > > > All though, there is some dataplane changes involved here, I don't
> > > > > think, it
> > > > is major.
> > > >
> > > > I still think our goal here should be to make no visible changes to
> > > > the dataplane.
> > > > I.E. all necessary data-plane changes need to be hidden inside CB
> > > > invocation part.
> > > Please note that this is being implemented using the memory
> > > reclamation framework documented at
> > > https://doc.dpdk.org/guides/prog_guide/rcu_lib.html#resource-reclamati
> > > on-framework-for-dpdk
> > >
> > > While using RCU there are couple of trade-offs that applications have to
> > consider:
> > > 1) Performance - reporting the quiescent state too often results in
> > > performance impact on data plane
> > > 2) Amount of outstanding memory to reclaim - reporting less often
> > > results in more outstanding memory to reclaim
> > >
> > > Hence, the quiescent state reporting is left to the application. The
> > > application decides how often it reports the quiescent state and has control
> > over the data plane performance and the outstanding memory to reclaim.
> > >
> > > When you say "new working model for crypto-dev enqueue/dequeue",
> > >
> > > 1) are you comparing these with existing crypto-dev enqueue/dequeue
> > APIs? If yes, these are new APIs, it is not breaking anything.
> > > 2) are you comparing these with existing call back functions in ethdev
> > > enqueue/dequeue APIs? If yes, agree that this is a new model. But, it is
> > possible to support what ethdev supports along with the RCU method used
> > in this patch.
> >
> > What I am talking about:
> >
> > Existing cryptodev enqueue/dequeue model doesn't require for the user to
> > manage any RCU QSBR state manually.
> > I believe that addition of ability to add/remove enqueue/dequeue callbacks
> > shouldn't change existing working model.
> > I think that adding/removing such callbacks has to be opaque to the user DP
> > code and shouldn't require user to change it. Same as we have now for
> > ethdev callback implementation.
> The ethdev callback implementation conveniently leaves the problem of freeing memory to the user to resolve, it does not handle the issue.
> Hence, it "looks" to be opaque to the DP code. However, if the application has to implement a safe way to free the call back memory, its DP
> is affected based on call backs are being used or not.

Yes, I think that's big drawback in initial ethdev callback implementation -
it simply ignores DP/CP sync problem completely.
Though I think it is possible to have both here:
 keep callback "opaque" to DP code and provide some sync mechanism between DP/CP.
Hopefully one day we can fix ethdev callbacks too. 

> > I think that forcing DP code to be aware that callbacks are present or not and
> > to modify its behaviour depending on that nearly voids the purpose of having
> > callbacks at all.
> > In that case DP can just invoke callback function directly from it's codepath .
> >
> > > >
> > > > >
> > > > > > Note that now data-plane thread would have to do that always -
> > > > > > even if there are now callbacks installed for that cryptodev queue
> > right now.
> > > > > > All that changes behaviour of existing apps and I presume would
> > > > > > reduce adoption of  that fature.
> > > If I understand this correct, you are talking about a case where in
> > > the application might be registering/unregistering multiple times
> > > during its lifetime. In this case, yes, the application might be reporting the
> > quiescent state even when it has not registered the call backs. But, it has the
> > flexibility to not report it if it implements additional logic.
> > > Note that we are assuming that the application has to report quiescent
> > > state only for using callback functions. Most probably the application has
> > other requirements to use RCU.
> > > Why not support what is done for ethdev call back functions along with
> > providing RCU method?
> > >
> > > > > There is always trade off involved!
> > > > > In the previous patch, you suggested that some lazy app may not
> > > > > free up the memory allocated by add cb. For such apps, this patch
> > > > > has sync mechanism with some additional cost of CP & DP changes.
> > > >
> > > > Sigh, it is not about laziness of the app.
> > > > The problem with current ethedev cb mechanism and yours V1 (which
> > > > was just a clone of it) - CP doesn't know when it is safe after CB
> > > > removal to free related memory.
> > > >
> > > > > > I still think all this callback mechanism should be totally
> > > > > > opaque to data-plane threads - user shouldn't change his app
> > > > > > code depending on would some enqueue/dequeue callbacks be
> > installed or not.
> > > > > I am not sure, how that can be implemented with existing RCU design.
> > > >
> > > > As I said below the simplest way - with calling rcu onine/offline
> > > > inside CB invocation block.
> > > > That's why I asked you - did you try that approach and what is the
> > > > perf numbers?
> > > > I presume with no callbacks installed the perf change should be nearly
> > zero.
> > > >
> > > > > @Honnappa Nagarahalli, Do you have any suggestions?
> > > Reporting quiescent state in the call back functions has several
> > disadvantages:
> > > 1) it will have performance impacts and the impacts will increase as the
> > number of data plane threads increase.
> > > 2) It will require additional configuration parameters to control how
> > > often the quiescent state is reported to control the performance impact.
> > > 3) Does not take advantage of the fact that most probably the
> > > application is using RCU already
> > > 4) There are few difficulties as well, please see below.
> >
> > I suggested Abhinandan to use RCU library because it is already there, and I
> > thought it would be good not to re-implement the wheel.
> > Though if you feel librte_rcu doesn't match that task - fine, let's do it without
> > librte_rcu.
> > After all, what we need here - just an atomic ref count per queue that we are
> > going to increment at entering and leaving list of callbacks inside
> > enqueue/dequeue.
> Ok, looks like I missed the point that a queue is used by a single data plane thread.
> Along with ref count increment you need the memory orderings to avoid race conditions. These will be the same ones used in RCU.
> On the control plane, you need to read this counter and poll for the counter updates. All this is same cost as in RCU.

Agree.

> To control the cost, you
> will have to control the rate of quiescent state reporting and might have to expose this as a configuration parameter.
> 
> The other important information you have to consider is if the thread is making any blocking calls, which may be in some other library. The
> thread is supposed to call rcu_qsbr_thread_offline API before calling a blocking call. This allows the RCU to know that this particular thread
> will not report quiescent state. The cryptodev library might not have that information.
> 
> If you want to go ahead with this design, you can still use RCU with single thread configuration (like you have mentioned below) and hide
> the details from the application.

Yes,  same thought here -  use rcu_qsbr online/offline for DP part and hide actual sync details inside callback code.

> 
> >
> > >
> > > > >
> > > > > >
> > > > > > >
> > > > > > > >
> > > > > > > > That seems quite a big change and I don't think it is
> > > > > > > > acceptable for most users.
> > > > > > > > From my perspective adding/installing call-backs to the dev
> > > > > > > > has to be opaque to the data-plane code.
> > > > > > > > Also note that different callbacks can be installed by
> > > > > > > > different entities (libs) and might have no idea about each other.
> > > > > > > > That's why I thought it would be better to make all this RCU
> > > > > > > > stuff internal inside cryptodev:
> > > > > > > >     hide all this rcu_qsbr allocation/setup inside cryptod
> > > > > > > > somehow to
> > > > > > obtain pointer to that rcu_qsbr ev init/queue setup
> > > > > > > >     invoke rcu_qsbr_online()/rcu_qsbr_offline() inside
> > > > > > cryptodev_enqueue().
> > > This will bring in the application related information such as the thread ID
> > into the library.
> >
> > I don't think it would.
> > Cryptodev enqueue/dequeue functions are not supposed to be thread safe
> > (same as rx/tx burst).
> > So we can always use RCU with just one thread(thread_id = 0).
> Agree, the memory that needs to be freed is accessed by a single thread on the data plane. RCU with one thread would suffice.
> 
> > But as I said above - if you feel RCU lib is an overhead here, that's fine - I
> > think it would be easy enough to do without librte_rcu.
> >
> > > If the same API calls are being made from multiple data plane threads,
> > > you need a way to configure that information to the library. So, it is
> > > better to leave those details for the application to handle.
> > >
> > > > > > > I have already tried exploring above stuffs. There are too
> > > > > > > many
> > > > constraints.
> > > > > > > The changes don't fit in, as per RCU design.
> > > > > >
> > > > > > Hmm could you be more specific here - what constraints are you
> > > > > > referring to?
> > > > > >
> > > > > > > Moreover, having rcu api under enqueue_burst() will affect the
> > > > > > performance.
> > > > > >
> > > > > > It most likely will. Though my expectation it will affect
> > > > > > performance only when some callbacks are installed. My thought
> > here:
> > > > > > callback function by itself will affect cryptdev_enqueue
> > > > > > performance anyway,
> > > > > With existing callback design, I have measured the
> > > > > performance(with
> > > > crypto perf test) on xeon.
> > > > > It was almost negligible and same was shared with Declan.
> > > >
> > > > I am asking about different thing: did you try alternate approach I
> > > > described, that wouldn't require changes in the user data-plane code.
> > > >
> > > > > That is one of the reasons, I didn't want to add to many stuffs in
> > > > > to the
> > > > callback.
> > > > > The best part of existing design is crypto lib is not much modified.
> > > > > The changes are either pushed to CP or DP.
> > > > >
> > > > > so adding extra overhead for sync is probably ok here.
> > > >
> > > > I think that extra overhead when callbacks are present is expected
> > > > and probably acceptable.
> > > > Changes in the upper-layer data-plane code - probably not.
> > > >
> > > > > > Though for situation when no callbacks are installed -
> > > > > > perfomance should be left unaffected (or impact should be as small
> > as possible).
> > > > > >
> > > > > > > The changes are more on control plane side, which is one time.
> > > > > > > The data plane changes are minimal.
> > > > > >
> > > > > > I still think upper layer data-plane code should stay unaffected
> > > > > > (zero changes).
> > > > > >
> > > > > > > >
> 
> <snip>
Honnappa Nagarahalli Sept. 29, 2020, 5:14 a.m. UTC | #14
<snip>

> >
> > > > > > > > > >
> > > > > > > > > > +#ifdef RTE_CRYPTODEV_CALLBACKS int
> > > > > > > > > > +rte_cryptodev_rcu_qsbr_add(uint8_t dev_id, struct
> > > > > > > > > > +rte_rcu_qsbr
> > > > > > > > > > +*qsbr) {
> > > > > > > > > > +
> > > > > > > > > > +	struct rte_cryptodev *dev;
> > > > > > > > > > +
> > > > > > > > > > +	if (!rte_cryptodev_pmd_is_valid_dev(dev_id)) {
> > > > > > > > > > +		CDEV_LOG_ERR("Invalid dev_id=%" PRIu8,
> dev_id);
> > > > > > > > > > +		return -EINVAL;
> > > > > > > > > > +	}
> > > > > > > > > > +
> > > > > > > > > > +	dev = &rte_crypto_devices[dev_id];
> > > > > > > > > > +	dev->qsbr = qsbr;
> > > > > > > > > > +	return 0;
> > > > > > > > > > +}
> > > > > > > > >
> > > > > > > > > So if I understand your patch correctly you propose a
> > > > > > > > > new working model for
> > > > > > > > > crypto-devs:
> > > > > > > > > 1. Control-plane has to allocate/setup rcu_qsbr and do
> > > > > > > > > rte_cryptodev_rcu_qsbr_add().
> > > > > > > > > 2. Data-plane has somehow to obtain pointer to that
> > > > > > > > > rcu_qsbr and wrap
> > > > > > > > > cryptodev_enqueue()
> > > > > > > > >    with rcu_qsbr_quiescent()  or
> > > rcu_qsbr_online()/rcu_qsbr_offline().
> > > > > > > > Yes. I think, it is not a new model. It is same as RCU
> > > > > > > > integration with
> > > > > LPM.
> > > > > > > > Please refer: https://patches.dpdk.org/cover/73673/
> > > > > > >
> > > > > > > I am talking about new working model for crypto-dev
> > > enqueue/dequeue.
> > > > > > > As I said above now it becomes data-plane thread responsibility to:
> > > > > > >  -somehow to obtain pointer to that rcu_qsbr for each
> > > > > > > cryptodev it is
> > > > > using.
> > > > > > >  -call rcu sync functions (quiescent/online/offline) on a regular
> basis.
> > > > > > It is not on regular basis. When data plane comes up, they report
> online.
> > > > > > They report quiescent when they are done with critical section
> > > > > > or shared
> > > > > structure.
> > > > >
> > > > > I understand that, but it means all existing apps have to be
> > > > > changed that
> > > way.
> > > > >
> > > > > > All though, there is some dataplane changes involved here, I
> > > > > > don't think, it
> > > > > is major.
> > > > >
> > > > > I still think our goal here should be to make no visible changes
> > > > > to the dataplane.
> > > > > I.E. all necessary data-plane changes need to be hidden inside
> > > > > CB invocation part.
> > > > Please note that this is being implemented using the memory
> > > > reclamation framework documented at
> > > > https://doc.dpdk.org/guides/prog_guide/rcu_lib.html#resource-recla
> > > > mati
> > > > on-framework-for-dpdk
> > > >
> > > > While using RCU there are couple of trade-offs that applications
> > > > have to
> > > consider:
> > > > 1) Performance - reporting the quiescent state too often results
> > > > in performance impact on data plane
> > > > 2) Amount of outstanding memory to reclaim - reporting less often
> > > > results in more outstanding memory to reclaim
> > > >
> > > > Hence, the quiescent state reporting is left to the application.
> > > > The application decides how often it reports the quiescent state
> > > > and has control
> > > over the data plane performance and the outstanding memory to reclaim.
> > > >
> > > > When you say "new working model for crypto-dev enqueue/dequeue",
> > > >
> > > > 1) are you comparing these with existing crypto-dev
> > > > enqueue/dequeue
> > > APIs? If yes, these are new APIs, it is not breaking anything.
> > > > 2) are you comparing these with existing call back functions in
> > > > ethdev enqueue/dequeue APIs? If yes, agree that this is a new
> > > > model. But, it is
> > > possible to support what ethdev supports along with the RCU method
> > > used in this patch.
> > >
> > > What I am talking about:
> > >
> > > Existing cryptodev enqueue/dequeue model doesn't require for the
> > > user to manage any RCU QSBR state manually.
> > > I believe that addition of ability to add/remove enqueue/dequeue
> > > callbacks shouldn't change existing working model.
> > > I think that adding/removing such callbacks has to be opaque to the
> > > user DP code and shouldn't require user to change it. Same as we
> > > have now for ethdev callback implementation.
> > The ethdev callback implementation conveniently leaves the problem of
> freeing memory to the user to resolve, it does not handle the issue.
> > Hence, it "looks" to be opaque to the DP code. However, if the
> > application has to implement a safe way to free the call back memory, its
> DP is affected based on call backs are being used or not.
> 
> Yes, I think that's big drawback in initial ethdev callback implementation - it
> simply ignores DP/CP sync problem completely.
> Though I think it is possible to have both here:
>  keep callback "opaque" to DP code and provide some sync mechanism
> between DP/CP.
> Hopefully one day we can fix ethdev callbacks too.
The solution we develop can be used in ethdev too.

> 
> > > I think that forcing DP code to be aware that callbacks are present
> > > or not and to modify its behaviour depending on that nearly voids
> > > the purpose of having callbacks at all.
> > > In that case DP can just invoke callback function directly from it's
> codepath .
> > >
> > > > >
> > > > > >
> > > > > > > Note that now data-plane thread would have to do that always
> > > > > > > - even if there are now callbacks installed for that
> > > > > > > cryptodev queue
> > > right now.
> > > > > > > All that changes behaviour of existing apps and I presume
> > > > > > > would reduce adoption of  that fature.
> > > > If I understand this correct, you are talking about a case where
> > > > in the application might be registering/unregistering multiple
> > > > times during its lifetime. In this case, yes, the application
> > > > might be reporting the
> > > quiescent state even when it has not registered the call backs. But,
> > > it has the flexibility to not report it if it implements additional logic.
> > > > Note that we are assuming that the application has to report
> > > > quiescent state only for using callback functions. Most probably
> > > > the application has
> > > other requirements to use RCU.
> > > > Why not support what is done for ethdev call back functions along
> > > > with
> > > providing RCU method?
> > > >
> > > > > > There is always trade off involved!
> > > > > > In the previous patch, you suggested that some lazy app may
> > > > > > not free up the memory allocated by add cb. For such apps,
> > > > > > this patch has sync mechanism with some additional cost of CP & DP
> changes.
> > > > >
> > > > > Sigh, it is not about laziness of the app.
> > > > > The problem with current ethedev cb mechanism and yours V1
> > > > > (which was just a clone of it) - CP doesn't know when it is safe
> > > > > after CB removal to free related memory.
> > > > >
> > > > > > > I still think all this callback mechanism should be totally
> > > > > > > opaque to data-plane threads - user shouldn't change his app
> > > > > > > code depending on would some enqueue/dequeue callbacks be
> > > installed or not.
> > > > > > I am not sure, how that can be implemented with existing RCU
> design.
> > > > >
> > > > > As I said below the simplest way - with calling rcu
> > > > > onine/offline inside CB invocation block.
> > > > > That's why I asked you - did you try that approach and what is
> > > > > the perf numbers?
> > > > > I presume with no callbacks installed the perf change should be
> > > > > nearly
> > > zero.
> > > > >
> > > > > > @Honnappa Nagarahalli, Do you have any suggestions?
> > > > Reporting quiescent state in the call back functions has several
> > > disadvantages:
> > > > 1) it will have performance impacts and the impacts will increase
> > > > as the
> > > number of data plane threads increase.
> > > > 2) It will require additional configuration parameters to control
> > > > how often the quiescent state is reported to control the performance
> impact.
> > > > 3) Does not take advantage of the fact that most probably the
> > > > application is using RCU already
> > > > 4) There are few difficulties as well, please see below.
> > >
> > > I suggested Abhinandan to use RCU library because it is already
> > > there, and I thought it would be good not to re-implement the wheel.
> > > Though if you feel librte_rcu doesn't match that task - fine, let's
> > > do it without librte_rcu.
> > > After all, what we need here - just an atomic ref count per queue
> > > that we are going to increment at entering and leaving list of
> > > callbacks inside enqueue/dequeue.
> > Ok, looks like I missed the point that a queue is used by a single data plane
> thread.
> > Along with ref count increment you need the memory orderings to avoid
> race conditions. These will be the same ones used in RCU.
> > On the control plane, you need to read this counter and poll for the
> counter updates. All this is same cost as in RCU.
> 
> Agree.
> 
> > To control the cost, you
> > will have to control the rate of quiescent state reporting and might have to
> expose this as a configuration parameter.
> >
> > The other important information you have to consider is if the thread
> > is making any blocking calls, which may be in some other library. The
> > thread is supposed to call rcu_qsbr_thread_offline API before calling a
> blocking call. This allows the RCU to know that this particular thread will not
> report quiescent state. The cryptodev library might not have that information.
> >
> > If you want to go ahead with this design, you can still use RCU with
> > single thread configuration (like you have mentioned below) and hide the
> details from the application.
> 
> Yes,  same thought here -  use rcu_qsbr online/offline for DP part and hide
> actual sync details inside callback code.
We can give it a try. If we can have the performance numbers, we can decide about how to control how often these APIs are called on the data plane.

> 
> >
> > >
> > > >
> > > > > >
> > > > > > >
> > > > > > > >
> > > > > > > > >
> > > > > > > > > That seems quite a big change and I don't think it is
> > > > > > > > > acceptable for most users.
> > > > > > > > > From my perspective adding/installing call-backs to the
> > > > > > > > > dev has to be opaque to the data-plane code.
> > > > > > > > > Also note that different callbacks can be installed by
> > > > > > > > > different entities (libs) and might have no idea about each
> other.
> > > > > > > > > That's why I thought it would be better to make all this
> > > > > > > > > RCU stuff internal inside cryptodev:
> > > > > > > > >     hide all this rcu_qsbr allocation/setup inside
> > > > > > > > > cryptod somehow to
> > > > > > > obtain pointer to that rcu_qsbr ev init/queue setup
> > > > > > > > >     invoke rcu_qsbr_online()/rcu_qsbr_offline() inside
> > > > > > > cryptodev_enqueue().
> > > > This will bring in the application related information such as the
> > > > thread ID
> > > into the library.
> > >
> > > I don't think it would.
> > > Cryptodev enqueue/dequeue functions are not supposed to be thread
> > > safe (same as rx/tx burst).
> > > So we can always use RCU with just one thread(thread_id = 0).
> > Agree, the memory that needs to be freed is accessed by a single thread
> on the data plane. RCU with one thread would suffice.
> >
> > > But as I said above - if you feel RCU lib is an overhead here,
> > > that's fine - I think it would be easy enough to do without librte_rcu.
> > >
> > > > If the same API calls are being made from multiple data plane
> > > > threads, you need a way to configure that information to the
> > > > library. So, it is better to leave those details for the application to handle.
> > > >
> > > > > > > > I have already tried exploring above stuffs. There are too
> > > > > > > > many
> > > > > constraints.
> > > > > > > > The changes don't fit in, as per RCU design.
> > > > > > >
> > > > > > > Hmm could you be more specific here - what constraints are
> > > > > > > you referring to?
> > > > > > >
> > > > > > > > Moreover, having rcu api under enqueue_burst() will affect
> > > > > > > > the
> > > > > > > performance.
> > > > > > >
> > > > > > > It most likely will. Though my expectation it will affect
> > > > > > > performance only when some callbacks are installed. My
> > > > > > > thought
> > > here:
> > > > > > > callback function by itself will affect cryptdev_enqueue
> > > > > > > performance anyway,
> > > > > > With existing callback design, I have measured the
> > > > > > performance(with
> > > > > crypto perf test) on xeon.
> > > > > > It was almost negligible and same was shared with Declan.
> > > > >
> > > > > I am asking about different thing: did you try alternate
> > > > > approach I described, that wouldn't require changes in the user data-
> plane code.
> > > > >
> > > > > > That is one of the reasons, I didn't want to add to many
> > > > > > stuffs in to the
> > > > > callback.
> > > > > > The best part of existing design is crypto lib is not much modified.
> > > > > > The changes are either pushed to CP or DP.
> > > > > >
> > > > > > so adding extra overhead for sync is probably ok here.
> > > > >
> > > > > I think that extra overhead when callbacks are present is
> > > > > expected and probably acceptable.
> > > > > Changes in the upper-layer data-plane code - probably not.
> > > > >
> > > > > > > Though for situation when no callbacks are installed -
> > > > > > > perfomance should be left unaffected (or impact should be as
> > > > > > > small
> > > as possible).
> > > > > > >
> > > > > > > > The changes are more on control plane side, which is one time.
> > > > > > > > The data plane changes are minimal.
> > > > > > >
> > > > > > > I still think upper layer data-plane code should stay
> > > > > > > unaffected (zero changes).
> > > > > > >
> > > > > > > > >
> >
> > <snip>
Ananyev, Konstantin Sept. 29, 2020, 9:03 a.m. UTC | #15
> 
> > >
> > > > > > > > > > >
> > > > > > > > > > > +#ifdef RTE_CRYPTODEV_CALLBACKS int
> > > > > > > > > > > +rte_cryptodev_rcu_qsbr_add(uint8_t dev_id, struct
> > > > > > > > > > > +rte_rcu_qsbr
> > > > > > > > > > > +*qsbr) {
> > > > > > > > > > > +
> > > > > > > > > > > +	struct rte_cryptodev *dev;
> > > > > > > > > > > +
> > > > > > > > > > > +	if (!rte_cryptodev_pmd_is_valid_dev(dev_id)) {
> > > > > > > > > > > +		CDEV_LOG_ERR("Invalid dev_id=%" PRIu8,
> > dev_id);
> > > > > > > > > > > +		return -EINVAL;
> > > > > > > > > > > +	}
> > > > > > > > > > > +
> > > > > > > > > > > +	dev = &rte_crypto_devices[dev_id];
> > > > > > > > > > > +	dev->qsbr = qsbr;
> > > > > > > > > > > +	return 0;
> > > > > > > > > > > +}
> > > > > > > > > >
> > > > > > > > > > So if I understand your patch correctly you propose a
> > > > > > > > > > new working model for
> > > > > > > > > > crypto-devs:
> > > > > > > > > > 1. Control-plane has to allocate/setup rcu_qsbr and do
> > > > > > > > > > rte_cryptodev_rcu_qsbr_add().
> > > > > > > > > > 2. Data-plane has somehow to obtain pointer to that
> > > > > > > > > > rcu_qsbr and wrap
> > > > > > > > > > cryptodev_enqueue()
> > > > > > > > > >    with rcu_qsbr_quiescent()  or
> > > > rcu_qsbr_online()/rcu_qsbr_offline().
> > > > > > > > > Yes. I think, it is not a new model. It is same as RCU
> > > > > > > > > integration with
> > > > > > LPM.
> > > > > > > > > Please refer: https://patches.dpdk.org/cover/73673/
> > > > > > > >
> > > > > > > > I am talking about new working model for crypto-dev
> > > > enqueue/dequeue.
> > > > > > > > As I said above now it becomes data-plane thread responsibility to:
> > > > > > > >  -somehow to obtain pointer to that rcu_qsbr for each
> > > > > > > > cryptodev it is
> > > > > > using.
> > > > > > > >  -call rcu sync functions (quiescent/online/offline) on a regular
> > basis.
> > > > > > > It is not on regular basis. When data plane comes up, they report
> > online.
> > > > > > > They report quiescent when they are done with critical section
> > > > > > > or shared
> > > > > > structure.
> > > > > >
> > > > > > I understand that, but it means all existing apps have to be
> > > > > > changed that
> > > > way.
> > > > > >
> > > > > > > All though, there is some dataplane changes involved here, I
> > > > > > > don't think, it
> > > > > > is major.
> > > > > >
> > > > > > I still think our goal here should be to make no visible changes
> > > > > > to the dataplane.
> > > > > > I.E. all necessary data-plane changes need to be hidden inside
> > > > > > CB invocation part.
> > > > > Please note that this is being implemented using the memory
> > > > > reclamation framework documented at
> > > > > https://doc.dpdk.org/guides/prog_guide/rcu_lib.html#resource-recla
> > > > > mati
> > > > > on-framework-for-dpdk
> > > > >
> > > > > While using RCU there are couple of trade-offs that applications
> > > > > have to
> > > > consider:
> > > > > 1) Performance - reporting the quiescent state too often results
> > > > > in performance impact on data plane
> > > > > 2) Amount of outstanding memory to reclaim - reporting less often
> > > > > results in more outstanding memory to reclaim
> > > > >
> > > > > Hence, the quiescent state reporting is left to the application.
> > > > > The application decides how often it reports the quiescent state
> > > > > and has control
> > > > over the data plane performance and the outstanding memory to reclaim.
> > > > >
> > > > > When you say "new working model for crypto-dev enqueue/dequeue",
> > > > >
> > > > > 1) are you comparing these with existing crypto-dev
> > > > > enqueue/dequeue
> > > > APIs? If yes, these are new APIs, it is not breaking anything.
> > > > > 2) are you comparing these with existing call back functions in
> > > > > ethdev enqueue/dequeue APIs? If yes, agree that this is a new
> > > > > model. But, it is
> > > > possible to support what ethdev supports along with the RCU method
> > > > used in this patch.
> > > >
> > > > What I am talking about:
> > > >
> > > > Existing cryptodev enqueue/dequeue model doesn't require for the
> > > > user to manage any RCU QSBR state manually.
> > > > I believe that addition of ability to add/remove enqueue/dequeue
> > > > callbacks shouldn't change existing working model.
> > > > I think that adding/removing such callbacks has to be opaque to the
> > > > user DP code and shouldn't require user to change it. Same as we
> > > > have now for ethdev callback implementation.
> > > The ethdev callback implementation conveniently leaves the problem of
> > freeing memory to the user to resolve, it does not handle the issue.
> > > Hence, it "looks" to be opaque to the DP code. However, if the
> > > application has to implement a safe way to free the call back memory, its
> > DP is affected based on call backs are being used or not.
> >
> > Yes, I think that's big drawback in initial ethdev callback implementation - it
> > simply ignores DP/CP sync problem completely.
> > Though I think it is possible to have both here:
> >  keep callback "opaque" to DP code and provide some sync mechanism
> > between DP/CP.
> > Hopefully one day we can fix ethdev callbacks too.
> The solution we develop can be used in ethdev too.
> 
> >
> > > > I think that forcing DP code to be aware that callbacks are present
> > > > or not and to modify its behaviour depending on that nearly voids
> > > > the purpose of having callbacks at all.
> > > > In that case DP can just invoke callback function directly from it's
> > codepath .
> > > >
> > > > > >
> > > > > > >
> > > > > > > > Note that now data-plane thread would have to do that always
> > > > > > > > - even if there are now callbacks installed for that
> > > > > > > > cryptodev queue
> > > > right now.
> > > > > > > > All that changes behaviour of existing apps and I presume
> > > > > > > > would reduce adoption of  that fature.
> > > > > If I understand this correct, you are talking about a case where
> > > > > in the application might be registering/unregistering multiple
> > > > > times during its lifetime. In this case, yes, the application
> > > > > might be reporting the
> > > > quiescent state even when it has not registered the call backs. But,
> > > > it has the flexibility to not report it if it implements additional logic.
> > > > > Note that we are assuming that the application has to report
> > > > > quiescent state only for using callback functions. Most probably
> > > > > the application has
> > > > other requirements to use RCU.
> > > > > Why not support what is done for ethdev call back functions along
> > > > > with
> > > > providing RCU method?
> > > > >
> > > > > > > There is always trade off involved!
> > > > > > > In the previous patch, you suggested that some lazy app may
> > > > > > > not free up the memory allocated by add cb. For such apps,
> > > > > > > this patch has sync mechanism with some additional cost of CP & DP
> > changes.
> > > > > >
> > > > > > Sigh, it is not about laziness of the app.
> > > > > > The problem with current ethedev cb mechanism and yours V1
> > > > > > (which was just a clone of it) - CP doesn't know when it is safe
> > > > > > after CB removal to free related memory.
> > > > > >
> > > > > > > > I still think all this callback mechanism should be totally
> > > > > > > > opaque to data-plane threads - user shouldn't change his app
> > > > > > > > code depending on would some enqueue/dequeue callbacks be
> > > > installed or not.
> > > > > > > I am not sure, how that can be implemented with existing RCU
> > design.
> > > > > >
> > > > > > As I said below the simplest way - with calling rcu
> > > > > > onine/offline inside CB invocation block.
> > > > > > That's why I asked you - did you try that approach and what is
> > > > > > the perf numbers?
> > > > > > I presume with no callbacks installed the perf change should be
> > > > > > nearly
> > > > zero.
> > > > > >
> > > > > > > @Honnappa Nagarahalli, Do you have any suggestions?
> > > > > Reporting quiescent state in the call back functions has several
> > > > disadvantages:
> > > > > 1) it will have performance impacts and the impacts will increase
> > > > > as the
> > > > number of data plane threads increase.
> > > > > 2) It will require additional configuration parameters to control
> > > > > how often the quiescent state is reported to control the performance
> > impact.
> > > > > 3) Does not take advantage of the fact that most probably the
> > > > > application is using RCU already
> > > > > 4) There are few difficulties as well, please see below.
> > > >
> > > > I suggested Abhinandan to use RCU library because it is already
> > > > there, and I thought it would be good not to re-implement the wheel.
> > > > Though if you feel librte_rcu doesn't match that task - fine, let's
> > > > do it without librte_rcu.
> > > > After all, what we need here - just an atomic ref count per queue
> > > > that we are going to increment at entering and leaving list of
> > > > callbacks inside enqueue/dequeue.
> > > Ok, looks like I missed the point that a queue is used by a single data plane
> > thread.
> > > Along with ref count increment you need the memory orderings to avoid
> > race conditions. These will be the same ones used in RCU.
> > > On the control plane, you need to read this counter and poll for the
> > counter updates. All this is same cost as in RCU.
> >
> > Agree.
> >
> > > To control the cost, you
> > > will have to control the rate of quiescent state reporting and might have to
> > expose this as a configuration parameter.
> > >
> > > The other important information you have to consider is if the thread
> > > is making any blocking calls, which may be in some other library. The
> > > thread is supposed to call rcu_qsbr_thread_offline API before calling a
> > blocking call. This allows the RCU to know that this particular thread will not
> > report quiescent state. The cryptodev library might not have that information.
> > >
> > > If you want to go ahead with this design, you can still use RCU with
> > > single thread configuration (like you have mentioned below) and hide the
> > details from the application.
> >
> > Yes,  same thought here -  use rcu_qsbr online/offline for DP part and hide
> > actual sync details inside callback code.
> We can give it a try. If we can have the performance numbers, we can decide about how to control how often these APIs are called on the
> data plane.

To avoid misunderstanding: I am talking about calling online/offline
with every cryptodev_enqueue() traversal over CB list.

> 
> >
> > >
> > > >
> > > > >
> > > > > > >
> > > > > > > >
> > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > That seems quite a big change and I don't think it is
> > > > > > > > > > acceptable for most users.
> > > > > > > > > > From my perspective adding/installing call-backs to the
> > > > > > > > > > dev has to be opaque to the data-plane code.
> > > > > > > > > > Also note that different callbacks can be installed by
> > > > > > > > > > different entities (libs) and might have no idea about each
> > other.
> > > > > > > > > > That's why I thought it would be better to make all this
> > > > > > > > > > RCU stuff internal inside cryptodev:
> > > > > > > > > >     hide all this rcu_qsbr allocation/setup inside
> > > > > > > > > > cryptod somehow to
> > > > > > > > obtain pointer to that rcu_qsbr ev init/queue setup
> > > > > > > > > >     invoke rcu_qsbr_online()/rcu_qsbr_offline() inside
> > > > > > > > cryptodev_enqueue().
> > > > > This will bring in the application related information such as the
> > > > > thread ID
> > > > into the library.
> > > >
> > > > I don't think it would.
> > > > Cryptodev enqueue/dequeue functions are not supposed to be thread
> > > > safe (same as rx/tx burst).
> > > > So we can always use RCU with just one thread(thread_id = 0).
> > > Agree, the memory that needs to be freed is accessed by a single thread
> > on the data plane. RCU with one thread would suffice.
> > >
> > > > But as I said above - if you feel RCU lib is an overhead here,
> > > > that's fine - I think it would be easy enough to do without librte_rcu.
> > > >
> > > > > If the same API calls are being made from multiple data plane
> > > > > threads, you need a way to configure that information to the
> > > > > library. So, it is better to leave those details for the application to handle.
> > > > >
> > > > > > > > > I have already tried exploring above stuffs. There are too
> > > > > > > > > many
> > > > > > constraints.
> > > > > > > > > The changes don't fit in, as per RCU design.
> > > > > > > >
> > > > > > > > Hmm could you be more specific here - what constraints are
> > > > > > > > you referring to?
> > > > > > > >
> > > > > > > > > Moreover, having rcu api under enqueue_burst() will affect
> > > > > > > > > the
> > > > > > > > performance.
> > > > > > > >
> > > > > > > > It most likely will. Though my expectation it will affect
> > > > > > > > performance only when some callbacks are installed. My
> > > > > > > > thought
> > > > here:
> > > > > > > > callback function by itself will affect cryptdev_enqueue
> > > > > > > > performance anyway,
> > > > > > > With existing callback design, I have measured the
> > > > > > > performance(with
> > > > > > crypto perf test) on xeon.
> > > > > > > It was almost negligible and same was shared with Declan.
> > > > > >
> > > > > > I am asking about different thing: did you try alternate
> > > > > > approach I described, that wouldn't require changes in the user data-
> > plane code.
> > > > > >
> > > > > > > That is one of the reasons, I didn't want to add to many
> > > > > > > stuffs in to the
> > > > > > callback.
> > > > > > > The best part of existing design is crypto lib is not much modified.
> > > > > > > The changes are either pushed to CP or DP.
> > > > > > >
> > > > > > > so adding extra overhead for sync is probably ok here.
> > > > > >
> > > > > > I think that extra overhead when callbacks are present is
> > > > > > expected and probably acceptable.
> > > > > > Changes in the upper-layer data-plane code - probably not.
> > > > > >
> > > > > > > > Though for situation when no callbacks are installed -
> > > > > > > > perfomance should be left unaffected (or impact should be as
> > > > > > > > small
> > > > as possible).
> > > > > > > >
> > > > > > > > > The changes are more on control plane side, which is one time.
> > > > > > > > > The data plane changes are minimal.
> > > > > > > >
> > > > > > > > I still think upper layer data-plane code should stay
> > > > > > > > unaffected (zero changes).
> > > > > > > >
> > > > > > > > > >
> > >
> > > <snip>
Abhinandan Gujjar Oct. 8, 2020, 5:36 p.m. UTC | #16
Hi Konstantin & Honnappa,

Thanks for all the inputs and feedback.

@Ananyev, Konstantin,
I have measured the perf with and without callback on xeon. Here are the numbers:

./app/dpdk-test-crypto-perf -l 6-7 --vdev="crypto_openssl0,socket_id=0,max_nb_sessions=128" -- --ptest throughput --devtype crypto_openssl --optype cipher-then-auth --cipher-algo aes-cbc --cipher-op encrypt --cipher-key-sz 16 --auth-algo sha1-hmac --auth-op generate --auth-key-sz 64 --digest-sz 12 --total-ops 10000000 --burst-sz 32 --buffer-sz 64

With callback(+ RCU - totally opaque to data-plane threads)
    lcore id    Buf Size  Burst Size    Enqueued    Dequeued  Failed Enq  Failed Deq        MOps        Gbps  Cycles/Buf
           7          64          32                10000000    10000000           0           0                       0.8129      0.4162     2694.09
           7          64          32                10000000    10000000           0           0                       0.8152      0.4174     2686.31
           7          64          32                10000000    10000000           0           0                       0.8198      0.4197     2671.48

Without callback:
    lcore id    Buf Size  Burst Size    Enqueued    Dequeued  Failed Enq  Failed Deq        MOps        Gbps  Cycles/Buf

           7          64          32               10000000    10000000           0           0                        0.8234      0.4216     2659.81
           7          64          32               10000000    10000000           0           0                        0.8247      0.4222     2655.63
           7          64          32               10000000    10000000           0           0                        0.8123      0.4159     2695.90


Regards
Abhinandan

> -----Original Message-----
> From: Ananyev, Konstantin <konstantin.ananyev@intel.com>
> Sent: Tuesday, September 29, 2020 2:33 PM
> To: Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>; Gujjar,
> Abhinandan S <abhinandan.gujjar@intel.com>; dev@dpdk.org; Doherty, Declan
> <declan.doherty@intel.com>
> Cc: jerinj@marvell.com; Akhil.goyal@nxp.com; Vangati, Narender
> <narender.vangati@intel.com>; nd <nd@arm.com>; nd <nd@arm.com>
> Subject: RE: [dpdk-dev] [v2 1/2] cryptodev: support enqueue callback functions
> 
> 
> >
> > > >
> > > > > > > > > > > >
> > > > > > > > > > > > +#ifdef RTE_CRYPTODEV_CALLBACKS int
> > > > > > > > > > > > +rte_cryptodev_rcu_qsbr_add(uint8_t dev_id, struct
> > > > > > > > > > > > +rte_rcu_qsbr
> > > > > > > > > > > > +*qsbr) {
> > > > > > > > > > > > +
> > > > > > > > > > > > +	struct rte_cryptodev *dev;
> > > > > > > > > > > > +
> > > > > > > > > > > > +	if (!rte_cryptodev_pmd_is_valid_dev(dev_id)) {
> > > > > > > > > > > > +		CDEV_LOG_ERR("Invalid dev_id=%" PRIu8,
> > > dev_id);
> > > > > > > > > > > > +		return -EINVAL;
> > > > > > > > > > > > +	}
> > > > > > > > > > > > +
> > > > > > > > > > > > +	dev = &rte_crypto_devices[dev_id];
> > > > > > > > > > > > +	dev->qsbr = qsbr;
> > > > > > > > > > > > +	return 0;
> > > > > > > > > > > > +}
> > > > > > > > > > >
> > > > > > > > > > > So if I understand your patch correctly you propose
> > > > > > > > > > > a new working model for
> > > > > > > > > > > crypto-devs:
> > > > > > > > > > > 1. Control-plane has to allocate/setup rcu_qsbr and
> > > > > > > > > > > do rte_cryptodev_rcu_qsbr_add().
> > > > > > > > > > > 2. Data-plane has somehow to obtain pointer to that
> > > > > > > > > > > rcu_qsbr and wrap
> > > > > > > > > > > cryptodev_enqueue()
> > > > > > > > > > >    with rcu_qsbr_quiescent()  or
> > > > > rcu_qsbr_online()/rcu_qsbr_offline().
> > > > > > > > > > Yes. I think, it is not a new model. It is same as RCU
> > > > > > > > > > integration with
> > > > > > > LPM.
> > > > > > > > > > Please refer: https://patches.dpdk.org/cover/73673/
> > > > > > > > >
> > > > > > > > > I am talking about new working model for crypto-dev
> > > > > enqueue/dequeue.
> > > > > > > > > As I said above now it becomes data-plane thread responsibility
> to:
> > > > > > > > >  -somehow to obtain pointer to that rcu_qsbr for each
> > > > > > > > > cryptodev it is
> > > > > > > using.
> > > > > > > > >  -call rcu sync functions (quiescent/online/offline) on
> > > > > > > > > a regular
> > > basis.
> > > > > > > > It is not on regular basis. When data plane comes up, they
> > > > > > > > report
> > > online.
> > > > > > > > They report quiescent when they are done with critical
> > > > > > > > section or shared
> > > > > > > structure.
> > > > > > >
> > > > > > > I understand that, but it means all existing apps have to be
> > > > > > > changed that
> > > > > way.
> > > > > > >
> > > > > > > > All though, there is some dataplane changes involved here,
> > > > > > > > I don't think, it
> > > > > > > is major.
> > > > > > >
> > > > > > > I still think our goal here should be to make no visible
> > > > > > > changes to the dataplane.
> > > > > > > I.E. all necessary data-plane changes need to be hidden
> > > > > > > inside CB invocation part.
> > > > > > Please note that this is being implemented using the memory
> > > > > > reclamation framework documented at
> > > > > > https://doc.dpdk.org/guides/prog_guide/rcu_lib.html#resource-r
> > > > > > ecla
> > > > > > mati
> > > > > > on-framework-for-dpdk
> > > > > >
> > > > > > While using RCU there are couple of trade-offs that
> > > > > > applications have to
> > > > > consider:
> > > > > > 1) Performance - reporting the quiescent state too often
> > > > > > results in performance impact on data plane
> > > > > > 2) Amount of outstanding memory to reclaim - reporting less
> > > > > > often results in more outstanding memory to reclaim
> > > > > >
> > > > > > Hence, the quiescent state reporting is left to the application.
> > > > > > The application decides how often it reports the quiescent
> > > > > > state and has control
> > > > > over the data plane performance and the outstanding memory to
> reclaim.
> > > > > >
> > > > > > When you say "new working model for crypto-dev
> > > > > > enqueue/dequeue",
> > > > > >
> > > > > > 1) are you comparing these with existing crypto-dev
> > > > > > enqueue/dequeue
> > > > > APIs? If yes, these are new APIs, it is not breaking anything.
> > > > > > 2) are you comparing these with existing call back functions
> > > > > > in ethdev enqueue/dequeue APIs? If yes, agree that this is a
> > > > > > new model. But, it is
> > > > > possible to support what ethdev supports along with the RCU
> > > > > method used in this patch.
> > > > >
> > > > > What I am talking about:
> > > > >
> > > > > Existing cryptodev enqueue/dequeue model doesn't require for the
> > > > > user to manage any RCU QSBR state manually.
> > > > > I believe that addition of ability to add/remove enqueue/dequeue
> > > > > callbacks shouldn't change existing working model.
> > > > > I think that adding/removing such callbacks has to be opaque to
> > > > > the user DP code and shouldn't require user to change it. Same
> > > > > as we have now for ethdev callback implementation.
> > > > The ethdev callback implementation conveniently leaves the problem
> > > > of
> > > freeing memory to the user to resolve, it does not handle the issue.
> > > > Hence, it "looks" to be opaque to the DP code. However, if the
> > > > application has to implement a safe way to free the call back
> > > > memory, its
> > > DP is affected based on call backs are being used or not.
> > >
> > > Yes, I think that's big drawback in initial ethdev callback
> > > implementation - it simply ignores DP/CP sync problem completely.
> > > Though I think it is possible to have both here:
> > >  keep callback "opaque" to DP code and provide some sync mechanism
> > > between DP/CP.
> > > Hopefully one day we can fix ethdev callbacks too.
> > The solution we develop can be used in ethdev too.
> >
> > >
> > > > > I think that forcing DP code to be aware that callbacks are
> > > > > present or not and to modify its behaviour depending on that
> > > > > nearly voids the purpose of having callbacks at all.
> > > > > In that case DP can just invoke callback function directly from
> > > > > it's
> > > codepath .
> > > > >
> > > > > > >
> > > > > > > >
> > > > > > > > > Note that now data-plane thread would have to do that
> > > > > > > > > always
> > > > > > > > > - even if there are now callbacks installed for that
> > > > > > > > > cryptodev queue
> > > > > right now.
> > > > > > > > > All that changes behaviour of existing apps and I
> > > > > > > > > presume would reduce adoption of  that fature.
> > > > > > If I understand this correct, you are talking about a case
> > > > > > where in the application might be registering/unregistering
> > > > > > multiple times during its lifetime. In this case, yes, the
> > > > > > application might be reporting the
> > > > > quiescent state even when it has not registered the call backs.
> > > > > But, it has the flexibility to not report it if it implements additional logic.
> > > > > > Note that we are assuming that the application has to report
> > > > > > quiescent state only for using callback functions. Most
> > > > > > probably the application has
> > > > > other requirements to use RCU.
> > > > > > Why not support what is done for ethdev call back functions
> > > > > > along with
> > > > > providing RCU method?
> > > > > >
> > > > > > > > There is always trade off involved!
> > > > > > > > In the previous patch, you suggested that some lazy app
> > > > > > > > may not free up the memory allocated by add cb. For such
> > > > > > > > apps, this patch has sync mechanism with some additional
> > > > > > > > cost of CP & DP
> > > changes.
> > > > > > >
> > > > > > > Sigh, it is not about laziness of the app.
> > > > > > > The problem with current ethedev cb mechanism and yours V1
> > > > > > > (which was just a clone of it) - CP doesn't know when it is
> > > > > > > safe after CB removal to free related memory.
> > > > > > >
> > > > > > > > > I still think all this callback mechanism should be
> > > > > > > > > totally opaque to data-plane threads - user shouldn't
> > > > > > > > > change his app code depending on would some
> > > > > > > > > enqueue/dequeue callbacks be
> > > > > installed or not.
> > > > > > > > I am not sure, how that can be implemented with existing
> > > > > > > > RCU
> > > design.
> > > > > > >
> > > > > > > As I said below the simplest way - with calling rcu
> > > > > > > onine/offline inside CB invocation block.
> > > > > > > That's why I asked you - did you try that approach and what
> > > > > > > is the perf numbers?
> > > > > > > I presume with no callbacks installed the perf change should
> > > > > > > be nearly
> > > > > zero.
> > > > > > >
> > > > > > > > @Honnappa Nagarahalli, Do you have any suggestions?
> > > > > > Reporting quiescent state in the call back functions has
> > > > > > several
> > > > > disadvantages:
> > > > > > 1) it will have performance impacts and the impacts will
> > > > > > increase as the
> > > > > number of data plane threads increase.
> > > > > > 2) It will require additional configuration parameters to
> > > > > > control how often the quiescent state is reported to control
> > > > > > the performance
> > > impact.
> > > > > > 3) Does not take advantage of the fact that most probably the
> > > > > > application is using RCU already
> > > > > > 4) There are few difficulties as well, please see below.
> > > > >
> > > > > I suggested Abhinandan to use RCU library because it is already
> > > > > there, and I thought it would be good not to re-implement the wheel.
> > > > > Though if you feel librte_rcu doesn't match that task - fine,
> > > > > let's do it without librte_rcu.
> > > > > After all, what we need here - just an atomic ref count per
> > > > > queue that we are going to increment at entering and leaving
> > > > > list of callbacks inside enqueue/dequeue.
> > > > Ok, looks like I missed the point that a queue is used by a single
> > > > data plane
> > > thread.
> > > > Along with ref count increment you need the memory orderings to
> > > > avoid
> > > race conditions. These will be the same ones used in RCU.
> > > > On the control plane, you need to read this counter and poll for
> > > > the
> > > counter updates. All this is same cost as in RCU.
> > >
> > > Agree.
> > >
> > > > To control the cost, you
> > > > will have to control the rate of quiescent state reporting and
> > > > might have to
> > > expose this as a configuration parameter.
> > > >
> > > > The other important information you have to consider is if the
> > > > thread is making any blocking calls, which may be in some other
> > > > library. The thread is supposed to call rcu_qsbr_thread_offline
> > > > API before calling a
> > > blocking call. This allows the RCU to know that this particular
> > > thread will not report quiescent state. The cryptodev library might not have
> that information.
> > > >
> > > > If you want to go ahead with this design, you can still use RCU
> > > > with single thread configuration (like you have mentioned below)
> > > > and hide the
> > > details from the application.
> > >
> > > Yes,  same thought here -  use rcu_qsbr online/offline for DP part
> > > and hide actual sync details inside callback code.
> > We can give it a try. If we can have the performance numbers, we can
> > decide about how to control how often these APIs are called on the data
> plane.
> 
> To avoid misunderstanding: I am talking about calling online/offline with every
> cryptodev_enqueue() traversal over CB list.
> 
> >
> > >
> > > >
> > > > >
> > > > > >
> > > > > > > >
> > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > That seems quite a big change and I don't think it
> > > > > > > > > > > is acceptable for most users.
> > > > > > > > > > > From my perspective adding/installing call-backs to
> > > > > > > > > > > the dev has to be opaque to the data-plane code.
> > > > > > > > > > > Also note that different callbacks can be installed
> > > > > > > > > > > by different entities (libs) and might have no idea
> > > > > > > > > > > about each
> > > other.
> > > > > > > > > > > That's why I thought it would be better to make all
> > > > > > > > > > > this RCU stuff internal inside cryptodev:
> > > > > > > > > > >     hide all this rcu_qsbr allocation/setup inside
> > > > > > > > > > > cryptod somehow to
> > > > > > > > > obtain pointer to that rcu_qsbr ev init/queue setup
> > > > > > > > > > >     invoke rcu_qsbr_online()/rcu_qsbr_offline()
> > > > > > > > > > > inside
> > > > > > > > > cryptodev_enqueue().
> > > > > > This will bring in the application related information such as
> > > > > > the thread ID
> > > > > into the library.
> > > > >
> > > > > I don't think it would.
> > > > > Cryptodev enqueue/dequeue functions are not supposed to be
> > > > > thread safe (same as rx/tx burst).
> > > > > So we can always use RCU with just one thread(thread_id = 0).
> > > > Agree, the memory that needs to be freed is accessed by a single
> > > > thread
> > > on the data plane. RCU with one thread would suffice.
> > > >
> > > > > But as I said above - if you feel RCU lib is an overhead here,
> > > > > that's fine - I think it would be easy enough to do without librte_rcu.
> > > > >
> > > > > > If the same API calls are being made from multiple data plane
> > > > > > threads, you need a way to configure that information to the
> > > > > > library. So, it is better to leave those details for the application to
> handle.
> > > > > >
> > > > > > > > > > I have already tried exploring above stuffs. There are
> > > > > > > > > > too many
> > > > > > > constraints.
> > > > > > > > > > The changes don't fit in, as per RCU design.
> > > > > > > > >
> > > > > > > > > Hmm could you be more specific here - what constraints
> > > > > > > > > are you referring to?
> > > > > > > > >
> > > > > > > > > > Moreover, having rcu api under enqueue_burst() will
> > > > > > > > > > affect the
> > > > > > > > > performance.
> > > > > > > > >
> > > > > > > > > It most likely will. Though my expectation it will
> > > > > > > > > affect performance only when some callbacks are
> > > > > > > > > installed. My thought
> > > > > here:
> > > > > > > > > callback function by itself will affect cryptdev_enqueue
> > > > > > > > > performance anyway,
> > > > > > > > With existing callback design, I have measured the
> > > > > > > > performance(with
> > > > > > > crypto perf test) on xeon.
> > > > > > > > It was almost negligible and same was shared with Declan.
> > > > > > >
> > > > > > > I am asking about different thing: did you try alternate
> > > > > > > approach I described, that wouldn't require changes in the
> > > > > > > user data-
> > > plane code.
> > > > > > >
> > > > > > > > That is one of the reasons, I didn't want to add to many
> > > > > > > > stuffs in to the
> > > > > > > callback.
> > > > > > > > The best part of existing design is crypto lib is not much modified.
> > > > > > > > The changes are either pushed to CP or DP.
> > > > > > > >
> > > > > > > > so adding extra overhead for sync is probably ok here.
> > > > > > >
> > > > > > > I think that extra overhead when callbacks are present is
> > > > > > > expected and probably acceptable.
> > > > > > > Changes in the upper-layer data-plane code - probably not.
> > > > > > >
> > > > > > > > > Though for situation when no callbacks are installed -
> > > > > > > > > perfomance should be left unaffected (or impact should
> > > > > > > > > be as small
> > > > > as possible).
> > > > > > > > >
> > > > > > > > > > The changes are more on control plane side, which is one
> time.
> > > > > > > > > > The data plane changes are minimal.
> > > > > > > > >
> > > > > > > > > I still think upper layer data-plane code should stay
> > > > > > > > > unaffected (zero changes).
> > > > > > > > >
> > > > > > > > > > >
> > > >
> > > > <snip>
Ananyev, Konstantin Oct. 9, 2020, 2:40 p.m. UTC | #17
Hi Abhinandan,

> 
> Hi Konstantin & Honnappa,
> 
> Thanks for all the inputs and feedback.
> 
> @Ananyev, Konstantin,
> I have measured the perf with and without callback on xeon. Here are the numbers:
> 
> ./app/dpdk-test-crypto-perf -l 6-7 --vdev="crypto_openssl0,socket_id=0,max_nb_sessions=128" -- --ptest throughput --devtype
> crypto_openssl --optype cipher-then-auth --cipher-algo aes-cbc --cipher-op encrypt --cipher-key-sz 16 --auth-algo sha1-hmac --auth-op
> generate --auth-key-sz 64 --digest-sz 12 --total-ops 10000000 --burst-sz 32 --buffer-sz 64
> 
> With callback(+ RCU - totally opaque to data-plane threads)
>     lcore id    Buf Size  Burst Size    Enqueued    Dequeued  Failed Enq  Failed Deq        MOps        Gbps  Cycles/Buf
>            7          64          32                10000000    10000000           0           0                       0.8129      0.4162     2694.09
>            7          64          32                10000000    10000000           0           0                       0.8152      0.4174     2686.31
>            7          64          32                10000000    10000000           0           0                       0.8198      0.4197     2671.48
> 
> Without callback:
>     lcore id    Buf Size  Burst Size    Enqueued    Dequeued  Failed Enq  Failed Deq        MOps        Gbps  Cycles/Buf
> 
>            7          64          32               10000000    10000000           0           0                        0.8234      0.4216     2659.81
>            7          64          32               10000000    10000000           0           0                        0.8247      0.4222     2655.63
>            7          64          32               10000000    10000000           0           0                        0.8123      0.4159     2695.90


Just to cofirm:
You implemented crypto enqueue callbacks using RCU QSBR online /offline
(as suggested below) and numbers above are for:
1) callback code in place and some dummy callback installed
2) callback code in place but no callbacks installed 
  
Is my understanding correct here?
Thanks
Konstantin

 
> 
> Regards
> Abhinandan
> 
> > -----Original Message-----
> > From: Ananyev, Konstantin <konstantin.ananyev@intel.com>
> > Sent: Tuesday, September 29, 2020 2:33 PM
> > To: Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>; Gujjar,
> > Abhinandan S <abhinandan.gujjar@intel.com>; dev@dpdk.org; Doherty, Declan
> > <declan.doherty@intel.com>
> > Cc: jerinj@marvell.com; Akhil.goyal@nxp.com; Vangati, Narender
> > <narender.vangati@intel.com>; nd <nd@arm.com>; nd <nd@arm.com>
> > Subject: RE: [dpdk-dev] [v2 1/2] cryptodev: support enqueue callback functions
> >
> >
> > >
> > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > > +#ifdef RTE_CRYPTODEV_CALLBACKS int
> > > > > > > > > > > > > +rte_cryptodev_rcu_qsbr_add(uint8_t dev_id, struct
> > > > > > > > > > > > > +rte_rcu_qsbr
> > > > > > > > > > > > > +*qsbr) {
> > > > > > > > > > > > > +
> > > > > > > > > > > > > +	struct rte_cryptodev *dev;
> > > > > > > > > > > > > +
> > > > > > > > > > > > > +	if (!rte_cryptodev_pmd_is_valid_dev(dev_id)) {
> > > > > > > > > > > > > +		CDEV_LOG_ERR("Invalid dev_id=%" PRIu8,
> > > > dev_id);
> > > > > > > > > > > > > +		return -EINVAL;
> > > > > > > > > > > > > +	}
> > > > > > > > > > > > > +
> > > > > > > > > > > > > +	dev = &rte_crypto_devices[dev_id];
> > > > > > > > > > > > > +	dev->qsbr = qsbr;
> > > > > > > > > > > > > +	return 0;
> > > > > > > > > > > > > +}
> > > > > > > > > > > >
> > > > > > > > > > > > So if I understand your patch correctly you propose
> > > > > > > > > > > > a new working model for
> > > > > > > > > > > > crypto-devs:
> > > > > > > > > > > > 1. Control-plane has to allocate/setup rcu_qsbr and
> > > > > > > > > > > > do rte_cryptodev_rcu_qsbr_add().
> > > > > > > > > > > > 2. Data-plane has somehow to obtain pointer to that
> > > > > > > > > > > > rcu_qsbr and wrap
> > > > > > > > > > > > cryptodev_enqueue()
> > > > > > > > > > > >    with rcu_qsbr_quiescent()  or
> > > > > > rcu_qsbr_online()/rcu_qsbr_offline().
> > > > > > > > > > > Yes. I think, it is not a new model. It is same as RCU
> > > > > > > > > > > integration with
> > > > > > > > LPM.
> > > > > > > > > > > Please refer: https://patches.dpdk.org/cover/73673/
> > > > > > > > > >
> > > > > > > > > > I am talking about new working model for crypto-dev
> > > > > > enqueue/dequeue.
> > > > > > > > > > As I said above now it becomes data-plane thread responsibility
> > to:
> > > > > > > > > >  -somehow to obtain pointer to that rcu_qsbr for each
> > > > > > > > > > cryptodev it is
> > > > > > > > using.
> > > > > > > > > >  -call rcu sync functions (quiescent/online/offline) on
> > > > > > > > > > a regular
> > > > basis.
> > > > > > > > > It is not on regular basis. When data plane comes up, they
> > > > > > > > > report
> > > > online.
> > > > > > > > > They report quiescent when they are done with critical
> > > > > > > > > section or shared
> > > > > > > > structure.
> > > > > > > >
> > > > > > > > I understand that, but it means all existing apps have to be
> > > > > > > > changed that
> > > > > > way.
> > > > > > > >
> > > > > > > > > All though, there is some dataplane changes involved here,
> > > > > > > > > I don't think, it
> > > > > > > > is major.
> > > > > > > >
> > > > > > > > I still think our goal here should be to make no visible
> > > > > > > > changes to the dataplane.
> > > > > > > > I.E. all necessary data-plane changes need to be hidden
> > > > > > > > inside CB invocation part.
> > > > > > > Please note that this is being implemented using the memory
> > > > > > > reclamation framework documented at
> > > > > > > https://doc.dpdk.org/guides/prog_guide/rcu_lib.html#resource-r
> > > > > > > ecla
> > > > > > > mati
> > > > > > > on-framework-for-dpdk
> > > > > > >
> > > > > > > While using RCU there are couple of trade-offs that
> > > > > > > applications have to
> > > > > > consider:
> > > > > > > 1) Performance - reporting the quiescent state too often
> > > > > > > results in performance impact on data plane
> > > > > > > 2) Amount of outstanding memory to reclaim - reporting less
> > > > > > > often results in more outstanding memory to reclaim
> > > > > > >
> > > > > > > Hence, the quiescent state reporting is left to the application.
> > > > > > > The application decides how often it reports the quiescent
> > > > > > > state and has control
> > > > > > over the data plane performance and the outstanding memory to
> > reclaim.
> > > > > > >
> > > > > > > When you say "new working model for crypto-dev
> > > > > > > enqueue/dequeue",
> > > > > > >
> > > > > > > 1) are you comparing these with existing crypto-dev
> > > > > > > enqueue/dequeue
> > > > > > APIs? If yes, these are new APIs, it is not breaking anything.
> > > > > > > 2) are you comparing these with existing call back functions
> > > > > > > in ethdev enqueue/dequeue APIs? If yes, agree that this is a
> > > > > > > new model. But, it is
> > > > > > possible to support what ethdev supports along with the RCU
> > > > > > method used in this patch.
> > > > > >
> > > > > > What I am talking about:
> > > > > >
> > > > > > Existing cryptodev enqueue/dequeue model doesn't require for the
> > > > > > user to manage any RCU QSBR state manually.
> > > > > > I believe that addition of ability to add/remove enqueue/dequeue
> > > > > > callbacks shouldn't change existing working model.
> > > > > > I think that adding/removing such callbacks has to be opaque to
> > > > > > the user DP code and shouldn't require user to change it. Same
> > > > > > as we have now for ethdev callback implementation.
> > > > > The ethdev callback implementation conveniently leaves the problem
> > > > > of
> > > > freeing memory to the user to resolve, it does not handle the issue.
> > > > > Hence, it "looks" to be opaque to the DP code. However, if the
> > > > > application has to implement a safe way to free the call back
> > > > > memory, its
> > > > DP is affected based on call backs are being used or not.
> > > >
> > > > Yes, I think that's big drawback in initial ethdev callback
> > > > implementation - it simply ignores DP/CP sync problem completely.
> > > > Though I think it is possible to have both here:
> > > >  keep callback "opaque" to DP code and provide some sync mechanism
> > > > between DP/CP.
> > > > Hopefully one day we can fix ethdev callbacks too.
> > > The solution we develop can be used in ethdev too.
> > >
> > > >
> > > > > > I think that forcing DP code to be aware that callbacks are
> > > > > > present or not and to modify its behaviour depending on that
> > > > > > nearly voids the purpose of having callbacks at all.
> > > > > > In that case DP can just invoke callback function directly from
> > > > > > it's
> > > > codepath .
> > > > > >
> > > > > > > >
> > > > > > > > >
> > > > > > > > > > Note that now data-plane thread would have to do that
> > > > > > > > > > always
> > > > > > > > > > - even if there are now callbacks installed for that
> > > > > > > > > > cryptodev queue
> > > > > > right now.
> > > > > > > > > > All that changes behaviour of existing apps and I
> > > > > > > > > > presume would reduce adoption of  that fature.
> > > > > > > If I understand this correct, you are talking about a case
> > > > > > > where in the application might be registering/unregistering
> > > > > > > multiple times during its lifetime. In this case, yes, the
> > > > > > > application might be reporting the
> > > > > > quiescent state even when it has not registered the call backs.
> > > > > > But, it has the flexibility to not report it if it implements additional logic.
> > > > > > > Note that we are assuming that the application has to report
> > > > > > > quiescent state only for using callback functions. Most
> > > > > > > probably the application has
> > > > > > other requirements to use RCU.
> > > > > > > Why not support what is done for ethdev call back functions
> > > > > > > along with
> > > > > > providing RCU method?
> > > > > > >
> > > > > > > > > There is always trade off involved!
> > > > > > > > > In the previous patch, you suggested that some lazy app
> > > > > > > > > may not free up the memory allocated by add cb. For such
> > > > > > > > > apps, this patch has sync mechanism with some additional
> > > > > > > > > cost of CP & DP
> > > > changes.
> > > > > > > >
> > > > > > > > Sigh, it is not about laziness of the app.
> > > > > > > > The problem with current ethedev cb mechanism and yours V1
> > > > > > > > (which was just a clone of it) - CP doesn't know when it is
> > > > > > > > safe after CB removal to free related memory.
> > > > > > > >
> > > > > > > > > > I still think all this callback mechanism should be
> > > > > > > > > > totally opaque to data-plane threads - user shouldn't
> > > > > > > > > > change his app code depending on would some
> > > > > > > > > > enqueue/dequeue callbacks be
> > > > > > installed or not.
> > > > > > > > > I am not sure, how that can be implemented with existing
> > > > > > > > > RCU
> > > > design.
> > > > > > > >
> > > > > > > > As I said below the simplest way - with calling rcu
> > > > > > > > onine/offline inside CB invocation block.
> > > > > > > > That's why I asked you - did you try that approach and what
> > > > > > > > is the perf numbers?
> > > > > > > > I presume with no callbacks installed the perf change should
> > > > > > > > be nearly
> > > > > > zero.
> > > > > > > >
> > > > > > > > > @Honnappa Nagarahalli, Do you have any suggestions?
> > > > > > > Reporting quiescent state in the call back functions has
> > > > > > > several
> > > > > > disadvantages:
> > > > > > > 1) it will have performance impacts and the impacts will
> > > > > > > increase as the
> > > > > > number of data plane threads increase.
> > > > > > > 2) It will require additional configuration parameters to
> > > > > > > control how often the quiescent state is reported to control
> > > > > > > the performance
> > > > impact.
> > > > > > > 3) Does not take advantage of the fact that most probably the
> > > > > > > application is using RCU already
> > > > > > > 4) There are few difficulties as well, please see below.
> > > > > >
> > > > > > I suggested Abhinandan to use RCU library because it is already
> > > > > > there, and I thought it would be good not to re-implement the wheel.
> > > > > > Though if you feel librte_rcu doesn't match that task - fine,
> > > > > > let's do it without librte_rcu.
> > > > > > After all, what we need here - just an atomic ref count per
> > > > > > queue that we are going to increment at entering and leaving
> > > > > > list of callbacks inside enqueue/dequeue.
> > > > > Ok, looks like I missed the point that a queue is used by a single
> > > > > data plane
> > > > thread.
> > > > > Along with ref count increment you need the memory orderings to
> > > > > avoid
> > > > race conditions. These will be the same ones used in RCU.
> > > > > On the control plane, you need to read this counter and poll for
> > > > > the
> > > > counter updates. All this is same cost as in RCU.
> > > >
> > > > Agree.
> > > >
> > > > > To control the cost, you
> > > > > will have to control the rate of quiescent state reporting and
> > > > > might have to
> > > > expose this as a configuration parameter.
> > > > >
> > > > > The other important information you have to consider is if the
> > > > > thread is making any blocking calls, which may be in some other
> > > > > library. The thread is supposed to call rcu_qsbr_thread_offline
> > > > > API before calling a
> > > > blocking call. This allows the RCU to know that this particular
> > > > thread will not report quiescent state. The cryptodev library might not have
> > that information.
> > > > >
> > > > > If you want to go ahead with this design, you can still use RCU
> > > > > with single thread configuration (like you have mentioned below)
> > > > > and hide the
> > > > details from the application.
> > > >
> > > > Yes,  same thought here -  use rcu_qsbr online/offline for DP part
> > > > and hide actual sync details inside callback code.
> > > We can give it a try. If we can have the performance numbers, we can
> > > decide about how to control how often these APIs are called on the data
> > plane.
> >
> > To avoid misunderstanding: I am talking about calling online/offline with every
> > cryptodev_enqueue() traversal over CB list.
> >
> > >
> > > >
> > > > >
> > > > > >
> > > > > > >
> > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > That seems quite a big change and I don't think it
> > > > > > > > > > > > is acceptable for most users.
> > > > > > > > > > > > From my perspective adding/installing call-backs to
> > > > > > > > > > > > the dev has to be opaque to the data-plane code.
> > > > > > > > > > > > Also note that different callbacks can be installed
> > > > > > > > > > > > by different entities (libs) and might have no idea
> > > > > > > > > > > > about each
> > > > other.
> > > > > > > > > > > > That's why I thought it would be better to make all
> > > > > > > > > > > > this RCU stuff internal inside cryptodev:
> > > > > > > > > > > >     hide all this rcu_qsbr allocation/setup inside
> > > > > > > > > > > > cryptod somehow to
> > > > > > > > > > obtain pointer to that rcu_qsbr ev init/queue setup
> > > > > > > > > > > >     invoke rcu_qsbr_online()/rcu_qsbr_offline()
> > > > > > > > > > > > inside
> > > > > > > > > > cryptodev_enqueue().
> > > > > > > This will bring in the application related information such as
> > > > > > > the thread ID
> > > > > > into the library.
> > > > > >
> > > > > > I don't think it would.
> > > > > > Cryptodev enqueue/dequeue functions are not supposed to be
> > > > > > thread safe (same as rx/tx burst).
> > > > > > So we can always use RCU with just one thread(thread_id = 0).
> > > > > Agree, the memory that needs to be freed is accessed by a single
> > > > > thread
> > > > on the data plane. RCU with one thread would suffice.
> > > > >
> > > > > > But as I said above - if you feel RCU lib is an overhead here,
> > > > > > that's fine - I think it would be easy enough to do without librte_rcu.
> > > > > >
> > > > > > > If the same API calls are being made from multiple data plane
> > > > > > > threads, you need a way to configure that information to the
> > > > > > > library. So, it is better to leave those details for the application to
> > handle.
> > > > > > >
> > > > > > > > > > > I have already tried exploring above stuffs. There are
> > > > > > > > > > > too many
> > > > > > > > constraints.
> > > > > > > > > > > The changes don't fit in, as per RCU design.
> > > > > > > > > >
> > > > > > > > > > Hmm could you be more specific here - what constraints
> > > > > > > > > > are you referring to?
> > > > > > > > > >
> > > > > > > > > > > Moreover, having rcu api under enqueue_burst() will
> > > > > > > > > > > affect the
> > > > > > > > > > performance.
> > > > > > > > > >
> > > > > > > > > > It most likely will. Though my expectation it will
> > > > > > > > > > affect performance only when some callbacks are
> > > > > > > > > > installed. My thought
> > > > > > here:
> > > > > > > > > > callback function by itself will affect cryptdev_enqueue
> > > > > > > > > > performance anyway,
> > > > > > > > > With existing callback design, I have measured the
> > > > > > > > > performance(with
> > > > > > > > crypto perf test) on xeon.
> > > > > > > > > It was almost negligible and same was shared with Declan.
> > > > > > > >
> > > > > > > > I am asking about different thing: did you try alternate
> > > > > > > > approach I described, that wouldn't require changes in the
> > > > > > > > user data-
> > > > plane code.
> > > > > > > >
> > > > > > > > > That is one of the reasons, I didn't want to add to many
> > > > > > > > > stuffs in to the
> > > > > > > > callback.
> > > > > > > > > The best part of existing design is crypto lib is not much modified.
> > > > > > > > > The changes are either pushed to CP or DP.
> > > > > > > > >
> > > > > > > > > so adding extra overhead for sync is probably ok here.
> > > > > > > >
> > > > > > > > I think that extra overhead when callbacks are present is
> > > > > > > > expected and probably acceptable.
> > > > > > > > Changes in the upper-layer data-plane code - probably not.
> > > > > > > >
> > > > > > > > > > Though for situation when no callbacks are installed -
> > > > > > > > > > perfomance should be left unaffected (or impact should
> > > > > > > > > > be as small
> > > > > > as possible).
> > > > > > > > > >
> > > > > > > > > > > The changes are more on control plane side, which is one
> > time.
> > > > > > > > > > > The data plane changes are minimal.
> > > > > > > > > >
> > > > > > > > > > I still think upper layer data-plane code should stay
> > > > > > > > > > unaffected (zero changes).
> > > > > > > > > >
> > > > > > > > > > > >
> > > > >
> > > > > <snip>
Abhinandan Gujjar Oct. 9, 2020, 4:51 p.m. UTC | #18
Hi Konstantin,


> -----Original Message-----
> From: Ananyev, Konstantin <konstantin.ananyev@intel.com>
> Sent: Friday, October 9, 2020 8:10 PM
> To: Gujjar, Abhinandan S <abhinandan.gujjar@intel.com>; Honnappa
> Nagarahalli <Honnappa.Nagarahalli@arm.com>; dev@dpdk.org; Doherty,
> Declan <declan.doherty@intel.com>
> Cc: jerinj@marvell.com; Akhil.goyal@nxp.com; Vangati, Narender
> <narender.vangati@intel.com>; nd <nd@arm.com>; nd <nd@arm.com>
> Subject: RE: [dpdk-dev] [v2 1/2] cryptodev: support enqueue callback functions
> 
> Hi Abhinandan,
> 
> >
> > Hi Konstantin & Honnappa,
> >
> > Thanks for all the inputs and feedback.
> >
> > @Ananyev, Konstantin,
> > I have measured the perf with and without callback on xeon. Here are the
> numbers:
> >
> > ./app/dpdk-test-crypto-perf -l 6-7
> > --vdev="crypto_openssl0,socket_id=0,max_nb_sessions=128" -- --ptest
> > throughput --devtype crypto_openssl --optype cipher-then-auth
> > --cipher-algo aes-cbc --cipher-op encrypt --cipher-key-sz 16
> > --auth-algo sha1-hmac --auth-op generate --auth-key-sz 64 --digest-sz
> > 12 --total-ops 10000000 --burst-sz 32 --buffer-sz 64
> >
> > With callback(+ RCU - totally opaque to data-plane threads)
> >     lcore id    Buf Size  Burst Size    Enqueued    Dequeued  Failed Enq  Failed
> Deq        MOps        Gbps  Cycles/Buf
> >            7          64          32                10000000    10000000           0           0
> 0.8129      0.4162     2694.09
> >            7          64          32                10000000    10000000           0           0
> 0.8152      0.4174     2686.31
> >            7          64          32                10000000    10000000           0           0
> 0.8198      0.4197     2671.48
> >
> > Without callback:
> >     lcore id    Buf Size  Burst Size    Enqueued    Dequeued  Failed Enq  Failed
> Deq        MOps        Gbps  Cycles/Buf
> >
> >            7          64          32               10000000    10000000           0           0
> 0.8234      0.4216     2659.81
> >            7          64          32               10000000    10000000           0           0
> 0.8247      0.4222     2655.63
> >            7          64          32               10000000    10000000           0           0
> 0.8123      0.4159     2695.90
> 
> 
> Just to cofirm:
> You implemented crypto enqueue callbacks using RCU QSBR online /offline (as
> suggested below) and numbers above are for:
> 1) callback code in place and some dummy callback installed
That's right. (+ RCU calling online + offline APIs inside rte_cryptodev_enqueue_burst())
> 2) callback code in place but no callbacks installed
No callback code. i.e. Original code.
> 
> Is my understanding correct here?
> Thanks
> Konstantin
> 
> 
> >
> > Regards
> > Abhinandan
> >
> > > -----Original Message-----
> > > From: Ananyev, Konstantin <konstantin.ananyev@intel.com>
> > > Sent: Tuesday, September 29, 2020 2:33 PM
> > > To: Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>; Gujjar,
> > > Abhinandan S <abhinandan.gujjar@intel.com>; dev@dpdk.org; Doherty,
> > > Declan <declan.doherty@intel.com>
> > > Cc: jerinj@marvell.com; Akhil.goyal@nxp.com; Vangati, Narender
> > > <narender.vangati@intel.com>; nd <nd@arm.com>; nd <nd@arm.com>
> > > Subject: RE: [dpdk-dev] [v2 1/2] cryptodev: support enqueue callback
> > > functions
> > >
> > >
> > > >
> > > > > >
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > +#ifdef RTE_CRYPTODEV_CALLBACKS int
> > > > > > > > > > > > > > +rte_cryptodev_rcu_qsbr_add(uint8_t dev_id,
> > > > > > > > > > > > > > +struct rte_rcu_qsbr
> > > > > > > > > > > > > > +*qsbr) {
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > +	struct rte_cryptodev *dev;
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > +	if (!rte_cryptodev_pmd_is_valid_dev(dev_id)) {
> > > > > > > > > > > > > > +		CDEV_LOG_ERR("Invalid dev_id=%"
> PRIu8,
> > > > > dev_id);
> > > > > > > > > > > > > > +		return -EINVAL;
> > > > > > > > > > > > > > +	}
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > +	dev = &rte_crypto_devices[dev_id];
> > > > > > > > > > > > > > +	dev->qsbr = qsbr;
> > > > > > > > > > > > > > +	return 0;
> > > > > > > > > > > > > > +}
> > > > > > > > > > > > >
> > > > > > > > > > > > > So if I understand your patch correctly you
> > > > > > > > > > > > > propose a new working model for
> > > > > > > > > > > > > crypto-devs:
> > > > > > > > > > > > > 1. Control-plane has to allocate/setup rcu_qsbr
> > > > > > > > > > > > > and do rte_cryptodev_rcu_qsbr_add().
> > > > > > > > > > > > > 2. Data-plane has somehow to obtain pointer to
> > > > > > > > > > > > > that rcu_qsbr and wrap
> > > > > > > > > > > > > cryptodev_enqueue()
> > > > > > > > > > > > >    with rcu_qsbr_quiescent()  or
> > > > > > > rcu_qsbr_online()/rcu_qsbr_offline().
> > > > > > > > > > > > Yes. I think, it is not a new model. It is same as
> > > > > > > > > > > > RCU integration with
> > > > > > > > > LPM.
> > > > > > > > > > > > Please refer:
> > > > > > > > > > > > https://patches.dpdk.org/cover/73673/
> > > > > > > > > > >
> > > > > > > > > > > I am talking about new working model for crypto-dev
> > > > > > > enqueue/dequeue.
> > > > > > > > > > > As I said above now it becomes data-plane thread
> > > > > > > > > > > responsibility
> > > to:
> > > > > > > > > > >  -somehow to obtain pointer to that rcu_qsbr for
> > > > > > > > > > > each cryptodev it is
> > > > > > > > > using.
> > > > > > > > > > >  -call rcu sync functions (quiescent/online/offline)
> > > > > > > > > > > on a regular
> > > > > basis.
> > > > > > > > > > It is not on regular basis. When data plane comes up,
> > > > > > > > > > they report
> > > > > online.
> > > > > > > > > > They report quiescent when they are done with critical
> > > > > > > > > > section or shared
> > > > > > > > > structure.
> > > > > > > > >
> > > > > > > > > I understand that, but it means all existing apps have
> > > > > > > > > to be changed that
> > > > > > > way.
> > > > > > > > >
> > > > > > > > > > All though, there is some dataplane changes involved
> > > > > > > > > > here, I don't think, it
> > > > > > > > > is major.
> > > > > > > > >
> > > > > > > > > I still think our goal here should be to make no visible
> > > > > > > > > changes to the dataplane.
> > > > > > > > > I.E. all necessary data-plane changes need to be hidden
> > > > > > > > > inside CB invocation part.
> > > > > > > > Please note that this is being implemented using the
> > > > > > > > memory reclamation framework documented at
> > > > > > > > https://doc.dpdk.org/guides/prog_guide/rcu_lib.html#resour
> > > > > > > > ce-r
> > > > > > > > ecla
> > > > > > > > mati
> > > > > > > > on-framework-for-dpdk
> > > > > > > >
> > > > > > > > While using RCU there are couple of trade-offs that
> > > > > > > > applications have to
> > > > > > > consider:
> > > > > > > > 1) Performance - reporting the quiescent state too often
> > > > > > > > results in performance impact on data plane
> > > > > > > > 2) Amount of outstanding memory to reclaim - reporting
> > > > > > > > less often results in more outstanding memory to reclaim
> > > > > > > >
> > > > > > > > Hence, the quiescent state reporting is left to the application.
> > > > > > > > The application decides how often it reports the quiescent
> > > > > > > > state and has control
> > > > > > > over the data plane performance and the outstanding memory
> > > > > > > to
> > > reclaim.
> > > > > > > >
> > > > > > > > When you say "new working model for crypto-dev
> > > > > > > > enqueue/dequeue",
> > > > > > > >
> > > > > > > > 1) are you comparing these with existing crypto-dev
> > > > > > > > enqueue/dequeue
> > > > > > > APIs? If yes, these are new APIs, it is not breaking anything.
> > > > > > > > 2) are you comparing these with existing call back
> > > > > > > > functions in ethdev enqueue/dequeue APIs? If yes, agree
> > > > > > > > that this is a new model. But, it is
> > > > > > > possible to support what ethdev supports along with the RCU
> > > > > > > method used in this patch.
> > > > > > >
> > > > > > > What I am talking about:
> > > > > > >
> > > > > > > Existing cryptodev enqueue/dequeue model doesn't require for
> > > > > > > the user to manage any RCU QSBR state manually.
> > > > > > > I believe that addition of ability to add/remove
> > > > > > > enqueue/dequeue callbacks shouldn't change existing working
> model.
> > > > > > > I think that adding/removing such callbacks has to be opaque
> > > > > > > to the user DP code and shouldn't require user to change it.
> > > > > > > Same as we have now for ethdev callback implementation.
> > > > > > The ethdev callback implementation conveniently leaves the
> > > > > > problem of
> > > > > freeing memory to the user to resolve, it does not handle the issue.
> > > > > > Hence, it "looks" to be opaque to the DP code. However, if the
> > > > > > application has to implement a safe way to free the call back
> > > > > > memory, its
> > > > > DP is affected based on call backs are being used or not.
> > > > >
> > > > > Yes, I think that's big drawback in initial ethdev callback
> > > > > implementation - it simply ignores DP/CP sync problem completely.
> > > > > Though I think it is possible to have both here:
> > > > >  keep callback "opaque" to DP code and provide some sync
> > > > > mechanism between DP/CP.
> > > > > Hopefully one day we can fix ethdev callbacks too.
> > > > The solution we develop can be used in ethdev too.
> > > >
> > > > >
> > > > > > > I think that forcing DP code to be aware that callbacks are
> > > > > > > present or not and to modify its behaviour depending on that
> > > > > > > nearly voids the purpose of having callbacks at all.
> > > > > > > In that case DP can just invoke callback function directly
> > > > > > > from it's
> > > > > codepath .
> > > > > > >
> > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > > Note that now data-plane thread would have to do
> > > > > > > > > > > that always
> > > > > > > > > > > - even if there are now callbacks installed for that
> > > > > > > > > > > cryptodev queue
> > > > > > > right now.
> > > > > > > > > > > All that changes behaviour of existing apps and I
> > > > > > > > > > > presume would reduce adoption of  that fature.
> > > > > > > > If I understand this correct, you are talking about a case
> > > > > > > > where in the application might be
> > > > > > > > registering/unregistering multiple times during its
> > > > > > > > lifetime. In this case, yes, the application might be
> > > > > > > > reporting the
> > > > > > > quiescent state even when it has not registered the call backs.
> > > > > > > But, it has the flexibility to not report it if it implements additional
> logic.
> > > > > > > > Note that we are assuming that the application has to
> > > > > > > > report quiescent state only for using callback functions.
> > > > > > > > Most probably the application has
> > > > > > > other requirements to use RCU.
> > > > > > > > Why not support what is done for ethdev call back
> > > > > > > > functions along with
> > > > > > > providing RCU method?
> > > > > > > >
> > > > > > > > > > There is always trade off involved!
> > > > > > > > > > In the previous patch, you suggested that some lazy
> > > > > > > > > > app may not free up the memory allocated by add cb.
> > > > > > > > > > For such apps, this patch has sync mechanism with some
> > > > > > > > > > additional cost of CP & DP
> > > > > changes.
> > > > > > > > >
> > > > > > > > > Sigh, it is not about laziness of the app.
> > > > > > > > > The problem with current ethedev cb mechanism and yours
> > > > > > > > > V1 (which was just a clone of it) - CP doesn't know when
> > > > > > > > > it is safe after CB removal to free related memory.
> > > > > > > > >
> > > > > > > > > > > I still think all this callback mechanism should be
> > > > > > > > > > > totally opaque to data-plane threads - user
> > > > > > > > > > > shouldn't change his app code depending on would
> > > > > > > > > > > some enqueue/dequeue callbacks be
> > > > > > > installed or not.
> > > > > > > > > > I am not sure, how that can be implemented with
> > > > > > > > > > existing RCU
> > > > > design.
> > > > > > > > >
> > > > > > > > > As I said below the simplest way - with calling rcu
> > > > > > > > > onine/offline inside CB invocation block.
> > > > > > > > > That's why I asked you - did you try that approach and
> > > > > > > > > what is the perf numbers?
> > > > > > > > > I presume with no callbacks installed the perf change
> > > > > > > > > should be nearly
> > > > > > > zero.
> > > > > > > > >
> > > > > > > > > > @Honnappa Nagarahalli, Do you have any suggestions?
> > > > > > > > Reporting quiescent state in the call back functions has
> > > > > > > > several
> > > > > > > disadvantages:
> > > > > > > > 1) it will have performance impacts and the impacts will
> > > > > > > > increase as the
> > > > > > > number of data plane threads increase.
> > > > > > > > 2) It will require additional configuration parameters to
> > > > > > > > control how often the quiescent state is reported to
> > > > > > > > control the performance
> > > > > impact.
> > > > > > > > 3) Does not take advantage of the fact that most probably
> > > > > > > > the application is using RCU already
> > > > > > > > 4) There are few difficulties as well, please see below.
> > > > > > >
> > > > > > > I suggested Abhinandan to use RCU library because it is
> > > > > > > already there, and I thought it would be good not to re-implement
> the wheel.
> > > > > > > Though if you feel librte_rcu doesn't match that task -
> > > > > > > fine, let's do it without librte_rcu.
> > > > > > > After all, what we need here - just an atomic ref count per
> > > > > > > queue that we are going to increment at entering and leaving
> > > > > > > list of callbacks inside enqueue/dequeue.
> > > > > > Ok, looks like I missed the point that a queue is used by a
> > > > > > single data plane
> > > > > thread.
> > > > > > Along with ref count increment you need the memory orderings
> > > > > > to avoid
> > > > > race conditions. These will be the same ones used in RCU.
> > > > > > On the control plane, you need to read this counter and poll
> > > > > > for the
> > > > > counter updates. All this is same cost as in RCU.
> > > > >
> > > > > Agree.
> > > > >
> > > > > > To control the cost, you
> > > > > > will have to control the rate of quiescent state reporting and
> > > > > > might have to
> > > > > expose this as a configuration parameter.
> > > > > >
> > > > > > The other important information you have to consider is if the
> > > > > > thread is making any blocking calls, which may be in some
> > > > > > other library. The thread is supposed to call
> > > > > > rcu_qsbr_thread_offline API before calling a
> > > > > blocking call. This allows the RCU to know that this particular
> > > > > thread will not report quiescent state. The cryptodev library
> > > > > might not have
> > > that information.
> > > > > >
> > > > > > If you want to go ahead with this design, you can still use
> > > > > > RCU with single thread configuration (like you have mentioned
> > > > > > below) and hide the
> > > > > details from the application.
> > > > >
> > > > > Yes,  same thought here -  use rcu_qsbr online/offline for DP
> > > > > part and hide actual sync details inside callback code.
> > > > We can give it a try. If we can have the performance numbers, we
> > > > can decide about how to control how often these APIs are called on
> > > > the data
> > > plane.
> > >
> > > To avoid misunderstanding: I am talking about calling online/offline
> > > with every
> > > cryptodev_enqueue() traversal over CB list.
> > >
> > > >
> > > > >
> > > > > >
> > > > > > >
> > > > > > > >
> > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > > That seems quite a big change and I don't think
> > > > > > > > > > > > > it is acceptable for most users.
> > > > > > > > > > > > > From my perspective adding/installing call-backs
> > > > > > > > > > > > > to the dev has to be opaque to the data-plane code.
> > > > > > > > > > > > > Also note that different callbacks can be
> > > > > > > > > > > > > installed by different entities (libs) and might
> > > > > > > > > > > > > have no idea about each
> > > > > other.
> > > > > > > > > > > > > That's why I thought it would be better to make
> > > > > > > > > > > > > all this RCU stuff internal inside cryptodev:
> > > > > > > > > > > > >     hide all this rcu_qsbr allocation/setup
> > > > > > > > > > > > > inside cryptod somehow to
> > > > > > > > > > > obtain pointer to that rcu_qsbr ev init/queue setup
> > > > > > > > > > > > >     invoke rcu_qsbr_online()/rcu_qsbr_offline()
> > > > > > > > > > > > > inside
> > > > > > > > > > > cryptodev_enqueue().
> > > > > > > > This will bring in the application related information
> > > > > > > > such as the thread ID
> > > > > > > into the library.
> > > > > > >
> > > > > > > I don't think it would.
> > > > > > > Cryptodev enqueue/dequeue functions are not supposed to be
> > > > > > > thread safe (same as rx/tx burst).
> > > > > > > So we can always use RCU with just one thread(thread_id = 0).
> > > > > > Agree, the memory that needs to be freed is accessed by a
> > > > > > single thread
> > > > > on the data plane. RCU with one thread would suffice.
> > > > > >
> > > > > > > But as I said above - if you feel RCU lib is an overhead
> > > > > > > here, that's fine - I think it would be easy enough to do without
> librte_rcu.
> > > > > > >
> > > > > > > > If the same API calls are being made from multiple data
> > > > > > > > plane threads, you need a way to configure that
> > > > > > > > information to the library. So, it is better to leave
> > > > > > > > those details for the application to
> > > handle.
> > > > > > > >
> > > > > > > > > > > > I have already tried exploring above stuffs. There
> > > > > > > > > > > > are too many
> > > > > > > > > constraints.
> > > > > > > > > > > > The changes don't fit in, as per RCU design.
> > > > > > > > > > >
> > > > > > > > > > > Hmm could you be more specific here - what
> > > > > > > > > > > constraints are you referring to?
> > > > > > > > > > >
> > > > > > > > > > > > Moreover, having rcu api under enqueue_burst()
> > > > > > > > > > > > will affect the
> > > > > > > > > > > performance.
> > > > > > > > > > >
> > > > > > > > > > > It most likely will. Though my expectation it will
> > > > > > > > > > > affect performance only when some callbacks are
> > > > > > > > > > > installed. My thought
> > > > > > > here:
> > > > > > > > > > > callback function by itself will affect
> > > > > > > > > > > cryptdev_enqueue performance anyway,
> > > > > > > > > > With existing callback design, I have measured the
> > > > > > > > > > performance(with
> > > > > > > > > crypto perf test) on xeon.
> > > > > > > > > > It was almost negligible and same was shared with Declan.
> > > > > > > > >
> > > > > > > > > I am asking about different thing: did you try alternate
> > > > > > > > > approach I described, that wouldn't require changes in
> > > > > > > > > the user data-
> > > > > plane code.
> > > > > > > > >
> > > > > > > > > > That is one of the reasons, I didn't want to add to
> > > > > > > > > > many stuffs in to the
> > > > > > > > > callback.
> > > > > > > > > > The best part of existing design is crypto lib is not much
> modified.
> > > > > > > > > > The changes are either pushed to CP or DP.
> > > > > > > > > >
> > > > > > > > > > so adding extra overhead for sync is probably ok here.
> > > > > > > > >
> > > > > > > > > I think that extra overhead when callbacks are present
> > > > > > > > > is expected and probably acceptable.
> > > > > > > > > Changes in the upper-layer data-plane code - probably not.
> > > > > > > > >
> > > > > > > > > > > Though for situation when no callbacks are installed
> > > > > > > > > > > - perfomance should be left unaffected (or impact
> > > > > > > > > > > should be as small
> > > > > > > as possible).
> > > > > > > > > > >
> > > > > > > > > > > > The changes are more on control plane side, which
> > > > > > > > > > > > is one
> > > time.
> > > > > > > > > > > > The data plane changes are minimal.
> > > > > > > > > > >
> > > > > > > > > > > I still think upper layer data-plane code should
> > > > > > > > > > > stay unaffected (zero changes).
> > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > >
> > > > > > <snip>
Ananyev, Konstantin Oct. 11, 2020, 11:02 p.m. UTC | #19
Hi Abhinandan,

> 
> Hi Konstantin,
> 
> 
> > -----Original Message-----
> > From: Ananyev, Konstantin <konstantin.ananyev@intel.com>
> > Sent: Friday, October 9, 2020 8:10 PM
> > To: Gujjar, Abhinandan S <abhinandan.gujjar@intel.com>; Honnappa
> > Nagarahalli <Honnappa.Nagarahalli@arm.com>; dev@dpdk.org; Doherty,
> > Declan <declan.doherty@intel.com>
> > Cc: jerinj@marvell.com; Akhil.goyal@nxp.com; Vangati, Narender
> > <narender.vangati@intel.com>; nd <nd@arm.com>; nd <nd@arm.com>
> > Subject: RE: [dpdk-dev] [v2 1/2] cryptodev: support enqueue callback functions
> >
> > Hi Abhinandan,
> >
> > >
> > > Hi Konstantin & Honnappa,
> > >
> > > Thanks for all the inputs and feedback.
> > >
> > > @Ananyev, Konstantin,
> > > I have measured the perf with and without callback on xeon. Here are the
> > numbers:
> > >
> > > ./app/dpdk-test-crypto-perf -l 6-7
> > > --vdev="crypto_openssl0,socket_id=0,max_nb_sessions=128" -- --ptest
> > > throughput --devtype crypto_openssl --optype cipher-then-auth
> > > --cipher-algo aes-cbc --cipher-op encrypt --cipher-key-sz 16
> > > --auth-algo sha1-hmac --auth-op generate --auth-key-sz 64 --digest-sz
> > > 12 --total-ops 10000000 --burst-sz 32 --buffer-sz 64
> > >
> > > With callback(+ RCU - totally opaque to data-plane threads)
> > >     lcore id    Buf Size  Burst Size    Enqueued    Dequeued  Failed Enq  Failed
> > Deq        MOps        Gbps  Cycles/Buf
> > >            7          64          32                10000000    10000000           0           0
> > 0.8129      0.4162     2694.09
> > >            7          64          32                10000000    10000000           0           0
> > 0.8152      0.4174     2686.31
> > >            7          64          32                10000000    10000000           0           0
> > 0.8198      0.4197     2671.48
> > >
> > > Without callback:
> > >     lcore id    Buf Size  Burst Size    Enqueued    Dequeued  Failed Enq  Failed
> > Deq        MOps        Gbps  Cycles/Buf
> > >
> > >            7          64          32               10000000    10000000           0           0
> > 0.8234      0.4216     2659.81
> > >            7          64          32               10000000    10000000           0           0
> > 0.8247      0.4222     2655.63
> > >            7          64          32               10000000    10000000           0           0
> > 0.8123      0.4159     2695.90
> >
> >
> > Just to cofirm:
> > You implemented crypto enqueue callbacks using RCU QSBR online /offline (as
> > suggested below) and numbers above are for:
> > 1) callback code in place and some dummy callback installed
> That's right. (+ RCU calling online + offline APIs inside rte_cryptodev_enqueue_burst())
> > 2) callback code in place but no callbacks installed
> No callback code. i.e. Original code.

Ok, and if I get things right - difference between mean values is ~15 cycles?
That's seems like very good result to me.
Can I suggest to run one more test for your new callback code in place,
but no actual callbacks installed?
Thanks
Konstantin

> >
> > Is my understanding correct here?
> > Thanks
> > Konstantin
> >
> >
> > >
> > > Regards
> > > Abhinandan
> > >
> > > > -----Original Message-----
> > > > From: Ananyev, Konstantin <konstantin.ananyev@intel.com>
> > > > Sent: Tuesday, September 29, 2020 2:33 PM
> > > > To: Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>; Gujjar,
> > > > Abhinandan S <abhinandan.gujjar@intel.com>; dev@dpdk.org; Doherty,
> > > > Declan <declan.doherty@intel.com>
> > > > Cc: jerinj@marvell.com; Akhil.goyal@nxp.com; Vangati, Narender
> > > > <narender.vangati@intel.com>; nd <nd@arm.com>; nd <nd@arm.com>
> > > > Subject: RE: [dpdk-dev] [v2 1/2] cryptodev: support enqueue callback
> > > > functions
> > > >
> > > >
> > > > >
> > > > > > >
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > +#ifdef RTE_CRYPTODEV_CALLBACKS int
> > > > > > > > > > > > > > > +rte_cryptodev_rcu_qsbr_add(uint8_t dev_id,
> > > > > > > > > > > > > > > +struct rte_rcu_qsbr
> > > > > > > > > > > > > > > +*qsbr) {
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +	struct rte_cryptodev *dev;
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +	if (!rte_cryptodev_pmd_is_valid_dev(dev_id)) {
> > > > > > > > > > > > > > > +		CDEV_LOG_ERR("Invalid dev_id=%"
> > PRIu8,
> > > > > > dev_id);
> > > > > > > > > > > > > > > +		return -EINVAL;
> > > > > > > > > > > > > > > +	}
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > +	dev = &rte_crypto_devices[dev_id];
> > > > > > > > > > > > > > > +	dev->qsbr = qsbr;
> > > > > > > > > > > > > > > +	return 0;
> > > > > > > > > > > > > > > +}
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > So if I understand your patch correctly you
> > > > > > > > > > > > > > propose a new working model for
> > > > > > > > > > > > > > crypto-devs:
> > > > > > > > > > > > > > 1. Control-plane has to allocate/setup rcu_qsbr
> > > > > > > > > > > > > > and do rte_cryptodev_rcu_qsbr_add().
> > > > > > > > > > > > > > 2. Data-plane has somehow to obtain pointer to
> > > > > > > > > > > > > > that rcu_qsbr and wrap
> > > > > > > > > > > > > > cryptodev_enqueue()
> > > > > > > > > > > > > >    with rcu_qsbr_quiescent()  or
> > > > > > > > rcu_qsbr_online()/rcu_qsbr_offline().
> > > > > > > > > > > > > Yes. I think, it is not a new model. It is same as
> > > > > > > > > > > > > RCU integration with
> > > > > > > > > > LPM.
> > > > > > > > > > > > > Please refer:
> > > > > > > > > > > > > https://patches.dpdk.org/cover/73673/
> > > > > > > > > > > >
> > > > > > > > > > > > I am talking about new working model for crypto-dev
> > > > > > > > enqueue/dequeue.
> > > > > > > > > > > > As I said above now it becomes data-plane thread
> > > > > > > > > > > > responsibility
> > > > to:
> > > > > > > > > > > >  -somehow to obtain pointer to that rcu_qsbr for
> > > > > > > > > > > > each cryptodev it is
> > > > > > > > > > using.
> > > > > > > > > > > >  -call rcu sync functions (quiescent/online/offline)
> > > > > > > > > > > > on a regular
> > > > > > basis.
> > > > > > > > > > > It is not on regular basis. When data plane comes up,
> > > > > > > > > > > they report
> > > > > > online.
> > > > > > > > > > > They report quiescent when they are done with critical
> > > > > > > > > > > section or shared
> > > > > > > > > > structure.
> > > > > > > > > >
> > > > > > > > > > I understand that, but it means all existing apps have
> > > > > > > > > > to be changed that
> > > > > > > > way.
> > > > > > > > > >
> > > > > > > > > > > All though, there is some dataplane changes involved
> > > > > > > > > > > here, I don't think, it
> > > > > > > > > > is major.
> > > > > > > > > >
> > > > > > > > > > I still think our goal here should be to make no visible
> > > > > > > > > > changes to the dataplane.
> > > > > > > > > > I.E. all necessary data-plane changes need to be hidden
> > > > > > > > > > inside CB invocation part.
> > > > > > > > > Please note that this is being implemented using the
> > > > > > > > > memory reclamation framework documented at
> > > > > > > > > https://doc.dpdk.org/guides/prog_guide/rcu_lib.html#resour
> > > > > > > > > ce-r
> > > > > > > > > ecla
> > > > > > > > > mati
> > > > > > > > > on-framework-for-dpdk
> > > > > > > > >
> > > > > > > > > While using RCU there are couple of trade-offs that
> > > > > > > > > applications have to
> > > > > > > > consider:
> > > > > > > > > 1) Performance - reporting the quiescent state too often
> > > > > > > > > results in performance impact on data plane
> > > > > > > > > 2) Amount of outstanding memory to reclaim - reporting
> > > > > > > > > less often results in more outstanding memory to reclaim
> > > > > > > > >
> > > > > > > > > Hence, the quiescent state reporting is left to the application.
> > > > > > > > > The application decides how often it reports the quiescent
> > > > > > > > > state and has control
> > > > > > > > over the data plane performance and the outstanding memory
> > > > > > > > to
> > > > reclaim.
> > > > > > > > >
> > > > > > > > > When you say "new working model for crypto-dev
> > > > > > > > > enqueue/dequeue",
> > > > > > > > >
> > > > > > > > > 1) are you comparing these with existing crypto-dev
> > > > > > > > > enqueue/dequeue
> > > > > > > > APIs? If yes, these are new APIs, it is not breaking anything.
> > > > > > > > > 2) are you comparing these with existing call back
> > > > > > > > > functions in ethdev enqueue/dequeue APIs? If yes, agree
> > > > > > > > > that this is a new model. But, it is
> > > > > > > > possible to support what ethdev supports along with the RCU
> > > > > > > > method used in this patch.
> > > > > > > >
> > > > > > > > What I am talking about:
> > > > > > > >
> > > > > > > > Existing cryptodev enqueue/dequeue model doesn't require for
> > > > > > > > the user to manage any RCU QSBR state manually.
> > > > > > > > I believe that addition of ability to add/remove
> > > > > > > > enqueue/dequeue callbacks shouldn't change existing working
> > model.
> > > > > > > > I think that adding/removing such callbacks has to be opaque
> > > > > > > > to the user DP code and shouldn't require user to change it.
> > > > > > > > Same as we have now for ethdev callback implementation.
> > > > > > > The ethdev callback implementation conveniently leaves the
> > > > > > > problem of
> > > > > > freeing memory to the user to resolve, it does not handle the issue.
> > > > > > > Hence, it "looks" to be opaque to the DP code. However, if the
> > > > > > > application has to implement a safe way to free the call back
> > > > > > > memory, its
> > > > > > DP is affected based on call backs are being used or not.
> > > > > >
> > > > > > Yes, I think that's big drawback in initial ethdev callback
> > > > > > implementation - it simply ignores DP/CP sync problem completely.
> > > > > > Though I think it is possible to have both here:
> > > > > >  keep callback "opaque" to DP code and provide some sync
> > > > > > mechanism between DP/CP.
> > > > > > Hopefully one day we can fix ethdev callbacks too.
> > > > > The solution we develop can be used in ethdev too.
> > > > >
> > > > > >
> > > > > > > > I think that forcing DP code to be aware that callbacks are
> > > > > > > > present or not and to modify its behaviour depending on that
> > > > > > > > nearly voids the purpose of having callbacks at all.
> > > > > > > > In that case DP can just invoke callback function directly
> > > > > > > > from it's
> > > > > > codepath .
> > > > > > > >
> > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > > Note that now data-plane thread would have to do
> > > > > > > > > > > > that always
> > > > > > > > > > > > - even if there are now callbacks installed for that
> > > > > > > > > > > > cryptodev queue
> > > > > > > > right now.
> > > > > > > > > > > > All that changes behaviour of existing apps and I
> > > > > > > > > > > > presume would reduce adoption of  that fature.
> > > > > > > > > If I understand this correct, you are talking about a case
> > > > > > > > > where in the application might be
> > > > > > > > > registering/unregistering multiple times during its
> > > > > > > > > lifetime. In this case, yes, the application might be
> > > > > > > > > reporting the
> > > > > > > > quiescent state even when it has not registered the call backs.
> > > > > > > > But, it has the flexibility to not report it if it implements additional
> > logic.
> > > > > > > > > Note that we are assuming that the application has to
> > > > > > > > > report quiescent state only for using callback functions.
> > > > > > > > > Most probably the application has
> > > > > > > > other requirements to use RCU.
> > > > > > > > > Why not support what is done for ethdev call back
> > > > > > > > > functions along with
> > > > > > > > providing RCU method?
> > > > > > > > >
> > > > > > > > > > > There is always trade off involved!
> > > > > > > > > > > In the previous patch, you suggested that some lazy
> > > > > > > > > > > app may not free up the memory allocated by add cb.
> > > > > > > > > > > For such apps, this patch has sync mechanism with some
> > > > > > > > > > > additional cost of CP & DP
> > > > > > changes.
> > > > > > > > > >
> > > > > > > > > > Sigh, it is not about laziness of the app.
> > > > > > > > > > The problem with current ethedev cb mechanism and yours
> > > > > > > > > > V1 (which was just a clone of it) - CP doesn't know when
> > > > > > > > > > it is safe after CB removal to free related memory.
> > > > > > > > > >
> > > > > > > > > > > > I still think all this callback mechanism should be
> > > > > > > > > > > > totally opaque to data-plane threads - user
> > > > > > > > > > > > shouldn't change his app code depending on would
> > > > > > > > > > > > some enqueue/dequeue callbacks be
> > > > > > > > installed or not.
> > > > > > > > > > > I am not sure, how that can be implemented with
> > > > > > > > > > > existing RCU
> > > > > > design.
> > > > > > > > > >
> > > > > > > > > > As I said below the simplest way - with calling rcu
> > > > > > > > > > onine/offline inside CB invocation block.
> > > > > > > > > > That's why I asked you - did you try that approach and
> > > > > > > > > > what is the perf numbers?
> > > > > > > > > > I presume with no callbacks installed the perf change
> > > > > > > > > > should be nearly
> > > > > > > > zero.
> > > > > > > > > >
> > > > > > > > > > > @Honnappa Nagarahalli, Do you have any suggestions?
> > > > > > > > > Reporting quiescent state in the call back functions has
> > > > > > > > > several
> > > > > > > > disadvantages:
> > > > > > > > > 1) it will have performance impacts and the impacts will
> > > > > > > > > increase as the
> > > > > > > > number of data plane threads increase.
> > > > > > > > > 2) It will require additional configuration parameters to
> > > > > > > > > control how often the quiescent state is reported to
> > > > > > > > > control the performance
> > > > > > impact.
> > > > > > > > > 3) Does not take advantage of the fact that most probably
> > > > > > > > > the application is using RCU already
> > > > > > > > > 4) There are few difficulties as well, please see below.
> > > > > > > >
> > > > > > > > I suggested Abhinandan to use RCU library because it is
> > > > > > > > already there, and I thought it would be good not to re-implement
> > the wheel.
> > > > > > > > Though if you feel librte_rcu doesn't match that task -
> > > > > > > > fine, let's do it without librte_rcu.
> > > > > > > > After all, what we need here - just an atomic ref count per
> > > > > > > > queue that we are going to increment at entering and leaving
> > > > > > > > list of callbacks inside enqueue/dequeue.
> > > > > > > Ok, looks like I missed the point that a queue is used by a
> > > > > > > single data plane
> > > > > > thread.
> > > > > > > Along with ref count increment you need the memory orderings
> > > > > > > to avoid
> > > > > > race conditions. These will be the same ones used in RCU.
> > > > > > > On the control plane, you need to read this counter and poll
> > > > > > > for the
> > > > > > counter updates. All this is same cost as in RCU.
> > > > > >
> > > > > > Agree.
> > > > > >
> > > > > > > To control the cost, you
> > > > > > > will have to control the rate of quiescent state reporting and
> > > > > > > might have to
> > > > > > expose this as a configuration parameter.
> > > > > > >
> > > > > > > The other important information you have to consider is if the
> > > > > > > thread is making any blocking calls, which may be in some
> > > > > > > other library. The thread is supposed to call
> > > > > > > rcu_qsbr_thread_offline API before calling a
> > > > > > blocking call. This allows the RCU to know that this particular
> > > > > > thread will not report quiescent state. The cryptodev library
> > > > > > might not have
> > > > that information.
> > > > > > >
> > > > > > > If you want to go ahead with this design, you can still use
> > > > > > > RCU with single thread configuration (like you have mentioned
> > > > > > > below) and hide the
> > > > > > details from the application.
> > > > > >
> > > > > > Yes,  same thought here -  use rcu_qsbr online/offline for DP
> > > > > > part and hide actual sync details inside callback code.
> > > > > We can give it a try. If we can have the performance numbers, we
> > > > > can decide about how to control how often these APIs are called on
> > > > > the data
> > > > plane.
> > > >
> > > > To avoid misunderstanding: I am talking about calling online/offline
> > > > with every
> > > > cryptodev_enqueue() traversal over CB list.
> > > >
> > > > >
> > > > > >
> > > > > > >
> > > > > > > >
> > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > That seems quite a big change and I don't think
> > > > > > > > > > > > > > it is acceptable for most users.
> > > > > > > > > > > > > > From my perspective adding/installing call-backs
> > > > > > > > > > > > > > to the dev has to be opaque to the data-plane code.
> > > > > > > > > > > > > > Also note that different callbacks can be
> > > > > > > > > > > > > > installed by different entities (libs) and might
> > > > > > > > > > > > > > have no idea about each
> > > > > > other.
> > > > > > > > > > > > > > That's why I thought it would be better to make
> > > > > > > > > > > > > > all this RCU stuff internal inside cryptodev:
> > > > > > > > > > > > > >     hide all this rcu_qsbr allocation/setup
> > > > > > > > > > > > > > inside cryptod somehow to
> > > > > > > > > > > > obtain pointer to that rcu_qsbr ev init/queue setup
> > > > > > > > > > > > > >     invoke rcu_qsbr_online()/rcu_qsbr_offline()
> > > > > > > > > > > > > > inside
> > > > > > > > > > > > cryptodev_enqueue().
> > > > > > > > > This will bring in the application related information
> > > > > > > > > such as the thread ID
> > > > > > > > into the library.
> > > > > > > >
> > > > > > > > I don't think it would.
> > > > > > > > Cryptodev enqueue/dequeue functions are not supposed to be
> > > > > > > > thread safe (same as rx/tx burst).
> > > > > > > > So we can always use RCU with just one thread(thread_id = 0).
> > > > > > > Agree, the memory that needs to be freed is accessed by a
> > > > > > > single thread
> > > > > > on the data plane. RCU with one thread would suffice.
> > > > > > >
> > > > > > > > But as I said above - if you feel RCU lib is an overhead
> > > > > > > > here, that's fine - I think it would be easy enough to do without
> > librte_rcu.
> > > > > > > >
> > > > > > > > > If the same API calls are being made from multiple data
> > > > > > > > > plane threads, you need a way to configure that
> > > > > > > > > information to the library. So, it is better to leave
> > > > > > > > > those details for the application to
> > > > handle.
> > > > > > > > >
> > > > > > > > > > > > > I have already tried exploring above stuffs. There
> > > > > > > > > > > > > are too many
> > > > > > > > > > constraints.
> > > > > > > > > > > > > The changes don't fit in, as per RCU design.
> > > > > > > > > > > >
> > > > > > > > > > > > Hmm could you be more specific here - what
> > > > > > > > > > > > constraints are you referring to?
> > > > > > > > > > > >
> > > > > > > > > > > > > Moreover, having rcu api under enqueue_burst()
> > > > > > > > > > > > > will affect the
> > > > > > > > > > > > performance.
> > > > > > > > > > > >
> > > > > > > > > > > > It most likely will. Though my expectation it will
> > > > > > > > > > > > affect performance only when some callbacks are
> > > > > > > > > > > > installed. My thought
> > > > > > > > here:
> > > > > > > > > > > > callback function by itself will affect
> > > > > > > > > > > > cryptdev_enqueue performance anyway,
> > > > > > > > > > > With existing callback design, I have measured the
> > > > > > > > > > > performance(with
> > > > > > > > > > crypto perf test) on xeon.
> > > > > > > > > > > It was almost negligible and same was shared with Declan.
> > > > > > > > > >
> > > > > > > > > > I am asking about different thing: did you try alternate
> > > > > > > > > > approach I described, that wouldn't require changes in
> > > > > > > > > > the user data-
> > > > > > plane code.
> > > > > > > > > >
> > > > > > > > > > > That is one of the reasons, I didn't want to add to
> > > > > > > > > > > many stuffs in to the
> > > > > > > > > > callback.
> > > > > > > > > > > The best part of existing design is crypto lib is not much
> > modified.
> > > > > > > > > > > The changes are either pushed to CP or DP.
> > > > > > > > > > >
> > > > > > > > > > > so adding extra overhead for sync is probably ok here.
> > > > > > > > > >
> > > > > > > > > > I think that extra overhead when callbacks are present
> > > > > > > > > > is expected and probably acceptable.
> > > > > > > > > > Changes in the upper-layer data-plane code - probably not.
> > > > > > > > > >
> > > > > > > > > > > > Though for situation when no callbacks are installed
> > > > > > > > > > > > - perfomance should be left unaffected (or impact
> > > > > > > > > > > > should be as small
> > > > > > > > as possible).
> > > > > > > > > > > >
> > > > > > > > > > > > > The changes are more on control plane side, which
> > > > > > > > > > > > > is one
> > > > time.
> > > > > > > > > > > > > The data plane changes are minimal.
> > > > > > > > > > > >
> > > > > > > > > > > > I still think upper layer data-plane code should
> > > > > > > > > > > > stay unaffected (zero changes).
> > > > > > > > > > > >
> > > > > > > > > > > > > >
> > > > > > >
> > > > > > > <snip>
Abhinandan Gujjar Oct. 12, 2020, 6:47 a.m. UTC | #20
Hi Konstantin,

> -----Original Message-----
> From: Ananyev, Konstantin <konstantin.ananyev@intel.com>
> Sent: Monday, October 12, 2020 4:33 AM
> To: Gujjar, Abhinandan S <abhinandan.gujjar@intel.com>; Honnappa
> Nagarahalli <Honnappa.Nagarahalli@arm.com>; dev@dpdk.org; Doherty,
> Declan <declan.doherty@intel.com>
> Cc: jerinj@marvell.com; Akhil.goyal@nxp.com; Vangati, Narender
> <narender.vangati@intel.com>; nd <nd@arm.com>; nd <nd@arm.com>
> Subject: RE: [dpdk-dev] [v2 1/2] cryptodev: support enqueue callback functions
> 
> 
> 
> Hi Abhinandan,
> 
> >
> > Hi Konstantin,
> >
> >
> > > -----Original Message-----
> > > From: Ananyev, Konstantin <konstantin.ananyev@intel.com>
> > > Sent: Friday, October 9, 2020 8:10 PM
> > > To: Gujjar, Abhinandan S <abhinandan.gujjar@intel.com>; Honnappa
> > > Nagarahalli <Honnappa.Nagarahalli@arm.com>; dev@dpdk.org; Doherty,
> > > Declan <declan.doherty@intel.com>
> > > Cc: jerinj@marvell.com; Akhil.goyal@nxp.com; Vangati, Narender
> > > <narender.vangati@intel.com>; nd <nd@arm.com>; nd <nd@arm.com>
> > > Subject: RE: [dpdk-dev] [v2 1/2] cryptodev: support enqueue callback
> > > functions
> > >
> > > Hi Abhinandan,
> > >
> > > >
> > > > Hi Konstantin & Honnappa,
> > > >
> > > > Thanks for all the inputs and feedback.
> > > >
> > > > @Ananyev, Konstantin,
> > > > I have measured the perf with and without callback on xeon. Here
> > > > are the
> > > numbers:
> > > >
> > > > ./app/dpdk-test-crypto-perf -l 6-7
> > > > --vdev="crypto_openssl0,socket_id=0,max_nb_sessions=128" --
> > > > --ptest throughput --devtype crypto_openssl --optype
> > > > cipher-then-auth --cipher-algo aes-cbc --cipher-op encrypt
> > > > --cipher-key-sz 16 --auth-algo sha1-hmac --auth-op generate
> > > > --auth-key-sz 64 --digest-sz
> > > > 12 --total-ops 10000000 --burst-sz 32 --buffer-sz 64
> > > >
> > > > With callback(+ RCU - totally opaque to data-plane threads)
> > > >     lcore id    Buf Size  Burst Size    Enqueued    Dequeued  Failed Enq
> Failed
> > > Deq        MOps        Gbps  Cycles/Buf
> > > >            7          64          32                10000000    10000000           0           0
> > > 0.8129      0.4162     2694.09
> > > >            7          64          32                10000000    10000000           0           0
> > > 0.8152      0.4174     2686.31
> > > >            7          64          32                10000000    10000000           0           0
> > > 0.8198      0.4197     2671.48
> > > >
> > > > Without callback:
> > > >     lcore id    Buf Size  Burst Size    Enqueued    Dequeued  Failed Enq
> Failed
> > > Deq        MOps        Gbps  Cycles/Buf
> > > >
> > > >            7          64          32               10000000    10000000           0           0
> > > 0.8234      0.4216     2659.81
> > > >            7          64          32               10000000    10000000           0           0
> > > 0.8247      0.4222     2655.63
> > > >            7          64          32               10000000    10000000           0           0
> > > 0.8123      0.4159     2695.90
> > >
> > >
> > > Just to cofirm:
> > > You implemented crypto enqueue callbacks using RCU QSBR online
> > > /offline (as suggested below) and numbers above are for:
> > > 1) callback code in place and some dummy callback installed
> > That's right. (+ RCU calling online + offline APIs inside
> > rte_cryptodev_enqueue_burst())
> > > 2) callback code in place but no callbacks installed
> > No callback code. i.e. Original code.
> 
> Ok, and if I get things right - difference between mean values is ~15 cycles?
Yes. May be, number are more stable on isolated core. Let's consider worst case too.
> That's seems like very good result to me.
> Can I suggest to run one more test for your new callback code in place, but no
> actual callbacks installed?
    lcore id    Buf Size  Burst Size    Enqueued    Dequeued  Failed Enq  Failed Deq        MOps        Gbps  Cycles/Buf

           7          64          32    10000000    10000000           0           0      0.8220      0.4209     2664.12
           7          64          32    10000000    10000000           0           0      0.8245      0.4221     2656.14
           7          64          32    10000000    10000000           0           0      0.8261      0.4229     2651.15

> Thanks
> Konstantin
> 
> > >
> > > Is my understanding correct here?
> > > Thanks
> > > Konstantin
> > >
> > >
> > > >
> > > > Regards
> > > > Abhinandan
> > > >
> > > > > -----Original Message-----
> > > > > From: Ananyev, Konstantin <konstantin.ananyev@intel.com>
> > > > > Sent: Tuesday, September 29, 2020 2:33 PM
> > > > > To: Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>; Gujjar,
> > > > > Abhinandan S <abhinandan.gujjar@intel.com>; dev@dpdk.org;
> > > > > Doherty, Declan <declan.doherty@intel.com>
> > > > > Cc: jerinj@marvell.com; Akhil.goyal@nxp.com; Vangati, Narender
> > > > > <narender.vangati@intel.com>; nd <nd@arm.com>; nd <nd@arm.com>
> > > > > Subject: RE: [dpdk-dev] [v2 1/2] cryptodev: support enqueue
> > > > > callback functions
> > > > >
> > > > >
> > > > > >
> > > > > > > >
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > +#ifdef RTE_CRYPTODEV_CALLBACKS int
> > > > > > > > > > > > > > > > +rte_cryptodev_rcu_qsbr_add(uint8_t
> > > > > > > > > > > > > > > > +dev_id, struct rte_rcu_qsbr
> > > > > > > > > > > > > > > > +*qsbr) {
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > +	struct rte_cryptodev *dev;
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > +	if (!rte_cryptodev_pmd_is_valid_dev(dev_id)) {
> > > > > > > > > > > > > > > > +		CDEV_LOG_ERR("Invalid dev_id=%"
> > > PRIu8,
> > > > > > > dev_id);
> > > > > > > > > > > > > > > > +		return -EINVAL;
> > > > > > > > > > > > > > > > +	}
> > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > +	dev = &rte_crypto_devices[dev_id];
> > > > > > > > > > > > > > > > +	dev->qsbr = qsbr;
> > > > > > > > > > > > > > > > +	return 0; }
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > So if I understand your patch correctly you
> > > > > > > > > > > > > > > propose a new working model for
> > > > > > > > > > > > > > > crypto-devs:
> > > > > > > > > > > > > > > 1. Control-plane has to allocate/setup
> > > > > > > > > > > > > > > rcu_qsbr and do rte_cryptodev_rcu_qsbr_add().
> > > > > > > > > > > > > > > 2. Data-plane has somehow to obtain pointer
> > > > > > > > > > > > > > > to that rcu_qsbr and wrap
> > > > > > > > > > > > > > > cryptodev_enqueue()
> > > > > > > > > > > > > > >    with rcu_qsbr_quiescent()  or
> > > > > > > > > rcu_qsbr_online()/rcu_qsbr_offline().
> > > > > > > > > > > > > > Yes. I think, it is not a new model. It is
> > > > > > > > > > > > > > same as RCU integration with
> > > > > > > > > > > LPM.
> > > > > > > > > > > > > > Please refer:
> > > > > > > > > > > > > > https://patches.dpdk.org/cover/73673/
> > > > > > > > > > > > >
> > > > > > > > > > > > > I am talking about new working model for
> > > > > > > > > > > > > crypto-dev
> > > > > > > > > enqueue/dequeue.
> > > > > > > > > > > > > As I said above now it becomes data-plane thread
> > > > > > > > > > > > > responsibility
> > > > > to:
> > > > > > > > > > > > >  -somehow to obtain pointer to that rcu_qsbr for
> > > > > > > > > > > > > each cryptodev it is
> > > > > > > > > > > using.
> > > > > > > > > > > > >  -call rcu sync functions
> > > > > > > > > > > > > (quiescent/online/offline) on a regular
> > > > > > > basis.
> > > > > > > > > > > > It is not on regular basis. When data plane comes
> > > > > > > > > > > > up, they report
> > > > > > > online.
> > > > > > > > > > > > They report quiescent when they are done with
> > > > > > > > > > > > critical section or shared
> > > > > > > > > > > structure.
> > > > > > > > > > >
> > > > > > > > > > > I understand that, but it means all existing apps
> > > > > > > > > > > have to be changed that
> > > > > > > > > way.
> > > > > > > > > > >
> > > > > > > > > > > > All though, there is some dataplane changes
> > > > > > > > > > > > involved here, I don't think, it
> > > > > > > > > > > is major.
> > > > > > > > > > >
> > > > > > > > > > > I still think our goal here should be to make no
> > > > > > > > > > > visible changes to the dataplane.
> > > > > > > > > > > I.E. all necessary data-plane changes need to be
> > > > > > > > > > > hidden inside CB invocation part.
> > > > > > > > > > Please note that this is being implemented using the
> > > > > > > > > > memory reclamation framework documented at
> > > > > > > > > > https://doc.dpdk.org/guides/prog_guide/rcu_lib.html#re
> > > > > > > > > > sour
> > > > > > > > > > ce-r
> > > > > > > > > > ecla
> > > > > > > > > > mati
> > > > > > > > > > on-framework-for-dpdk
> > > > > > > > > >
> > > > > > > > > > While using RCU there are couple of trade-offs that
> > > > > > > > > > applications have to
> > > > > > > > > consider:
> > > > > > > > > > 1) Performance - reporting the quiescent state too
> > > > > > > > > > often results in performance impact on data plane
> > > > > > > > > > 2) Amount of outstanding memory to reclaim - reporting
> > > > > > > > > > less often results in more outstanding memory to
> > > > > > > > > > reclaim
> > > > > > > > > >
> > > > > > > > > > Hence, the quiescent state reporting is left to the application.
> > > > > > > > > > The application decides how often it reports the
> > > > > > > > > > quiescent state and has control
> > > > > > > > > over the data plane performance and the outstanding
> > > > > > > > > memory to
> > > > > reclaim.
> > > > > > > > > >
> > > > > > > > > > When you say "new working model for crypto-dev
> > > > > > > > > > enqueue/dequeue",
> > > > > > > > > >
> > > > > > > > > > 1) are you comparing these with existing crypto-dev
> > > > > > > > > > enqueue/dequeue
> > > > > > > > > APIs? If yes, these are new APIs, it is not breaking anything.
> > > > > > > > > > 2) are you comparing these with existing call back
> > > > > > > > > > functions in ethdev enqueue/dequeue APIs? If yes,
> > > > > > > > > > agree that this is a new model. But, it is
> > > > > > > > > possible to support what ethdev supports along with the
> > > > > > > > > RCU method used in this patch.
> > > > > > > > >
> > > > > > > > > What I am talking about:
> > > > > > > > >
> > > > > > > > > Existing cryptodev enqueue/dequeue model doesn't require
> > > > > > > > > for the user to manage any RCU QSBR state manually.
> > > > > > > > > I believe that addition of ability to add/remove
> > > > > > > > > enqueue/dequeue callbacks shouldn't change existing
> > > > > > > > > working
> > > model.
> > > > > > > > > I think that adding/removing such callbacks has to be
> > > > > > > > > opaque to the user DP code and shouldn't require user to change
> it.
> > > > > > > > > Same as we have now for ethdev callback implementation.
> > > > > > > > The ethdev callback implementation conveniently leaves the
> > > > > > > > problem of
> > > > > > > freeing memory to the user to resolve, it does not handle the issue.
> > > > > > > > Hence, it "looks" to be opaque to the DP code. However, if
> > > > > > > > the application has to implement a safe way to free the
> > > > > > > > call back memory, its
> > > > > > > DP is affected based on call backs are being used or not.
> > > > > > >
> > > > > > > Yes, I think that's big drawback in initial ethdev callback
> > > > > > > implementation - it simply ignores DP/CP sync problem completely.
> > > > > > > Though I think it is possible to have both here:
> > > > > > >  keep callback "opaque" to DP code and provide some sync
> > > > > > > mechanism between DP/CP.
> > > > > > > Hopefully one day we can fix ethdev callbacks too.
> > > > > > The solution we develop can be used in ethdev too.
> > > > > >
> > > > > > >
> > > > > > > > > I think that forcing DP code to be aware that callbacks
> > > > > > > > > are present or not and to modify its behaviour depending
> > > > > > > > > on that nearly voids the purpose of having callbacks at all.
> > > > > > > > > In that case DP can just invoke callback function
> > > > > > > > > directly from it's
> > > > > > > codepath .
> > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > > Note that now data-plane thread would have to do
> > > > > > > > > > > > > that always
> > > > > > > > > > > > > - even if there are now callbacks installed for
> > > > > > > > > > > > > that cryptodev queue
> > > > > > > > > right now.
> > > > > > > > > > > > > All that changes behaviour of existing apps and
> > > > > > > > > > > > > I presume would reduce adoption of  that fature.
> > > > > > > > > > If I understand this correct, you are talking about a
> > > > > > > > > > case where in the application might be
> > > > > > > > > > registering/unregistering multiple times during its
> > > > > > > > > > lifetime. In this case, yes, the application might be
> > > > > > > > > > reporting the
> > > > > > > > > quiescent state even when it has not registered the call backs.
> > > > > > > > > But, it has the flexibility to not report it if it
> > > > > > > > > implements additional
> > > logic.
> > > > > > > > > > Note that we are assuming that the application has to
> > > > > > > > > > report quiescent state only for using callback functions.
> > > > > > > > > > Most probably the application has
> > > > > > > > > other requirements to use RCU.
> > > > > > > > > > Why not support what is done for ethdev call back
> > > > > > > > > > functions along with
> > > > > > > > > providing RCU method?
> > > > > > > > > >
> > > > > > > > > > > > There is always trade off involved!
> > > > > > > > > > > > In the previous patch, you suggested that some
> > > > > > > > > > > > lazy app may not free up the memory allocated by add cb.
> > > > > > > > > > > > For such apps, this patch has sync mechanism with
> > > > > > > > > > > > some additional cost of CP & DP
> > > > > > > changes.
> > > > > > > > > > >
> > > > > > > > > > > Sigh, it is not about laziness of the app.
> > > > > > > > > > > The problem with current ethedev cb mechanism and
> > > > > > > > > > > yours
> > > > > > > > > > > V1 (which was just a clone of it) - CP doesn't know
> > > > > > > > > > > when it is safe after CB removal to free related memory.
> > > > > > > > > > >
> > > > > > > > > > > > > I still think all this callback mechanism should
> > > > > > > > > > > > > be totally opaque to data-plane threads - user
> > > > > > > > > > > > > shouldn't change his app code depending on would
> > > > > > > > > > > > > some enqueue/dequeue callbacks be
> > > > > > > > > installed or not.
> > > > > > > > > > > > I am not sure, how that can be implemented with
> > > > > > > > > > > > existing RCU
> > > > > > > design.
> > > > > > > > > > >
> > > > > > > > > > > As I said below the simplest way - with calling rcu
> > > > > > > > > > > onine/offline inside CB invocation block.
> > > > > > > > > > > That's why I asked you - did you try that approach
> > > > > > > > > > > and what is the perf numbers?
> > > > > > > > > > > I presume with no callbacks installed the perf
> > > > > > > > > > > change should be nearly
> > > > > > > > > zero.
> > > > > > > > > > >
> > > > > > > > > > > > @Honnappa Nagarahalli, Do you have any suggestions?
> > > > > > > > > > Reporting quiescent state in the call back functions
> > > > > > > > > > has several
> > > > > > > > > disadvantages:
> > > > > > > > > > 1) it will have performance impacts and the impacts
> > > > > > > > > > will increase as the
> > > > > > > > > number of data plane threads increase.
> > > > > > > > > > 2) It will require additional configuration parameters
> > > > > > > > > > to control how often the quiescent state is reported
> > > > > > > > > > to control the performance
> > > > > > > impact.
> > > > > > > > > > 3) Does not take advantage of the fact that most
> > > > > > > > > > probably the application is using RCU already
> > > > > > > > > > 4) There are few difficulties as well, please see below.
> > > > > > > > >
> > > > > > > > > I suggested Abhinandan to use RCU library because it is
> > > > > > > > > already there, and I thought it would be good not to
> > > > > > > > > re-implement
> > > the wheel.
> > > > > > > > > Though if you feel librte_rcu doesn't match that task -
> > > > > > > > > fine, let's do it without librte_rcu.
> > > > > > > > > After all, what we need here - just an atomic ref count
> > > > > > > > > per queue that we are going to increment at entering and
> > > > > > > > > leaving list of callbacks inside enqueue/dequeue.
> > > > > > > > Ok, looks like I missed the point that a queue is used by
> > > > > > > > a single data plane
> > > > > > > thread.
> > > > > > > > Along with ref count increment you need the memory
> > > > > > > > orderings to avoid
> > > > > > > race conditions. These will be the same ones used in RCU.
> > > > > > > > On the control plane, you need to read this counter and
> > > > > > > > poll for the
> > > > > > > counter updates. All this is same cost as in RCU.
> > > > > > >
> > > > > > > Agree.
> > > > > > >
> > > > > > > > To control the cost, you
> > > > > > > > will have to control the rate of quiescent state reporting
> > > > > > > > and might have to
> > > > > > > expose this as a configuration parameter.
> > > > > > > >
> > > > > > > > The other important information you have to consider is if
> > > > > > > > the thread is making any blocking calls, which may be in
> > > > > > > > some other library. The thread is supposed to call
> > > > > > > > rcu_qsbr_thread_offline API before calling a
> > > > > > > blocking call. This allows the RCU to know that this
> > > > > > > particular thread will not report quiescent state. The
> > > > > > > cryptodev library might not have
> > > > > that information.
> > > > > > > >
> > > > > > > > If you want to go ahead with this design, you can still
> > > > > > > > use RCU with single thread configuration (like you have
> > > > > > > > mentioned
> > > > > > > > below) and hide the
> > > > > > > details from the application.
> > > > > > >
> > > > > > > Yes,  same thought here -  use rcu_qsbr online/offline for
> > > > > > > DP part and hide actual sync details inside callback code.
> > > > > > We can give it a try. If we can have the performance numbers,
> > > > > > we can decide about how to control how often these APIs are
> > > > > > called on the data
> > > > > plane.
> > > > >
> > > > > To avoid misunderstanding: I am talking about calling
> > > > > online/offline with every
> > > > > cryptodev_enqueue() traversal over CB list.
> > > > >
> > > > > >
> > > > > > >
> > > > > > > >
> > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > That seems quite a big change and I don't
> > > > > > > > > > > > > > > think it is acceptable for most users.
> > > > > > > > > > > > > > > From my perspective adding/installing
> > > > > > > > > > > > > > > call-backs to the dev has to be opaque to the data-
> plane code.
> > > > > > > > > > > > > > > Also note that different callbacks can be
> > > > > > > > > > > > > > > installed by different entities (libs) and
> > > > > > > > > > > > > > > might have no idea about each
> > > > > > > other.
> > > > > > > > > > > > > > > That's why I thought it would be better to
> > > > > > > > > > > > > > > make all this RCU stuff internal inside cryptodev:
> > > > > > > > > > > > > > >     hide all this rcu_qsbr allocation/setup
> > > > > > > > > > > > > > > inside cryptod somehow to
> > > > > > > > > > > > > obtain pointer to that rcu_qsbr ev init/queue
> > > > > > > > > > > > > setup
> > > > > > > > > > > > > > >     invoke
> > > > > > > > > > > > > > > rcu_qsbr_online()/rcu_qsbr_offline()
> > > > > > > > > > > > > > > inside
> > > > > > > > > > > > > cryptodev_enqueue().
> > > > > > > > > > This will bring in the application related information
> > > > > > > > > > such as the thread ID
> > > > > > > > > into the library.
> > > > > > > > >
> > > > > > > > > I don't think it would.
> > > > > > > > > Cryptodev enqueue/dequeue functions are not supposed to
> > > > > > > > > be thread safe (same as rx/tx burst).
> > > > > > > > > So we can always use RCU with just one thread(thread_id = 0).
> > > > > > > > Agree, the memory that needs to be freed is accessed by a
> > > > > > > > single thread
> > > > > > > on the data plane. RCU with one thread would suffice.
> > > > > > > >
> > > > > > > > > But as I said above - if you feel RCU lib is an overhead
> > > > > > > > > here, that's fine - I think it would be easy enough to
> > > > > > > > > do without
> > > librte_rcu.
> > > > > > > > >
> > > > > > > > > > If the same API calls are being made from multiple
> > > > > > > > > > data plane threads, you need a way to configure that
> > > > > > > > > > information to the library. So, it is better to leave
> > > > > > > > > > those details for the application to
> > > > > handle.
> > > > > > > > > >
> > > > > > > > > > > > > > I have already tried exploring above stuffs.
> > > > > > > > > > > > > > There are too many
> > > > > > > > > > > constraints.
> > > > > > > > > > > > > > The changes don't fit in, as per RCU design.
> > > > > > > > > > > > >
> > > > > > > > > > > > > Hmm could you be more specific here - what
> > > > > > > > > > > > > constraints are you referring to?
> > > > > > > > > > > > >
> > > > > > > > > > > > > > Moreover, having rcu api under enqueue_burst()
> > > > > > > > > > > > > > will affect the
> > > > > > > > > > > > > performance.
> > > > > > > > > > > > >
> > > > > > > > > > > > > It most likely will. Though my expectation it
> > > > > > > > > > > > > will affect performance only when some callbacks
> > > > > > > > > > > > > are installed. My thought
> > > > > > > > > here:
> > > > > > > > > > > > > callback function by itself will affect
> > > > > > > > > > > > > cryptdev_enqueue performance anyway,
> > > > > > > > > > > > With existing callback design, I have measured the
> > > > > > > > > > > > performance(with
> > > > > > > > > > > crypto perf test) on xeon.
> > > > > > > > > > > > It was almost negligible and same was shared with Declan.
> > > > > > > > > > >
> > > > > > > > > > > I am asking about different thing: did you try
> > > > > > > > > > > alternate approach I described, that wouldn't
> > > > > > > > > > > require changes in the user data-
> > > > > > > plane code.
> > > > > > > > > > >
> > > > > > > > > > > > That is one of the reasons, I didn't want to add
> > > > > > > > > > > > to many stuffs in to the
> > > > > > > > > > > callback.
> > > > > > > > > > > > The best part of existing design is crypto lib is
> > > > > > > > > > > > not much
> > > modified.
> > > > > > > > > > > > The changes are either pushed to CP or DP.
> > > > > > > > > > > >
> > > > > > > > > > > > so adding extra overhead for sync is probably ok here.
> > > > > > > > > > >
> > > > > > > > > > > I think that extra overhead when callbacks are
> > > > > > > > > > > present is expected and probably acceptable.
> > > > > > > > > > > Changes in the upper-layer data-plane code - probably not.
> > > > > > > > > > >
> > > > > > > > > > > > > Though for situation when no callbacks are
> > > > > > > > > > > > > installed
> > > > > > > > > > > > > - perfomance should be left unaffected (or
> > > > > > > > > > > > > impact should be as small
> > > > > > > > > as possible).
> > > > > > > > > > > > >
> > > > > > > > > > > > > > The changes are more on control plane side,
> > > > > > > > > > > > > > which is one
> > > > > time.
> > > > > > > > > > > > > > The data plane changes are minimal.
> > > > > > > > > > > > >
> > > > > > > > > > > > > I still think upper layer data-plane code should
> > > > > > > > > > > > > stay unaffected (zero changes).
> > > > > > > > > > > > >
> > > > > > > > > > > > > > >
> > > > > > > >
> > > > > > > > <snip>
Ananyev, Konstantin Oct. 12, 2020, 1:02 p.m. UTC | #21
Hi Abhinandan,
 
> Hi Konstantin,
> 
> > -----Original Message-----
> > From: Ananyev, Konstantin <konstantin.ananyev@intel.com>
> > Sent: Monday, October 12, 2020 4:33 AM
> > To: Gujjar, Abhinandan S <abhinandan.gujjar@intel.com>; Honnappa
> > Nagarahalli <Honnappa.Nagarahalli@arm.com>; dev@dpdk.org; Doherty,
> > Declan <declan.doherty@intel.com>
> > Cc: jerinj@marvell.com; Akhil.goyal@nxp.com; Vangati, Narender
> > <narender.vangati@intel.com>; nd <nd@arm.com>; nd <nd@arm.com>
> > Subject: RE: [dpdk-dev] [v2 1/2] cryptodev: support enqueue callback functions
> >
> >
> >
> > Hi Abhinandan,
> >
> > >
> > > Hi Konstantin,
> > >
> > >
> > > > -----Original Message-----
> > > > From: Ananyev, Konstantin <konstantin.ananyev@intel.com>
> > > > Sent: Friday, October 9, 2020 8:10 PM
> > > > To: Gujjar, Abhinandan S <abhinandan.gujjar@intel.com>; Honnappa
> > > > Nagarahalli <Honnappa.Nagarahalli@arm.com>; dev@dpdk.org; Doherty,
> > > > Declan <declan.doherty@intel.com>
> > > > Cc: jerinj@marvell.com; Akhil.goyal@nxp.com; Vangati, Narender
> > > > <narender.vangati@intel.com>; nd <nd@arm.com>; nd <nd@arm.com>
> > > > Subject: RE: [dpdk-dev] [v2 1/2] cryptodev: support enqueue callback
> > > > functions
> > > >
> > > > Hi Abhinandan,
> > > >
> > > > >
> > > > > Hi Konstantin & Honnappa,
> > > > >
> > > > > Thanks for all the inputs and feedback.
> > > > >
> > > > > @Ananyev, Konstantin,
> > > > > I have measured the perf with and without callback on xeon. Here
> > > > > are the
> > > > numbers:
> > > > >
> > > > > ./app/dpdk-test-crypto-perf -l 6-7
> > > > > --vdev="crypto_openssl0,socket_id=0,max_nb_sessions=128" --
> > > > > --ptest throughput --devtype crypto_openssl --optype
> > > > > cipher-then-auth --cipher-algo aes-cbc --cipher-op encrypt
> > > > > --cipher-key-sz 16 --auth-algo sha1-hmac --auth-op generate
> > > > > --auth-key-sz 64 --digest-sz
> > > > > 12 --total-ops 10000000 --burst-sz 32 --buffer-sz 64
> > > > >
> > > > > With callback(+ RCU - totally opaque to data-plane threads)
> > > > >     lcore id    Buf Size  Burst Size    Enqueued    Dequeued  Failed Enq
> > Failed
> > > > Deq        MOps        Gbps  Cycles/Buf
> > > > >            7          64          32                10000000    10000000           0           0
> > > > 0.8129      0.4162     2694.09
> > > > >            7          64          32                10000000    10000000           0           0
> > > > 0.8152      0.4174     2686.31
> > > > >            7          64          32                10000000    10000000           0           0
> > > > 0.8198      0.4197     2671.48
> > > > >
> > > > > Without callback:
> > > > >     lcore id    Buf Size  Burst Size    Enqueued    Dequeued  Failed Enq
> > Failed
> > > > Deq        MOps        Gbps  Cycles/Buf
> > > > >
> > > > >            7          64          32               10000000    10000000           0           0
> > > > 0.8234      0.4216     2659.81
> > > > >            7          64          32               10000000    10000000           0           0
> > > > 0.8247      0.4222     2655.63
> > > > >            7          64          32               10000000    10000000           0           0
> > > > 0.8123      0.4159     2695.90
> > > >
> > > >
> > > > Just to cofirm:
> > > > You implemented crypto enqueue callbacks using RCU QSBR online
> > > > /offline (as suggested below) and numbers above are for:
> > > > 1) callback code in place and some dummy callback installed
> > > That's right. (+ RCU calling online + offline APIs inside
> > > rte_cryptodev_enqueue_burst())
> > > > 2) callback code in place but no callbacks installed
> > > No callback code. i.e. Original code.
> >
> > Ok, and if I get things right - difference between mean values is ~15 cycles?
> Yes. May be, number are more stable on isolated core. Let's consider worst case too.

Ok.

> > That's seems like very good result to me.
> > Can I suggest to run one more test for your new callback code in place, but no
> > actual callbacks installed?
>     lcore id    Buf Size  Burst Size    Enqueued    Dequeued  Failed Enq  Failed Deq        MOps        Gbps  Cycles/Buf
> 
>            7          64          32    10000000    10000000           0           0      0.8220      0.4209     2664.12
>            7          64          32    10000000    10000000           0           0      0.8245      0.4221     2656.14
>            7          64          32    10000000    10000000           0           0      0.8261      0.4229     2651.15

So, if I can read numbers properly for not-armed callback impact is neglectable.
It is hard to say much without seeing the actual code, but from the numbers above,
I think it is a good result and we can go ahead with that approach.
Honnappa, Akhil, Jerin do you have any objections to such approach in principle?
Konstantin

> > Thanks
> > Konstantin
> >
> > > >
> > > > Is my understanding correct here?
> > > > Thanks
> > > > Konstantin
> > > >
> > > >
> > > > >
> > > > > Regards
> > > > > Abhinandan
> > > > >
> > > > > > -----Original Message-----
> > > > > > From: Ananyev, Konstantin <konstantin.ananyev@intel.com>
> > > > > > Sent: Tuesday, September 29, 2020 2:33 PM
> > > > > > To: Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>; Gujjar,
> > > > > > Abhinandan S <abhinandan.gujjar@intel.com>; dev@dpdk.org;
> > > > > > Doherty, Declan <declan.doherty@intel.com>
> > > > > > Cc: jerinj@marvell.com; Akhil.goyal@nxp.com; Vangati, Narender
> > > > > > <narender.vangati@intel.com>; nd <nd@arm.com>; nd <nd@arm.com>
> > > > > > Subject: RE: [dpdk-dev] [v2 1/2] cryptodev: support enqueue
> > > > > > callback functions
> > > > > >
> > > > > >
> > > > > > >
> > > > > > > > >
> > > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > > +#ifdef RTE_CRYPTODEV_CALLBACKS int
> > > > > > > > > > > > > > > > > +rte_cryptodev_rcu_qsbr_add(uint8_t
> > > > > > > > > > > > > > > > > +dev_id, struct rte_rcu_qsbr
> > > > > > > > > > > > > > > > > +*qsbr) {
> > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > +	struct rte_cryptodev *dev;
> > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > +	if (!rte_cryptodev_pmd_is_valid_dev(dev_id)) {
> > > > > > > > > > > > > > > > > +		CDEV_LOG_ERR("Invalid dev_id=%"
> > > > PRIu8,
> > > > > > > > dev_id);
> > > > > > > > > > > > > > > > > +		return -EINVAL;
> > > > > > > > > > > > > > > > > +	}
> > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > +	dev = &rte_crypto_devices[dev_id];
> > > > > > > > > > > > > > > > > +	dev->qsbr = qsbr;
> > > > > > > > > > > > > > > > > +	return 0; }
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > So if I understand your patch correctly you
> > > > > > > > > > > > > > > > propose a new working model for
> > > > > > > > > > > > > > > > crypto-devs:
> > > > > > > > > > > > > > > > 1. Control-plane has to allocate/setup
> > > > > > > > > > > > > > > > rcu_qsbr and do rte_cryptodev_rcu_qsbr_add().
> > > > > > > > > > > > > > > > 2. Data-plane has somehow to obtain pointer
> > > > > > > > > > > > > > > > to that rcu_qsbr and wrap
> > > > > > > > > > > > > > > > cryptodev_enqueue()
> > > > > > > > > > > > > > > >    with rcu_qsbr_quiescent()  or
> > > > > > > > > > rcu_qsbr_online()/rcu_qsbr_offline().
> > > > > > > > > > > > > > > Yes. I think, it is not a new model. It is
> > > > > > > > > > > > > > > same as RCU integration with
> > > > > > > > > > > > LPM.
> > > > > > > > > > > > > > > Please refer:
> > > > > > > > > > > > > > > https://patches.dpdk.org/cover/73673/
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > I am talking about new working model for
> > > > > > > > > > > > > > crypto-dev
> > > > > > > > > > enqueue/dequeue.
> > > > > > > > > > > > > > As I said above now it becomes data-plane thread
> > > > > > > > > > > > > > responsibility
> > > > > > to:
> > > > > > > > > > > > > >  -somehow to obtain pointer to that rcu_qsbr for
> > > > > > > > > > > > > > each cryptodev it is
> > > > > > > > > > > > using.
> > > > > > > > > > > > > >  -call rcu sync functions
> > > > > > > > > > > > > > (quiescent/online/offline) on a regular
> > > > > > > > basis.
> > > > > > > > > > > > > It is not on regular basis. When data plane comes
> > > > > > > > > > > > > up, they report
> > > > > > > > online.
> > > > > > > > > > > > > They report quiescent when they are done with
> > > > > > > > > > > > > critical section or shared
> > > > > > > > > > > > structure.
> > > > > > > > > > > >
> > > > > > > > > > > > I understand that, but it means all existing apps
> > > > > > > > > > > > have to be changed that
> > > > > > > > > > way.
> > > > > > > > > > > >
> > > > > > > > > > > > > All though, there is some dataplane changes
> > > > > > > > > > > > > involved here, I don't think, it
> > > > > > > > > > > > is major.
> > > > > > > > > > > >
> > > > > > > > > > > > I still think our goal here should be to make no
> > > > > > > > > > > > visible changes to the dataplane.
> > > > > > > > > > > > I.E. all necessary data-plane changes need to be
> > > > > > > > > > > > hidden inside CB invocation part.
> > > > > > > > > > > Please note that this is being implemented using the
> > > > > > > > > > > memory reclamation framework documented at
> > > > > > > > > > > https://doc.dpdk.org/guides/prog_guide/rcu_lib.html#re
> > > > > > > > > > > sour
> > > > > > > > > > > ce-r
> > > > > > > > > > > ecla
> > > > > > > > > > > mati
> > > > > > > > > > > on-framework-for-dpdk
> > > > > > > > > > >
> > > > > > > > > > > While using RCU there are couple of trade-offs that
> > > > > > > > > > > applications have to
> > > > > > > > > > consider:
> > > > > > > > > > > 1) Performance - reporting the quiescent state too
> > > > > > > > > > > often results in performance impact on data plane
> > > > > > > > > > > 2) Amount of outstanding memory to reclaim - reporting
> > > > > > > > > > > less often results in more outstanding memory to
> > > > > > > > > > > reclaim
> > > > > > > > > > >
> > > > > > > > > > > Hence, the quiescent state reporting is left to the application.
> > > > > > > > > > > The application decides how often it reports the
> > > > > > > > > > > quiescent state and has control
> > > > > > > > > > over the data plane performance and the outstanding
> > > > > > > > > > memory to
> > > > > > reclaim.
> > > > > > > > > > >
> > > > > > > > > > > When you say "new working model for crypto-dev
> > > > > > > > > > > enqueue/dequeue",
> > > > > > > > > > >
> > > > > > > > > > > 1) are you comparing these with existing crypto-dev
> > > > > > > > > > > enqueue/dequeue
> > > > > > > > > > APIs? If yes, these are new APIs, it is not breaking anything.
> > > > > > > > > > > 2) are you comparing these with existing call back
> > > > > > > > > > > functions in ethdev enqueue/dequeue APIs? If yes,
> > > > > > > > > > > agree that this is a new model. But, it is
> > > > > > > > > > possible to support what ethdev supports along with the
> > > > > > > > > > RCU method used in this patch.
> > > > > > > > > >
> > > > > > > > > > What I am talking about:
> > > > > > > > > >
> > > > > > > > > > Existing cryptodev enqueue/dequeue model doesn't require
> > > > > > > > > > for the user to manage any RCU QSBR state manually.
> > > > > > > > > > I believe that addition of ability to add/remove
> > > > > > > > > > enqueue/dequeue callbacks shouldn't change existing
> > > > > > > > > > working
> > > > model.
> > > > > > > > > > I think that adding/removing such callbacks has to be
> > > > > > > > > > opaque to the user DP code and shouldn't require user to change
> > it.
> > > > > > > > > > Same as we have now for ethdev callback implementation.
> > > > > > > > > The ethdev callback implementation conveniently leaves the
> > > > > > > > > problem of
> > > > > > > > freeing memory to the user to resolve, it does not handle the issue.
> > > > > > > > > Hence, it "looks" to be opaque to the DP code. However, if
> > > > > > > > > the application has to implement a safe way to free the
> > > > > > > > > call back memory, its
> > > > > > > > DP is affected based on call backs are being used or not.
> > > > > > > >
> > > > > > > > Yes, I think that's big drawback in initial ethdev callback
> > > > > > > > implementation - it simply ignores DP/CP sync problem completely.
> > > > > > > > Though I think it is possible to have both here:
> > > > > > > >  keep callback "opaque" to DP code and provide some sync
> > > > > > > > mechanism between DP/CP.
> > > > > > > > Hopefully one day we can fix ethdev callbacks too.
> > > > > > > The solution we develop can be used in ethdev too.
> > > > > > >
> > > > > > > >
> > > > > > > > > > I think that forcing DP code to be aware that callbacks
> > > > > > > > > > are present or not and to modify its behaviour depending
> > > > > > > > > > on that nearly voids the purpose of having callbacks at all.
> > > > > > > > > > In that case DP can just invoke callback function
> > > > > > > > > > directly from it's
> > > > > > > > codepath .
> > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > > > Note that now data-plane thread would have to do
> > > > > > > > > > > > > > that always
> > > > > > > > > > > > > > - even if there are now callbacks installed for
> > > > > > > > > > > > > > that cryptodev queue
> > > > > > > > > > right now.
> > > > > > > > > > > > > > All that changes behaviour of existing apps and
> > > > > > > > > > > > > > I presume would reduce adoption of  that fature.
> > > > > > > > > > > If I understand this correct, you are talking about a
> > > > > > > > > > > case where in the application might be
> > > > > > > > > > > registering/unregistering multiple times during its
> > > > > > > > > > > lifetime. In this case, yes, the application might be
> > > > > > > > > > > reporting the
> > > > > > > > > > quiescent state even when it has not registered the call backs.
> > > > > > > > > > But, it has the flexibility to not report it if it
> > > > > > > > > > implements additional
> > > > logic.
> > > > > > > > > > > Note that we are assuming that the application has to
> > > > > > > > > > > report quiescent state only for using callback functions.
> > > > > > > > > > > Most probably the application has
> > > > > > > > > > other requirements to use RCU.
> > > > > > > > > > > Why not support what is done for ethdev call back
> > > > > > > > > > > functions along with
> > > > > > > > > > providing RCU method?
> > > > > > > > > > >
> > > > > > > > > > > > > There is always trade off involved!
> > > > > > > > > > > > > In the previous patch, you suggested that some
> > > > > > > > > > > > > lazy app may not free up the memory allocated by add cb.
> > > > > > > > > > > > > For such apps, this patch has sync mechanism with
> > > > > > > > > > > > > some additional cost of CP & DP
> > > > > > > > changes.
> > > > > > > > > > > >
> > > > > > > > > > > > Sigh, it is not about laziness of the app.
> > > > > > > > > > > > The problem with current ethedev cb mechanism and
> > > > > > > > > > > > yours
> > > > > > > > > > > > V1 (which was just a clone of it) - CP doesn't know
> > > > > > > > > > > > when it is safe after CB removal to free related memory.
> > > > > > > > > > > >
> > > > > > > > > > > > > > I still think all this callback mechanism should
> > > > > > > > > > > > > > be totally opaque to data-plane threads - user
> > > > > > > > > > > > > > shouldn't change his app code depending on would
> > > > > > > > > > > > > > some enqueue/dequeue callbacks be
> > > > > > > > > > installed or not.
> > > > > > > > > > > > > I am not sure, how that can be implemented with
> > > > > > > > > > > > > existing RCU
> > > > > > > > design.
> > > > > > > > > > > >
> > > > > > > > > > > > As I said below the simplest way - with calling rcu
> > > > > > > > > > > > onine/offline inside CB invocation block.
> > > > > > > > > > > > That's why I asked you - did you try that approach
> > > > > > > > > > > > and what is the perf numbers?
> > > > > > > > > > > > I presume with no callbacks installed the perf
> > > > > > > > > > > > change should be nearly
> > > > > > > > > > zero.
> > > > > > > > > > > >
> > > > > > > > > > > > > @Honnappa Nagarahalli, Do you have any suggestions?
> > > > > > > > > > > Reporting quiescent state in the call back functions
> > > > > > > > > > > has several
> > > > > > > > > > disadvantages:
> > > > > > > > > > > 1) it will have performance impacts and the impacts
> > > > > > > > > > > will increase as the
> > > > > > > > > > number of data plane threads increase.
> > > > > > > > > > > 2) It will require additional configuration parameters
> > > > > > > > > > > to control how often the quiescent state is reported
> > > > > > > > > > > to control the performance
> > > > > > > > impact.
> > > > > > > > > > > 3) Does not take advantage of the fact that most
> > > > > > > > > > > probably the application is using RCU already
> > > > > > > > > > > 4) There are few difficulties as well, please see below.
> > > > > > > > > >
> > > > > > > > > > I suggested Abhinandan to use RCU library because it is
> > > > > > > > > > already there, and I thought it would be good not to
> > > > > > > > > > re-implement
> > > > the wheel.
> > > > > > > > > > Though if you feel librte_rcu doesn't match that task -
> > > > > > > > > > fine, let's do it without librte_rcu.
> > > > > > > > > > After all, what we need here - just an atomic ref count
> > > > > > > > > > per queue that we are going to increment at entering and
> > > > > > > > > > leaving list of callbacks inside enqueue/dequeue.
> > > > > > > > > Ok, looks like I missed the point that a queue is used by
> > > > > > > > > a single data plane
> > > > > > > > thread.
> > > > > > > > > Along with ref count increment you need the memory
> > > > > > > > > orderings to avoid
> > > > > > > > race conditions. These will be the same ones used in RCU.
> > > > > > > > > On the control plane, you need to read this counter and
> > > > > > > > > poll for the
> > > > > > > > counter updates. All this is same cost as in RCU.
> > > > > > > >
> > > > > > > > Agree.
> > > > > > > >
> > > > > > > > > To control the cost, you
> > > > > > > > > will have to control the rate of quiescent state reporting
> > > > > > > > > and might have to
> > > > > > > > expose this as a configuration parameter.
> > > > > > > > >
> > > > > > > > > The other important information you have to consider is if
> > > > > > > > > the thread is making any blocking calls, which may be in
> > > > > > > > > some other library. The thread is supposed to call
> > > > > > > > > rcu_qsbr_thread_offline API before calling a
> > > > > > > > blocking call. This allows the RCU to know that this
> > > > > > > > particular thread will not report quiescent state. The
> > > > > > > > cryptodev library might not have
> > > > > > that information.
> > > > > > > > >
> > > > > > > > > If you want to go ahead with this design, you can still
> > > > > > > > > use RCU with single thread configuration (like you have
> > > > > > > > > mentioned
> > > > > > > > > below) and hide the
> > > > > > > > details from the application.
> > > > > > > >
> > > > > > > > Yes,  same thought here -  use rcu_qsbr online/offline for
> > > > > > > > DP part and hide actual sync details inside callback code.
> > > > > > > We can give it a try. If we can have the performance numbers,
> > > > > > > we can decide about how to control how often these APIs are
> > > > > > > called on the data
> > > > > > plane.
> > > > > >
> > > > > > To avoid misunderstanding: I am talking about calling
> > > > > > online/offline with every
> > > > > > cryptodev_enqueue() traversal over CB list.
> > > > > >
> > > > > > >
> > > > > > > >
> > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > That seems quite a big change and I don't
> > > > > > > > > > > > > > > > think it is acceptable for most users.
> > > > > > > > > > > > > > > > From my perspective adding/installing
> > > > > > > > > > > > > > > > call-backs to the dev has to be opaque to the data-
> > plane code.
> > > > > > > > > > > > > > > > Also note that different callbacks can be
> > > > > > > > > > > > > > > > installed by different entities (libs) and
> > > > > > > > > > > > > > > > might have no idea about each
> > > > > > > > other.
> > > > > > > > > > > > > > > > That's why I thought it would be better to
> > > > > > > > > > > > > > > > make all this RCU stuff internal inside cryptodev:
> > > > > > > > > > > > > > > >     hide all this rcu_qsbr allocation/setup
> > > > > > > > > > > > > > > > inside cryptod somehow to
> > > > > > > > > > > > > > obtain pointer to that rcu_qsbr ev init/queue
> > > > > > > > > > > > > > setup
> > > > > > > > > > > > > > > >     invoke
> > > > > > > > > > > > > > > > rcu_qsbr_online()/rcu_qsbr_offline()
> > > > > > > > > > > > > > > > inside
> > > > > > > > > > > > > > cryptodev_enqueue().
> > > > > > > > > > > This will bring in the application related information
> > > > > > > > > > > such as the thread ID
> > > > > > > > > > into the library.
> > > > > > > > > >
> > > > > > > > > > I don't think it would.
> > > > > > > > > > Cryptodev enqueue/dequeue functions are not supposed to
> > > > > > > > > > be thread safe (same as rx/tx burst).
> > > > > > > > > > So we can always use RCU with just one thread(thread_id = 0).
> > > > > > > > > Agree, the memory that needs to be freed is accessed by a
> > > > > > > > > single thread
> > > > > > > > on the data plane. RCU with one thread would suffice.
> > > > > > > > >
> > > > > > > > > > But as I said above - if you feel RCU lib is an overhead
> > > > > > > > > > here, that's fine - I think it would be easy enough to
> > > > > > > > > > do without
> > > > librte_rcu.
> > > > > > > > > >
> > > > > > > > > > > If the same API calls are being made from multiple
> > > > > > > > > > > data plane threads, you need a way to configure that
> > > > > > > > > > > information to the library. So, it is better to leave
> > > > > > > > > > > those details for the application to
> > > > > > handle.
> > > > > > > > > > >
> > > > > > > > > > > > > > > I have already tried exploring above stuffs.
> > > > > > > > > > > > > > > There are too many
> > > > > > > > > > > > constraints.
> > > > > > > > > > > > > > > The changes don't fit in, as per RCU design.
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > Hmm could you be more specific here - what
> > > > > > > > > > > > > > constraints are you referring to?
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > > Moreover, having rcu api under enqueue_burst()
> > > > > > > > > > > > > > > will affect the
> > > > > > > > > > > > > > performance.
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > It most likely will. Though my expectation it
> > > > > > > > > > > > > > will affect performance only when some callbacks
> > > > > > > > > > > > > > are installed. My thought
> > > > > > > > > > here:
> > > > > > > > > > > > > > callback function by itself will affect
> > > > > > > > > > > > > > cryptdev_enqueue performance anyway,
> > > > > > > > > > > > > With existing callback design, I have measured the
> > > > > > > > > > > > > performance(with
> > > > > > > > > > > > crypto perf test) on xeon.
> > > > > > > > > > > > > It was almost negligible and same was shared with Declan.
> > > > > > > > > > > >
> > > > > > > > > > > > I am asking about different thing: did you try
> > > > > > > > > > > > alternate approach I described, that wouldn't
> > > > > > > > > > > > require changes in the user data-
> > > > > > > > plane code.
> > > > > > > > > > > >
> > > > > > > > > > > > > That is one of the reasons, I didn't want to add
> > > > > > > > > > > > > to many stuffs in to the
> > > > > > > > > > > > callback.
> > > > > > > > > > > > > The best part of existing design is crypto lib is
> > > > > > > > > > > > > not much
> > > > modified.
> > > > > > > > > > > > > The changes are either pushed to CP or DP.
> > > > > > > > > > > > >
> > > > > > > > > > > > > so adding extra overhead for sync is probably ok here.
> > > > > > > > > > > >
> > > > > > > > > > > > I think that extra overhead when callbacks are
> > > > > > > > > > > > present is expected and probably acceptable.
> > > > > > > > > > > > Changes in the upper-layer data-plane code - probably not.
> > > > > > > > > > > >
> > > > > > > > > > > > > > Though for situation when no callbacks are
> > > > > > > > > > > > > > installed
> > > > > > > > > > > > > > - perfomance should be left unaffected (or
> > > > > > > > > > > > > > impact should be as small
> > > > > > > > > > as possible).
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > > The changes are more on control plane side,
> > > > > > > > > > > > > > > which is one
> > > > > > time.
> > > > > > > > > > > > > > > The data plane changes are minimal.
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > I still think upper layer data-plane code should
> > > > > > > > > > > > > > stay unaffected (zero changes).
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > > >
> > > > > > > > >
> > > > > > > > > <snip>
Honnappa Nagarahalli Oct. 19, 2020, 10:04 p.m. UTC | #22
<snip>

> > > > > Hi Abhinandan,
> > > > >
> > > > > >
> > > > > > Hi Konstantin & Honnappa,
> > > > > >
> > > > > > Thanks for all the inputs and feedback.
> > > > > >
> > > > > > @Ananyev, Konstantin,
> > > > > > I have measured the perf with and without callback on xeon.
> > > > > > Here are the
> > > > > numbers:
> > > > > >
> > > > > > ./app/dpdk-test-crypto-perf -l 6-7
> > > > > > --vdev="crypto_openssl0,socket_id=0,max_nb_sessions=128" --
> > > > > > --ptest throughput --devtype crypto_openssl --optype
> > > > > > cipher-then-auth --cipher-algo aes-cbc --cipher-op encrypt
> > > > > > --cipher-key-sz 16 --auth-algo sha1-hmac --auth-op generate
> > > > > > --auth-key-sz 64 --digest-sz
> > > > > > 12 --total-ops 10000000 --burst-sz 32 --buffer-sz 64
> > > > > >
> > > > > > With callback(+ RCU - totally opaque to data-plane threads)
> > > > > >     lcore id    Buf Size  Burst Size    Enqueued    Dequeued  Failed Enq
> > > Failed
> > > > > Deq        MOps        Gbps  Cycles/Buf
> > > > > >            7          64          32                10000000    10000000           0           0
> > > > > 0.8129      0.4162     2694.09
> > > > > >            7          64          32                10000000    10000000           0           0
> > > > > 0.8152      0.4174     2686.31
> > > > > >            7          64          32                10000000    10000000           0           0
> > > > > 0.8198      0.4197     2671.48
> > > > > >
> > > > > > Without callback:
> > > > > >     lcore id    Buf Size  Burst Size    Enqueued    Dequeued  Failed Enq
> > > Failed
> > > > > Deq        MOps        Gbps  Cycles/Buf
> > > > > >
> > > > > >            7          64          32               10000000    10000000           0           0
> > > > > 0.8234      0.4216     2659.81
> > > > > >            7          64          32               10000000    10000000           0           0
> > > > > 0.8247      0.4222     2655.63
> > > > > >            7          64          32               10000000    10000000           0           0
> > > > > 0.8123      0.4159     2695.90
> > > > >
> > > > >
> > > > > Just to cofirm:
> > > > > You implemented crypto enqueue callbacks using RCU QSBR online
> > > > > /offline (as suggested below) and numbers above are for:
> > > > > 1) callback code in place and some dummy callback installed
> > > > That's right. (+ RCU calling online + offline APIs inside
> > > > rte_cryptodev_enqueue_burst())
> > > > > 2) callback code in place but no callbacks installed
> > > > No callback code. i.e. Original code.
> > >
> > > Ok, and if I get things right - difference between mean values is ~15 cycles?
> > Yes. May be, number are more stable on isolated core. Let's consider worst
> case too.
> 
> Ok.
> 
> > > That's seems like very good result to me.
> > > Can I suggest to run one more test for your new callback code in
> > > place, but no actual callbacks installed?
> >     lcore id    Buf Size  Burst Size    Enqueued    Dequeued  Failed Enq  Failed Deq
> MOps        Gbps  Cycles/Buf
> >
> >            7          64          32    10000000    10000000           0           0      0.8220
> 0.4209     2664.12
> >            7          64          32    10000000    10000000           0           0      0.8245
> 0.4221     2656.14
> >            7          64          32    10000000    10000000           0           0      0.8261
> 0.4229     2651.15
> 
> So, if I can read numbers properly for not-armed callback impact is neglectable.
> It is hard to say much without seeing the actual code, but from the numbers
> above, I think it is a good result and we can go ahead with that approach.
> Honnappa, Akhil, Jerin do you have any objections to such approach in principle?
The numbers look good. I guess this needs to be tested on Arm platforms as well. It would be good to get the next version of the patch (along with the test case), others can test from there.

> Konstantin
> 
> > > Thanks
> > > Konstantin
> > >
> > > > >
> > > > > Is my understanding correct here?
> > > > > Thanks
> > > > > Konstantin
> > > > >
> > > > >
> > > > > >
> > > > > > Regards
> > > > > > Abhinandan
> > > > > >
> > > > > > > -----Original Message-----
> > > > > > > From: Ananyev, Konstantin <konstantin.ananyev@intel.com>
> > > > > > > Sent: Tuesday, September 29, 2020 2:33 PM
> > > > > > > To: Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>;
> > > > > > > Gujjar, Abhinandan S <abhinandan.gujjar@intel.com>;
> > > > > > > dev@dpdk.org; Doherty, Declan <declan.doherty@intel.com>
> > > > > > > Cc: jerinj@marvell.com; Akhil.goyal@nxp.com; Vangati,
> > > > > > > Narender <narender.vangati@intel.com>; nd <nd@arm.com>; nd
> > > > > > > <nd@arm.com>
> > > > > > > Subject: RE: [dpdk-dev] [v2 1/2] cryptodev: support enqueue
> > > > > > > callback functions
> > > > > > >
> > > > > > >
> > > > > > > >
> > > > > > > > > >
> > > > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > > > +#ifdef RTE_CRYPTODEV_CALLBACKS int
> > > > > > > > > > > > > > > > > > +rte_cryptodev_rcu_qsbr_add(uint8_t
> > > > > > > > > > > > > > > > > > +dev_id, struct rte_rcu_qsbr
> > > > > > > > > > > > > > > > > > +*qsbr) {
> > > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > > +	struct rte_cryptodev *dev;
> > > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > > +	if (!rte_cryptodev_pmd_is_valid_dev(dev_id)) {
> > > > > > > > > > > > > > > > > > +		CDEV_LOG_ERR("Invalid dev_id=%"
> > > > > PRIu8,
> > > > > > > > > dev_id);
> > > > > > > > > > > > > > > > > > +		return -EINVAL;
> > > > > > > > > > > > > > > > > > +	}
> > > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > > +	dev = &rte_crypto_devices[dev_id];
> > > > > > > > > > > > > > > > > > +	dev->qsbr = qsbr;
> > > > > > > > > > > > > > > > > > +	return 0; }
> > > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > > So if I understand your patch correctly
> > > > > > > > > > > > > > > > > you propose a new working model for
> > > > > > > > > > > > > > > > > crypto-devs:
> > > > > > > > > > > > > > > > > 1. Control-plane has to allocate/setup
> > > > > > > > > > > > > > > > > rcu_qsbr and do rte_cryptodev_rcu_qsbr_add().
> > > > > > > > > > > > > > > > > 2. Data-plane has somehow to obtain
> > > > > > > > > > > > > > > > > pointer to that rcu_qsbr and wrap
> > > > > > > > > > > > > > > > > cryptodev_enqueue()
> > > > > > > > > > > > > > > > >    with rcu_qsbr_quiescent()  or
> > > > > > > > > > > rcu_qsbr_online()/rcu_qsbr_offline().
> > > > > > > > > > > > > > > > Yes. I think, it is not a new model. It is
> > > > > > > > > > > > > > > > same as RCU integration with
> > > > > > > > > > > > > LPM.
> > > > > > > > > > > > > > > > Please refer:
> > > > > > > > > > > > > > > > https://patches.dpdk.org/cover/73673/
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > I am talking about new working model for
> > > > > > > > > > > > > > > crypto-dev
> > > > > > > > > > > enqueue/dequeue.
> > > > > > > > > > > > > > > As I said above now it becomes data-plane
> > > > > > > > > > > > > > > thread responsibility
> > > > > > > to:
> > > > > > > > > > > > > > >  -somehow to obtain pointer to that rcu_qsbr
> > > > > > > > > > > > > > > for each cryptodev it is
> > > > > > > > > > > > > using.
> > > > > > > > > > > > > > >  -call rcu sync functions
> > > > > > > > > > > > > > > (quiescent/online/offline) on a regular
> > > > > > > > > basis.
> > > > > > > > > > > > > > It is not on regular basis. When data plane
> > > > > > > > > > > > > > comes up, they report
> > > > > > > > > online.
> > > > > > > > > > > > > > They report quiescent when they are done with
> > > > > > > > > > > > > > critical section or shared
> > > > > > > > > > > > > structure.
> > > > > > > > > > > > >
> > > > > > > > > > > > > I understand that, but it means all existing
> > > > > > > > > > > > > apps have to be changed that
> > > > > > > > > > > way.
> > > > > > > > > > > > >
> > > > > > > > > > > > > > All though, there is some dataplane changes
> > > > > > > > > > > > > > involved here, I don't think, it
> > > > > > > > > > > > > is major.
> > > > > > > > > > > > >
> > > > > > > > > > > > > I still think our goal here should be to make no
> > > > > > > > > > > > > visible changes to the dataplane.
> > > > > > > > > > > > > I.E. all necessary data-plane changes need to be
> > > > > > > > > > > > > hidden inside CB invocation part.
> > > > > > > > > > > > Please note that this is being implemented using
> > > > > > > > > > > > the memory reclamation framework documented at
> > > > > > > > > > > > https://doc.dpdk.org/guides/prog_guide/rcu_lib.htm
> > > > > > > > > > > > l#re
> > > > > > > > > > > > sour
> > > > > > > > > > > > ce-r
> > > > > > > > > > > > ecla
> > > > > > > > > > > > mati
> > > > > > > > > > > > on-framework-for-dpdk
> > > > > > > > > > > >
> > > > > > > > > > > > While using RCU there are couple of trade-offs
> > > > > > > > > > > > that applications have to
> > > > > > > > > > > consider:
> > > > > > > > > > > > 1) Performance - reporting the quiescent state too
> > > > > > > > > > > > often results in performance impact on data plane
> > > > > > > > > > > > 2) Amount of outstanding memory to reclaim -
> > > > > > > > > > > > reporting less often results in more outstanding
> > > > > > > > > > > > memory to reclaim
> > > > > > > > > > > >
> > > > > > > > > > > > Hence, the quiescent state reporting is left to the application.
> > > > > > > > > > > > The application decides how often it reports the
> > > > > > > > > > > > quiescent state and has control
> > > > > > > > > > > over the data plane performance and the outstanding
> > > > > > > > > > > memory to
> > > > > > > reclaim.
> > > > > > > > > > > >
> > > > > > > > > > > > When you say "new working model for crypto-dev
> > > > > > > > > > > > enqueue/dequeue",
> > > > > > > > > > > >
> > > > > > > > > > > > 1) are you comparing these with existing
> > > > > > > > > > > > crypto-dev enqueue/dequeue
> > > > > > > > > > > APIs? If yes, these are new APIs, it is not breaking anything.
> > > > > > > > > > > > 2) are you comparing these with existing call back
> > > > > > > > > > > > functions in ethdev enqueue/dequeue APIs? If yes,
> > > > > > > > > > > > agree that this is a new model. But, it is
> > > > > > > > > > > possible to support what ethdev supports along with
> > > > > > > > > > > the RCU method used in this patch.
> > > > > > > > > > >
> > > > > > > > > > > What I am talking about:
> > > > > > > > > > >
> > > > > > > > > > > Existing cryptodev enqueue/dequeue model doesn't
> > > > > > > > > > > require for the user to manage any RCU QSBR state manually.
> > > > > > > > > > > I believe that addition of ability to add/remove
> > > > > > > > > > > enqueue/dequeue callbacks shouldn't change existing
> > > > > > > > > > > working
> > > > > model.
> > > > > > > > > > > I think that adding/removing such callbacks has to
> > > > > > > > > > > be opaque to the user DP code and shouldn't require
> > > > > > > > > > > user to change
> > > it.
> > > > > > > > > > > Same as we have now for ethdev callback implementation.
> > > > > > > > > > The ethdev callback implementation conveniently leaves
> > > > > > > > > > the problem of
> > > > > > > > > freeing memory to the user to resolve, it does not handle the
> issue.
> > > > > > > > > > Hence, it "looks" to be opaque to the DP code.
> > > > > > > > > > However, if the application has to implement a safe
> > > > > > > > > > way to free the call back memory, its
> > > > > > > > > DP is affected based on call backs are being used or not.
> > > > > > > > >
> > > > > > > > > Yes, I think that's big drawback in initial ethdev
> > > > > > > > > callback implementation - it simply ignores DP/CP sync problem
> completely.
> > > > > > > > > Though I think it is possible to have both here:
> > > > > > > > >  keep callback "opaque" to DP code and provide some sync
> > > > > > > > > mechanism between DP/CP.
> > > > > > > > > Hopefully one day we can fix ethdev callbacks too.
> > > > > > > > The solution we develop can be used in ethdev too.
> > > > > > > >
> > > > > > > > >
> > > > > > > > > > > I think that forcing DP code to be aware that
> > > > > > > > > > > callbacks are present or not and to modify its
> > > > > > > > > > > behaviour depending on that nearly voids the purpose of
> having callbacks at all.
> > > > > > > > > > > In that case DP can just invoke callback function
> > > > > > > > > > > directly from it's
> > > > > > > > > codepath .
> > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > > Note that now data-plane thread would have
> > > > > > > > > > > > > > > to do that always
> > > > > > > > > > > > > > > - even if there are now callbacks installed
> > > > > > > > > > > > > > > for that cryptodev queue
> > > > > > > > > > > right now.
> > > > > > > > > > > > > > > All that changes behaviour of existing apps
> > > > > > > > > > > > > > > and I presume would reduce adoption of  that fature.
> > > > > > > > > > > > If I understand this correct, you are talking
> > > > > > > > > > > > about a case where in the application might be
> > > > > > > > > > > > registering/unregistering multiple times during
> > > > > > > > > > > > its lifetime. In this case, yes, the application
> > > > > > > > > > > > might be reporting the
> > > > > > > > > > > quiescent state even when it has not registered the call backs.
> > > > > > > > > > > But, it has the flexibility to not report it if it
> > > > > > > > > > > implements additional
> > > > > logic.
> > > > > > > > > > > > Note that we are assuming that the application has
> > > > > > > > > > > > to report quiescent state only for using callback functions.
> > > > > > > > > > > > Most probably the application has
> > > > > > > > > > > other requirements to use RCU.
> > > > > > > > > > > > Why not support what is done for ethdev call back
> > > > > > > > > > > > functions along with
> > > > > > > > > > > providing RCU method?
> > > > > > > > > > > >
> > > > > > > > > > > > > > There is always trade off involved!
> > > > > > > > > > > > > > In the previous patch, you suggested that some
> > > > > > > > > > > > > > lazy app may not free up the memory allocated by add cb.
> > > > > > > > > > > > > > For such apps, this patch has sync mechanism
> > > > > > > > > > > > > > with some additional cost of CP & DP
> > > > > > > > > changes.
> > > > > > > > > > > > >
> > > > > > > > > > > > > Sigh, it is not about laziness of the app.
> > > > > > > > > > > > > The problem with current ethedev cb mechanism
> > > > > > > > > > > > > and yours
> > > > > > > > > > > > > V1 (which was just a clone of it) - CP doesn't
> > > > > > > > > > > > > know when it is safe after CB removal to free related
> memory.
> > > > > > > > > > > > >
> > > > > > > > > > > > > > > I still think all this callback mechanism
> > > > > > > > > > > > > > > should be totally opaque to data-plane
> > > > > > > > > > > > > > > threads - user shouldn't change his app code
> > > > > > > > > > > > > > > depending on would some enqueue/dequeue
> > > > > > > > > > > > > > > callbacks be
> > > > > > > > > > > installed or not.
> > > > > > > > > > > > > > I am not sure, how that can be implemented
> > > > > > > > > > > > > > with existing RCU
> > > > > > > > > design.
> > > > > > > > > > > > >
> > > > > > > > > > > > > As I said below the simplest way - with calling
> > > > > > > > > > > > > rcu onine/offline inside CB invocation block.
> > > > > > > > > > > > > That's why I asked you - did you try that
> > > > > > > > > > > > > approach and what is the perf numbers?
> > > > > > > > > > > > > I presume with no callbacks installed the perf
> > > > > > > > > > > > > change should be nearly
> > > > > > > > > > > zero.
> > > > > > > > > > > > >
> > > > > > > > > > > > > > @Honnappa Nagarahalli, Do you have any suggestions?
> > > > > > > > > > > > Reporting quiescent state in the call back
> > > > > > > > > > > > functions has several
> > > > > > > > > > > disadvantages:
> > > > > > > > > > > > 1) it will have performance impacts and the
> > > > > > > > > > > > impacts will increase as the
> > > > > > > > > > > number of data plane threads increase.
> > > > > > > > > > > > 2) It will require additional configuration
> > > > > > > > > > > > parameters to control how often the quiescent
> > > > > > > > > > > > state is reported to control the performance
> > > > > > > > > impact.
> > > > > > > > > > > > 3) Does not take advantage of the fact that most
> > > > > > > > > > > > probably the application is using RCU already
> > > > > > > > > > > > 4) There are few difficulties as well, please see below.
> > > > > > > > > > >
> > > > > > > > > > > I suggested Abhinandan to use RCU library because it
> > > > > > > > > > > is already there, and I thought it would be good not
> > > > > > > > > > > to re-implement
> > > > > the wheel.
> > > > > > > > > > > Though if you feel librte_rcu doesn't match that
> > > > > > > > > > > task - fine, let's do it without librte_rcu.
> > > > > > > > > > > After all, what we need here - just an atomic ref
> > > > > > > > > > > count per queue that we are going to increment at
> > > > > > > > > > > entering and leaving list of callbacks inside enqueue/dequeue.
> > > > > > > > > > Ok, looks like I missed the point that a queue is used
> > > > > > > > > > by a single data plane
> > > > > > > > > thread.
> > > > > > > > > > Along with ref count increment you need the memory
> > > > > > > > > > orderings to avoid
> > > > > > > > > race conditions. These will be the same ones used in RCU.
> > > > > > > > > > On the control plane, you need to read this counter
> > > > > > > > > > and poll for the
> > > > > > > > > counter updates. All this is same cost as in RCU.
> > > > > > > > >
> > > > > > > > > Agree.
> > > > > > > > >
> > > > > > > > > > To control the cost, you will have to control the rate
> > > > > > > > > > of quiescent state reporting and might have to
> > > > > > > > > expose this as a configuration parameter.
> > > > > > > > > >
> > > > > > > > > > The other important information you have to consider
> > > > > > > > > > is if the thread is making any blocking calls, which
> > > > > > > > > > may be in some other library. The thread is supposed
> > > > > > > > > > to call rcu_qsbr_thread_offline API before calling a
> > > > > > > > > blocking call. This allows the RCU to know that this
> > > > > > > > > particular thread will not report quiescent state. The
> > > > > > > > > cryptodev library might not have
> > > > > > > that information.
> > > > > > > > > >
> > > > > > > > > > If you want to go ahead with this design, you can
> > > > > > > > > > still use RCU with single thread configuration (like
> > > > > > > > > > you have mentioned
> > > > > > > > > > below) and hide the
> > > > > > > > > details from the application.
> > > > > > > > >
> > > > > > > > > Yes,  same thought here -  use rcu_qsbr online/offline
> > > > > > > > > for DP part and hide actual sync details inside callback code.
> > > > > > > > We can give it a try. If we can have the performance
> > > > > > > > numbers, we can decide about how to control how often
> > > > > > > > these APIs are called on the data
> > > > > > > plane.
> > > > > > >
> > > > > > > To avoid misunderstanding: I am talking about calling
> > > > > > > online/offline with every
> > > > > > > cryptodev_enqueue() traversal over CB list.
> > > > > > >
> > > > > > > >
> > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > > That seems quite a big change and I
> > > > > > > > > > > > > > > > > don't think it is acceptable for most users.
> > > > > > > > > > > > > > > > > From my perspective adding/installing
> > > > > > > > > > > > > > > > > call-backs to the dev has to be opaque
> > > > > > > > > > > > > > > > > to the data-
> > > plane code.
> > > > > > > > > > > > > > > > > Also note that different callbacks can
> > > > > > > > > > > > > > > > > be installed by different entities
> > > > > > > > > > > > > > > > > (libs) and might have no idea about each
> > > > > > > > > other.
> > > > > > > > > > > > > > > > > That's why I thought it would be better
> > > > > > > > > > > > > > > > > to make all this RCU stuff internal inside cryptodev:
> > > > > > > > > > > > > > > > >     hide all this rcu_qsbr
> > > > > > > > > > > > > > > > > allocation/setup inside cryptod somehow
> > > > > > > > > > > > > > > > > to
> > > > > > > > > > > > > > > obtain pointer to that rcu_qsbr ev
> > > > > > > > > > > > > > > init/queue setup
> > > > > > > > > > > > > > > > >     invoke
> > > > > > > > > > > > > > > > > rcu_qsbr_online()/rcu_qsbr_offline()
> > > > > > > > > > > > > > > > > inside
> > > > > > > > > > > > > > > cryptodev_enqueue().
> > > > > > > > > > > > This will bring in the application related
> > > > > > > > > > > > information such as the thread ID
> > > > > > > > > > > into the library.
> > > > > > > > > > >
> > > > > > > > > > > I don't think it would.
> > > > > > > > > > > Cryptodev enqueue/dequeue functions are not supposed
> > > > > > > > > > > to be thread safe (same as rx/tx burst).
> > > > > > > > > > > So we can always use RCU with just one thread(thread_id = 0).
> > > > > > > > > > Agree, the memory that needs to be freed is accessed
> > > > > > > > > > by a single thread
> > > > > > > > > on the data plane. RCU with one thread would suffice.
> > > > > > > > > >
> > > > > > > > > > > But as I said above - if you feel RCU lib is an
> > > > > > > > > > > overhead here, that's fine - I think it would be
> > > > > > > > > > > easy enough to do without
> > > > > librte_rcu.
> > > > > > > > > > >
> > > > > > > > > > > > If the same API calls are being made from multiple
> > > > > > > > > > > > data plane threads, you need a way to configure
> > > > > > > > > > > > that information to the library. So, it is better
> > > > > > > > > > > > to leave those details for the application to
> > > > > > > handle.
> > > > > > > > > > > >
> > > > > > > > > > > > > > > > I have already tried exploring above stuffs.
> > > > > > > > > > > > > > > > There are too many
> > > > > > > > > > > > > constraints.
> > > > > > > > > > > > > > > > The changes don't fit in, as per RCU design.
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > Hmm could you be more specific here - what
> > > > > > > > > > > > > > > constraints are you referring to?
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > Moreover, having rcu api under
> > > > > > > > > > > > > > > > enqueue_burst() will affect the
> > > > > > > > > > > > > > > performance.
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > It most likely will. Though my expectation
> > > > > > > > > > > > > > > it will affect performance only when some
> > > > > > > > > > > > > > > callbacks are installed. My thought
> > > > > > > > > > > here:
> > > > > > > > > > > > > > > callback function by itself will affect
> > > > > > > > > > > > > > > cryptdev_enqueue performance anyway,
> > > > > > > > > > > > > > With existing callback design, I have measured
> > > > > > > > > > > > > > the performance(with
> > > > > > > > > > > > > crypto perf test) on xeon.
> > > > > > > > > > > > > > It was almost negligible and same was shared with
> Declan.
> > > > > > > > > > > > >
> > > > > > > > > > > > > I am asking about different thing: did you try
> > > > > > > > > > > > > alternate approach I described, that wouldn't
> > > > > > > > > > > > > require changes in the user data-
> > > > > > > > > plane code.
> > > > > > > > > > > > >
> > > > > > > > > > > > > > That is one of the reasons, I didn't want to
> > > > > > > > > > > > > > add to many stuffs in to the
> > > > > > > > > > > > > callback.
> > > > > > > > > > > > > > The best part of existing design is crypto lib
> > > > > > > > > > > > > > is not much
> > > > > modified.
> > > > > > > > > > > > > > The changes are either pushed to CP or DP.
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > so adding extra overhead for sync is probably ok here.
> > > > > > > > > > > > >
> > > > > > > > > > > > > I think that extra overhead when callbacks are
> > > > > > > > > > > > > present is expected and probably acceptable.
> > > > > > > > > > > > > Changes in the upper-layer data-plane code - probably not.
> > > > > > > > > > > > >
> > > > > > > > > > > > > > > Though for situation when no callbacks are
> > > > > > > > > > > > > > > installed
> > > > > > > > > > > > > > > - perfomance should be left unaffected (or
> > > > > > > > > > > > > > > impact should be as small
> > > > > > > > > > > as possible).
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > The changes are more on control plane
> > > > > > > > > > > > > > > > side, which is one
> > > > > > > time.
> > > > > > > > > > > > > > > > The data plane changes are minimal.
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > I still think upper layer data-plane code
> > > > > > > > > > > > > > > should stay unaffected (zero changes).
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > <snip>
Abhinandan Gujjar Oct. 20, 2020, 9:25 a.m. UTC | #23
Hi Honnappa,

I will send out the next version of the patch soon.

Regards
Abhinandan

> -----Original Message-----
> From: Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>
> Sent: Tuesday, October 20, 2020 3:34 AM
> To: Ananyev, Konstantin <konstantin.ananyev@intel.com>; Gujjar, Abhinandan S
> <abhinandan.gujjar@intel.com>; dev@dpdk.org; Doherty, Declan
> <declan.doherty@intel.com>
> Cc: jerinj@marvell.com; Akhil.goyal@nxp.com; Vangati, Narender
> <narender.vangati@intel.com>; nd <nd@arm.com>; Honnappa Nagarahalli
> <Honnappa.Nagarahalli@arm.com>; nd <nd@arm.com>
> Subject: RE: [dpdk-dev] [v2 1/2] cryptodev: support enqueue callback functions
> 
> <snip>
> 
> > > > > > Hi Abhinandan,
> > > > > >
> > > > > > >
> > > > > > > Hi Konstantin & Honnappa,
> > > > > > >
> > > > > > > Thanks for all the inputs and feedback.
> > > > > > >
> > > > > > > @Ananyev, Konstantin,
> > > > > > > I have measured the perf with and without callback on xeon.
> > > > > > > Here are the
> > > > > > numbers:
> > > > > > >
> > > > > > > ./app/dpdk-test-crypto-perf -l 6-7
> > > > > > > --vdev="crypto_openssl0,socket_id=0,max_nb_sessions=128" --
> > > > > > > --ptest throughput --devtype crypto_openssl --optype
> > > > > > > cipher-then-auth --cipher-algo aes-cbc --cipher-op encrypt
> > > > > > > --cipher-key-sz 16 --auth-algo sha1-hmac --auth-op generate
> > > > > > > --auth-key-sz 64 --digest-sz
> > > > > > > 12 --total-ops 10000000 --burst-sz 32 --buffer-sz 64
> > > > > > >
> > > > > > > With callback(+ RCU - totally opaque to data-plane threads)
> > > > > > >     lcore id    Buf Size  Burst Size    Enqueued    Dequeued  Failed Enq
> > > > Failed
> > > > > > Deq        MOps        Gbps  Cycles/Buf
> > > > > > >            7          64          32                10000000    10000000           0
> 0
> > > > > > 0.8129      0.4162     2694.09
> > > > > > >            7          64          32                10000000    10000000           0
> 0
> > > > > > 0.8152      0.4174     2686.31
> > > > > > >            7          64          32                10000000    10000000           0
> 0
> > > > > > 0.8198      0.4197     2671.48
> > > > > > >
> > > > > > > Without callback:
> > > > > > >     lcore id    Buf Size  Burst Size    Enqueued    Dequeued  Failed Enq
> > > > Failed
> > > > > > Deq        MOps        Gbps  Cycles/Buf
> > > > > > >
> > > > > > >            7          64          32               10000000    10000000           0
> 0
> > > > > > 0.8234      0.4216     2659.81
> > > > > > >            7          64          32               10000000    10000000           0
> 0
> > > > > > 0.8247      0.4222     2655.63
> > > > > > >            7          64          32               10000000    10000000           0
> 0
> > > > > > 0.8123      0.4159     2695.90
> > > > > >
> > > > > >
> > > > > > Just to cofirm:
> > > > > > You implemented crypto enqueue callbacks using RCU QSBR online
> > > > > > /offline (as suggested below) and numbers above are for:
> > > > > > 1) callback code in place and some dummy callback installed
> > > > > That's right. (+ RCU calling online + offline APIs inside
> > > > > rte_cryptodev_enqueue_burst())
> > > > > > 2) callback code in place but no callbacks installed
> > > > > No callback code. i.e. Original code.
> > > >
> > > > Ok, and if I get things right - difference between mean values is ~15
> cycles?
> > > Yes. May be, number are more stable on isolated core. Let's consider
> > > worst
> > case too.
> >
> > Ok.
> >
> > > > That's seems like very good result to me.
> > > > Can I suggest to run one more test for your new callback code in
> > > > place, but no actual callbacks installed?
> > >     lcore id    Buf Size  Burst Size    Enqueued    Dequeued  Failed Enq  Failed
> Deq
> > MOps        Gbps  Cycles/Buf
> > >
> > >            7          64          32    10000000    10000000           0           0
> 0.8220
> > 0.4209     2664.12
> > >            7          64          32    10000000    10000000           0           0
> 0.8245
> > 0.4221     2656.14
> > >            7          64          32    10000000    10000000           0           0
> 0.8261
> > 0.4229     2651.15
> >
> > So, if I can read numbers properly for not-armed callback impact is
> neglectable.
> > It is hard to say much without seeing the actual code, but from the
> > numbers above, I think it is a good result and we can go ahead with that
> approach.
> > Honnappa, Akhil, Jerin do you have any objections to such approach in
> principle?
> The numbers look good. I guess this needs to be tested on Arm platforms as
> well. It would be good to get the next version of the patch (along with the test
> case), others can test from there.
> 
> > Konstantin
> >
> > > > Thanks
> > > > Konstantin
> > > >
> > > > > >
> > > > > > Is my understanding correct here?
> > > > > > Thanks
> > > > > > Konstantin
> > > > > >
> > > > > >
> > > > > > >
> > > > > > > Regards
> > > > > > > Abhinandan
> > > > > > >
> > > > > > > > -----Original Message-----
> > > > > > > > From: Ananyev, Konstantin <konstantin.ananyev@intel.com>
> > > > > > > > Sent: Tuesday, September 29, 2020 2:33 PM
> > > > > > > > To: Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>;
> > > > > > > > Gujjar, Abhinandan S <abhinandan.gujjar@intel.com>;
> > > > > > > > dev@dpdk.org; Doherty, Declan <declan.doherty@intel.com>
> > > > > > > > Cc: jerinj@marvell.com; Akhil.goyal@nxp.com; Vangati,
> > > > > > > > Narender <narender.vangati@intel.com>; nd <nd@arm.com>; nd
> > > > > > > > <nd@arm.com>
> > > > > > > > Subject: RE: [dpdk-dev] [v2 1/2] cryptodev: support
> > > > > > > > enqueue callback functions
> > > > > > > >
> > > > > > > >
> > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > > > > +#ifdef RTE_CRYPTODEV_CALLBACKS int
> > > > > > > > > > > > > > > > > > > +rte_cryptodev_rcu_qsbr_add(uint8_t
> > > > > > > > > > > > > > > > > > > +dev_id, struct rte_rcu_qsbr
> > > > > > > > > > > > > > > > > > > +*qsbr) {
> > > > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > > > +	struct rte_cryptodev *dev;
> > > > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > > > +	if
> (!rte_cryptodev_pmd_is_valid_dev(dev_id)) {
> > > > > > > > > > > > > > > > > > > +		CDEV_LOG_ERR("Invalid
> dev_id=%"
> > > > > > PRIu8,
> > > > > > > > > > dev_id);
> > > > > > > > > > > > > > > > > > > +		return -EINVAL;
> > > > > > > > > > > > > > > > > > > +	}
> > > > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > > > +	dev = &rte_crypto_devices[dev_id];
> > > > > > > > > > > > > > > > > > > +	dev->qsbr = qsbr;
> > > > > > > > > > > > > > > > > > > +	return 0; }
> > > > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > > > So if I understand your patch
> > > > > > > > > > > > > > > > > > correctly you propose a new working
> > > > > > > > > > > > > > > > > > model for
> > > > > > > > > > > > > > > > > > crypto-devs:
> > > > > > > > > > > > > > > > > > 1. Control-plane has to allocate/setup
> > > > > > > > > > > > > > > > > > rcu_qsbr and do rte_cryptodev_rcu_qsbr_add().
> > > > > > > > > > > > > > > > > > 2. Data-plane has somehow to obtain
> > > > > > > > > > > > > > > > > > pointer to that rcu_qsbr and wrap
> > > > > > > > > > > > > > > > > > cryptodev_enqueue()
> > > > > > > > > > > > > > > > > >    with rcu_qsbr_quiescent()  or
> > > > > > > > > > > > rcu_qsbr_online()/rcu_qsbr_offline().
> > > > > > > > > > > > > > > > > Yes. I think, it is not a new model. It
> > > > > > > > > > > > > > > > > is same as RCU integration with
> > > > > > > > > > > > > > LPM.
> > > > > > > > > > > > > > > > > Please refer:
> > > > > > > > > > > > > > > > > https://patches.dpdk.org/cover/73673/
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > I am talking about new working model for
> > > > > > > > > > > > > > > > crypto-dev
> > > > > > > > > > > > enqueue/dequeue.
> > > > > > > > > > > > > > > > As I said above now it becomes data-plane
> > > > > > > > > > > > > > > > thread responsibility
> > > > > > > > to:
> > > > > > > > > > > > > > > >  -somehow to obtain pointer to that
> > > > > > > > > > > > > > > > rcu_qsbr for each cryptodev it is
> > > > > > > > > > > > > > using.
> > > > > > > > > > > > > > > >  -call rcu sync functions
> > > > > > > > > > > > > > > > (quiescent/online/offline) on a regular
> > > > > > > > > > basis.
> > > > > > > > > > > > > > > It is not on regular basis. When data plane
> > > > > > > > > > > > > > > comes up, they report
> > > > > > > > > > online.
> > > > > > > > > > > > > > > They report quiescent when they are done
> > > > > > > > > > > > > > > with critical section or shared
> > > > > > > > > > > > > > structure.
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > I understand that, but it means all existing
> > > > > > > > > > > > > > apps have to be changed that
> > > > > > > > > > > > way.
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > > All though, there is some dataplane changes
> > > > > > > > > > > > > > > involved here, I don't think, it
> > > > > > > > > > > > > > is major.
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > I still think our goal here should be to make
> > > > > > > > > > > > > > no visible changes to the dataplane.
> > > > > > > > > > > > > > I.E. all necessary data-plane changes need to
> > > > > > > > > > > > > > be hidden inside CB invocation part.
> > > > > > > > > > > > > Please note that this is being implemented using
> > > > > > > > > > > > > the memory reclamation framework documented at
> > > > > > > > > > > > > https://doc.dpdk.org/guides/prog_guide/rcu_lib.h
> > > > > > > > > > > > > tm
> > > > > > > > > > > > > l#re
> > > > > > > > > > > > > sour
> > > > > > > > > > > > > ce-r
> > > > > > > > > > > > > ecla
> > > > > > > > > > > > > mati
> > > > > > > > > > > > > on-framework-for-dpdk
> > > > > > > > > > > > >
> > > > > > > > > > > > > While using RCU there are couple of trade-offs
> > > > > > > > > > > > > that applications have to
> > > > > > > > > > > > consider:
> > > > > > > > > > > > > 1) Performance - reporting the quiescent state
> > > > > > > > > > > > > too often results in performance impact on data
> > > > > > > > > > > > > plane
> > > > > > > > > > > > > 2) Amount of outstanding memory to reclaim -
> > > > > > > > > > > > > reporting less often results in more outstanding
> > > > > > > > > > > > > memory to reclaim
> > > > > > > > > > > > >
> > > > > > > > > > > > > Hence, the quiescent state reporting is left to the
> application.
> > > > > > > > > > > > > The application decides how often it reports the
> > > > > > > > > > > > > quiescent state and has control
> > > > > > > > > > > > over the data plane performance and the
> > > > > > > > > > > > outstanding memory to
> > > > > > > > reclaim.
> > > > > > > > > > > > >
> > > > > > > > > > > > > When you say "new working model for crypto-dev
> > > > > > > > > > > > > enqueue/dequeue",
> > > > > > > > > > > > >
> > > > > > > > > > > > > 1) are you comparing these with existing
> > > > > > > > > > > > > crypto-dev enqueue/dequeue
> > > > > > > > > > > > APIs? If yes, these are new APIs, it is not breaking anything.
> > > > > > > > > > > > > 2) are you comparing these with existing call
> > > > > > > > > > > > > back functions in ethdev enqueue/dequeue APIs?
> > > > > > > > > > > > > If yes, agree that this is a new model. But, it
> > > > > > > > > > > > > is
> > > > > > > > > > > > possible to support what ethdev supports along
> > > > > > > > > > > > with the RCU method used in this patch.
> > > > > > > > > > > >
> > > > > > > > > > > > What I am talking about:
> > > > > > > > > > > >
> > > > > > > > > > > > Existing cryptodev enqueue/dequeue model doesn't
> > > > > > > > > > > > require for the user to manage any RCU QSBR state
> manually.
> > > > > > > > > > > > I believe that addition of ability to add/remove
> > > > > > > > > > > > enqueue/dequeue callbacks shouldn't change
> > > > > > > > > > > > existing working
> > > > > > model.
> > > > > > > > > > > > I think that adding/removing such callbacks has to
> > > > > > > > > > > > be opaque to the user DP code and shouldn't
> > > > > > > > > > > > require user to change
> > > > it.
> > > > > > > > > > > > Same as we have now for ethdev callback implementation.
> > > > > > > > > > > The ethdev callback implementation conveniently
> > > > > > > > > > > leaves the problem of
> > > > > > > > > > freeing memory to the user to resolve, it does not
> > > > > > > > > > handle the
> > issue.
> > > > > > > > > > > Hence, it "looks" to be opaque to the DP code.
> > > > > > > > > > > However, if the application has to implement a safe
> > > > > > > > > > > way to free the call back memory, its
> > > > > > > > > > DP is affected based on call backs are being used or not.
> > > > > > > > > >
> > > > > > > > > > Yes, I think that's big drawback in initial ethdev
> > > > > > > > > > callback implementation - it simply ignores DP/CP sync
> > > > > > > > > > problem
> > completely.
> > > > > > > > > > Though I think it is possible to have both here:
> > > > > > > > > >  keep callback "opaque" to DP code and provide some
> > > > > > > > > > sync mechanism between DP/CP.
> > > > > > > > > > Hopefully one day we can fix ethdev callbacks too.
> > > > > > > > > The solution we develop can be used in ethdev too.
> > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > > > I think that forcing DP code to be aware that
> > > > > > > > > > > > callbacks are present or not and to modify its
> > > > > > > > > > > > behaviour depending on that nearly voids the
> > > > > > > > > > > > purpose of
> > having callbacks at all.
> > > > > > > > > > > > In that case DP can just invoke callback function
> > > > > > > > > > > > directly from it's
> > > > > > > > > > codepath .
> > > > > > > > > > > >
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > Note that now data-plane thread would have
> > > > > > > > > > > > > > > > to do that always
> > > > > > > > > > > > > > > > - even if there are now callbacks
> > > > > > > > > > > > > > > > installed for that cryptodev queue
> > > > > > > > > > > > right now.
> > > > > > > > > > > > > > > > All that changes behaviour of existing
> > > > > > > > > > > > > > > > apps and I presume would reduce adoption of  that
> fature.
> > > > > > > > > > > > > If I understand this correct, you are talking
> > > > > > > > > > > > > about a case where in the application might be
> > > > > > > > > > > > > registering/unregistering multiple times during
> > > > > > > > > > > > > its lifetime. In this case, yes, the application
> > > > > > > > > > > > > might be reporting the
> > > > > > > > > > > > quiescent state even when it has not registered the call
> backs.
> > > > > > > > > > > > But, it has the flexibility to not report it if it
> > > > > > > > > > > > implements additional
> > > > > > logic.
> > > > > > > > > > > > > Note that we are assuming that the application
> > > > > > > > > > > > > has to report quiescent state only for using callback
> functions.
> > > > > > > > > > > > > Most probably the application has
> > > > > > > > > > > > other requirements to use RCU.
> > > > > > > > > > > > > Why not support what is done for ethdev call
> > > > > > > > > > > > > back functions along with
> > > > > > > > > > > > providing RCU method?
> > > > > > > > > > > > >
> > > > > > > > > > > > > > > There is always trade off involved!
> > > > > > > > > > > > > > > In the previous patch, you suggested that
> > > > > > > > > > > > > > > some lazy app may not free up the memory allocated
> by add cb.
> > > > > > > > > > > > > > > For such apps, this patch has sync mechanism
> > > > > > > > > > > > > > > with some additional cost of CP & DP
> > > > > > > > > > changes.
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > Sigh, it is not about laziness of the app.
> > > > > > > > > > > > > > The problem with current ethedev cb mechanism
> > > > > > > > > > > > > > and yours
> > > > > > > > > > > > > > V1 (which was just a clone of it) - CP doesn't
> > > > > > > > > > > > > > know when it is safe after CB removal to free
> > > > > > > > > > > > > > related
> > memory.
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > I still think all this callback mechanism
> > > > > > > > > > > > > > > > should be totally opaque to data-plane
> > > > > > > > > > > > > > > > threads - user shouldn't change his app
> > > > > > > > > > > > > > > > code depending on would some
> > > > > > > > > > > > > > > > enqueue/dequeue callbacks be
> > > > > > > > > > > > installed or not.
> > > > > > > > > > > > > > > I am not sure, how that can be implemented
> > > > > > > > > > > > > > > with existing RCU
> > > > > > > > > > design.
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > As I said below the simplest way - with
> > > > > > > > > > > > > > calling rcu onine/offline inside CB invocation block.
> > > > > > > > > > > > > > That's why I asked you - did you try that
> > > > > > > > > > > > > > approach and what is the perf numbers?
> > > > > > > > > > > > > > I presume with no callbacks installed the perf
> > > > > > > > > > > > > > change should be nearly
> > > > > > > > > > > > zero.
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > > @Honnappa Nagarahalli, Do you have any
> suggestions?
> > > > > > > > > > > > > Reporting quiescent state in the call back
> > > > > > > > > > > > > functions has several
> > > > > > > > > > > > disadvantages:
> > > > > > > > > > > > > 1) it will have performance impacts and the
> > > > > > > > > > > > > impacts will increase as the
> > > > > > > > > > > > number of data plane threads increase.
> > > > > > > > > > > > > 2) It will require additional configuration
> > > > > > > > > > > > > parameters to control how often the quiescent
> > > > > > > > > > > > > state is reported to control the performance
> > > > > > > > > > impact.
> > > > > > > > > > > > > 3) Does not take advantage of the fact that most
> > > > > > > > > > > > > probably the application is using RCU already
> > > > > > > > > > > > > 4) There are few difficulties as well, please see below.
> > > > > > > > > > > >
> > > > > > > > > > > > I suggested Abhinandan to use RCU library because
> > > > > > > > > > > > it is already there, and I thought it would be
> > > > > > > > > > > > good not to re-implement
> > > > > > the wheel.
> > > > > > > > > > > > Though if you feel librte_rcu doesn't match that
> > > > > > > > > > > > task - fine, let's do it without librte_rcu.
> > > > > > > > > > > > After all, what we need here - just an atomic ref
> > > > > > > > > > > > count per queue that we are going to increment at
> > > > > > > > > > > > entering and leaving list of callbacks inside
> enqueue/dequeue.
> > > > > > > > > > > Ok, looks like I missed the point that a queue is
> > > > > > > > > > > used by a single data plane
> > > > > > > > > > thread.
> > > > > > > > > > > Along with ref count increment you need the memory
> > > > > > > > > > > orderings to avoid
> > > > > > > > > > race conditions. These will be the same ones used in RCU.
> > > > > > > > > > > On the control plane, you need to read this counter
> > > > > > > > > > > and poll for the
> > > > > > > > > > counter updates. All this is same cost as in RCU.
> > > > > > > > > >
> > > > > > > > > > Agree.
> > > > > > > > > >
> > > > > > > > > > > To control the cost, you will have to control the
> > > > > > > > > > > rate of quiescent state reporting and might have to
> > > > > > > > > > expose this as a configuration parameter.
> > > > > > > > > > >
> > > > > > > > > > > The other important information you have to consider
> > > > > > > > > > > is if the thread is making any blocking calls, which
> > > > > > > > > > > may be in some other library. The thread is supposed
> > > > > > > > > > > to call rcu_qsbr_thread_offline API before calling a
> > > > > > > > > > blocking call. This allows the RCU to know that this
> > > > > > > > > > particular thread will not report quiescent state. The
> > > > > > > > > > cryptodev library might not have
> > > > > > > > that information.
> > > > > > > > > > >
> > > > > > > > > > > If you want to go ahead with this design, you can
> > > > > > > > > > > still use RCU with single thread configuration (like
> > > > > > > > > > > you have mentioned
> > > > > > > > > > > below) and hide the
> > > > > > > > > > details from the application.
> > > > > > > > > >
> > > > > > > > > > Yes,  same thought here -  use rcu_qsbr online/offline
> > > > > > > > > > for DP part and hide actual sync details inside callback code.
> > > > > > > > > We can give it a try. If we can have the performance
> > > > > > > > > numbers, we can decide about how to control how often
> > > > > > > > > these APIs are called on the data
> > > > > > > > plane.
> > > > > > > >
> > > > > > > > To avoid misunderstanding: I am talking about calling
> > > > > > > > online/offline with every
> > > > > > > > cryptodev_enqueue() traversal over CB list.
> > > > > > > >
> > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > >
> > > > > > > > > > > > >
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > > > That seems quite a big change and I
> > > > > > > > > > > > > > > > > > don't think it is acceptable for most users.
> > > > > > > > > > > > > > > > > > From my perspective adding/installing
> > > > > > > > > > > > > > > > > > call-backs to the dev has to be opaque
> > > > > > > > > > > > > > > > > > to the data-
> > > > plane code.
> > > > > > > > > > > > > > > > > > Also note that different callbacks can
> > > > > > > > > > > > > > > > > > be installed by different entities
> > > > > > > > > > > > > > > > > > (libs) and might have no idea about
> > > > > > > > > > > > > > > > > > each
> > > > > > > > > > other.
> > > > > > > > > > > > > > > > > > That's why I thought it would be
> > > > > > > > > > > > > > > > > > better to make all this RCU stuff internal inside
> cryptodev:
> > > > > > > > > > > > > > > > > >     hide all this rcu_qsbr
> > > > > > > > > > > > > > > > > > allocation/setup inside cryptod
> > > > > > > > > > > > > > > > > > somehow to
> > > > > > > > > > > > > > > > obtain pointer to that rcu_qsbr ev
> > > > > > > > > > > > > > > > init/queue setup
> > > > > > > > > > > > > > > > > >     invoke
> > > > > > > > > > > > > > > > > > rcu_qsbr_online()/rcu_qsbr_offline()
> > > > > > > > > > > > > > > > > > inside
> > > > > > > > > > > > > > > > cryptodev_enqueue().
> > > > > > > > > > > > > This will bring in the application related
> > > > > > > > > > > > > information such as the thread ID
> > > > > > > > > > > > into the library.
> > > > > > > > > > > >
> > > > > > > > > > > > I don't think it would.
> > > > > > > > > > > > Cryptodev enqueue/dequeue functions are not
> > > > > > > > > > > > supposed to be thread safe (same as rx/tx burst).
> > > > > > > > > > > > So we can always use RCU with just one thread(thread_id =
> 0).
> > > > > > > > > > > Agree, the memory that needs to be freed is accessed
> > > > > > > > > > > by a single thread
> > > > > > > > > > on the data plane. RCU with one thread would suffice.
> > > > > > > > > > >
> > > > > > > > > > > > But as I said above - if you feel RCU lib is an
> > > > > > > > > > > > overhead here, that's fine - I think it would be
> > > > > > > > > > > > easy enough to do without
> > > > > > librte_rcu.
> > > > > > > > > > > >
> > > > > > > > > > > > > If the same API calls are being made from
> > > > > > > > > > > > > multiple data plane threads, you need a way to
> > > > > > > > > > > > > configure that information to the library. So,
> > > > > > > > > > > > > it is better to leave those details for the
> > > > > > > > > > > > > application to
> > > > > > > > handle.
> > > > > > > > > > > > >
> > > > > > > > > > > > > > > > > I have already tried exploring above stuffs.
> > > > > > > > > > > > > > > > > There are too many
> > > > > > > > > > > > > > constraints.
> > > > > > > > > > > > > > > > > The changes don't fit in, as per RCU design.
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > Hmm could you be more specific here - what
> > > > > > > > > > > > > > > > constraints are you referring to?
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > > Moreover, having rcu api under
> > > > > > > > > > > > > > > > > enqueue_burst() will affect the
> > > > > > > > > > > > > > > > performance.
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > It most likely will. Though my expectation
> > > > > > > > > > > > > > > > it will affect performance only when some
> > > > > > > > > > > > > > > > callbacks are installed. My thought
> > > > > > > > > > > > here:
> > > > > > > > > > > > > > > > callback function by itself will affect
> > > > > > > > > > > > > > > > cryptdev_enqueue performance anyway,
> > > > > > > > > > > > > > > With existing callback design, I have
> > > > > > > > > > > > > > > measured the performance(with
> > > > > > > > > > > > > > crypto perf test) on xeon.
> > > > > > > > > > > > > > > It was almost negligible and same was shared
> > > > > > > > > > > > > > > with
> > Declan.
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > I am asking about different thing: did you try
> > > > > > > > > > > > > > alternate approach I described, that wouldn't
> > > > > > > > > > > > > > require changes in the user data-
> > > > > > > > > > plane code.
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > > That is one of the reasons, I didn't want to
> > > > > > > > > > > > > > > add to many stuffs in to the
> > > > > > > > > > > > > > callback.
> > > > > > > > > > > > > > > The best part of existing design is crypto
> > > > > > > > > > > > > > > lib is not much
> > > > > > modified.
> > > > > > > > > > > > > > > The changes are either pushed to CP or DP.
> > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > so adding extra overhead for sync is probably ok
> here.
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > I think that extra overhead when callbacks are
> > > > > > > > > > > > > > present is expected and probably acceptable.
> > > > > > > > > > > > > > Changes in the upper-layer data-plane code - probably
> not.
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > Though for situation when no callbacks are
> > > > > > > > > > > > > > > > installed
> > > > > > > > > > > > > > > > - perfomance should be left unaffected (or
> > > > > > > > > > > > > > > > impact should be as small
> > > > > > > > > > > > as possible).
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > > The changes are more on control plane
> > > > > > > > > > > > > > > > > side, which is one
> > > > > > > > time.
> > > > > > > > > > > > > > > > > The data plane changes are minimal.
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > I still think upper layer data-plane code
> > > > > > > > > > > > > > > > should stay unaffected (zero changes).
> > > > > > > > > > > > > > > >
> > > > > > > > > > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > <snip>

Patch
diff mbox series

diff --git a/config/common_base b/config/common_base
index fbf0ee7..f5ebde4 100644
--- a/config/common_base
+++ b/config/common_base
@@ -599,6 +599,7 @@  CONFIG_RTE_LIBRTE_PMD_BBDEV_FPGA_5GNR_FEC=y
 #
 CONFIG_RTE_LIBRTE_CRYPTODEV=y
 CONFIG_RTE_CRYPTO_MAX_DEVS=64
+CONFIG_RTE_CRYPTODEV_CALLBACKS=y
 
 #
 # Compile PMD for ARMv8 Crypto device
diff --git a/lib/librte_cryptodev/Makefile b/lib/librte_cryptodev/Makefile
index 73e77a2..514d552 100644
--- a/lib/librte_cryptodev/Makefile
+++ b/lib/librte_cryptodev/Makefile
@@ -10,7 +10,7 @@  LIB = librte_cryptodev.a
 CFLAGS += -O3
 CFLAGS += $(WERROR_FLAGS)
 LDLIBS += -lrte_eal -lrte_mempool -lrte_ring -lrte_mbuf
-LDLIBS += -lrte_kvargs
+LDLIBS += -lrte_kvargs -lrte_rcu
 
 # library source files
 SRCS-y += rte_cryptodev.c rte_cryptodev_pmd.c cryptodev_trace_points.c
diff --git a/lib/librte_cryptodev/rte_cryptodev.c b/lib/librte_cryptodev/rte_cryptodev.c
index 1dd795b..2fb3e35 100644
--- a/lib/librte_cryptodev/rte_cryptodev.c
+++ b/lib/librte_cryptodev/rte_cryptodev.c
@@ -38,6 +38,7 @@ 
 #include <rte_string_fns.h>
 #include <rte_compat.h>
 #include <rte_function_versioning.h>
+#include <rte_rcu_qsbr.h>
 
 #include "rte_crypto.h"
 #include "rte_cryptodev.h"
@@ -499,6 +500,10 @@  struct rte_cryptodev_sym_session_pool_private_data {
 	return 0;
 }
 
+#ifdef RTE_CRYPTODEV_CALLBACKS
+/* spinlock for crypto device enq callbacks */
+static rte_spinlock_t rte_cryptodev_enq_cb_lock = RTE_SPINLOCK_INITIALIZER;
+#endif
 
 const char *
 rte_cryptodev_get_feature_name(uint64_t flag)
@@ -1449,6 +1454,158 @@  struct rte_cryptodev *
 	rte_spinlock_unlock(&rte_cryptodev_cb_lock);
 }
 
+#ifdef RTE_CRYPTODEV_CALLBACKS
+int
+rte_cryptodev_rcu_qsbr_add(uint8_t dev_id, struct rte_rcu_qsbr *qsbr)
+{
+
+	struct rte_cryptodev *dev;
+
+	if (!rte_cryptodev_pmd_is_valid_dev(dev_id)) {
+		CDEV_LOG_ERR("Invalid dev_id=%" PRIu8, dev_id);
+		return -EINVAL;
+	}
+
+	dev = &rte_crypto_devices[dev_id];
+	dev->qsbr = qsbr;
+	return 0;
+}
+
+struct rte_cryptodev_enq_callback *
+rte_cryptodev_add_enq_callback(uint8_t dev_id,
+			       uint16_t qp_id,
+			       rte_cryptodev_enq_cb_fn cb_fn,
+			       void *cb_arg)
+{
+	struct rte_cryptodev *dev;
+	struct rte_cryptodev_enq_callback *cb, *tail;
+
+	if (!cb_fn)
+		return NULL;
+
+	if (!rte_cryptodev_pmd_is_valid_dev(dev_id)) {
+		CDEV_LOG_ERR("Invalid dev_id=%" PRIu8, dev_id);
+		return NULL;
+	}
+
+	dev = &rte_crypto_devices[dev_id];
+	if (qp_id >= dev->data->nb_queue_pairs) {
+		CDEV_LOG_ERR("Invalid queue_pair_id=%d", qp_id);
+		return NULL;
+	}
+
+	cb = rte_zmalloc(NULL, sizeof(*cb), 0);
+	if (cb == NULL) {
+		CDEV_LOG_ERR("Failed to allocate memory for callback on "
+			     "dev=%d, queue_pair_id=%d", dev_id, qp_id);
+		rte_errno = ENOMEM;
+		return NULL;
+	}
+
+	cb->fn = cb_fn;
+	cb->arg = cb_arg;
+
+	rte_spinlock_lock(&rte_cryptodev_enq_cb_lock);
+	if (dev->enq_cbs == NULL) {
+		dev->enq_cbs = rte_zmalloc(NULL, sizeof(cb) *
+					   dev->data->nb_queue_pairs, 0);
+		if (dev->enq_cbs == NULL) {
+			CDEV_LOG_ERR("Failed to allocate memory for callbacks");
+			rte_errno = ENOMEM;
+			rte_free(cb);
+			return NULL;
+		}
+	}
+
+	/* Add the callbacks in fifo order. */
+	tail = dev->enq_cbs[qp_id];
+	if (tail) {
+		while (tail->next)
+			tail = tail->next;
+		tail->next = cb;
+	} else
+		dev->enq_cbs[qp_id] = cb;
+
+	rte_spinlock_unlock(&rte_cryptodev_enq_cb_lock);
+
+	return cb;
+}
+
+int
+rte_cryptodev_remove_enq_callback(uint8_t dev_id,
+				  uint16_t qp_id,
+				  struct rte_cryptodev_enq_callback *cb)
+{
+	struct rte_cryptodev *dev;
+	struct rte_cryptodev_enq_callback **prev_cb, *curr_cb;
+	uint16_t qp;
+	int free_mem;
+	int ret;
+
+	free_mem = 1;
+
+	if (!cb) {
+		CDEV_LOG_ERR("cb is NULL");
+		return -EINVAL;
+	}
+
+	if (!rte_cryptodev_pmd_is_valid_dev(dev_id)) {
+		CDEV_LOG_ERR("Invalid dev_id=%" PRIu8, dev_id);
+		return -EINVAL;
+	}
+
+	dev = &rte_crypto_devices[dev_id];
+	if (qp_id >= dev->data->nb_queue_pairs) {
+		CDEV_LOG_ERR("Invalid queue_pair_id=%d", qp_id);
+		return -EINVAL;
+	}
+
+	if (!dev->qsbr) {
+		CDEV_LOG_ERR("Rcu qsbr is NULL");
+		return -EINVAL;
+	}
+
+	rte_spinlock_lock(&rte_cryptodev_enq_cb_lock);
+	if (dev->enq_cbs == NULL) {
+		rte_spinlock_unlock(&rte_cryptodev_enq_cb_lock);
+		return -EINVAL;
+	}
+
+	prev_cb = &dev->enq_cbs[qp_id];
+	for (; *prev_cb != NULL; prev_cb = &curr_cb->next) {
+		curr_cb = *prev_cb;
+		if (curr_cb == cb) {
+			/* Remove the user cb from the callback list. */
+			*prev_cb = curr_cb->next;
+			ret = 0;
+			break;
+		}
+	}
+
+	if (!ret) {
+		/* Call sync with invalid thread id as this is part of
+		 * control plane API */
+		rte_rcu_qsbr_synchronize(dev->qsbr,
+					 RTE_QSBR_THRID_INVALID);
+		rte_free(cb);
+	}
+
+	for (qp = 0; qp < dev->data->nb_queue_pairs; qp++)
+		if (dev->enq_cbs[qp] != NULL) {
+			free_mem = 0;
+			break;
+		}
+
+	if (free_mem) {
+		rte_free(dev->enq_cbs);
+		dev->enq_cbs = NULL;
+	}
+
+	rte_spinlock_unlock(&rte_cryptodev_enq_cb_lock);
+
+	return ret;
+}
+#endif
 
 int
 rte_cryptodev_sym_session_init(uint8_t dev_id,
diff --git a/lib/librte_cryptodev/rte_cryptodev.h b/lib/librte_cryptodev/rte_cryptodev.h
index 7b3ebc2..2c7a47b 100644
--- a/lib/librte_cryptodev/rte_cryptodev.h
+++ b/lib/librte_cryptodev/rte_cryptodev.h
@@ -530,6 +530,32 @@  struct rte_cryptodev_qp_conf {
 };
 
 /**
+ * Function type used for pre processing crypto ops when enqueue burst is
+ * called.
+ *
+ * The callback function is called on enqueue burst immediately
+ * before the crypto ops are put onto the hardware queue for processing.
+ *
+ * @param	dev_id		The identifier of the device.
+ * @param	qp_id		The index of the queue pair in which ops are
+ *				to be enqueued for processing. The value
+ *				must be in the range [0, nb_queue_pairs - 1]
+ *				previously supplied to
+ *				*rte_cryptodev_configure*.
+ * @param	ops		The address of an array of *nb_ops* pointers
+ *				to *rte_crypto_op* structures which contain
+ *				the crypto operations to be processed.
+ * @param	nb_ops		The number of operations to process.
+ * @param	user_param	The arbitrary user parameter passed in by the
+ *				application when the callback was originally
+ *				registered.
+ * @return			The number of ops to be enqueued to the
+ *				crypto device.
+ */
+typedef uint16_t (*rte_cryptodev_enq_cb_fn)(uint16_t dev_id, uint16_t qp_id,
+		struct rte_crypto_op **ops, uint16_t nb_ops, void *user_param);
+
+/**
  * Typedef for application callback function to be registered by application
  * software for notification of device events
  *
@@ -853,7 +879,6 @@  struct rte_cryptodev_config {
 		enum rte_cryptodev_event_type event,
 		rte_cryptodev_cb_fn cb_fn, void *cb_arg);
 
-
 typedef uint16_t (*dequeue_pkt_burst_t)(void *qp,
 		struct rte_crypto_op **ops,	uint16_t nb_ops);
 /**< Dequeue processed packets from queue pair of a device. */
@@ -870,6 +895,17 @@  typedef uint16_t (*enqueue_pkt_burst_t)(void *qp,
 /** Structure to keep track of registered callbacks */
 TAILQ_HEAD(rte_cryptodev_cb_list, rte_cryptodev_callback);
 
+/**
+ * @internal
+ * Structure used to hold information about the callbacks to be called for a
+ * queue pair on enqueue.
+ */
+struct rte_cryptodev_enq_callback {
+	struct rte_cryptodev_enq_callback *next;
+	rte_cryptodev_enq_cb_fn fn;
+	void *arg;
+};
+
 /** The data structure associated with each crypto device. */
 struct rte_cryptodev {
 	dequeue_pkt_burst_t dequeue_burst;
@@ -898,6 +934,14 @@  struct rte_cryptodev {
 	__extension__
 	uint8_t attached : 1;
 	/**< Flag indicating the device is attached */
+
+#ifdef RTE_CRYPTODEV_CALLBACKS
+	struct rte_cryptodev_enq_callback **enq_cbs;
+	/**< User application callback for pre enqueue processing */
+
+	struct rte_rcu_qsbr *qsbr;
+	/** < RCU QSBR variable for rte_cryptodev_enq_callback */
+#endif
 } __rte_cache_aligned;
 
 void *
@@ -1019,6 +1063,18 @@  struct rte_cryptodev_data {
 		struct rte_crypto_op **ops, uint16_t nb_ops)
 {
 	struct rte_cryptodev *dev = &rte_cryptodevs[dev_id];
+#ifdef RTE_CRYPTODEV_CALLBACKS
+	if (unlikely(dev->enq_cbs != NULL && dev->enq_cbs[qp_id] != NULL)) {
+		struct rte_cryptodev_enq_callback *cb =
+			dev->enq_cbs[qp_id];
+
+		do {
+			nb_ops = cb->fn(dev_id, qp_id, ops, nb_ops,
+					cb->arg);
+			cb = cb->next;
+		} while (cb != NULL);
+	}
+#endif
 
 	rte_cryptodev_trace_enqueue_burst(dev_id, qp_id, (void **)ops, nb_ops);
 	return (*dev->enqueue_burst)(
@@ -1351,6 +1407,102 @@  struct rte_cryptodev_asym_session *
 	struct rte_cryptodev_sym_session *sess, union rte_crypto_sym_ofs ofs,
 	struct rte_crypto_sym_vec *vec);
 
+#ifdef RTE_CRYPTODEV_CALLBACKS
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Add a user callback for a given crypto device and queue pair which will be
+ * called on crypto ops enqueue.
+ *
+ * This API configures a function to be called for each burst of crypto ops
+ * received on a given crypto device queue pair. The return value is a pointer
+ * that can be used later to remove the callback using
+ * rte_cryptodev_remove_enq_callback().
+ *
+ * Multiple functions are called in the order that they are added.
+ *
+ * @param	dev_id		The identifier of the device.
+ * @param	qp_id		The index of the queue pair in which ops are
+ *				to be enqueued for processing. The value
+ *				must be in the range [0, nb_queue_pairs - 1]
+ *				previously supplied to
+ *				*rte_cryptodev_configure*.
+ * @param	cb_fn		The callback function
+ * @param	cb_arg		A generic pointer parameter which will be passed
+ *				to each invocation of the callback function on
+ *				this crypto device and queue pair.
+ *
+ * @return
+ *   NULL on error.
+ *   On success, a pointer value which can later be used to remove the callback.
+ */
+
+__rte_experimental
+struct rte_cryptodev_enq_callback *
+rte_cryptodev_add_enq_callback(uint8_t dev_id,
+			       uint16_t qp_id,
+			       rte_cryptodev_enq_cb_fn cb_fn,
+			       void *cb_arg);
+
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Remove a user callback function for given crypto device and queue pair.
+ *
+ * This function is used to removed callbacks that were added to a crypto
+ * device queue pair using rte_cryptodev_add_enq_callback().
+ *
+ * Note: The callback expects a RCU QSBR to be configured to synchronize
+ * to free the memory. Application is expected to configure RCU QSBR after
+ * adding an enqueue callback.
+ *
+ *
+ * @param	dev_id		The identifier of the device.
+ * @param	qp_id		The index of the queue pair in which ops are
+ *				to be enqueued for processing. The value
+ *				must be in the range [0, nb_queue_pairs - 1]
+ *				previously supplied to
+ *				*rte_cryptodev_configure*.
+ * @param	cb		Pointer to user supplied callback created via
+ *				rte_cryptodev_add_enq_callback().
+ *
+ * @return
+ *   - 0: Success. Callback was removed.
+ *   - -EINVAL:  The dev_id or the qp_id is out of range, or the callback
+ *               is NULL or not found for the crypto device queue pair.
+ */
+
+__rte_experimental
+int rte_cryptodev_remove_enq_callback(uint8_t dev_id,
+				  uint16_t qp_id,
+				  struct rte_cryptodev_enq_callback *cb);
+
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Associate RCU QSBR variable with a cryptodev.
+ *
+ * This function is used to add RCU QSBR to a crypto device.
+ * The purpose of RCU is to help multiple threads to synchronize
+ * with each other before initiating adding/removing callback
+ * while dataplane threads are running enqueue callbacks.
+ *
+ * @param	dev_id		The identifier of the device.
+ * @param	qsr		RCU QSBR configuration
+ * @return
+ *   On success - 0
+ *   On error - EINVAL.
+ */
+
+__rte_experimental
+int rte_cryptodev_rcu_qsbr_add(uint8_t dev_id, struct rte_rcu_qsbr *qsbr);
+#endif
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/librte_cryptodev/rte_cryptodev_version.map b/lib/librte_cryptodev/rte_cryptodev_version.map
index 02f6dcf..46de3ca 100644
--- a/lib/librte_cryptodev/rte_cryptodev_version.map
+++ b/lib/librte_cryptodev/rte_cryptodev_version.map
@@ -64,6 +64,7 @@  DPDK_20.0 {
 	rte_cryptodev_sym_capability_get;
 };
 
+
 EXPERIMENTAL {
 	global:
 
@@ -105,4 +106,9 @@  EXPERIMENTAL {
 
 	# added in 20.08
 	rte_cryptodev_get_qp_status;
+
+	# added in 20.11
+	rte_cryptodev_add_enq_callback;
+	rte_cryptodev_remove_enq_callback;
+	rte_cryptodev_rcu_qsbr_add;
 };