[v4,1/4] cryptodev: add symmetric crypto data-path APIs

Message ID 20200703124942.29171-2-roy.fan.zhang@intel.com (mailing list archive)
State Changes Requested, archived
Delegated to: akhil goyal
Headers
Series cryptodev: add symmetric crypto data-path APIs |

Checks

Context Check Description
ci/checkpatch warning coding style issues
ci/Intel-compilation fail apply issues

Commit Message

Fan Zhang July 3, 2020, 12:49 p.m. UTC
  This patch adds data-path APIs to cryptodev. The APIs are organized as
a data structure containing function pointers for different enqueue and
dequeue operations. An added public API is added to obtain the function
pointers and necessary queue pair data from the device queue pair.

This patch depends on patch-72157 ("cryptodev: add function to check
if qp was setup")

Signed-off-by: Fan Zhang <roy.fan.zhang@intel.com>
Signed-off-by: Piotr Bronowski <piotrx.bronowski@intel.com>
---
 lib/librte_cryptodev/rte_crypto_sym.h         |  48 +++++
 lib/librte_cryptodev/rte_cryptodev.c          |  22 +++
 lib/librte_cryptodev/rte_cryptodev.h          | 173 +++++++++++++++++-
 lib/librte_cryptodev/rte_cryptodev_pmd.h      |  12 +-
 .../rte_cryptodev_version.map                 |   4 +
 5 files changed, 255 insertions(+), 4 deletions(-)
  

Comments

Akhil Goyal July 4, 2020, 6:16 p.m. UTC | #1
Hi Fan,

> +
> +/**
> + * Asynchronous operation job descriptor.
> + * Used by HW crypto devices direct API call that supports such activity
> + **/
> +struct rte_crypto_sym_job {
> +	union {
> +		/**
> +		 * When RTE_CRYPTO_HW_ENQ_FLAG_IS_SGL bit is set in flags,
> sgl
> +		 * field is used as input data. Otherwise data_iova is
> +		 * used.
> +		 **/
> +		rte_iova_t data_iova;
> +		struct rte_crypto_sgl *sgl;
> +	};
> +	union {
> +		/**
> +		 * Different than cryptodev ops, all ofs and len fields have
> +		 * the unit of bytes (including Snow3G/Kasumi/Zuc.
> +		 **/
> +		struct {
> +			uint32_t cipher_ofs;
> +			uint32_t cipher_len;
> +		} cipher_only;
> +		struct {
> +			uint32_t auth_ofs;
> +			uint32_t auth_len;
> +			rte_iova_t digest_iova;
> +		} auth_only;
> +		struct {
> +			uint32_t aead_ofs;
> +			uint32_t aead_len;
> +			rte_iova_t tag_iova;
> +			uint8_t *aad;
> +			rte_iova_t aad_iova;
> +		} aead;
> +		struct {
> +			uint32_t cipher_ofs;
> +			uint32_t cipher_len;
> +			uint32_t auth_ofs;
> +			uint32_t auth_len;
> +			rte_iova_t digest_iova;
> +		} chain;
> +	};
> +	uint8_t *iv;
> +	rte_iova_t iv_iova;
> +};

NACK,
Why do you need this structure definitions again when you have similar ones
(the ones used in CPU crypto) available for the same purpose.
In case of CPU crypto, there were 2 main requirements
- synchronous API instead of enq +deq
- raw buffers.

Now for this patchset, the requirement is 
- raw buffers 
- asynchronous APIs

The data structure for raw buffers and crypto related offsets are already defined
So they should be reused.
And I believe with some changes in rte_crypto_op  and rte_crypto_sym_op,
We can support raw buffers with the same APIs.
Instead of m_src and m_dst, raw buffer data structures can be combined in a
Union and some of the fields in the rte_crypto_op can be left NULL in case of raw buffers.


> +/* Struct for user to perform HW specific enqueue/dequeue function calls */
> +struct rte_crypto_hw_ops {
> +	/* Driver written queue pair data pointer, should NOT be alterred by
> +	 * the user.
> +	 */
> +	void *qp;
> +	/* Function handler to enqueue AEAD job */
> +	rte_crypto_hw_enq_cb_fn enqueue_aead;
> +	/* Function handler to enqueue cipher only job */
> +	rte_crypto_hw_enq_cb_fn enqueue_cipher;
> +	/* Function handler to enqueue auth only job */
> +	rte_crypto_hw_enq_cb_fn enqueue_auth;
> +	/* Function handler to enqueue cipher + hash chaining job */
> +	rte_crypto_hw_enq_cb_fn enqueue_chain;
> +	/* Function handler to query processed jobs */
> +	rte_crypto_hw_query_processed query_processed;
> +	/* Function handler to dequeue one job and return opaque data stored
> */
> +	rte_crypto_hw_deq_one_cb_fn dequeue_one;
> +	/* Function handler to dequeue many jobs */
> +	rte_crypto_hw_deq_many_cb_fn dequeue_many;
> +	/* Reserved */
> +	void *reserved[8];
> +};

Why do we need such callbacks in the library?
These should be inside the drivers, or else we do the same for
Legacy case as well. The pain of finding the correct enq function for 
Appropriate crypto operation is already handled by all the drivers
And we can reuse that or else we modify it there as well.

We should not add a lot of data paths for the user, otherwise the
APIs will become centric to a particular vendor and it will be very difficult
For the user to migrate from one vendor to another and would defeat the
Purpose of DPDK which provide uniform abstraction layer for all the hardware
Vendors.

Adding other vendors to comment.

Regards,
Akhil
  
Fan Zhang July 6, 2020, 10:02 a.m. UTC | #2
Hi Akhil,

> -----Original Message-----
> From: Akhil Goyal <akhil.goyal@nxp.com>
> Sent: Saturday, July 4, 2020 7:16 PM
> To: Zhang, Roy Fan <roy.fan.zhang@intel.com>; dev@dpdk.org;
> anoobj@marvell.com; asomalap@amd.com; ruifeng.wang@arm.com;
> Nagadheeraj Rottela <rnagadheeraj@marvell.com>; Michael Shamis
> <michaelsh@marvell.com>; Ankur Dwivedi <adwivedi@marvell.com>; Jay
> Zhou <jianjay.zhou@huawei.com>; De Lara Guarch, Pablo
> <pablo.de.lara.guarch@intel.com>
> Cc: Trahe, Fiona <fiona.trahe@intel.com>; Bronowski, PiotrX
> <piotrx.bronowski@intel.com>; Ananyev, Konstantin
> <konstantin.ananyev@intel.com>; Thomas Monjalon
> <thomas@monjalon.net>
> Subject: RE: [dpdk-dev v4 1/4] cryptodev: add symmetric crypto data-path
> APIs
> 
> Hi Fan,
> 
> > +
> > +/**
> > + * Asynchronous operation job descriptor.
> > + * Used by HW crypto devices direct API call that supports such activity
> > + **/
> > +struct rte_crypto_sym_job {
> > +	union {
> > +		/**
> > +		 * When RTE_CRYPTO_HW_ENQ_FLAG_IS_SGL bit is set in
> flags,
> > sgl
> > +		 * field is used as input data. Otherwise data_iova is
> > +		 * used.
> > +		 **/
> > +		rte_iova_t data_iova;
> > +		struct rte_crypto_sgl *sgl;
> > +	};
> > +	union {
> > +		/**
> > +		 * Different than cryptodev ops, all ofs and len fields have
> > +		 * the unit of bytes (including Snow3G/Kasumi/Zuc.
> > +		 **/
> > +		struct {
> > +			uint32_t cipher_ofs;
> > +			uint32_t cipher_len;
> > +		} cipher_only;
> > +		struct {
> > +			uint32_t auth_ofs;
> > +			uint32_t auth_len;
> > +			rte_iova_t digest_iova;
> > +		} auth_only;
> > +		struct {
> > +			uint32_t aead_ofs;
> > +			uint32_t aead_len;
> > +			rte_iova_t tag_iova;
> > +			uint8_t *aad;
> > +			rte_iova_t aad_iova;
> > +		} aead;
> > +		struct {
> > +			uint32_t cipher_ofs;
> > +			uint32_t cipher_len;
> > +			uint32_t auth_ofs;
> > +			uint32_t auth_len;
> > +			rte_iova_t digest_iova;
> > +		} chain;
> > +	};
> > +	uint8_t *iv;
> > +	rte_iova_t iv_iova;
> > +};
> 
> NACK,
> Why do you need this structure definitions again when you have similar ones
> (the ones used in CPU crypto) available for the same purpose.
> In case of CPU crypto, there were 2 main requirements
> - synchronous API instead of enq +deq
> - raw buffers.
> 

As you may have seen the structure definition is hW centric with the
IOVA addresses all over. Also as you will from the patch series the operation is 
Per operation basis instead of operating in a burst. The external application
may sooner know when a specific enqueue is failed. 

> Now for this patchset, the requirement is
> - raw buffers
> - asynchronous APIs
> 
> The data structure for raw buffers and crypto related offsets are already
> defined
> So they should be reused.
> And I believe with some changes in rte_crypto_op  and rte_crypto_sym_op,
> We can support raw buffers with the same APIs.
> Instead of m_src and m_dst, raw buffer data structures can be combined in a
> Union and some of the fields in the rte_crypto_op can be left NULL in case of
> raw buffers.
> 

This is a good point but we still face too many unnecessary fields to be NULL, such as
digest pointers, I have given a lot thought to this structure. Hopefully it covers
all vendor's HW symmetric crypto needs and in the same time it well squeeze 
the required HW addresses into 1 cacheline, instead of rte_crypto_op + 
rte_crypto_sym_op 3 cacheline footprint. Another purpose of the structure design
is the structure buffer can be taken from stack and can be used to fill all
jobs to the PMD HW.

> 
> > +/* Struct for user to perform HW specific enqueue/dequeue function calls
> */
> > +struct rte_crypto_hw_ops {
> > +	/* Driver written queue pair data pointer, should NOT be alterred by
> > +	 * the user.
> > +	 */
> > +	void *qp;
> > +	/* Function handler to enqueue AEAD job */
> > +	rte_crypto_hw_enq_cb_fn enqueue_aead;
> > +	/* Function handler to enqueue cipher only job */
> > +	rte_crypto_hw_enq_cb_fn enqueue_cipher;
> > +	/* Function handler to enqueue auth only job */
> > +	rte_crypto_hw_enq_cb_fn enqueue_auth;
> > +	/* Function handler to enqueue cipher + hash chaining job */
> > +	rte_crypto_hw_enq_cb_fn enqueue_chain;
> > +	/* Function handler to query processed jobs */
> > +	rte_crypto_hw_query_processed query_processed;
> > +	/* Function handler to dequeue one job and return opaque data
> stored
> > */
> > +	rte_crypto_hw_deq_one_cb_fn dequeue_one;
> > +	/* Function handler to dequeue many jobs */
> > +	rte_crypto_hw_deq_many_cb_fn dequeue_many;
> > +	/* Reserved */
> > +	void *reserved[8];
> > +};
> 
> Why do we need such callbacks in the library?
> These should be inside the drivers, or else we do the same for
> Legacy case as well. The pain of finding the correct enq function for
> Appropriate crypto operation is already handled by all the drivers
> And we can reuse that or else we modify it there as well.
> 

Providing different types of enqueue functions for specific operation type
could save a lot of branches for the driver to handle. As mentioned this
data-path API is intended to be used as an advanced feature to provide
close-to-native perf to external library/applications that are not mbuf
centric. And I don't agree classifying choosing 1 enqueue function from
4 candidates as "pain".

> We should not add a lot of data paths for the user, otherwise the
> APIs will become centric to a particular vendor and it will be very difficult
> For the user to migrate from one vendor to another and would defeat the
> Purpose of DPDK which provide uniform abstraction layer for all the
> hardware
> Vendors.
> 

The purpose of adding data-path for the user is performance for non-mbuf
data-path centric applications/libraries, in the meantime not creating
confusion. In this version we aim to provide a more friendly data-path for
them, and aims to be useful to all vendor's PMDs. If there is any place in
the API that blocks a PMD please let me know.

> Adding other vendors to comment.
> 
> Regards,
> Akhil
  
Akhil Goyal July 6, 2020, 12:13 p.m. UTC | #3
Hi Fan,

> Hi Akhil,
> 
> > > +
> > > +/**
> > > + * Asynchronous operation job descriptor.
> > > + * Used by HW crypto devices direct API call that supports such activity
> > > + **/
> > > +struct rte_crypto_sym_job {
> > > +	union {
> > > +		/**
> > > +		 * When RTE_CRYPTO_HW_ENQ_FLAG_IS_SGL bit is set in
> > flags,
> > > sgl
> > > +		 * field is used as input data. Otherwise data_iova is
> > > +		 * used.
> > > +		 **/
> > > +		rte_iova_t data_iova;
> > > +		struct rte_crypto_sgl *sgl;
> > > +	};
> > > +	union {
> > > +		/**
> > > +		 * Different than cryptodev ops, all ofs and len fields have
> > > +		 * the unit of bytes (including Snow3G/Kasumi/Zuc.
> > > +		 **/
> > > +		struct {
> > > +			uint32_t cipher_ofs;
> > > +			uint32_t cipher_len;
> > > +		} cipher_only;
> > > +		struct {
> > > +			uint32_t auth_ofs;
> > > +			uint32_t auth_len;
> > > +			rte_iova_t digest_iova;
> > > +		} auth_only;
> > > +		struct {
> > > +			uint32_t aead_ofs;
> > > +			uint32_t aead_len;
> > > +			rte_iova_t tag_iova;
> > > +			uint8_t *aad;
> > > +			rte_iova_t aad_iova;
> > > +		} aead;
> > > +		struct {
> > > +			uint32_t cipher_ofs;
> > > +			uint32_t cipher_len;
> > > +			uint32_t auth_ofs;
> > > +			uint32_t auth_len;
> > > +			rte_iova_t digest_iova;
> > > +		} chain;
> > > +	};
> > > +	uint8_t *iv;
> > > +	rte_iova_t iv_iova;
> > > +};
> >
> > NACK,
> > Why do you need this structure definitions again when you have similar ones
> > (the ones used in CPU crypto) available for the same purpose.
> > In case of CPU crypto, there were 2 main requirements
> > - synchronous API instead of enq +deq
> > - raw buffers.
> >
> 
> As you may have seen the structure definition is hW centric with the
> IOVA addresses all over. Also as you will from the patch series the operation is
> Per operation basis instead of operating in a burst. The external application
> may sooner know when a specific enqueue is failed.

You may also need to save a virtual address as well. As some hardware are able to
Convert virtual to physical addresses on it's own giving a performance improvement.

I do not see an issue in using enqueue burst with burst size=1 , but since you are doing
Optimizations, none of the hardware can perform well with burst = 1, I think it is always
Greater than 1.
> 
> > Now for this patchset, the requirement is
> > - raw buffers
> > - asynchronous APIs
> >
> > The data structure for raw buffers and crypto related offsets are already
> > defined
> > So they should be reused.
> > And I believe with some changes in rte_crypto_op  and rte_crypto_sym_op,
> > We can support raw buffers with the same APIs.
> > Instead of m_src and m_dst, raw buffer data structures can be combined in a
> > Union and some of the fields in the rte_crypto_op can be left NULL in case of
> > raw buffers.
> >
> 
> This is a good point but we still face too many unnecessary fields to be NULL,
> such as
> digest pointers, I have given a lot thought to this structure. Hopefully it covers
> all vendor's HW symmetric crypto needs and in the same time it well squeeze
> the required HW addresses into 1 cacheline, instead of rte_crypto_op +
> rte_crypto_sym_op 3 cacheline footprint. Another purpose of the structure
> design
> is the structure buffer can be taken from stack and can be used to fill all
> jobs to the PMD HW.

Which fields you think are not useful and should be set as NULL?
Digest pointers you are anyways setting in the new structure.
Your new struct does not support session less as well as security sessions.
It does not take care of asymmetric crypto.
So whenever, a vendor need to support all these, we would end up getting
the rte_crypto_op structure.
IMO, you only need to make m_src and m_dst as union to a raw input/output
buffers. Everything else will be relevant.

Have you done some profiling with using rte_crypto_op instead of this new struct?

> 
> >
> > > +/* Struct for user to perform HW specific enqueue/dequeue function calls
> > */
> > > +struct rte_crypto_hw_ops {
> > > +	/* Driver written queue pair data pointer, should NOT be alterred by
> > > +	 * the user.
> > > +	 */
> > > +	void *qp;
> > > +	/* Function handler to enqueue AEAD job */
> > > +	rte_crypto_hw_enq_cb_fn enqueue_aead;
> > > +	/* Function handler to enqueue cipher only job */
> > > +	rte_crypto_hw_enq_cb_fn enqueue_cipher;
> > > +	/* Function handler to enqueue auth only job */
> > > +	rte_crypto_hw_enq_cb_fn enqueue_auth;
> > > +	/* Function handler to enqueue cipher + hash chaining job */
> > > +	rte_crypto_hw_enq_cb_fn enqueue_chain;
> > > +	/* Function handler to query processed jobs */
> > > +	rte_crypto_hw_query_processed query_processed;
> > > +	/* Function handler to dequeue one job and return opaque data
> > stored
> > > */
> > > +	rte_crypto_hw_deq_one_cb_fn dequeue_one;
> > > +	/* Function handler to dequeue many jobs */
> > > +	rte_crypto_hw_deq_many_cb_fn dequeue_many;
> > > +	/* Reserved */
> > > +	void *reserved[8];
> > > +};
> >
> > Why do we need such callbacks in the library?
> > These should be inside the drivers, or else we do the same for
> > Legacy case as well. The pain of finding the correct enq function for
> > Appropriate crypto operation is already handled by all the drivers
> > And we can reuse that or else we modify it there as well.
> >
> 
> Providing different types of enqueue functions for specific operation type
> could save a lot of branches for the driver to handle. As mentioned this
> data-path API is intended to be used as an advanced feature to provide
> close-to-native perf to external library/applications that are not mbuf
> centric. And I don't agree classifying choosing 1 enqueue function from
> 4 candidates as "pain".

My point is why don't we have it in the Legacy code path as well?
I think it is useful in both the paths. Branching is a pain for the driver.

> 
> > We should not add a lot of data paths for the user, otherwise the
> > APIs will become centric to a particular vendor and it will be very difficult
> > For the user to migrate from one vendor to another and would defeat the
> > Purpose of DPDK which provide uniform abstraction layer for all the
> > hardware
> > Vendors.
> >
> 
> The purpose of adding data-path for the user is performance for non-mbuf
> data-path centric applications/libraries, in the meantime not creating
> confusion. In this version we aim to provide a more friendly data-path for

I do not see the new path as friendly.
Adding a parallel new datapath with create more confusion for the application
developer. It would be convenient, if we can use the same path with minimal
changes so that people can migrate easily.

> them, and aims to be useful to all vendor's PMDs. If there is any place in
> the API that blocks a PMD please let me know.

As commented above, sessionless, rte_security sessions, asymmetric crypto
Not supported.


> 
> > Adding other vendors to comment.
> >
> > Regards,
> > Akhil
  
Fan Zhang July 7, 2020, 12:37 p.m. UTC | #4
Hi Akhil,

> -----Original Message-----
> From: Akhil Goyal <akhil.goyal@nxp.com>
> Sent: Monday, July 6, 2020 1:13 PM
> To: Zhang, Roy Fan <roy.fan.zhang@intel.com>; dev@dpdk.org;
> anoobj@marvell.com; asomalap@amd.com; ruifeng.wang@arm.com;
> Nagadheeraj Rottela <rnagadheeraj@marvell.com>; Michael Shamis
> <michaelsh@marvell.com>; Ankur Dwivedi <adwivedi@marvell.com>; Jay
> Zhou <jianjay.zhou@huawei.com>; De Lara Guarch, Pablo
> <pablo.de.lara.guarch@intel.com>
> Cc: Trahe, Fiona <fiona.trahe@intel.com>; Bronowski, PiotrX
> <piotrx.bronowski@intel.com>; Ananyev, Konstantin
> <konstantin.ananyev@intel.com>; Thomas Monjalon
> <thomas@monjalon.net>
> Subject: RE: [dpdk-dev v4 1/4] cryptodev: add symmetric crypto data-path
> APIs
> 
...
> >
> > As you may have seen the structure definition is hW centric with the
> > IOVA addresses all over. Also as you will from the patch series the
> operation is
> > Per operation basis instead of operating in a burst. The external application
> > may sooner know when a specific enqueue is failed.
> 
> You may also need to save a virtual address as well. As some hardware are
> able to
> Convert virtual to physical addresses on it's own giving a performance
> improvement.
> 
> I do not see an issue in using enqueue burst with burst size=1 , but since you
> are doing
> Optimizations, none of the hardware can perform well with burst = 1, I think
> it is always
> Greater than 1.

Shall I update the rte_crypto_sym_vec as the following - so the 2 problems can be
resolved?

struct rte_crypto_sym_vec {
	/** array of SGL vectors */
	struct rte_crypto_sgl *sgl;
	union {
		/* Supposed to be used with CPU crypto API call. */
		struct {
			/** array of pointers to IV */
			void **iv;
			/** array of pointers to AAD */
			void **aad;
			/** array of pointers to digest */
			void **digest;
			/**
			 * array of statuses for each operation:
			 *  - 0 on success
			 *  - errno on error
			 */
			int32_t *status;
		};

		/* Supposed to be used with HW crypto API call. */
		struct {
			/** array of pointers to IV */
			struct rte_crypto_vec *iv_hw;
			/** array of pointers to AAD */
			struct rte_crypto_vec *aad_hw;
			/** array of pointers to Digest */
			struct rte_crypto_vec *digest_hw;
		};

	};
	/** number of operations to perform */
	uint32_t num;
};

> >
> > > Now for this patchset, the requirement is
> > > - raw buffers
> > > - asynchronous APIs
> > >
> > > The data structure for raw buffers and crypto related offsets are already
> > > defined
> > > So they should be reused.
> > > And I believe with some changes in rte_crypto_op  and
> rte_crypto_sym_op,
> > > We can support raw buffers with the same APIs.
> > > Instead of m_src and m_dst, raw buffer data structures can be combined
> in a
> > > Union and some of the fields in the rte_crypto_op can be left NULL in
> case of
> > > raw buffers.
> > >
> >
> > This is a good point but we still face too many unnecessary fields to be
> NULL,
> > such as
> > digest pointers, I have given a lot thought to this structure. Hopefully it
> covers
> > all vendor's HW symmetric crypto needs and in the same time it well
> squeeze
> > the required HW addresses into 1 cacheline, instead of rte_crypto_op +
> > rte_crypto_sym_op 3 cacheline footprint. Another purpose of the
> structure
> > design
> > is the structure buffer can be taken from stack and can be used to fill all
> > jobs to the PMD HW.
> 
> Which fields you think are not useful and should be set as NULL?
> Digest pointers you are anyways setting in the new structure.
> Your new struct does not support session less as well as security sessions.
> It does not take care of asymmetric crypto.
> So whenever, a vendor need to support all these, we would end up getting
> the rte_crypto_op structure.
> IMO, you only need to make m_src and m_dst as union to a raw
> input/output
> buffers. Everything else will be relevant.
> 

Rte_crypto_op is designed to be allocated from mempool with HW address
info contained so it is possible to deduct IV and AAD physical address from
it. More importantly rte_crypto_op is designed to be taken from heap and 
being freed after dequeue. So they cannot be allocated from stack - for this
reason I think rte_crypot_sym_vec is a better fit for the patch, do you agree? 
(the Proposed change is at above).

> Have you done some profiling with using rte_crypto_op instead of this new
> struct?
> 
Yes, the code are actually upstreamed in VPP
https://gerrit.fd.io/r/c/vpp/+/18036, please try out. If you have a look at the
enqueue/dequeue functions you should see the struggle we had to translate
ops, and creating a second software ring to make sure we only dequeue a 
frame of data. Lucky VPP has space to store mbufs otherwise the perf will
be even worse.

> >
> > >
> > > > +/* Struct for user to perform HW specific enqueue/dequeue function
> calls
> > > */
> > > > +struct rte_crypto_hw_ops {
> > > > +	/* Driver written queue pair data pointer, should NOT be alterred by
> > > > +	 * the user.
> > > > +	 */
> > > > +	void *qp;
> > > > +	/* Function handler to enqueue AEAD job */
> > > > +	rte_crypto_hw_enq_cb_fn enqueue_aead;
> > > > +	/* Function handler to enqueue cipher only job */
> > > > +	rte_crypto_hw_enq_cb_fn enqueue_cipher;
> > > > +	/* Function handler to enqueue auth only job */
> > > > +	rte_crypto_hw_enq_cb_fn enqueue_auth;
> > > > +	/* Function handler to enqueue cipher + hash chaining job */
> > > > +	rte_crypto_hw_enq_cb_fn enqueue_chain;
> > > > +	/* Function handler to query processed jobs */
> > > > +	rte_crypto_hw_query_processed query_processed;
> > > > +	/* Function handler to dequeue one job and return opaque data
> > > stored
> > > > */
> > > > +	rte_crypto_hw_deq_one_cb_fn dequeue_one;
> > > > +	/* Function handler to dequeue many jobs */
> > > > +	rte_crypto_hw_deq_many_cb_fn dequeue_many;
> > > > +	/* Reserved */
> > > > +	void *reserved[8];
> > > > +};
> > >
> > > Why do we need such callbacks in the library?
> > > These should be inside the drivers, or else we do the same for
> > > Legacy case as well. The pain of finding the correct enq function for
> > > Appropriate crypto operation is already handled by all the drivers
> > > And we can reuse that or else we modify it there as well.
> > >
> >
> > Providing different types of enqueue functions for specific operation type
> > could save a lot of branches for the driver to handle. As mentioned this
> > data-path API is intended to be used as an advanced feature to provide
> > close-to-native perf to external library/applications that are not mbuf
> > centric. And I don't agree classifying choosing 1 enqueue function from
> > 4 candidates as "pain".
> 
> My point is why don't we have it in the Legacy code path as well?
> I think it is useful in both the paths. Branching is a pain for the driver.
> 

That's a good point :-) we definitely can do something about it in future releases.

> >
> > > We should not add a lot of data paths for the user, otherwise the
> > > APIs will become centric to a particular vendor and it will be very difficult
> > > For the user to migrate from one vendor to another and would defeat
> the
> > > Purpose of DPDK which provide uniform abstraction layer for all the
> > > hardware
> > > Vendors.
> > >
> >
> > The purpose of adding data-path for the user is performance for non-mbuf
> > data-path centric applications/libraries, in the meantime not creating
> > confusion. In this version we aim to provide a more friendly data-path for
> 
> I do not see the new path as friendly.
> Adding a parallel new datapath with create more confusion for the
> application
> developer. It would be convenient, if we can use the same path with minimal
> changes so that people can migrate easily.
> 

We are working on next version that is based on rte_crypto_sym_vec and single
Enqueue-then-dequeue API. To be honest it won't be that of friendly to application
developer that the applications are mbuf-based or already built on top of cryptodev,
however for the applications that are not mbuf based and having their own data-path
structures it will be surely friendlier than existing enqueue and dequeue APIs. No
dependency to mbuf, mbuf and crypto op pool, and no need to consider how to adapt
different working methods.

> > them, and aims to be useful to all vendor's PMDs. If there is any place in
> > the API that blocks a PMD please let me know.
> 
> As commented above, sessionless, rte_security sessions, asymmetric crypto
> Not supported.
> 
You are right - 
Sessionless support aims the usability and is not intended to be used in high-throughput
Application.
Rte_security is built on top of mbuf and ethdev and is not intended to "mbuf-independent"
applications either.

> 
> >
> > > Adding other vendors to comment.
> > >
> > > Regards,
> > > Akhil

Regards,
Fan
  
Akhil Goyal July 7, 2020, 8:37 p.m. UTC | #5
Hi Fan,
> 
> Hi Akhil,
> 
> ...
> > >
> > > As you may have seen the structure definition is hW centric with the
> > > IOVA addresses all over. Also as you will from the patch series the
> > operation is
> > > Per operation basis instead of operating in a burst. The external application
> > > may sooner know when a specific enqueue is failed.
> >
> > You may also need to save a virtual address as well. As some hardware are
> > able to
> > Convert virtual to physical addresses on it's own giving a performance
> > improvement.
> >
> > I do not see an issue in using enqueue burst with burst size=1 , but since you
> > are doing
> > Optimizations, none of the hardware can perform well with burst = 1, I think
> > it is always
> > Greater than 1.
> 
> Shall I update the rte_crypto_sym_vec as the following - so the 2 problems can
> be
> resolved?
> 
> struct rte_crypto_sym_vec {
> 	/** array of SGL vectors */
> 	struct rte_crypto_sgl *sgl;
> 	union {
> 		/* Supposed to be used with CPU crypto API call. */
> 		struct {
> 			/** array of pointers to IV */
> 			void **iv;
> 			/** array of pointers to AAD */
> 			void **aad;
> 			/** array of pointers to digest */
> 			void **digest;
> 			/**
> 			 * array of statuses for each operation:
> 			 *  - 0 on success
> 			 *  - errno on error
> 			 */
> 			int32_t *status;
> 		};
> 
> 		/* Supposed to be used with HW crypto API call. */
> 		struct {
> 			/** array of pointers to IV */
> 			struct rte_crypto_vec *iv_hw;
> 			/** array of pointers to AAD */
> 			struct rte_crypto_vec *aad_hw;
> 			/** array of pointers to Digest */
> 			struct rte_crypto_vec *digest_hw;
> 		};
> 
> 	};
> 	/** number of operations to perform */
> 	uint32_t num;
> };

Yes something of that sort can work. 

The current use case was also discussed in the CPU crypto mail chain
About the need of non-mbuf use case for async enq/deq APIs.
http://patches.dpdk.org/patch/58528/#101995


> 
> > >
> > > > Now for this patchset, the requirement is
> > > > - raw buffers
> > > > - asynchronous APIs
> > > >
> > > > The data structure for raw buffers and crypto related offsets are already
> > > > defined
> > > > So they should be reused.
> > > > And I believe with some changes in rte_crypto_op  and
> > rte_crypto_sym_op,
> > > > We can support raw buffers with the same APIs.
> > > > Instead of m_src and m_dst, raw buffer data structures can be combined
> > in a
> > > > Union and some of the fields in the rte_crypto_op can be left NULL in
> > case of
> > > > raw buffers.
> > > >
> > >
> > > This is a good point but we still face too many unnecessary fields to be
> > NULL,
> > > such as
> > > digest pointers, I have given a lot thought to this structure. Hopefully it
> > covers
> > > all vendor's HW symmetric crypto needs and in the same time it well
> > squeeze
> > > the required HW addresses into 1 cacheline, instead of rte_crypto_op +
> > > rte_crypto_sym_op 3 cacheline footprint. Another purpose of the
> > structure
> > > design
> > > is the structure buffer can be taken from stack and can be used to fill all
> > > jobs to the PMD HW.
> >
> > Which fields you think are not useful and should be set as NULL?
> > Digest pointers you are anyways setting in the new structure.
> > Your new struct does not support session less as well as security sessions.
> > It does not take care of asymmetric crypto.
> > So whenever, a vendor need to support all these, we would end up getting
> > the rte_crypto_op structure.
> > IMO, you only need to make m_src and m_dst as union to a raw
> > input/output
> > buffers. Everything else will be relevant.
> >
> 
> Rte_crypto_op is designed to be allocated from mempool with HW address
> info contained so it is possible to deduct IV and AAD physical address from
> it. More importantly rte_crypto_op is designed to be taken from heap and
> being freed after dequeue. So they cannot be allocated from stack - for this
> reason I think rte_crypot_sym_vec is a better fit for the patch, do you agree?
> (the Proposed change is at above).

Agreed.

> 
> > Have you done some profiling with using rte_crypto_op instead of this new
> > struct?
> >
> Yes, the code are actually upstreamed in VPP
> https://gerrit.fd.io/r/c/vpp/+/18036, please try out. If you
> have a look at the
> enqueue/dequeue functions you should see the struggle we had to translate
> ops, and creating a second software ring to make sure we only dequeue a
> frame of data. Lucky VPP has space to store mbufs otherwise the perf will
> be even worse.
What is the performance gap do you see after making m_src and m_dst as
Raw buffers?

> 
> > >
> > > >
> > > > > +/* Struct for user to perform HW specific enqueue/dequeue function
> > calls
> > > > */
> > > > > +struct rte_crypto_hw_ops {
> > > > > +	/* Driver written queue pair data pointer, should NOT be
> alterred by
> > > > > +	 * the user.
> > > > > +	 */
> > > > > +	void *qp;
> > > > > +	/* Function handler to enqueue AEAD job */
> > > > > +	rte_crypto_hw_enq_cb_fn enqueue_aead;
> > > > > +	/* Function handler to enqueue cipher only job */
> > > > > +	rte_crypto_hw_enq_cb_fn enqueue_cipher;
> > > > > +	/* Function handler to enqueue auth only job */
> > > > > +	rte_crypto_hw_enq_cb_fn enqueue_auth;
> > > > > +	/* Function handler to enqueue cipher + hash chaining job */
> > > > > +	rte_crypto_hw_enq_cb_fn enqueue_chain;
> > > > > +	/* Function handler to query processed jobs */
> > > > > +	rte_crypto_hw_query_processed query_processed;
> > > > > +	/* Function handler to dequeue one job and return opaque data
> > > > stored
> > > > > */
> > > > > +	rte_crypto_hw_deq_one_cb_fn dequeue_one;
> > > > > +	/* Function handler to dequeue many jobs */
> > > > > +	rte_crypto_hw_deq_many_cb_fn dequeue_many;
> > > > > +	/* Reserved */
> > > > > +	void *reserved[8];
> > > > > +};
> > > >
> > > > Why do we need such callbacks in the library?
> > > > These should be inside the drivers, or else we do the same for
> > > > Legacy case as well. The pain of finding the correct enq function for
> > > > Appropriate crypto operation is already handled by all the drivers
> > > > And we can reuse that or else we modify it there as well.
> > > >
> > >
> > > Providing different types of enqueue functions for specific operation type
> > > could save a lot of branches for the driver to handle. As mentioned this
> > > data-path API is intended to be used as an advanced feature to provide
> > > close-to-native perf to external library/applications that are not mbuf
> > > centric. And I don't agree classifying choosing 1 enqueue function from
> > > 4 candidates as "pain".
> >
> > My point is why don't we have it in the Legacy code path as well?
> > I think it is useful in both the paths. Branching is a pain for the driver.
> >
> 
> That's a good point :-) we definitely can do something about it in future releases.
> 
> > >
> > > > We should not add a lot of data paths for the user, otherwise the
> > > > APIs will become centric to a particular vendor and it will be very difficult
> > > > For the user to migrate from one vendor to another and would defeat
> > the
> > > > Purpose of DPDK which provide uniform abstraction layer for all the
> > > > hardware
> > > > Vendors.
> > > >
> > >
> > > The purpose of adding data-path for the user is performance for non-mbuf
> > > data-path centric applications/libraries, in the meantime not creating
> > > confusion. In this version we aim to provide a more friendly data-path for
> >
> > I do not see the new path as friendly.
> > Adding a parallel new datapath with create more confusion for the
> > application
> > developer. It would be convenient, if we can use the same path with minimal
> > changes so that people can migrate easily.
> >
> 
> We are working on next version that is based on rte_crypto_sym_vec and single
> Enqueue-then-dequeue API. To be honest it won't be that of friendly to
> application
> developer that the applications are mbuf-based or already built on top of
> cryptodev,
> however for the applications that are not mbuf based and having their own
> data-path
> structures it will be surely friendlier than existing enqueue and dequeue APIs. No
> dependency to mbuf, mbuf and crypto op pool, and no need to consider how to
> adapt
> different working methods.

Agreed with your point. The intention is just not to create multiple copies of
Same information in multiple structures.

> 
> > > them, and aims to be useful to all vendor's PMDs. If there is any place in
> > > the API that blocks a PMD please let me know.
> >
> > As commented above, sessionless, rte_security sessions, asymmetric crypto
> > Not supported.
> >
> You are right -
> Sessionless support aims the usability and is not intended to be used in high-
> throughput
> Application.

There may be some cases where a limited amount of control pkts can be sent
Which may be session less. We cannot ask people to use a different data path
For such traffic. So we may need to support that too.

> Rte_security is built on top of mbuf and ethdev and is not intended to "mbuf-
> independent"
> applications either.

Rte_security for lookaside protocol can be mbuf independent and NXP may
Support it in future especially in case of PDCP.

Regards,
Akhil
  
Fan Zhang July 8, 2020, 3:09 p.m. UTC | #6
Hi Akhil,

Thanks for the comments!

> -----Original Message-----
> From: Akhil Goyal <akhil.goyal@nxp.com>
> Sent: Tuesday, July 7, 2020 9:37 PM
> To: Zhang, Roy Fan <roy.fan.zhang@intel.com>; dev@dpdk.org;
> anoobj@marvell.com; asomalap@amd.com; ruifeng.wang@arm.com;
> Nagadheeraj Rottela <rnagadheeraj@marvell.com>; Michael Shamis
> <michaelsh@marvell.com>; Ankur Dwivedi <adwivedi@marvell.com>; Jay
> Zhou <jianjay.zhou@huawei.com>; De Lara Guarch, Pablo
> <pablo.de.lara.guarch@intel.com>; Hemant Agrawal
> <hemant.agrawal@nxp.com>
> Cc: Trahe, Fiona <fiona.trahe@intel.com>; Bronowski, PiotrX
> <piotrx.bronowski@intel.com>; Ananyev, Konstantin
> <konstantin.ananyev@intel.com>; Thomas Monjalon
> <thomas@monjalon.net>
> Subject: RE: [dpdk-dev v4 1/4] cryptodev: add symmetric crypto data-path
> APIs
> 
> Hi Fan,
> >
> > Hi Akhil,
> >
> > ...
> > > >
> > > > As you may have seen the structure definition is hW centric with the
> > > > IOVA addresses all over. Also as you will from the patch series the
> > > operation is
> > > > Per operation basis instead of operating in a burst. The external
> application
> > > > may sooner know when a specific enqueue is failed.
> > >
> > > You may also need to save a virtual address as well. As some hardware
> are
> > > able to
> > > Convert virtual to physical addresses on it's own giving a performance
> > > improvement.
> > >
> > > I do not see an issue in using enqueue burst with burst size=1 , but since
> you
> > > are doing
> > > Optimizations, none of the hardware can perform well with burst = 1, I
> think
> > > it is always
> > > Greater than 1.
> >
> > Shall I update the rte_crypto_sym_vec as the following - so the 2 problems
> can
> > be
> > resolved?
> >
> > struct rte_crypto_sym_vec {
> > 	/** array of SGL vectors */
> > 	struct rte_crypto_sgl *sgl;
> > 	union {
> > 		/* Supposed to be used with CPU crypto API call. */
> > 		struct {
> > 			/** array of pointers to IV */
> > 			void **iv;
> > 			/** array of pointers to AAD */
> > 			void **aad;
> > 			/** array of pointers to digest */
> > 			void **digest;
> > 			/**
> > 			 * array of statuses for each operation:
> > 			 *  - 0 on success
> > 			 *  - errno on error
> > 			 */
> > 			int32_t *status;
> > 		};
> >
> > 		/* Supposed to be used with HW crypto API call. */
> > 		struct {
> > 			/** array of pointers to IV */
> > 			struct rte_crypto_vec *iv_hw;
> > 			/** array of pointers to AAD */
> > 			struct rte_crypto_vec *aad_hw;
> > 			/** array of pointers to Digest */
> > 			struct rte_crypto_vec *digest_hw;
> > 		};
> >
> > 	};
> > 	/** number of operations to perform */
> > 	uint32_t num;
> > };
> 
> Yes something of that sort can work.
> 
Will change it in v5.

...
> >
> > > Have you done some profiling with using rte_crypto_op instead of this
> new
> > > struct?
> > >
> > Yes, the code are actually upstreamed in VPP
> > https://gerrit.fd.io/r/c/vpp/+/18036, please try out. If you
> > have a look at the
> > enqueue/dequeue functions you should see the struggle we had to
> translate
> > ops, and creating a second software ring to make sure we only dequeue a
> > frame of data. Lucky VPP has space to store mbufs otherwise the perf will
> > be even worse.
> What is the performance gap do you see after making m_src and m_dst as
> Raw buffers?
> 

Converting other projects data structure (such as VPP crypto op) into DPDK
cryptodev operation introduces some performance degradation.

> 
> There may be some cases where a limited amount of control pkts can be sent
> Which may be session less. We cannot ask people to use a different data
> path
> For such traffic. So we may need to support that too.
> 

Here is our proposal to the enqueue-dequeue API

typedef uint32_t (*cryptodev_sym_hw_dp_crypto_enqueue_dequeue_t)
	(struct rte_cryptodev *dev, uint16_t qp_id,
	struct rte_cryptodev_sym_session *sess,
	union rte_crypto_sym_ofs ofs, struct rte_crypto_sym_vec *vec,
	void **opaque, int *enqueued_num,
	rte_cryptodev_get_dequeue_count_t get_dequeue_count,
	rte_cryptodev_post_dequeue_t post_dequeue,
	uint32_t flags);

So the idea is a single API that does enqueue and/or dequeue combined.
If the user wants to do enqueue she/he should have 
RTE_CRYPTO_HW_DP_FF_DO_ENQUEUE in the flag, or 
RTE_CRYPTO_HW_DP_FF_DO_DEQUEUE if dequeue is expected to be done.

Opaque could be single pointer or an array, also specified by the flags, so
If the user wants to do same as cryptodev_enqueue they can stores the
Crypto ops into opaque_in, and the dequeued opaque will be stored in
Opaque_out. There are 2 function pointers 
rte_cryptodev_get_dequeue_count_t: return the number of jobs to dequeue,
which helps if the user wants to know the dequeue count from first
dequeued opaque data, or just return a fixed number same as cryptodev
enqueue/dequeue usage.
rte_cryptodev_post_dequeue_t: user provided function to operate post
dequeue, good to write status to user specified data structure (opaque).

To enable sessionless we may add the union number to replace sess. The
union is either a session pointer or xform pointer, may be specified by the
flag too. 

You may ask why using a single function pointer for both enqueue and
dequeue, instead 2 separate functions... I only intended to squeeze it into
rte_cryptodev_ops to combine it with cryptodev_sym_cpu_crypto_process_t
as a union, without expanding rte_cryptodev_ops size.

struct rte_cryptodev_ops {
	...
	cryptodev_asym_free_session_t asym_session_clear;
	/**< Clear a Crypto sessions private data. */
	union {
		cryptodev_sym_cpu_crypto_process_t sym_cpu_process;
		/**< process input data synchronously (cpu-crypto). */
		cryptodev_sym_hw_crypto_enqueue_dequeue_t sym_hw_enq_deq;
		/**< Get HW crypto data-path call back functions and data */
	};
};


> > Rte_security is built on top of mbuf and ethdev and is not intended to
> "mbuf-
> > independent"
> > applications either.

Again we can replacing sess to the union of 
union rte_cryptodev_hw_dp_ctx {
	struct rte_cryptodev_sym_session *crypto_sess;
	struct rte_crypto_sym_xform *xform;
	struct rte_security_session *sec_sess;
};

> 
> Rte_security for lookaside protocol can be mbuf independent and NXP may
> Support it in future especially in case of PDCP.
> 
> Regards,
> Akhil


What do you think?

Regards,
Fan
  

Patch

diff --git a/lib/librte_cryptodev/rte_crypto_sym.h b/lib/librte_cryptodev/rte_crypto_sym.h
index da961a19d..e237e3cfa 100644
--- a/lib/librte_cryptodev/rte_crypto_sym.h
+++ b/lib/librte_cryptodev/rte_crypto_sym.h
@@ -87,6 +87,54 @@  union rte_crypto_sym_ofs {
 	} ofs;
 };
 
+
+/**
+ * Asynchronous operation job descriptor.
+ * Used by HW crypto devices direct API call that supports such activity
+ **/
+struct rte_crypto_sym_job {
+	union {
+		/**
+		 * When RTE_CRYPTO_HW_ENQ_FLAG_IS_SGL bit is set in flags, sgl
+		 * field is used as input data. Otherwise data_iova is
+		 * used.
+		 **/
+		rte_iova_t data_iova;
+		struct rte_crypto_sgl *sgl;
+	};
+	union {
+		/**
+		 * Different than cryptodev ops, all ofs and len fields have
+		 * the unit of bytes (including Snow3G/Kasumi/Zuc.
+		 **/
+		struct {
+			uint32_t cipher_ofs;
+			uint32_t cipher_len;
+		} cipher_only;
+		struct {
+			uint32_t auth_ofs;
+			uint32_t auth_len;
+			rte_iova_t digest_iova;
+		} auth_only;
+		struct {
+			uint32_t aead_ofs;
+			uint32_t aead_len;
+			rte_iova_t tag_iova;
+			uint8_t *aad;
+			rte_iova_t aad_iova;
+		} aead;
+		struct {
+			uint32_t cipher_ofs;
+			uint32_t cipher_len;
+			uint32_t auth_ofs;
+			uint32_t auth_len;
+			rte_iova_t digest_iova;
+		} chain;
+	};
+	uint8_t *iv;
+	rte_iova_t iv_iova;
+};
+
 /** Symmetric Cipher Algorithms */
 enum rte_crypto_cipher_algorithm {
 	RTE_CRYPTO_CIPHER_NULL = 1,
diff --git a/lib/librte_cryptodev/rte_cryptodev.c b/lib/librte_cryptodev/rte_cryptodev.c
index 705387b8b..5d5f84e27 100644
--- a/lib/librte_cryptodev/rte_cryptodev.c
+++ b/lib/librte_cryptodev/rte_cryptodev.c
@@ -1866,6 +1866,28 @@  rte_cryptodev_sym_cpu_crypto_process(uint8_t dev_id,
 	return dev->dev_ops->sym_cpu_process(dev, sess, ofs, vec);
 }
 
+int
+rte_cryptodev_sym_get_hw_ops(uint8_t dev_id, uint16_t qp_id,
+		struct rte_crypto_hw_ops *hw_ops)
+{
+	struct rte_cryptodev *dev;
+
+	if (!hw_ops)
+		return -EINVAL;
+
+	memset(hw_ops, 0, sizeof(*hw_ops));
+
+	if (!rte_cryptodev_get_qp_status(dev_id, qp_id))
+		return -EINVAL;
+
+	dev = rte_cryptodev_pmd_get_dev(dev_id);
+	if (!(dev->feature_flags & RTE_CRYPTODEV_FF_SYM_HW_DIRECT_API) ||
+			*dev->dev_ops->sym_get_hw_ops == NULL)
+		return -ENOTSUP;
+
+	return dev->dev_ops->sym_get_hw_ops(dev, qp_id, hw_ops);
+}
+
 /** Initialise rte_crypto_op mempool element */
 static void
 rte_crypto_op_init(struct rte_mempool *mempool,
diff --git a/lib/librte_cryptodev/rte_cryptodev.h b/lib/librte_cryptodev/rte_cryptodev.h
index d01a65825..7cd2095d7 100644
--- a/lib/librte_cryptodev/rte_cryptodev.h
+++ b/lib/librte_cryptodev/rte_cryptodev.h
@@ -466,7 +466,8 @@  rte_cryptodev_asym_get_xform_enum(enum rte_crypto_asym_xform_type *xform_enum,
 /**< Support symmetric session-less operations */
 #define RTE_CRYPTODEV_FF_NON_BYTE_ALIGNED_DATA		(1ULL << 23)
 /**< Support operations on data which is not byte aligned */
-
+#define RTE_CRYPTODEV_FF_SYM_HW_DIRECT_API		(1ULL << 24)
+/**< Support hardware accelerator specific raw data as input */
 
 /**
  * Get the name of a crypto device feature flag
@@ -737,7 +738,7 @@  rte_cryptodev_queue_pair_setup(uint8_t dev_id, uint16_t queue_pair_id,
  *   - 1: qp was configured
  *   - -ENODEV: device was not configured
  */
-int
+__rte_experimental int
 rte_cryptodev_get_qp_status(uint8_t dev_id, uint16_t queue_pair_id);
 
 /**
@@ -1348,6 +1349,174 @@  rte_cryptodev_sym_cpu_crypto_process(uint8_t dev_id,
 	struct rte_cryptodev_sym_session *sess, union rte_crypto_sym_ofs ofs,
 	struct rte_crypto_sym_vec *vec);
 
+
+/* HW direct symmetric crypto data-path APIs */
+
+/* Bit-masks used for enqueuing job */
+#define RTE_CRYPTO_HW_ENQ_FLAG_START		(1ULL << 0)
+/**< Bit-mask to indicate the first job in a burst. With this bit set the
+ *   driver may write but not read the drv_data buffer, otherwise the driver
+ *   shall read and update the drv_data.
+ */
+#define RTE_CRYPTO_HW_ENQ_FLAG_SET_OPAQUE	(1ULL << 1)
+/**< Bit-mask to indicate write opaque pointer into HW crypto descriptor. */
+#define RTE_CRYPTO_HW_ENQ_FLAG_END		(1ULL << 2)
+/**< Bit-mask to indicate the last job in a burst. With this bit set the
+ *   driver may read but not write the drv_data buffer, and kick the HW to
+ *   start processing all jobs written.
+ */
+#define RTE_CRYPTO_HW_ENQ_FLAG_IS_SGL		(1ULL << 3)
+/**< Bit-mask to indicate the input job is an SGL buffer */
+
+/* Bit-masks used for dequeuing job */
+#define RTE_CRYPTO_HW_DEQ_FLAG_START		(1ULL << 0)
+/**< Bit-mask to indicate the first job to be dequeued. With this bit set the
+ *   driver may write but not read the drv_data buffer, otherwise the driver
+ *   shall read and update the drv_data.
+ */
+#define RTE_CRYPTO_HW_DEQ_FLAG_EXHAUST		(1ULL << 1)
+/**< Bit-mask to indicate dequeuing as many as n jobs in dequeue-many function.
+ *   Without this bit once the driver found out the ready-to-dequeue jobs are
+ *   not as many as n, it shall stop immediate, leave all processed jobs in the
+ *   queue, and return the ready jobs in negative. With this bit set the
+ *   function shall continue dequeue all done jobs and return the dequeued
+ *   job count in positive.
+ */
+
+/**
+ * Typedef for HW direct data-path enqueue callback function.
+ *
+ * @param	qp		Queue pair data.
+ * @param	sess		Cryptodev session.
+ * @param	job		Job data.
+ * @param	opaque		Opaque data to be written to queue descriptor
+ *				when RTE_CRYPTO_HW_ENQ_SET_OPAQUE is
+ *				set.
+ * @param	drv_data	User created temporary driver data for the
+ *				driver to store and update data used between
+ *				adjacent enqueues operations.
+ * @param	flags		Bitmask of RTE_CRYPTO_HW_ENQ_* flags
+ * @return
+ *  - On success return 0
+ *  - On fail return -1
+ **/
+typedef int (*rte_crypto_hw_enq_cb_fn)(void *qp,
+	struct rte_cryptodev_sym_session *sess,
+	struct rte_crypto_sym_job *job, void *opaque, uint64_t *drv_data,
+	uint64_t flags);
+
+/**
+ * Typedef for HW direct data-path dequeue one job callback function.
+ *
+ * @param	qp		Queue pair data.
+ * @param	drv_data	User created temporary driver data for the
+ *				driver to store and update data used between
+ *				adjacent enqueues operations.
+ * @param	flags		Bitmask of RTE_CRYPTO_HW_DEQ_* flags
+ * @param	status		The buffer for the driver to write operation
+ *				status.
+ * @return
+ *  - On success return the opaque data user write in enqueue (if any) and
+ *    - status written as 1 when operation is successful.
+ *    - status written as -1 when operation is failed (e.g. bad MAC)
+ *  - On fail return NULL and status written as 0 when operation is still
+ *    under processing.
+ **/
+typedef void * (*rte_crypto_hw_deq_one_cb_fn)(void *qp, uint64_t *drv_data,
+		uint64_t flags, int *status);
+
+/**
+ * Typedef that the user provided to deal with jobs' status when
+ * dequeue in a bulk.
+ *
+ * @param	data		User provided data.
+ * @param	index		Index number of the processed job.
+ * @param	is_op_success	Driver filled operation status.
+ **/
+typedef void (*rte_crpyto_hw_user_post_deq_cb_fn)(void *data, uint32_t index,
+		uint8_t is_op_success);
+
+/**
+ * Typedef for HW direct data-path dequeue bulk jobs callback function.
+ *
+ * @param	qp		Queue pair data.
+ * @param	drv_data	User created temporary driver data for the
+ *				driver to store and update data used between
+ *				adjacent enqueues operations.
+ * @param	user_data	User provided data to be passed into cb
+ *				function.
+ * @param	cb		User provided callback functions to deal with
+ *				driver returned job status.
+ * @param	n		The number of expected jobs to be dequeued.
+ * @param	flags		Bitmask of RTE_CRYPTO_HW_DEQ_* flags
+ * @param	n_fail		The buffer for driver to write the number of
+ *				failed jobs.
+ * @return
+ *  - Return the number of dequeued jobs.
+ **/
+typedef uint32_t (*rte_crypto_hw_deq_many_cb_fn)(void *qp, uint64_t *drv_data,
+		void *user_data, rte_crpyto_hw_user_post_deq_cb_fn cb,
+		uint32_t n, uint64_t flags, uint32_t *n_fail);
+/**
+ * Typedef for querying HW the number of processed jobs.
+ *
+ * @param	qp		Queue pair data.
+ * @param	nb_jobs		The expected processed job number.
+ * @return
+ *  - If the nb_jobs ready, return 1.
+ *  - Otherwise return 0.
+ **/
+typedef int (*rte_crypto_hw_query_processed)(void *qp, uint32_t nb_jobs);
+
+/* Struct for user to perform HW specific enqueue/dequeue function calls */
+struct rte_crypto_hw_ops {
+	/* Driver written queue pair data pointer, should NOT be alterred by
+	 * the user.
+	 */
+	void *qp;
+	/* Function handler to enqueue AEAD job */
+	rte_crypto_hw_enq_cb_fn enqueue_aead;
+	/* Function handler to enqueue cipher only job */
+	rte_crypto_hw_enq_cb_fn enqueue_cipher;
+	/* Function handler to enqueue auth only job */
+	rte_crypto_hw_enq_cb_fn enqueue_auth;
+	/* Function handler to enqueue cipher + hash chaining job */
+	rte_crypto_hw_enq_cb_fn enqueue_chain;
+	/* Function handler to query processed jobs */
+	rte_crypto_hw_query_processed query_processed;
+	/* Function handler to dequeue one job and return opaque data stored */
+	rte_crypto_hw_deq_one_cb_fn dequeue_one;
+	/* Function handler to dequeue many jobs */
+	rte_crypto_hw_deq_many_cb_fn dequeue_many;
+	/* Reserved */
+	void *reserved[8];
+};
+
+/**
+ * Get the symmetric crypto hardware ops function pointers and queue pair data.
+ *
+ * @param	dev_id	The device identifier.
+ * @param	qp_id	The index of the queue pair from which to retrieve
+ *			processed packets. The value must be in the range
+ *			[0, nb_queue_pair - 1] previously supplied to
+ *			rte_cryptodev_configure().
+ * @param	hw_ops	User provided rte_crypto_hw_ops buffer.
+ *
+ * @return
+ *  - On success hw_ops will be written the HW crypto device's queue pair data
+ *    and function pointers for data enqueue/dequeue.
+ *  - On fail hw_ops is cleared and negative integer is returned.
+ */
+__rte_experimental
+int
+rte_cryptodev_sym_get_hw_ops(
+		uint8_t dev_id, uint16_t qp_id,
+		struct rte_crypto_hw_ops *hw_ops);
+int
+rte_cryptodev_sym_get_hw_ops(
+		uint8_t dev_id, uint16_t qp_id,
+		struct rte_crypto_hw_ops *hw_ops);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/librte_cryptodev/rte_cryptodev_pmd.h b/lib/librte_cryptodev/rte_cryptodev_pmd.h
index 81975d72b..28f75d1da 100644
--- a/lib/librte_cryptodev/rte_cryptodev_pmd.h
+++ b/lib/librte_cryptodev/rte_cryptodev_pmd.h
@@ -316,6 +316,10 @@  typedef uint32_t (*cryptodev_sym_cpu_crypto_process_t)
 	(struct rte_cryptodev *dev, struct rte_cryptodev_sym_session *sess,
 	union rte_crypto_sym_ofs ofs, struct rte_crypto_sym_vec *vec);
 
+struct rte_crypto_hw_ops;
+
+typedef int (*cryptodev_sym_hw_get_ops_t)(struct rte_cryptodev *dev,
+		uint16_t qp_id, struct rte_crypto_hw_ops *hw_ops);
 
 /** Crypto device operations function pointer table */
 struct rte_cryptodev_ops {
@@ -348,8 +352,12 @@  struct rte_cryptodev_ops {
 	/**< Clear a Crypto sessions private data. */
 	cryptodev_asym_free_session_t asym_session_clear;
 	/**< Clear a Crypto sessions private data. */
-	cryptodev_sym_cpu_crypto_process_t sym_cpu_process;
-	/**< process input data synchronously (cpu-crypto). */
+	union {
+		cryptodev_sym_cpu_crypto_process_t sym_cpu_process;
+		/**< process input data synchronously (cpu-crypto). */
+		cryptodev_sym_hw_get_ops_t sym_get_hw_ops;
+		/**< Get HW crypto data-path call back functions and data */
+	};
 };
 
 
diff --git a/lib/librte_cryptodev/rte_cryptodev_version.map b/lib/librte_cryptodev/rte_cryptodev_version.map
index 07a2d2f02..56f5684c8 100644
--- a/lib/librte_cryptodev/rte_cryptodev_version.map
+++ b/lib/librte_cryptodev/rte_cryptodev_version.map
@@ -85,6 +85,7 @@  EXPERIMENTAL {
 	rte_cryptodev_sym_session_set_user_data;
 	rte_crypto_asym_op_strings;
 	rte_crypto_asym_xform_strings;
+	rte_cryptodev_get_qp_status;
 
 	# added in 20.05
 	__rte_cryptodev_trace_configure;
@@ -103,4 +104,7 @@  EXPERIMENTAL {
 	__rte_cryptodev_trace_asym_session_clear;
 	__rte_cryptodev_trace_dequeue_burst;
 	__rte_cryptodev_trace_enqueue_burst;
+
+	# added in 20.08
+	rte_cryptodev_sym_get_hw_ops;
 };