[RFC,5/5] crypto/mlx5: add enqueue and dequeue operations

Message ID 20230418092325.2578712-6-suanmingm@nvidia.com (mailing list archive)
State Superseded, archived
Delegated to: akhil goyal
Headers
Series crypto/mlx5: support AES-GCM |

Checks

Context Check Description
ci/checkpatch warning coding style issues
ci/Intel-compilation success Compilation OK
ci/intel-Testing success Testing PASS
ci/intel-Functional success Functional PASS

Commit Message

Suanming Mou April 18, 2023, 9:23 a.m. UTC
  The crypto operations are performed with crypto WQE. If the input
buffers(AAD, mbuf, digest) are not contiguous, as the requirement
from FW, an UMR WQE is needed to generate contiguous address space
for crypto WQE. The UMR WQE and crypto WQE are handled in two
different QPs.

The QP for UMR operation contains two types of WQE, UMR and SEND_EN
WQE. The WQEs are built dynamically according to the crypto operation
buffer address. Crypto operation with non-contiguous buffers will
have its own UMR WQE, while the operation with contiguous buffers
doesn't need the UMR WQE. Once the all the operations WQE in the
enqueue burst built finishes, if any UMR WQEs are built, additional
SEND_EN WQE will be as the final WQE of the burst in the UMR QP.
The purpose of that SEND_EN WQE is to trigger the crypto QP processing
with the UMR ready input memory address space buffers.

The QP for crypto operations contains only the crypto WQE and the QP
WQEs are built as fixed in QP setup. The QP processing is triggered
by doorbell ring or the SEND_EN WQE from UMR QP.

Signed-off-by: Suanming Mou <suanmingm@nvidia.com>
---
 drivers/common/mlx5/mlx5_prm.h        |   1 +
 drivers/crypto/mlx5/mlx5_crypto.h     |   2 +
 drivers/crypto/mlx5/mlx5_crypto_gcm.c | 401 ++++++++++++++++++++++++++
 3 files changed, 404 insertions(+)
  

Patch

diff --git a/drivers/common/mlx5/mlx5_prm.h b/drivers/common/mlx5/mlx5_prm.h
index c8d73a8456..71000ebf02 100644
--- a/drivers/common/mlx5/mlx5_prm.h
+++ b/drivers/common/mlx5/mlx5_prm.h
@@ -613,6 +613,7 @@  struct mlx5_wqe_send_en_wqe {
 /* MMO metadata segment */
 
 #define	MLX5_OPCODE_MMO	0x2fu
+#define	MLX5_OPC_MOD_MMO_CRYPTO 0x6u
 #define	MLX5_OPC_MOD_MMO_REGEX 0x4u
 #define	MLX5_OPC_MOD_MMO_COMP 0x2u
 #define	MLX5_OPC_MOD_MMO_DECOMP 0x3u
diff --git a/drivers/crypto/mlx5/mlx5_crypto.h b/drivers/crypto/mlx5/mlx5_crypto.h
index 9945891ea8..0b0ef1a84d 100644
--- a/drivers/crypto/mlx5/mlx5_crypto.h
+++ b/drivers/crypto/mlx5/mlx5_crypto.h
@@ -66,8 +66,10 @@  struct mlx5_crypto_qp {
 	uint8_t *umr_wqe;
 	uint16_t umr_wqbbs;
 	uint16_t umr_pi;
+	uint16_t umr_last_pi;
 	uint16_t umr_ci;
 	uint32_t umr_errors;
+	bool has_umr;
 };
 
 struct mlx5_crypto_dek {
diff --git a/drivers/crypto/mlx5/mlx5_crypto_gcm.c b/drivers/crypto/mlx5/mlx5_crypto_gcm.c
index b67f22c591..40cf4c804e 100644
--- a/drivers/crypto/mlx5/mlx5_crypto_gcm.c
+++ b/drivers/crypto/mlx5/mlx5_crypto_gcm.c
@@ -9,6 +9,7 @@ 
 #include <rte_log.h>
 #include <bus_pci_driver.h>
 #include <rte_memory.h>
+#include <rte_io.h>
 
 #include <mlx5_glue.h>
 #include <mlx5_common.h>
@@ -18,6 +19,17 @@ 
 #include "mlx5_crypto_utils.h"
 #include "mlx5_crypto.h"
 
+#define MLX5_MMO_CRYPTO_OPC (MLX5_OPCODE_MMO | \
+	(MLX5_OPC_MOD_MMO_CRYPTO << WQE_CSEG_OPC_MOD_OFFSET))
+
+struct mlx5_crypto_gcm_data {
+	void *src_addr;
+	uint32_t src_bytes;
+	void *dst_addr;
+	uint32_t dst_bytes;
+	uint32_t mkey;
+};
+
 static struct rte_cryptodev_capabilities mlx5_crypto_gcm_caps[] = {
 	{
 		.op = RTE_CRYPTO_OP_TYPE_UNDEFINED,
@@ -246,6 +258,10 @@  mlx5_crypto_gcm_umr_qp_setup(struct rte_cryptodev *dev, struct mlx5_crypto_qp *q
 		DRV_LOG(ERR, "Failed to create UMR CQ.");
 		return -1;
 	}
+	/* Init CQ to ones to be in HW owner in the start. */
+	qp->umr_cq_obj.cqes[0].op_own = MLX5_CQE_OWNER_MASK;
+	qp->umr_cq_obj.cqes[0].wqe_counter = rte_cpu_to_be_16(UINT16_MAX);
+	qp->umr_last_pi = UINT16_MAX;
 	/* Set UMR + SEND_EN WQE as maximum same with crypto. */
 	log_wqbb_n = rte_log2_u32(qp->entries_n *
 			(priv->wqe_set_size / MLX5_SEND_WQE_BB));
@@ -374,6 +390,389 @@  mlx5_crypto_gcm_qp_setup(struct rte_cryptodev *dev, uint16_t qp_id,
 	return -1;
 }
 
+static __rte_always_inline bool
+mlx5_crypto_is_gcm_input_continuous(struct rte_crypto_op *op)
+{
+	struct mlx5_crypto_session *sess = CRYPTODEV_GET_SYM_SESS_PRIV(op->sym->session);
+	struct rte_mbuf *m_src = op->sym->m_src;
+	void *aad_addr = op->sym->aead.aad.data;
+	void *tag_addr = op->sym->aead.digest.data;
+	void *pkt_addr = rte_pktmbuf_mtod_offset(m_src, void *, op->sym->aead.data.offset);
+
+	/* Out of place mode, AAD will never satisfy the expectation. */
+	if ((op->sym->m_dst && op->sym->m_dst != m_src) ||
+	    (m_src->nb_segs > 1) ||
+	    (RTE_PTR_ADD(aad_addr, sess->aad_len) != pkt_addr) ||
+	    (RTE_PTR_ADD(pkt_addr, op->sym->aead.data.length) != tag_addr))
+		return false;
+	return true;
+}
+
+static __rte_always_inline uint32_t
+mlx5_crypto_gcm_umr_klm_set(struct mlx5_crypto_qp *qp, struct rte_mbuf *mbuf,
+		    struct mlx5_klm *klm, uint32_t offset,
+		    uint32_t *remain)
+{
+	uint32_t data_len = (rte_pktmbuf_data_len(mbuf) - offset);
+	uintptr_t addr = rte_pktmbuf_mtod_offset(mbuf, uintptr_t, offset);
+
+	if (data_len > *remain)
+		data_len = *remain;
+	*remain -= data_len;
+	klm->byte_count = rte_cpu_to_be_32(data_len);
+	klm->address = rte_cpu_to_be_64(addr);
+	klm->mkey = mlx5_mr_mb2mr(&qp->mr_ctrl, mbuf);
+	return klm->mkey;
+}
+
+static __rte_always_inline int
+mlx5_crypto_gcm_build_klm(struct mlx5_crypto_qp *qp,
+		struct rte_crypto_op *op,
+		struct rte_mbuf *mbuf,
+		struct mlx5_klm *klm)
+{
+	struct mlx5_crypto_session *sess = CRYPTODEV_GET_SYM_SESS_PRIV(op->sym->session);
+	uint32_t remain_len = op->sym->aead.data.length;
+	uint32_t nb_segs = mbuf->nb_segs;
+	uint32_t klm_n = 0;
+
+	/* Set AAD. */
+	klm->byte_count = rte_cpu_to_be_32(sess->aad_len);
+	klm->address = rte_cpu_to_be_64((uintptr_t)op->sym->aead.aad.data);
+	klm->mkey = mlx5_mr_addr2mr_bh(&qp->mr_ctrl, (uintptr_t)op->sym->aead.aad.data);
+	klm_n++;
+	/* First mbuf needs to take the data offset. */
+	if (unlikely(mlx5_crypto_gcm_umr_klm_set(qp, mbuf, ++klm,
+		     op->sym->aead.data.offset, &remain_len) == UINT32_MAX)) {
+		op->status = RTE_CRYPTO_OP_STATUS_ERROR;
+		return 0;
+	}
+	klm_n++;
+	while (remain_len) {
+		nb_segs--;
+		mbuf = mbuf->next;
+		if (unlikely(mbuf == NULL || nb_segs == 0)) {
+			op->status = RTE_CRYPTO_OP_STATUS_INVALID_ARGS;
+			return 0;
+		}
+		if (unlikely(mlx5_crypto_gcm_umr_klm_set(qp, mbuf, ++klm, 0,
+						 &remain_len) == UINT32_MAX)) {
+			op->status = RTE_CRYPTO_OP_STATUS_ERROR;
+			return 0;
+		}
+		klm_n++;
+	}
+	/* Set TAG. */
+	klm++;
+	klm->byte_count = rte_cpu_to_be_32((uint32_t)sess->tag_len);
+	klm->address = rte_cpu_to_be_64((uintptr_t)op->sym->aead.digest.data);
+	klm->mkey = mlx5_mr_addr2mr_bh(&qp->mr_ctrl, (uintptr_t)op->sym->aead.digest.data);
+	klm_n++;
+	return klm_n;
+}
+
+static __rte_always_inline void*
+mlx5_crypto_gcm_get_umr_wqe(struct mlx5_crypto_qp *qp)
+{
+	struct mlx5_crypto_priv *priv = qp->priv;
+	uint32_t wqe_offset = qp->umr_pi & (qp->umr_wqbbs - 1);
+	uint32_t left_wqbbs = qp->umr_wqbbs - wqe_offset;
+	struct mlx5_wqe_cseg *wqe;
+
+	/* If UMR WQE is near the boundary. */
+	if (left_wqbbs < priv->umr_wqe_stride) {
+		/* Append NOP WQE as the left WQEBBS is not enough for UMR. */
+		wqe = (struct mlx5_wqe_cseg *)RTE_PTR_ADD(qp->umr_qp_obj.umem_buf,
+			wqe_offset * MLX5_SEND_WQE_BB);
+		wqe->opcode = RTE_BE32(MLX5_OPCODE_NOP | ((uint32_t)qp->umr_pi << 8));
+		wqe->sq_ds = rte_cpu_to_be_32((qp->umr_qp_obj.qp->id << 8) | (left_wqbbs << 2));
+		wqe->flags = RTE_BE32(0);
+		wqe->misc = RTE_BE32(0);
+		qp->umr_pi += left_wqbbs;
+		wqe_offset = qp->umr_pi & (qp->umr_wqbbs - 1);
+	}
+	wqe_offset *= MLX5_SEND_WQE_BB;
+	return RTE_PTR_ADD(qp->umr_qp_obj.umem_buf, wqe_offset);
+}
+
+static __rte_always_inline int
+mlx5_crypto_gcm_build_umr(struct mlx5_crypto_qp *qp,
+			  struct rte_crypto_op *op,
+			  uint32_t idx,
+			  struct mlx5_crypto_gcm_data *data)
+{
+	struct mlx5_crypto_priv *priv = qp->priv;
+	struct mlx5_crypto_session *sess = CRYPTODEV_GET_SYM_SESS_PRIV(op->sym->session);
+	struct mlx5_wqe_cseg *wqe;
+	struct mlx5_wqe_umr_ctrl_seg *ucseg;
+	struct mlx5_wqe_mkey_context_seg *mkc;
+	struct mlx5_klm *iklm;
+	struct mlx5_klm *klm = &qp->klm_array[idx * priv->max_segs_num];
+	uint16_t klm_size, klm_align;
+	uint16_t klm_src = 0, klm_dst = 0;
+	uint32_t total_len = op->sym->aead.data.length + sess->aad_len + sess->tag_len;
+	uint32_t i;
+
+	/* Build KLM base on the op. */
+	klm_src = mlx5_crypto_gcm_build_klm(qp, op, op->sym->m_src, klm);
+	if (!klm_src)
+		return -EINVAL;
+	if (op->sym->m_dst && op->sym->m_dst != op->sym->m_src) {
+		klm_dst = mlx5_crypto_gcm_build_klm(qp, op, op->sym->m_dst, klm + klm_src);
+		if (!klm_dst)
+			return -EINVAL;
+		total_len *= 2;
+	}
+	klm_size = klm_src + klm_dst;
+	klm_align = RTE_ALIGN(klm_size, 4);
+	/* Get UMR WQE memory. */
+	wqe = (struct mlx5_wqe_cseg *)mlx5_crypto_gcm_get_umr_wqe(qp);
+	memset(wqe, 0, priv->umr_wqe_size);
+	/* Set WQE control seg. Non-inline KLM UMR WQE size must be 9 WQE_DS. */
+	wqe->opcode = RTE_BE32(MLX5_OPCODE_UMR | ((uint32_t)qp->umr_pi << 8));
+	wqe->sq_ds = rte_cpu_to_be_32((qp->umr_qp_obj.qp->id << 8) | 9);
+	wqe->flags = RTE_BE32(MLX5_COMP_ONLY_FIRST_ERR << MLX5_COMP_MODE_OFFSET);
+	wqe->misc = rte_cpu_to_be_32(qp->mkey[idx]->id);
+	/* Set UMR WQE control seg. */
+	ucseg = (struct mlx5_wqe_umr_ctrl_seg *)(wqe + 1);
+	ucseg->mkey_mask |= rte_cpu_to_be_64(MLX5_WQE_UMR_CTRL_MKEY_MASK_LEN);
+	ucseg->klm_octowords = rte_cpu_to_be_16(klm_align);
+	/* Set mkey context seg. */
+	mkc = (struct mlx5_wqe_mkey_context_seg *)(ucseg + 1);
+	mkc->len = rte_cpu_to_be_64(total_len);
+	mkc->qpn_mkey = rte_cpu_to_be_32(0xffffff00 | (qp->mkey[idx]->id & 0xff));
+	/* Set UMR pointer to data seg. */
+	iklm = (struct mlx5_klm *)(mkc + 1);
+	iklm->address = rte_cpu_to_be_64((uintptr_t)((char *)klm));
+	iklm->mkey = rte_cpu_to_be_32(qp->klm_mr.lkey);
+	iklm->byte_count = rte_cpu_to_be_32(klm_align);
+	data->mkey = rte_cpu_to_be_32(qp->mkey[idx]->id);
+	data->src_addr = 0;
+	data->src_bytes = sess->aad_len + op->sym->aead.data.length;
+	data->dst_bytes = data->src_bytes;
+	if (klm_dst)
+		data->dst_addr = (void *)(uintptr_t)(data->src_bytes + sess->tag_len);
+	else
+		data->dst_addr = 0;
+	if (sess->op_type == MLX5_CRYPTO_OP_TYPE_ENCRYPTION)
+		data->dst_bytes += sess->tag_len;
+	else
+		data->src_bytes += sess->tag_len;
+	/* Clear the padding memory. */
+	for (i = klm_size; i < klm_align; i++) {
+		klm[i].mkey = UINT32_MAX;
+		klm[i].address = 0;
+		klm[i].byte_count = 0;
+	}
+	/* Update PI and WQE */
+	qp->umr_pi += priv->umr_wqe_stride;
+	qp->umr_wqe = (uint8_t *)wqe;
+	return 0;
+}
+
+static __rte_always_inline void
+mlx5_crypto_gcm_build_send_en(struct mlx5_crypto_qp *qp)
+{
+	uint32_t wqe_offset = (qp->umr_pi & (qp->umr_wqbbs - 1)) * MLX5_SEND_WQE_BB;
+	struct mlx5_wqe_cseg *cs = RTE_PTR_ADD(qp->umr_qp_obj.wqes, wqe_offset);
+	struct mlx5_wqe_qseg *qs = RTE_PTR_ADD(cs, sizeof(struct mlx5_wqe_cseg));
+
+	cs->opcode = RTE_BE32(MLX5_OPCODE_SEND_EN | ((uint32_t)qp->umr_pi << 8));
+	cs->sq_ds = rte_cpu_to_be_32((qp->umr_qp_obj.qp->id << 8) | 2);
+	cs->flags = RTE_BE32((MLX5_COMP_ALWAYS << MLX5_COMP_MODE_OFFSET) |
+			MLX5_WQE_CTRL_FENCE);
+	cs->misc = RTE_BE32(0);
+	qs->max_index = rte_cpu_to_be_32(qp->pi);
+	qs->qpn_cqn = rte_cpu_to_be_32(qp->qp_obj.qp->id);
+	qp->umr_wqe = (uint8_t *)cs;
+	qp->umr_pi += 1;
+}
+
+static __rte_always_inline void
+mlx5_crypto_gcm_wqe_set(struct mlx5_crypto_qp *qp,
+			struct rte_crypto_op *op,
+			uint32_t idx,
+			struct mlx5_crypto_gcm_data *data)
+{
+	struct mlx5_crypto_session *sess = CRYPTODEV_GET_SYM_SESS_PRIV(op->sym->session);
+	struct mlx5_gga_wqe *wqe = &((struct mlx5_gga_wqe *)qp->qp_obj.wqes)[idx];
+	union mlx5_gga_crypto_opaque *opaq = qp->opaque_mr.addr;
+
+	memcpy(opaq[idx].cp.iv,
+		rte_crypto_op_ctod_offset(op, uint8_t *, sess->iv_offset), sess->iv_len);
+	opaq[idx].cp.tag_size = rte_cpu_to_be_32((uint32_t)sess->tag_len);
+	opaq[idx].cp.aad_size = rte_cpu_to_be_32((uint32_t)sess->aad_len);
+	/* Update control seg. */
+	wqe->opcode = rte_cpu_to_be_32(MLX5_MMO_CRYPTO_OPC + (qp->pi << 8));
+	wqe->gga_ctrl1 = sess->mmo_ctrl;
+	wqe->gga_ctrl2 = sess->dek_id;
+	/* Update input seg. */
+	wqe->gather.bcount = rte_cpu_to_be_32(data->src_bytes);
+	wqe->gather.lkey = data->mkey;
+	wqe->gather.pbuf = rte_cpu_to_be_64((uintptr_t)data->src_addr);
+	/* Update output seg. */
+	wqe->scatter.bcount = rte_cpu_to_be_32(data->dst_bytes);
+	wqe->scatter.lkey = data->mkey;
+	wqe->scatter.pbuf = rte_cpu_to_be_64((uintptr_t)data->dst_addr);
+	qp->wqe = (uint8_t *)wqe;
+}
+
+static uint16_t
+mlx5_crypto_gcm_enqueue_burst(void *queue_pair,
+			      struct rte_crypto_op **ops,
+			      uint16_t nb_ops)
+{
+	struct mlx5_crypto_qp *qp = queue_pair;
+	struct mlx5_crypto_session *sess;
+	struct mlx5_crypto_priv *priv = qp->priv;
+	struct mlx5_crypto_gcm_data gcm_data;
+	struct rte_crypto_op *op;
+	uint16_t mask = qp->entries_n - 1;
+	uint16_t remain = qp->entries_n - (qp->pi - qp->ci);
+	uint32_t idx;
+	uint16_t umr_cnt = 0;
+
+	if (remain < nb_ops)
+		nb_ops = remain;
+	else
+		remain = nb_ops;
+	if (unlikely(remain == 0))
+		return 0;
+	do {
+		op = *ops++;
+		sess = CRYPTODEV_GET_SYM_SESS_PRIV(op->sym->session);
+		idx = qp->pi & mask;
+		if (mlx5_crypto_is_gcm_input_continuous(op)) {
+			gcm_data.src_addr = op->sym->aead.aad.data;
+			gcm_data.src_bytes = op->sym->aead.data.length + sess->aad_len;
+			gcm_data.dst_addr = gcm_data.src_addr;
+			gcm_data.dst_bytes = gcm_data.src_bytes;
+			if (sess->op_type == MLX5_CRYPTO_OP_TYPE_ENCRYPTION)
+				gcm_data.dst_bytes += sess->tag_len;
+			else
+				gcm_data.src_bytes += sess->tag_len;
+			gcm_data.mkey = mlx5_mr_mb2mr(&qp->mr_ctrl, op->sym->m_src);
+		} else {
+			if (unlikely(mlx5_crypto_gcm_build_umr(qp, op, idx, &gcm_data))) {
+				qp->stats.enqueue_err_count++;
+				if (remain != nb_ops) {
+					qp->stats.enqueued_count -= remain;
+					break;
+				}
+				return 0;
+			}
+			umr_cnt++;
+		}
+		mlx5_crypto_gcm_wqe_set(qp, op, idx, &gcm_data);
+		qp->ops[idx] = op;
+		qp->pi++;
+	} while (--remain);
+	qp->stats.enqueued_count += nb_ops;
+	if (!umr_cnt) {
+		mlx5_doorbell_ring(&priv->uar.bf_db, *(volatile uint64_t *)qp->wqe,
+				   qp->pi, &qp->qp_obj.db_rec[MLX5_SND_DBR],
+				   !priv->uar.dbnc);
+	} else {
+		mlx5_crypto_gcm_build_send_en(qp);
+		mlx5_doorbell_ring(&priv->uar.bf_db, *(volatile uint64_t *)qp->umr_wqe,
+				   qp->umr_pi, &qp->umr_qp_obj.db_rec[MLX5_SND_DBR],
+				   !priv->uar.dbnc);
+	}
+	qp->has_umr = !!umr_cnt;
+	return nb_ops;
+}
+
+static __rte_noinline void
+mlx5_crypto_gcm_cqe_err_handle(struct mlx5_crypto_qp *qp, struct rte_crypto_op *op)
+{
+	const uint32_t idx = qp->ci & (qp->entries_n - 1);
+	volatile struct mlx5_err_cqe *cqe = (volatile struct mlx5_err_cqe *)
+							&qp->cq_obj.cqes[idx];
+
+	if (op)
+		op->status = RTE_CRYPTO_OP_STATUS_ERROR;
+	qp->stats.dequeue_err_count++;
+	DRV_LOG(ERR, "CQE ERR:%x.\n", rte_be_to_cpu_32(cqe->syndrome));
+}
+
+static __rte_always_inline void
+mlx5_crypto_gcm_umr_cq_poll(struct mlx5_crypto_qp *qp)
+{
+	union {
+		struct {
+			uint16_t wqe_counter;
+			uint8_t rsvd5;
+			uint8_t op_own;
+		};
+		uint32_t word;
+	} last_word;
+	uint16_t cur_wqe_counter;
+
+	if (!qp->has_umr)
+		return;
+	last_word.word = rte_read32(&qp->umr_cq_obj.cqes[0].wqe_counter);
+	cur_wqe_counter = rte_be_to_cpu_16(last_word.wqe_counter);
+	if (cur_wqe_counter == qp->umr_last_pi)
+		return;
+	MLX5_ASSERT(MLX5_CQE_OPCODE(last_word.op_own) !=
+			MLX5_CQE_INVALID);
+	if (unlikely((MLX5_CQE_OPCODE(last_word.op_own) ==
+			   MLX5_CQE_RESP_ERR ||
+			   MLX5_CQE_OPCODE(last_word.op_own) ==
+			   MLX5_CQE_REQ_ERR)))
+		qp->umr_errors++;
+	qp->umr_last_pi = cur_wqe_counter;
+	qp->umr_ci++;
+	rte_io_wmb();
+	/* Ring CQ doorbell record. */
+	qp->umr_cq_obj.db_rec[0] = rte_cpu_to_be_32(qp->umr_ci);
+	qp->has_umr = false;
+}
+
+static uint16_t
+mlx5_crypto_gcm_dequeue_burst(void *queue_pair,
+			      struct rte_crypto_op **ops,
+			      uint16_t nb_ops)
+{
+	struct mlx5_crypto_qp *qp = queue_pair;
+	volatile struct mlx5_cqe *restrict cqe;
+	struct rte_crypto_op *restrict op;
+	const unsigned int cq_size = qp->entries_n;
+	const unsigned int mask = cq_size - 1;
+	uint32_t idx;
+	uint32_t next_idx = qp->ci & mask;
+	const uint16_t max = RTE_MIN((uint16_t)(qp->pi - qp->ci), nb_ops);
+	uint16_t i = 0;
+	int ret;
+
+	if (unlikely(max == 0))
+		return 0;
+	/* Handle UMR CQE firstly.*/
+	mlx5_crypto_gcm_umr_cq_poll(qp);
+	do {
+		idx = next_idx;
+		next_idx = (qp->ci + 1) & mask;
+		op = qp->ops[idx];
+		cqe = &qp->cq_obj.cqes[idx];
+		ret = check_cqe(cqe, cq_size, qp->ci);
+		rte_io_rmb();
+		if (unlikely(ret != MLX5_CQE_STATUS_SW_OWN)) {
+			if (unlikely(ret != MLX5_CQE_STATUS_HW_OWN))
+				mlx5_crypto_gcm_cqe_err_handle(qp, op);
+			break;
+		}
+		op->status = RTE_CRYPTO_OP_STATUS_SUCCESS;
+		ops[i++] = op;
+		qp->ci++;
+	} while (i < max);
+	if (likely(i != 0)) {
+		rte_io_wmb();
+		qp->cq_obj.db_rec[0] = rte_cpu_to_be_32(qp->ci);
+		qp->stats.dequeued_count += i;
+	}
+	return i;
+}
+
 int
 mlx5_crypto_gcm_init(struct mlx5_crypto_priv *priv)
 {
@@ -386,6 +785,8 @@  mlx5_crypto_gcm_init(struct mlx5_crypto_priv *priv)
 	dev_ops->sym_session_configure = mlx5_crypto_sym_gcm_session_configure;
 	dev_ops->queue_pair_setup = mlx5_crypto_gcm_qp_setup;
 	dev_ops->queue_pair_release = mlx5_crypto_gcm_qp_release;
+	crypto_dev->dequeue_burst = mlx5_crypto_gcm_dequeue_burst;
+	crypto_dev->enqueue_burst = mlx5_crypto_gcm_enqueue_burst;
 	/* Generate GCM capability. */
 	ret = mlx5_crypto_generate_gcm_cap(&cdev->config.hca_attr.crypto_mmo,
 					   mlx5_crypto_gcm_caps);