[v2] regex/mlx5: add dynamic memory registration to datapath
diff mbox series

Message ID 1600948546-124153-1-git-send-email-yuvalav@nvidia.com
State Superseded
Delegated to: Thomas Monjalon
Headers show
Series
  • [v2] regex/mlx5: add dynamic memory registration to datapath
Related show

Checks

Context Check Description
ci/travis-robot warning Travis build: failed
ci/Intel-compilation success Compilation OK
ci/Performance-Testing fail build patch failure
ci/checkpatch warning coding style issues

Commit Message

Yuval Avnery Sept. 24, 2020, 11:55 a.m. UTC
From: Yuval Avnery <yuvalav@mellanox.com>

Currently job data is being copied to pre-registered buffer.
To avoid memcpy on the datapath, use dynamic memory registration.

This change will reduce latency when sending regex jobs. The first few
jobs may have high latency due to registration, but assuming all
following mbufs will arrive from the same mempool/hugepage, there will
be no further memory registration.

Signed-off-by: Yuval Avnery <yuvalav@mellanox.com>
Acked-by: Ori Kam <orika@mellanox.com>

---
 drivers/regex/mlx5/mlx5_regex.c          | 13 ++++++++
 drivers/regex/mlx5/mlx5_regex.h          |  4 ++-
 drivers/regex/mlx5/mlx5_regex_control.c  | 16 ++++++++--
 drivers/regex/mlx5/mlx5_regex_fastpath.c | 55 +++++++++-----------------------
 4 files changed, 44 insertions(+), 44 deletions(-)

Comments

Thomas Monjalon Oct. 4, 2020, 7:38 p.m. UTC | #1
Hi Yuval,

> From: Yuval Avnery <yuvalav@mellanox.com>
> 
> Currently job data is being copied to pre-registered buffer.
> To avoid memcpy on the datapath, use dynamic memory registration.
> 
> This change will reduce latency when sending regex jobs. The first few
> jobs may have high latency due to registration, but assuming all
> following mbufs will arrive from the same mempool/hugepage, there will
> be no further memory registration.
> 
> Signed-off-by: Yuval Avnery <yuvalav@mellanox.com>
> Acked-by: Ori Kam <orika@mellanox.com>

There is a compilation issue:
	drivers/regex/mlx5/mlx5_regex.c:200:
	undefined reference to `mlx5_os_set_reg_mr_cb'
Yuval Avnery Oct. 5, 2020, 11:57 a.m. UTC | #2
Fixed in version 3

Patch
diff mbox series

diff --git a/drivers/regex/mlx5/mlx5_regex.c b/drivers/regex/mlx5/mlx5_regex.c
index 605ebcf..7ecff1e 100644
--- a/drivers/regex/mlx5/mlx5_regex.c
+++ b/drivers/regex/mlx5/mlx5_regex.c
@@ -109,6 +109,9 @@ 
 		pci_dev->addr.devid, pci_dev->addr.function);
 }
 
+void mlx5_os_set_reg_mr_cb(mlx5_reg_mr_t *reg_mr_cb,
+			   mlx5_dereg_mr_t *dereg_mr_cb);
+
 static int
 mlx5_regex_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 		     struct rte_pci_device *pci_dev)
@@ -194,6 +197,16 @@ 
 	priv->regexdev->device = (struct rte_device *)pci_dev;
 	priv->regexdev->data->dev_private = priv;
 	priv->regexdev->state = RTE_REGEXDEV_READY;
+	mlx5_os_set_reg_mr_cb(&priv->mr_scache.reg_mr_cb,
+			      &priv->mr_scache.dereg_mr_cb);
+	ret = mlx5_mr_btree_init(&priv->mr_scache.cache,
+				 MLX5_MR_BTREE_CACHE_N * 2,
+				 rte_socket_id());
+	if (ret) {
+		DRV_LOG(ERR, "MR init tree failed.");
+	    rte_errno = ENOMEM;
+		goto error;
+	}
 	return 0;
 
 error:
diff --git a/drivers/regex/mlx5/mlx5_regex.h b/drivers/regex/mlx5/mlx5_regex.h
index e1cdf80..9a7353d 100644
--- a/drivers/regex/mlx5/mlx5_regex.h
+++ b/drivers/regex/mlx5/mlx5_regex.h
@@ -11,6 +11,7 @@ 
 #include <infiniband/mlx5dv.h>
 
 #include <mlx5_common.h>
+#include <mlx5_common_mr.h>
 
 #include "mlx5_rxp.h"
 
@@ -47,9 +48,9 @@  struct mlx5_regex_qp {
 	uint32_t free_sqs;
 	struct mlx5_regex_job *jobs;
 	struct ibv_mr *metadata;
-	struct ibv_mr *inputs;
 	struct ibv_mr *outputs;
 	size_t ci, pi;
+	struct mlx5_mr_ctrl mr_ctrl;
 };
 
 struct mlx5_regex_db {
@@ -78,6 +79,7 @@  struct mlx5_regex_priv {
 	struct mlx5dv_devx_uar *uar; /* UAR object. */
 	struct ibv_pd *pd;
 	struct mlx5_dbr_page_list dbrpgs; /* Door-bell pages. */
+	struct mlx5_mr_share_cache mr_scache; /* Global shared MR cache. */
 };
 
 /* mlx5_regex.c */
diff --git a/drivers/regex/mlx5/mlx5_regex_control.c b/drivers/regex/mlx5/mlx5_regex_control.c
index 187c3de..88b3d1a 100644
--- a/drivers/regex/mlx5/mlx5_regex_control.c
+++ b/drivers/regex/mlx5/mlx5_regex_control.c
@@ -10,6 +10,7 @@ 
 #include <rte_regexdev.h>
 #include <rte_regexdev_core.h>
 #include <rte_regexdev_driver.h>
+#include <rte_dev.h>
 
 #include <mlx5_common.h>
 #include <mlx5_glue.h>
@@ -350,7 +351,7 @@ 
 			     qp->nb_obj * sizeof(struct mlx5_regex_sq), 64);
 	if (!qp->sqs) {
 		DRV_LOG(ERR, "Can't allocate sq array memory.");
-		rte_errno  = ENOMEM;
+		rte_errno = ENOMEM;
 		return -rte_errno;
 	}
 	log_desc = rte_log2_u32(qp->nb_desc / qp->nb_obj);
@@ -367,16 +368,25 @@ 
 		}
 	}
 
+	ret = mlx5_mr_btree_init(&qp->mr_ctrl.cache_bh, MLX5_MR_BTREE_CACHE_N,
+				 rte_socket_id());
+	if (ret) {
+		DRV_LOG(ERR, "Error setting up mr btree");
+		goto err_btree;
+	}
+
 	ret = mlx5_regexdev_setup_fastpath(priv, qp_ind);
 	if (ret) {
-		DRV_LOG(ERR, "Fail to setup fastpath.");
+		DRV_LOG(ERR, "Error setting up fastpath");
 		goto err_fp;
 	}
 	return 0;
 
 err_fp:
+	mlx5_mr_btree_free(&qp->mr_ctrl.cache_bh);
+err_btree:
 	for (i = 0; i < qp->nb_obj; i++)
-		ret = regex_ctrl_destroy_sq(priv, qp, i);
+		regex_ctrl_destroy_sq(priv, qp, i);
 err_sq:
 	regex_ctrl_destroy_cq(priv, &qp->cq);
 err_cq:
diff --git a/drivers/regex/mlx5/mlx5_regex_fastpath.c b/drivers/regex/mlx5/mlx5_regex_fastpath.c
index 6fafcff..114061d 100644
--- a/drivers/regex/mlx5/mlx5_regex_fastpath.c
+++ b/drivers/regex/mlx5/mlx5_regex_fastpath.c
@@ -25,7 +25,6 @@ 
 
 #define MLX5_REGEX_MAX_WQE_INDEX 0xffff
 #define MLX5_REGEX_METADATA_SIZE 64
-#define MLX5_REGEX_MAX_INPUT (1 << 14)
 #define MLX5_REGEX_MAX_OUTPUT (1 << 11)
 #define MLX5_REGEX_WQE_CTRL_OFFSET 12
 #define MLX5_REGEX_WQE_METADATA_OFFSET 16
@@ -47,7 +46,6 @@ 
 
 struct mlx5_regex_job {
 	uint64_t user_id;
-	uint8_t *input;
 	volatile uint8_t *output;
 	volatile uint8_t *metadata;
 } __rte_cached_aligned;
@@ -100,16 +98,20 @@  struct mlx5_regex_job {
 }
 
 static inline void
-prep_one(struct mlx5_regex_sq *sq, struct rte_regex_ops *op,
+prep_one(struct mlx5_regex_priv *priv, struct mlx5_regex_qp *qp,
+	 struct mlx5_regex_sq *sq, struct rte_regex_ops *op,
 	 struct mlx5_regex_job *job)
 {
 	size_t wqe_offset = (sq->pi & (sq_size_get(sq) - 1)) * MLX5_SEND_WQE_BB;
+	uint32_t lkey;
+
+	lkey = mlx5_mr_addr2mr_bh(priv->pd, 0,
+				  &priv->mr_scache, &qp->mr_ctrl,
+				  rte_pktmbuf_mtod(op->mbuf, uintptr_t),
+				  !!(op->mbuf->ol_flags & EXT_ATTACHED_MBUF));
 	uint8_t *wqe = (uint8_t *)sq->wqe + wqe_offset;
 	int ds = 4; /*  ctrl + meta + input + output */
 
-	memcpy(job->input,
-		rte_pktmbuf_mtod(op->mbuf, void *),
-		rte_pktmbuf_data_len(op->mbuf));
 	set_wqe_ctrl_seg((struct mlx5_wqe_ctrl_seg *)wqe, sq->pi,
 			 MLX5_OPCODE_MMO, MLX5_OPC_MOD_MMO_REGEX, sq->obj->id,
 			 0, ds, 0, 0);
@@ -121,6 +123,9 @@  struct mlx5_regex_job {
 					     MLX5_REGEX_WQE_GATHER_OFFSET);
 	input_seg->byte_count =
 		rte_cpu_to_be_32(rte_pktmbuf_data_len(op->mbuf));
+	input_seg->addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(op->mbuf,
+							    uintptr_t));
+	input_seg->lkey = lkey;
 	job->user_id = op->user_id;
 	sq->db_pi = sq->pi;
 	sq->pi = (sq->pi + 1) & MLX5_REGEX_MAX_WQE_INDEX;
@@ -167,7 +172,7 @@  struct mlx5_regex_job {
 		sq = &queue->sqs[sqid];
 		while (can_send(sq)) {
 			job_id = job_id_get(sqid, sq_size_get(sq), sq->pi);
-			prep_one(sq, ops[i], &queue->jobs[job_id]);
+			prep_one(priv, queue, sq, ops[i], &queue->jobs[job_id]);
 			i++;
 			if (unlikely(i == nb_ops)) {
 				send_doorbell(priv->uar, sq);
@@ -305,10 +310,6 @@  struct mlx5_regex_job {
 					 0, queue->metadata->lkey,
 					 (uintptr_t)job->metadata);
 			set_data_seg((struct mlx5_wqe_data_seg *)
-				     (wqe + MLX5_REGEX_WQE_GATHER_OFFSET),
-				     0, queue->inputs->lkey,
-				     (uintptr_t)job->input);
-			set_data_seg((struct mlx5_wqe_data_seg *)
 				     (wqe + MLX5_REGEX_WQE_SCATTER_OFFSET),
 				     MLX5_REGEX_MAX_OUTPUT,
 				     queue->outputs->lkey,
@@ -335,25 +336,10 @@  struct mlx5_regex_job {
 					 MLX5_REGEX_METADATA_SIZE*qp->nb_desc,
 					 IBV_ACCESS_LOCAL_WRITE);
 	if (!qp->metadata) {
+		DRV_LOG(ERR, "Failed to register metadata");
 		rte_free(ptr);
 		return -EINVAL;
 	}
-	ptr = rte_calloc(__func__, qp->nb_desc,
-			 MLX5_REGEX_MAX_INPUT,
-			 MLX5_REGEX_MAX_INPUT);
-
-	if (!ptr) {
-		err = -ENOMEM;
-		goto err_input;
-	}
-	qp->inputs = mlx5_glue->reg_mr(pd, ptr,
-				       MLX5_REGEX_MAX_INPUT*qp->nb_desc,
-				       IBV_ACCESS_LOCAL_WRITE);
-	if (!qp->inputs) {
-		rte_free(ptr);
-		err = -EINVAL;
-		goto err_input;
-	}
 
 	ptr = rte_calloc(__func__, qp->nb_desc,
 			 MLX5_REGEX_MAX_OUTPUT,
@@ -367,15 +353,13 @@  struct mlx5_regex_job {
 					IBV_ACCESS_LOCAL_WRITE);
 	if (!qp->outputs) {
 		rte_free(ptr);
+		DRV_LOG(ERR, "Failed to register output");
 		err = -EINVAL;
 		goto err_output;
 	}
 
 	/* distribute buffers to jobs */
 	for (i = 0; i < qp->nb_desc; i++) {
-		qp->jobs[i].input =
-			(uint8_t *)qp->inputs->addr +
-			(i % qp->nb_desc) * MLX5_REGEX_MAX_INPUT;
 		qp->jobs[i].output =
 			(uint8_t *)qp->outputs->addr +
 			(i % qp->nb_desc) * MLX5_REGEX_MAX_OUTPUT;
@@ -386,10 +370,6 @@  struct mlx5_regex_job {
 	return 0;
 
 err_output:
-	ptr = qp->inputs->addr;
-	rte_free(ptr);
-	mlx5_glue->dereg_mr(qp->inputs);
-err_input:
 	ptr = qp->metadata->addr;
 	rte_free(ptr);
 	mlx5_glue->dereg_mr(qp->metadata);
@@ -402,8 +382,7 @@  struct mlx5_regex_job {
 	struct mlx5_regex_qp *qp = &priv->qps[qp_id];
 	int err;
 
-	qp->jobs = rte_calloc(__func__, qp->nb_desc, sizeof(*qp->jobs),
-			      sizeof(*qp->jobs));
+	qp->jobs = rte_calloc(__func__, qp->nb_desc, sizeof(*qp->jobs), 64);
 	if (!qp->jobs)
 		return -ENOMEM;
 	err = setup_buffers(qp, priv->pd);
@@ -422,10 +401,6 @@  struct mlx5_regex_job {
 		mlx5_glue->dereg_mr(qp->metadata);
 		rte_free(qp->metadata->addr);
 	}
-	if (qp->inputs) {
-		mlx5_glue->dereg_mr(qp->inputs);
-		rte_free(qp->inputs->addr);
-	}
 	if (qp->outputs) {
 		mlx5_glue->dereg_mr(qp->outputs);
 		rte_free(qp->outputs->addr);