[v4,33/34] ml/cnxk: enable fast-path ops for TVM models

Message ID 20231017165951.27299-34-syalavarthi@marvell.com (mailing list archive)
State Superseded, archived
Delegated to: Jerin Jacob
Headers
Series Implementation of revised ml/cnxk driver |

Checks

Context Check Description
ci/checkpatch success coding style OK

Commit Message

Srikanth Yalavarthi Oct. 17, 2023, 4:59 p.m. UTC
  From: Anup Prabhu <aprabhu@marvell.com>

Enable fast-path ops support for TVM models. Models would
use TVMDP library function calls to execute inference
operations for Hybrid and LLVM model sub-types.

For TVM MRVL model subtypes that have a single MRVL layer,
the inference requests are directly enqueued to hardware
by the driver.

Signed-off-by: Anup Prabhu <aprabhu@marvell.com>
Signed-off-by: Srikanth Yalavarthi <syalavarthi@marvell.com>
---
 doc/guides/rel_notes/release_23_11.rst |   4 +
 drivers/ml/cnxk/cn10k_ml_ops.c         |   4 -
 drivers/ml/cnxk/cnxk_ml_io.h           |   6 ++
 drivers/ml/cnxk/cnxk_ml_ops.c          |   4 +
 drivers/ml/cnxk/cnxk_ml_ops.h          |   5 +
 drivers/ml/cnxk/mvtvm_ml_model.c       |  20 ++++
 drivers/ml/cnxk/mvtvm_ml_model.h       |   6 ++
 drivers/ml/cnxk/mvtvm_ml_ops.c         | 124 +++++++++++++++++++++++++
 drivers/ml/cnxk/mvtvm_ml_ops.h         |  43 +++++++++
 9 files changed, 212 insertions(+), 4 deletions(-)
  

Patch

diff --git a/doc/guides/rel_notes/release_23_11.rst b/doc/guides/rel_notes/release_23_11.rst
index 8701350b2e..ba4d162287 100644
--- a/doc/guides/rel_notes/release_23_11.rst
+++ b/doc/guides/rel_notes/release_23_11.rst
@@ -28,6 +28,10 @@  New Features
 
      Added support in mldev library for models with multiple inputs and outputs.
 
+   * **Added support for Marvell TVM models in ML CNXK driver.**
+
+     Added support for models compiled using TVM framework in ML CNXK driver.
+
 
 .. This section should contain new features added in this release.
    Sample format:
diff --git a/drivers/ml/cnxk/cn10k_ml_ops.c b/drivers/ml/cnxk/cn10k_ml_ops.c
index 01b0a44caa..b9d30278c6 100644
--- a/drivers/ml/cnxk/cn10k_ml_ops.c
+++ b/drivers/ml/cnxk/cn10k_ml_ops.c
@@ -371,10 +371,6 @@  cn10k_ml_dev_configure(struct cnxk_ml_dev *cnxk_mldev, const struct rte_ml_dev_c
 	else
 		cn10k_mldev->ml_jcmdq_enqueue = roc_ml_jcmdq_enqueue_lf;
 
-	cnxk_mldev->mldev->enqueue_burst = cnxk_ml_enqueue_burst;
-	cnxk_mldev->mldev->dequeue_burst = cnxk_ml_dequeue_burst;
-	cnxk_mldev->mldev->op_error_get = cn10k_ml_op_error_get;
-
 	return 0;
 }
 
diff --git a/drivers/ml/cnxk/cnxk_ml_io.h b/drivers/ml/cnxk/cnxk_ml_io.h
index 5de166c252..6d5d25a7c9 100644
--- a/drivers/ml/cnxk/cnxk_ml_io.h
+++ b/drivers/ml/cnxk/cnxk_ml_io.h
@@ -47,6 +47,12 @@  struct cnxk_ml_io {
 
 	/* Scale */
 	float scale;
+
+	/* Dequantized offset */
+	uint32_t offset_d;
+
+	/* Quantized offset */
+	uint32_t offset_q;
 };
 
 /* Model / Layer IO structure */
diff --git a/drivers/ml/cnxk/cnxk_ml_ops.c b/drivers/ml/cnxk/cnxk_ml_ops.c
index fd2c46ac1f..608e9fc4ca 100644
--- a/drivers/ml/cnxk/cnxk_ml_ops.c
+++ b/drivers/ml/cnxk/cnxk_ml_ops.c
@@ -632,6 +632,10 @@  cnxk_ml_dev_configure(struct rte_ml_dev *dev, const struct rte_ml_dev_config *co
 	cnxk_mldev->max_nb_layers =
 		cnxk_mldev->cn10k_mldev.fw.req->cn10k_req.jd.fw_load.cap.s.max_models;
 
+	cnxk_mldev->mldev->enqueue_burst = cnxk_ml_enqueue_burst;
+	cnxk_mldev->mldev->dequeue_burst = cnxk_ml_dequeue_burst;
+	cnxk_mldev->mldev->op_error_get = cn10k_ml_op_error_get;
+
 	/* Allocate and initialize index_map */
 	if (cnxk_mldev->index_map == NULL) {
 		cnxk_mldev->index_map =
diff --git a/drivers/ml/cnxk/cnxk_ml_ops.h b/drivers/ml/cnxk/cnxk_ml_ops.h
index ab32676b3e..7b49793a57 100644
--- a/drivers/ml/cnxk/cnxk_ml_ops.h
+++ b/drivers/ml/cnxk/cnxk_ml_ops.h
@@ -24,6 +24,11 @@  struct cnxk_ml_req {
 	union {
 		/* CN10K */
 		struct cn10k_ml_req cn10k_req;
+
+#ifdef RTE_MLDEV_CNXK_ENABLE_MVTVM
+		/* MVTVM */
+		struct mvtvm_ml_req mvtvm_req;
+#endif
 	};
 
 	/* Address of status field */
diff --git a/drivers/ml/cnxk/mvtvm_ml_model.c b/drivers/ml/cnxk/mvtvm_ml_model.c
index 4c12f584d5..1dfd0d176a 100644
--- a/drivers/ml/cnxk/mvtvm_ml_model.c
+++ b/drivers/ml/cnxk/mvtvm_ml_model.c
@@ -198,6 +198,16 @@  mvtvm_ml_model_io_info_set(struct cnxk_ml_model *model)
 		model->mvtvm.info.total_input_sz_d += model->mvtvm.info.input[i].sz_d;
 		model->mvtvm.info.total_input_sz_q += model->mvtvm.info.input[i].sz_q;
 
+		model->mvtvm.info.input[i].offset_d = model->mvtvm.info.total_input_sz_d;
+		model->mvtvm.info.input[i].offset_q = model->mvtvm.info.total_input_sz_q;
+
+		model->mvtvm.input_tensor[i].device = metadata->input[i].device;
+		model->mvtvm.input_tensor[i].ndim = metadata->input[i].ndim;
+		model->mvtvm.input_tensor[i].dtype = metadata->input[i].datatype;
+		model->mvtvm.input_tensor[i].shape = metadata->input[i].shape;
+		model->mvtvm.input_tensor[i].strides = NULL;
+		model->mvtvm.input_tensor[i].byte_offset = model->mvtvm.info.input[i].offset_q;
+
 		plt_ml_dbg("model_id = %u, input[%u] - sz_d = %u sz_q = %u", model->model_id, i,
 			   model->mvtvm.info.input[i].sz_d, model->mvtvm.info.input[i].sz_q);
 	}
@@ -231,6 +241,16 @@  mvtvm_ml_model_io_info_set(struct cnxk_ml_model *model)
 		model->mvtvm.info.total_output_sz_d += model->mvtvm.info.output[i].sz_d;
 		model->mvtvm.info.total_output_sz_q += model->mvtvm.info.output[i].sz_q;
 
+		model->mvtvm.info.output[i].offset_d = model->mvtvm.info.total_output_sz_d;
+		model->mvtvm.info.output[i].offset_q = model->mvtvm.info.total_output_sz_q;
+
+		model->mvtvm.output_tensor[i].device = metadata->output[i].device;
+		model->mvtvm.output_tensor[i].ndim = metadata->output[i].ndim;
+		model->mvtvm.output_tensor[i].dtype = metadata->output[i].datatype;
+		model->mvtvm.output_tensor[i].shape = metadata->output[i].shape;
+		model->mvtvm.output_tensor[i].strides = NULL;
+		model->mvtvm.output_tensor[i].byte_offset = model->mvtvm.info.output[i].offset_q;
+
 		plt_ml_dbg("model_id = %u, output[%u] - sz_d = %u sz_q = %u", model->model_id, i,
 			   model->mvtvm.info.output[i].sz_d, model->mvtvm.info.output[i].sz_q);
 	}
diff --git a/drivers/ml/cnxk/mvtvm_ml_model.h b/drivers/ml/cnxk/mvtvm_ml_model.h
index 66c3af18e1..7ffce38094 100644
--- a/drivers/ml/cnxk/mvtvm_ml_model.h
+++ b/drivers/ml/cnxk/mvtvm_ml_model.h
@@ -69,6 +69,12 @@  struct mvtvm_ml_model_data {
 
 	/* Stats for burst ops */
 	struct mvtvm_ml_model_xstats *burst_xstats;
+
+	/* Input Tensor */
+	DLTensor input_tensor[ML_CNXK_MODEL_MAX_INPUT_OUTPUT];
+
+	/* Output Tensor */
+	DLTensor output_tensor[ML_CNXK_MODEL_MAX_INPUT_OUTPUT];
 };
 
 enum cnxk_ml_model_type mvtvm_ml_model_type_get(struct rte_ml_model_params *params);
diff --git a/drivers/ml/cnxk/mvtvm_ml_ops.c b/drivers/ml/cnxk/mvtvm_ml_ops.c
index 776675843a..1e74b82a0a 100644
--- a/drivers/ml/cnxk/mvtvm_ml_ops.c
+++ b/drivers/ml/cnxk/mvtvm_ml_ops.c
@@ -19,6 +19,12 @@ 
 /* ML model macros */
 #define MVTVM_ML_MODEL_MEMZONE_NAME "ml_mvtvm_model_mz"
 
+__rte_hot static void
+mvtvm_ml_set_poll_addr(struct cnxk_ml_req *req)
+{
+	req->status = &req->mvtvm_req.status;
+}
+
 void
 mvtvm_ml_model_xstat_name_set(struct cnxk_ml_dev *cnxk_mldev, struct cnxk_ml_model *model,
 			      uint16_t stat_id, uint16_t entry, char *suffix)
@@ -242,6 +248,7 @@  mvtvm_ml_model_load(struct cnxk_ml_dev *cnxk_mldev, struct rte_ml_model_params *
 		callback->tvmrt_free = cn10k_ml_free;
 		callback->tvmrt_quantize = mvtvm_ml_io_quantize;
 		callback->tvmrt_dequantize = mvtvm_ml_io_dequantize;
+		callback->tvmrt_inference = cn10k_ml_inference_sync;
 	} else {
 		callback = NULL;
 	}
@@ -285,6 +292,19 @@  mvtvm_ml_model_load(struct cnxk_ml_dev *cnxk_mldev, struct rte_ml_model_params *
 		model->mvtvm.burst_xstats[qp_id].dequeued_count = 0;
 	}
 
+	/* Set model specific fast path functions */
+	if (model->subtype == ML_CNXK_MODEL_SUBTYPE_TVM_MRVL) {
+		model->enqueue_single = cn10k_ml_enqueue_single;
+		model->result_update = cn10k_ml_result_update;
+		model->set_error_code = cn10k_ml_set_error_code;
+		model->set_poll_addr = cn10k_ml_set_poll_addr;
+	} else {
+		model->enqueue_single = mvtvm_ml_enqueue_single;
+		model->result_update = mvtvm_ml_result_update;
+		model->set_error_code = mvtvm_ml_set_error_code;
+		model->set_poll_addr = mvtvm_ml_set_poll_addr;
+	}
+
 	return 0;
 
 error:
@@ -495,3 +515,107 @@  mvtvm_ml_io_dequantize(void *device, uint16_t model_id, const char *layer_name,
 
 	return 0;
 }
+
+static int
+mvtvm_ml_model_run(struct cnxk_ml_model *model, struct rte_ml_op *op, struct cnxk_ml_req *req)
+{
+	uint8_t i;
+
+	rte_memcpy(req->mvtvm_req.input_tensor, model->mvtvm.input_tensor,
+		   model->mvtvm.metadata.model.num_input * sizeof(DLTensor));
+	for (i = 0; i < model->mvtvm.metadata.model.num_input; i++) {
+		req->mvtvm_req.input_tensor[i].data = op->input[i]->addr;
+		req->mvtvm_req.input_tensor[i].byte_offset = 0;
+	}
+
+	rte_memcpy(req->mvtvm_req.output_tensor, model->mvtvm.output_tensor,
+		   model->mvtvm.metadata.model.num_output * sizeof(DLTensor));
+	for (i = 0; i < model->mvtvm.metadata.model.num_output; i++) {
+		req->mvtvm_req.output_tensor[i].data = op->output[i]->addr;
+		req->mvtvm_req.output_tensor[i].byte_offset = 0;
+	}
+
+	tvmdp_model_run(model->model_id, model->mvtvm.metadata.model.num_input,
+			req->mvtvm_req.input_tensor, model->mvtvm.metadata.model.num_output,
+			req->mvtvm_req.output_tensor, &req->mvtvm_req.result,
+			&req->mvtvm_req.status);
+
+	plt_write64(ML_CNXK_POLL_JOB_FINISH, req->status);
+
+	return 0;
+}
+
+__rte_hot void
+mvtvm_ml_set_error_code(struct cnxk_ml_req *req, uint64_t etype, uint64_t stype)
+{
+	RTE_SET_USED(stype);
+
+	req->mvtvm_req.result.error_code = etype;
+}
+
+__rte_hot bool
+mvtvm_ml_enqueue_single(struct cnxk_ml_dev *cnxk_mldev, struct rte_ml_op *op, uint16_t layer_id,
+			struct cnxk_ml_qp *qp, uint64_t head)
+{
+	struct cnxk_ml_model *model;
+	struct cnxk_ml_queue *queue;
+	struct cnxk_ml_req *req;
+
+	RTE_SET_USED(layer_id);
+
+	queue = &qp->queue;
+	req = &queue->reqs[head];
+	model = cnxk_mldev->mldev->data->models[op->model_id];
+
+	model->set_poll_addr(req);
+	memset(&req->mvtvm_req.result, 0, sizeof(struct mvtvm_ml_result));
+	req->mvtvm_req.result.error_code = 0x0;
+	req->mvtvm_req.result.user_ptr = op->user_ptr;
+
+	cnxk_ml_set_poll_ptr(req);
+	mvtvm_ml_model_run(model, op, req);
+	req->timeout = plt_tsc_cycles() + queue->wait_cycles;
+	req->op = op;
+
+	return true;
+}
+
+__rte_hot void
+mvtvm_ml_result_update(struct cnxk_ml_dev *cnxk_mldev, int qp_id, void *request)
+{
+	struct mvtvm_ml_model_xstats *xstats;
+	struct mvtvm_ml_result *result;
+	struct cnxk_ml_model *model;
+	struct cnxk_ml_req *req;
+	uint64_t tvm_rt_latency;
+	struct cnxk_ml_qp *qp;
+	struct rte_ml_op *op;
+
+	req = (struct cnxk_ml_req *)request;
+	result = &req->mvtvm_req.result;
+	op = req->op;
+	qp = cnxk_mldev->mldev->data->queue_pairs[qp_id];
+	op->impl_opaque = result->error_code;
+
+	if (likely(result->error_code == 0)) {
+		qp->stats.dequeued_count++;
+		op->status = RTE_ML_OP_STATUS_SUCCESS;
+
+		model = cnxk_mldev->mldev->data->models[op->model_id];
+		xstats = &model->mvtvm.burst_xstats[qp_id];
+
+		if (unlikely(xstats->dequeued_count == xstats->tvm_rt_reset_count)) {
+			xstats->tvm_rt_latency_min = UINT64_MAX;
+			xstats->tvm_rt_latency_max = 0;
+		}
+		tvm_rt_latency = result->stats.end_ns - result->stats.start_ns;
+		xstats->tvm_rt_latency = tvm_rt_latency;
+		xstats->tvm_rt_latency_tot += tvm_rt_latency;
+		xstats->tvm_rt_latency_min = RTE_MIN(xstats->tvm_rt_latency_min, tvm_rt_latency);
+		xstats->tvm_rt_latency_max = RTE_MAX(xstats->tvm_rt_latency_max, tvm_rt_latency);
+		xstats->dequeued_count++;
+	} else {
+		qp->stats.dequeue_err_count++;
+		op->status = RTE_ML_OP_STATUS_ERROR;
+	}
+}
diff --git a/drivers/ml/cnxk/mvtvm_ml_ops.h b/drivers/ml/cnxk/mvtvm_ml_ops.h
index 4cabe30a82..cb4b219743 100644
--- a/drivers/ml/cnxk/mvtvm_ml_ops.h
+++ b/drivers/ml/cnxk/mvtvm_ml_ops.h
@@ -16,6 +16,44 @@ 
 struct cnxk_ml_dev;
 struct cnxk_ml_model;
 struct cnxk_ml_layer;
+struct cnxk_ml_qp;
+struct cnxk_ml_req;
+
+/* Inference stats */
+struct mvtvm_ml_stats {
+	/* Start ns */
+	uint64_t start_ns;
+
+	/* Start ns */
+	uint64_t end_ns;
+};
+
+/* Result structure */
+struct mvtvm_ml_result {
+	/* Job error code */
+	uint64_t error_code;
+
+	/* Inference stats */
+	struct mvtvm_ml_stats stats;
+
+	/* User context pointer */
+	void *user_ptr;
+};
+
+/* MVTVM specific request */
+struct mvtvm_ml_req {
+	/* Input tensors */
+	DLTensor input_tensor[ML_CNXK_MODEL_MAX_INPUT_OUTPUT];
+
+	/* Output tensors */
+	DLTensor output_tensor[ML_CNXK_MODEL_MAX_INPUT_OUTPUT];
+
+	/* Status field for poll mode requests */
+	volatile uint64_t status;
+
+	/* Result */
+	struct mvtvm_ml_result result;
+};
 
 int mvtvm_ml_dev_configure(struct cnxk_ml_dev *cnxk_mldev, const struct rte_ml_dev_config *conf);
 int mvtvm_ml_dev_close(struct cnxk_ml_dev *cnxk_mldev);
@@ -29,6 +67,11 @@  int mvtvm_ml_io_quantize(void *device, uint16_t model_id, const char *layer_name
 int mvtvm_ml_io_dequantize(void *device, uint16_t model_id, const char *layer_name, void *qbuffer,
 			   const DLTensor **deq_tensor);
 
+__rte_hot bool mvtvm_ml_enqueue_single(struct cnxk_ml_dev *cnxk_mldev, struct rte_ml_op *op,
+				       uint16_t layer_id, struct cnxk_ml_qp *qp, uint64_t head);
+__rte_hot void mvtvm_ml_result_update(struct cnxk_ml_dev *cnxk_mldev, int qp_id, void *request);
+__rte_hot void mvtvm_ml_set_error_code(struct cnxk_ml_req *req, uint64_t etype, uint64_t stype);
+
 void mvtvm_ml_model_xstat_name_set(struct cnxk_ml_dev *cnxk_mldev, struct cnxk_ml_model *model,
 				   uint16_t stat_id, uint16_t entry, char *suffix);
 uint64_t mvtvm_ml_model_xstat_get(struct cnxk_ml_dev *cnxk_mldev, struct cnxk_ml_model *model,