[v1,33/34] ml/cnxk: enable fast-path ops for TVM models

Message ID 20230830155927.3566-34-syalavarthi@marvell.com (mailing list archive)
State Superseded, archived
Delegated to: Thomas Monjalon
Headers
Series Implemenation of revised ml/cnxk driver |

Checks

Context Check Description
ci/checkpatch success coding style OK

Commit Message

Srikanth Yalavarthi Aug. 30, 2023, 3:59 p.m. UTC
  From: Anup Prabhu <aprabhu@marvell.com>

Enable fast-path ops support for TVM models. Models would
use TVMDP library function calls to execute inference
operations for Hybrid and LLVM model sub-types.

For TVM MRVL model subtypes that have a single MRVL layer,
the inference requests are directly enqueued to hardware
by the driver.

Signed-off-by: Anup Prabhu <aprabhu@marvell.com>
Signed-off-by: Srikanth Yalavarthi <syalavarthi@marvell.com>
---
 drivers/ml/cnxk/cn10k_ml_ops.c   |   4 -
 drivers/ml/cnxk/cnxk_ml_io.h     |   6 ++
 drivers/ml/cnxk/cnxk_ml_ops.c    |   4 +
 drivers/ml/cnxk/cnxk_ml_ops.h    |   9 +++
 drivers/ml/cnxk/mvtvm_ml_model.c |  20 +++++
 drivers/ml/cnxk/mvtvm_ml_model.h |   6 ++
 drivers/ml/cnxk/mvtvm_ml_ops.c   | 124 +++++++++++++++++++++++++++++++
 drivers/ml/cnxk/mvtvm_ml_ops.h   |  43 +++++++++++
 8 files changed, 212 insertions(+), 4 deletions(-)
  

Patch

diff --git a/drivers/ml/cnxk/cn10k_ml_ops.c b/drivers/ml/cnxk/cn10k_ml_ops.c
index 140f7a343f9..c1353fb0c81 100644
--- a/drivers/ml/cnxk/cn10k_ml_ops.c
+++ b/drivers/ml/cnxk/cn10k_ml_ops.c
@@ -287,10 +287,6 @@  cn10k_ml_dev_configure(struct cnxk_ml_dev *cnxk_mldev, const struct rte_ml_dev_c
 	else
 		cn10k_mldev->ml_jcmdq_enqueue = roc_ml_jcmdq_enqueue_lf;
 
-	cnxk_mldev->mldev->enqueue_burst = cnxk_ml_enqueue_burst;
-	cnxk_mldev->mldev->dequeue_burst = cnxk_ml_dequeue_burst;
-	cnxk_mldev->mldev->op_error_get = cn10k_ml_op_error_get;
-
 	return 0;
 }
 
diff --git a/drivers/ml/cnxk/cnxk_ml_io.h b/drivers/ml/cnxk/cnxk_ml_io.h
index 5de166c2520..6d5d25a7c9c 100644
--- a/drivers/ml/cnxk/cnxk_ml_io.h
+++ b/drivers/ml/cnxk/cnxk_ml_io.h
@@ -47,6 +47,12 @@  struct cnxk_ml_io {
 
 	/* Scale */
 	float scale;
+
+	/* Dequantized offset */
+	uint32_t offset_d;
+
+	/* Quantized offset */
+	uint32_t offset_q;
 };
 
 /* Model / Layer IO structure */
diff --git a/drivers/ml/cnxk/cnxk_ml_ops.c b/drivers/ml/cnxk/cnxk_ml_ops.c
index ff9ecd3c941..c8491646da9 100644
--- a/drivers/ml/cnxk/cnxk_ml_ops.c
+++ b/drivers/ml/cnxk/cnxk_ml_ops.c
@@ -758,6 +758,10 @@  cnxk_ml_dev_configure(struct rte_ml_dev *dev, const struct rte_ml_dev_config *co
 	cnxk_mldev->max_nb_layers =
 		cnxk_mldev->cn10k_mldev.fw.req->cn10k_req.jd.fw_load.cap.s.max_models;
 
+	cnxk_mldev->mldev->enqueue_burst = cnxk_ml_enqueue_burst;
+	cnxk_mldev->mldev->dequeue_burst = cnxk_ml_dequeue_burst;
+	cnxk_mldev->mldev->op_error_get = cn10k_ml_op_error_get;
+
 	/* Allocate and initialize index_map */
 	if (cnxk_mldev->index_map == NULL) {
 		cnxk_mldev->index_map =
diff --git a/drivers/ml/cnxk/cnxk_ml_ops.h b/drivers/ml/cnxk/cnxk_ml_ops.h
index 2575f4c6e10..62e2b17e35b 100644
--- a/drivers/ml/cnxk/cnxk_ml_ops.h
+++ b/drivers/ml/cnxk/cnxk_ml_ops.h
@@ -12,12 +12,21 @@ 
 
 #include "cn10k_ml_ops.h"
 
+#ifdef RTE_MLDEV_CNXK_ENABLE_MVTVM
+#include "mvtvm_ml_ops.h"
+#endif
+
 /* Request structure */
 struct cnxk_ml_req {
 	/* Device specific request */
 	union {
 		/* CN10K */
 		struct cn10k_ml_req cn10k_req;
+
+#ifdef RTE_MLDEV_CNXK_ENABLE_MVTVM
+		/* MVTVM */
+		struct mvtvm_ml_req mvtvm_req;
+#endif
 	};
 
 	/* Address of status field */
diff --git a/drivers/ml/cnxk/mvtvm_ml_model.c b/drivers/ml/cnxk/mvtvm_ml_model.c
index 24dc862d685..4ac053408e2 100644
--- a/drivers/ml/cnxk/mvtvm_ml_model.c
+++ b/drivers/ml/cnxk/mvtvm_ml_model.c
@@ -136,6 +136,16 @@  mvtvm_ml_model_io_info_update(struct cnxk_ml_model *model)
 		model->mvtvm.info.total_input_sz_d += model->mvtvm.info.input[i].sz_d;
 		model->mvtvm.info.total_input_sz_q += model->mvtvm.info.input[i].sz_q;
 
+		model->mvtvm.info.input[i].offset_d = model->mvtvm.info.total_input_sz_d;
+		model->mvtvm.info.input[i].offset_q = model->mvtvm.info.total_input_sz_q;
+
+		model->mvtvm.input_tensor[i].device = metadata->input[i].device;
+		model->mvtvm.input_tensor[i].ndim = metadata->input[i].ndim;
+		model->mvtvm.input_tensor[i].dtype = metadata->input[i].datatype;
+		model->mvtvm.input_tensor[i].shape = metadata->input[i].shape;
+		model->mvtvm.input_tensor[i].strides = NULL;
+		model->mvtvm.input_tensor[i].byte_offset = model->mvtvm.info.input[i].offset_q;
+
 		plt_ml_dbg("model_id = %u, input[%u] - sz_d = %u sz_q = %u", model->model_id, i,
 			   model->mvtvm.info.input[i].sz_d, model->mvtvm.info.input[i].sz_q);
 	}
@@ -169,6 +179,16 @@  mvtvm_ml_model_io_info_update(struct cnxk_ml_model *model)
 		model->mvtvm.info.total_output_sz_d += model->mvtvm.info.output[i].sz_d;
 		model->mvtvm.info.total_output_sz_q += model->mvtvm.info.output[i].sz_q;
 
+		model->mvtvm.info.output[i].offset_d = model->mvtvm.info.total_output_sz_d;
+		model->mvtvm.info.output[i].offset_q = model->mvtvm.info.total_output_sz_q;
+
+		model->mvtvm.output_tensor[i].device = metadata->output[i].device;
+		model->mvtvm.output_tensor[i].ndim = metadata->output[i].ndim;
+		model->mvtvm.output_tensor[i].dtype = metadata->output[i].datatype;
+		model->mvtvm.output_tensor[i].shape = metadata->output[i].shape;
+		model->mvtvm.output_tensor[i].strides = NULL;
+		model->mvtvm.output_tensor[i].byte_offset = model->mvtvm.info.output[i].offset_q;
+
 		plt_ml_dbg("model_id = %u, output[%u] - sz_d = %u sz_q = %u", model->model_id, i,
 			   model->mvtvm.info.output[i].sz_d, model->mvtvm.info.output[i].sz_q);
 	}
diff --git a/drivers/ml/cnxk/mvtvm_ml_model.h b/drivers/ml/cnxk/mvtvm_ml_model.h
index 57a6ce0bb1a..08e101bbe74 100644
--- a/drivers/ml/cnxk/mvtvm_ml_model.h
+++ b/drivers/ml/cnxk/mvtvm_ml_model.h
@@ -71,6 +71,12 @@  struct mvtvm_ml_model_data {
 
 	/* Stats for burst ops */
 	struct mvtvm_ml_model_xstats *burst_xstats;
+
+	/* Input Tensor */
+	DLTensor input_tensor[ML_CNXK_MODEL_MAX_INPUT_OUTPUT];
+
+	/* Output Tensor */
+	DLTensor output_tensor[ML_CNXK_MODEL_MAX_INPUT_OUTPUT];
 };
 
 int mvtvm_ml_model_blob_parse(struct rte_ml_model_params *params,
diff --git a/drivers/ml/cnxk/mvtvm_ml_ops.c b/drivers/ml/cnxk/mvtvm_ml_ops.c
index 0bee5884640..e8484b3bd92 100644
--- a/drivers/ml/cnxk/mvtvm_ml_ops.c
+++ b/drivers/ml/cnxk/mvtvm_ml_ops.c
@@ -23,6 +23,12 @@ 
 /* ML model macros */
 #define MVTVM_ML_MODEL_MEMZONE_NAME "ml_mvtvm_model_mz"
 
+__rte_hot static void
+mvtvm_ml_set_poll_addr(struct cnxk_ml_req *req)
+{
+	req->status = &req->mvtvm_req.status;
+}
+
 int
 mvtvm_ml_dev_configure(struct cnxk_ml_dev *cnxk_mldev, const struct rte_ml_dev_config *conf)
 {
@@ -174,6 +180,7 @@  mvtvm_ml_model_load(struct cnxk_ml_dev *cnxk_mldev, struct rte_ml_model_params *
 		callback->tvmrt_free = cn10k_ml_free;
 		callback->tvmrt_quantize = mvtvm_ml_io_quantize;
 		callback->tvmrt_dequantize = mvtvm_ml_io_dequantize;
+		callback->tvmrt_inference = cn10k_ml_inference_sync;
 	} else {
 		callback = NULL;
 	}
@@ -217,6 +224,19 @@  mvtvm_ml_model_load(struct cnxk_ml_dev *cnxk_mldev, struct rte_ml_model_params *
 		model->mvtvm.burst_xstats[qp_id].dequeued_count = 0;
 	}
 
+	/* Set model specific fast path functions */
+	if (model->subtype == ML_CNXK_MODEL_SUBTYPE_TVM_MRVL) {
+		model->enqueue_single = cn10k_ml_enqueue_single;
+		model->result_update = cn10k_ml_result_update;
+		model->set_error_code = cn10k_ml_set_error_code;
+		model->set_poll_addr = cn10k_ml_set_poll_addr;
+	} else {
+		model->enqueue_single = mvtvm_ml_enqueue_single;
+		model->result_update = mvtvm_ml_result_update;
+		model->set_error_code = mvtvm_ml_set_error_code;
+		model->set_poll_addr = mvtvm_ml_set_poll_addr;
+	}
+
 	return 0;
 
 error:
@@ -427,3 +447,107 @@  mvtvm_ml_io_dequantize(void *device, uint16_t model_id, const char *layer_name,
 
 	return 0;
 }
+
+static int
+mvtvm_ml_model_run(struct cnxk_ml_model *model, struct rte_ml_op *op, struct cnxk_ml_req *req)
+{
+	uint8_t i;
+
+	rte_memcpy(req->mvtvm_req.input_tensor, model->mvtvm.input_tensor,
+		   model->mvtvm.metadata.model.num_input * sizeof(DLTensor));
+	for (i = 0; i < model->mvtvm.metadata.model.num_input; i++) {
+		req->mvtvm_req.input_tensor[i].data = op->input[i]->addr;
+		req->mvtvm_req.input_tensor[i].byte_offset = 0;
+	}
+
+	rte_memcpy(req->mvtvm_req.output_tensor, model->mvtvm.output_tensor,
+		   model->mvtvm.metadata.model.num_output * sizeof(DLTensor));
+	for (i = 0; i < model->mvtvm.metadata.model.num_output; i++) {
+		req->mvtvm_req.output_tensor[i].data = op->output[i]->addr;
+		req->mvtvm_req.output_tensor[i].byte_offset = 0;
+	}
+
+	tvmdp_model_run(model->model_id, model->mvtvm.metadata.model.num_input,
+			req->mvtvm_req.input_tensor, model->mvtvm.metadata.model.num_output,
+			req->mvtvm_req.output_tensor, &req->mvtvm_req.result,
+			&req->mvtvm_req.status);
+
+	plt_write64(ML_CNXK_POLL_JOB_FINISH, req->status);
+
+	return 0;
+}
+
+__rte_hot void
+mvtvm_ml_set_error_code(struct cnxk_ml_req *req, uint64_t etype, uint64_t stype)
+{
+	RTE_SET_USED(stype);
+
+	req->mvtvm_req.result.error_code = etype;
+}
+
+__rte_hot bool
+mvtvm_ml_enqueue_single(struct cnxk_ml_dev *cnxk_mldev, struct rte_ml_op *op, uint16_t layer_id,
+			struct cnxk_ml_qp *qp, uint64_t head)
+{
+	struct cnxk_ml_model *model;
+	struct cnxk_ml_queue *queue;
+	struct cnxk_ml_req *req;
+
+	RTE_SET_USED(layer_id);
+
+	queue = &qp->queue;
+	req = &queue->reqs[head];
+	model = cnxk_mldev->mldev->data->models[op->model_id];
+
+	model->set_poll_addr(req);
+	memset(&req->mvtvm_req.result, 0, sizeof(struct mvtvm_ml_result));
+	req->mvtvm_req.result.error_code = 0x0;
+	req->mvtvm_req.result.user_ptr = op->user_ptr;
+
+	cnxk_ml_set_poll_ptr(req);
+	mvtvm_ml_model_run(model, op, req);
+	req->timeout = plt_tsc_cycles() + queue->wait_cycles;
+	req->op = op;
+
+	return true;
+}
+
+__rte_hot void
+mvtvm_ml_result_update(struct cnxk_ml_dev *cnxk_mldev, int qp_id, void *request)
+{
+	struct mvtvm_ml_model_xstats *xstats;
+	struct mvtvm_ml_result *result;
+	struct cnxk_ml_model *model;
+	struct cnxk_ml_req *req;
+	uint64_t tvm_rt_latency;
+	struct cnxk_ml_qp *qp;
+	struct rte_ml_op *op;
+
+	req = (struct cnxk_ml_req *)request;
+	result = &req->mvtvm_req.result;
+	op = req->op;
+	qp = cnxk_mldev->mldev->data->queue_pairs[qp_id];
+	op->impl_opaque = result->error_code;
+
+	if (likely(result->error_code == 0)) {
+		qp->stats.dequeued_count++;
+		op->status = RTE_ML_OP_STATUS_SUCCESS;
+
+		model = cnxk_mldev->mldev->data->models[op->model_id];
+		xstats = &model->mvtvm.burst_xstats[qp_id];
+
+		if (unlikely(xstats->dequeued_count == xstats->tvm_rt_reset_count)) {
+			xstats->tvm_rt_latency_min = UINT64_MAX;
+			xstats->tvm_rt_latency_max = 0;
+		}
+		tvm_rt_latency = result->stats.end_ns - result->stats.start_ns;
+		xstats->tvm_rt_latency = tvm_rt_latency;
+		xstats->tvm_rt_latency_tot += tvm_rt_latency;
+		xstats->tvm_rt_latency_min = RTE_MIN(xstats->tvm_rt_latency_min, tvm_rt_latency);
+		xstats->tvm_rt_latency_max = RTE_MAX(xstats->tvm_rt_latency_max, tvm_rt_latency);
+		xstats->dequeued_count++;
+	} else {
+		qp->stats.dequeue_err_count++;
+		op->status = RTE_ML_OP_STATUS_ERROR;
+	}
+}
diff --git a/drivers/ml/cnxk/mvtvm_ml_ops.h b/drivers/ml/cnxk/mvtvm_ml_ops.h
index 3a1e97a7a08..dba055c22e7 100644
--- a/drivers/ml/cnxk/mvtvm_ml_ops.h
+++ b/drivers/ml/cnxk/mvtvm_ml_ops.h
@@ -11,6 +11,44 @@ 
 
 struct cnxk_ml_dev;
 struct cnxk_ml_model;
+struct cnxk_ml_qp;
+struct cnxk_ml_req;
+
+/* Inference stats */
+struct mvtvm_ml_stats {
+	/* Start ns */
+	uint64_t start_ns;
+
+	/* Start ns */
+	uint64_t end_ns;
+};
+
+/* Result structure */
+struct mvtvm_ml_result {
+	/* Job error code */
+	uint64_t error_code;
+
+	/* Inference stats */
+	struct mvtvm_ml_stats stats;
+
+	/* User context pointer */
+	void *user_ptr;
+};
+
+/* MVTVM specific request */
+struct mvtvm_ml_req {
+	/* Input tensors */
+	DLTensor input_tensor[ML_CNXK_MODEL_MAX_INPUT_OUTPUT];
+
+	/* Output tensors */
+	DLTensor output_tensor[ML_CNXK_MODEL_MAX_INPUT_OUTPUT];
+
+	/* Status field for poll mode requests */
+	volatile uint64_t status;
+
+	/* Result */
+	struct mvtvm_ml_result result;
+};
 
 int mvtvm_ml_dev_configure(struct cnxk_ml_dev *cnxk_mldev, const struct rte_ml_dev_config *conf);
 int mvtvm_ml_dev_close(struct cnxk_ml_dev *cnxk_mldev);
@@ -24,4 +62,9 @@  int mvtvm_ml_io_quantize(void *device, uint16_t model_id, const char *layer_name
 int mvtvm_ml_io_dequantize(void *device, uint16_t model_id, const char *layer_name, void *qbuffer,
 			   const DLTensor **deq_tensor);
 
+__rte_hot bool mvtvm_ml_enqueue_single(struct cnxk_ml_dev *cnxk_mldev, struct rte_ml_op *op,
+				       uint16_t layer_id, struct cnxk_ml_qp *qp, uint64_t head);
+__rte_hot void mvtvm_ml_result_update(struct cnxk_ml_dev *cnxk_mldev, int qp_id, void *request);
+__rte_hot void mvtvm_ml_set_error_code(struct cnxk_ml_req *req, uint64_t etype, uint64_t stype);
+
 #endif /* _MVTVM_ML_OPS_H_ */