[v1,1/1] ml/cnxk: updates to cn10k error handling

Message ID 20240731063803.9223-1-syalavarthi@marvell.com (mailing list archive)
State New
Delegated to: Jerin Jacob
Headers
Series [v1,1/1] ml/cnxk: updates to cn10k error handling |

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/loongarch-compilation success Compilation OK
ci/loongarch-unit-testing success Unit Testing PASS
ci/Intel-compilation success Compilation OK
ci/intel-Testing success Testing PASS
ci/github-robot: build success github build: passed
ci/intel-Functional success Functional PASS
ci/iol-mellanox-Performance success Performance Testing PASS
ci/iol-intel-Performance success Performance Testing PASS
ci/iol-marvell-Functional success Functional Testing PASS
ci/iol-abi-testing success Testing PASS
ci/iol-sample-apps-testing success Testing PASS
ci/iol-broadcom-Performance success Performance Testing PASS
ci/iol-broadcom-Functional success Functional Testing PASS
ci/iol-intel-Functional success Functional Testing PASS
ci/iol-unit-arm64-testing success Testing PASS
ci/iol-unit-amd64-testing success Testing PASS
ci/iol-compile-arm64-testing success Testing PASS
ci/iol-compile-amd64-testing success Testing PASS

Commit Message

Srikanth Yalavarthi July 31, 2024, 6:38 a.m. UTC
Renamed cnxk error codes as cn10k error codes. Added
support for model specific op_error_get routines.

Signed-off-by: Srikanth Yalavarthi <syalavarthi@marvell.com>
---
 drivers/ml/cnxk/cn10k_ml_dev.c  |  8 ++++++++
 drivers/ml/cnxk/cn10k_ml_dev.h  | 16 ++++++++++++++++
 drivers/ml/cnxk/cn10k_ml_ops.c  | 20 +++++++++++---------
 drivers/ml/cnxk/cn10k_ml_ops.h  |  2 +-
 drivers/ml/cnxk/cnxk_ml_dev.c   |  8 --------
 drivers/ml/cnxk/cnxk_ml_dev.h   | 18 +-----------------
 drivers/ml/cnxk/cnxk_ml_model.h |  3 +++
 drivers/ml/cnxk/cnxk_ml_ops.c   | 18 ++++++++++++++----
 drivers/ml/cnxk/cnxk_ml_ops.h   |  2 ++
 drivers/ml/cnxk/mvtvm_ml_ops.c  | 13 +++++++++++++
 drivers/ml/cnxk/mvtvm_ml_ops.h  |  2 ++
 11 files changed, 71 insertions(+), 39 deletions(-)
  

Patch

diff --git a/drivers/ml/cnxk/cn10k_ml_dev.c b/drivers/ml/cnxk/cn10k_ml_dev.c
index 41f3b7a95da..2e719919ce1 100644
--- a/drivers/ml/cnxk/cn10k_ml_dev.c
+++ b/drivers/ml/cnxk/cn10k_ml_dev.c
@@ -58,6 +58,14 @@  static const char *const valid_args[] = {CN10K_ML_FW_PATH,
 /* Supported OCM page sizes: 1KB, 2KB, 4KB, 8KB and 16KB */
 static const int valid_ocm_page_size[] = {1024, 2048, 4096, 8192, 16384};
 
+/* Error type database */
+struct cn10k_ml_error_db ml_etype_db[] = {
+	{ML_CN10K_ETYPE_NO_ERROR, "NO_ERROR"},	      {ML_CN10K_ETYPE_FW_NONFATAL, "FW_NON_FATAL"},
+	{ML_CN10K_ETYPE_HW_NONFATAL, "HW_NON_FATAL"}, {ML_CN10K_ETYPE_HW_FATAL, "HW_FATAL"},
+	{ML_CN10K_ETYPE_HW_WARNING, "HW_WARNING"},    {ML_CN10K_ETYPE_DRIVER, "DRIVER_ERROR"},
+	{ML_CN10K_ETYPE_UNKNOWN, "UNKNOWN_ERROR"},
+};
+
 static int
 parse_string_arg(const char *key __rte_unused, const char *value, void *extra_args)
 {
diff --git a/drivers/ml/cnxk/cn10k_ml_dev.h b/drivers/ml/cnxk/cn10k_ml_dev.h
index ddb8b67e06e..dadb3b571ba 100644
--- a/drivers/ml/cnxk/cn10k_ml_dev.h
+++ b/drivers/ml/cnxk/cn10k_ml_dev.h
@@ -46,6 +46,22 @@  struct cnxk_ml_dev;
 struct cnxk_ml_req;
 struct cnxk_ml_qp;
 
+/* Error types enumeration */
+enum cn10k_ml_error_etype {
+	/* 0x0 */ ML_CN10K_ETYPE_NO_ERROR = 0, /* No error */
+	/* 0x1 */ ML_CN10K_ETYPE_FW_NONFATAL,  /* Firmware non-fatal error */
+	/* 0x2 */ ML_CN10K_ETYPE_HW_NONFATAL,  /* Hardware non-fatal error */
+	/* 0x3 */ ML_CN10K_ETYPE_HW_FATAL,     /* Hardware fatal error */
+	/* 0x4 */ ML_CN10K_ETYPE_HW_WARNING,   /* Hardware warning */
+	/* 0x5 */ ML_CN10K_ETYPE_DRIVER,       /* Driver specific error */
+	/* 0x6 */ ML_CN10K_ETYPE_UNKNOWN,      /* Unknown error */
+};
+
+struct cn10k_ml_error_db {
+	uint64_t code;
+	char str[RTE_ML_STR_MAX];
+};
+
 /* Firmware non-fatal error sub-type */
 enum cn10k_ml_error_stype_fw_nf {
 	/* 0x0 */ ML_CN10K_FW_ERR_NOERR = 0,	       /* No error */
diff --git a/drivers/ml/cnxk/cn10k_ml_ops.c b/drivers/ml/cnxk/cn10k_ml_ops.c
index 834e55e88e9..b30af7c7a44 100644
--- a/drivers/ml/cnxk/cn10k_ml_ops.c
+++ b/drivers/ml/cnxk/cn10k_ml_ops.c
@@ -23,7 +23,7 @@ 
 #define ML_FLAGS_SSO_COMPL  BIT(1)
 
 /* Hardware non-fatal error subtype database */
-static struct cnxk_ml_error_db ml_stype_db_hw_nf[] = {
+static struct cn10k_ml_error_db ml_stype_db_hw_nf[] = {
 	{ML_CN10K_FW_ERR_NOERR, "NO ERROR"},
 	{ML_CN10K_FW_ERR_UNLOAD_ID_NOT_FOUND, "UNLOAD MODEL ID NOT FOUND"},
 	{ML_CN10K_FW_ERR_LOAD_LUT_OVERFLOW, "LOAD LUT OVERFLOW"},
@@ -38,7 +38,7 @@  static struct cnxk_ml_error_db ml_stype_db_hw_nf[] = {
 };
 
 /* Driver error subtype database */
-static struct cnxk_ml_error_db ml_stype_db_driver[] = {
+static struct cn10k_ml_error_db ml_stype_db_driver[] = {
 	{ML_CN10K_DRIVER_ERR_NOERR, "NO ERROR"},
 	{ML_CN10K_DRIVER_ERR_UNKNOWN, "UNKNOWN ERROR"},
 	{ML_CN10K_DRIVER_ERR_EXCEPTION, "FW EXCEPTION"},
@@ -784,6 +784,7 @@  cn10k_ml_model_load(struct cnxk_ml_dev *cnxk_mldev, struct rte_ml_model_params *
 	model->result_update = cn10k_ml_result_update;
 	model->set_error_code = cn10k_ml_set_error_code;
 	model->set_poll_addr = cn10k_ml_set_poll_addr;
+	model->op_error_get = cn10k_ml_op_error_get;
 
 	return 0;
 }
@@ -1257,7 +1258,7 @@  cn10k_ml_result_update(struct cnxk_ml_dev *cnxk_mldev, int qp_id, void *request)
 
 		/* Handle driver error */
 		error_code = (union cn10k_ml_error_code *)&result->error_code;
-		if (error_code->s.etype == ML_CNXK_ETYPE_DRIVER) {
+		if (error_code->s.etype == ML_CN10K_ETYPE_DRIVER) {
 			cn10k_mldev = &cnxk_mldev->cn10k_mldev;
 
 			/* Check for exception */
@@ -1310,7 +1311,7 @@  cn10k_ml_enqueue_single(struct cnxk_ml_dev *cnxk_mldev, struct rte_ml_op *op, ui
 
 	memset(&req->cn10k_req.result, 0, sizeof(struct cn10k_ml_result));
 	error_code = (union cn10k_ml_error_code *)&req->cn10k_req.result.error_code;
-	error_code->s.etype = ML_CNXK_ETYPE_UNKNOWN;
+	error_code->s.etype = ML_CN10K_ETYPE_UNKNOWN;
 	req->cn10k_req.result.user_ptr = op->user_ptr;
 
 	cnxk_ml_set_poll_ptr(req);
@@ -1324,16 +1325,17 @@  cn10k_ml_enqueue_single(struct cnxk_ml_dev *cnxk_mldev, struct rte_ml_op *op, ui
 }
 
 __rte_hot int
-cn10k_ml_op_error_get(struct rte_ml_dev *dev, struct rte_ml_op *op, struct rte_ml_op_error *error)
+cn10k_ml_op_error_get(struct cnxk_ml_dev *cnxk_mldev, struct rte_ml_op *op,
+		      struct rte_ml_op_error *error)
 {
 	union cn10k_ml_error_code *error_code;
 
-	PLT_SET_USED(dev);
+	PLT_SET_USED(cnxk_mldev);
 
 	error_code = (union cn10k_ml_error_code *)&op->impl_opaque;
 
 	/* Copy sub error message */
-	if (error_code->s.etype == ML_CNXK_ETYPE_HW_NONFATAL) {
+	if (error_code->s.etype == ML_CN10K_ETYPE_HW_NONFATAL) {
 		if (error_code->s.stype < PLT_DIM(ml_stype_db_hw_nf))
 			snprintf(error->message, RTE_ML_STR_MAX, "%s : %s",
 				 ml_etype_db[error_code->s.etype].str,
@@ -1341,7 +1343,7 @@  cn10k_ml_op_error_get(struct rte_ml_dev *dev, struct rte_ml_op *op, struct rte_m
 		else
 			snprintf(error->message, RTE_ML_STR_MAX, "%s : UNKNOWN ERROR",
 				 ml_etype_db[error_code->s.etype].str);
-	} else if (error_code->s.etype == ML_CNXK_ETYPE_DRIVER) {
+	} else if (error_code->s.etype == ML_CN10K_ETYPE_DRIVER) {
 		snprintf(error->message, RTE_ML_STR_MAX, "%s : %s",
 			 ml_etype_db[error_code->s.etype].str,
 			 ml_stype_db_driver[error_code->s.stype].str);
@@ -1387,7 +1389,7 @@  cn10k_ml_inference_sync(void *device, uint16_t index, void *input, void *output,
 
 	memset(&req->cn10k_req.result, 0, sizeof(struct cn10k_ml_result));
 	error_code = (union cn10k_ml_error_code *)&req->cn10k_req.result.error_code;
-	error_code->s.etype = ML_CNXK_ETYPE_UNKNOWN;
+	error_code->s.etype = ML_CN10K_ETYPE_UNKNOWN;
 	req->cn10k_req.result.user_ptr = NULL;
 
 	cnxk_ml_set_poll_ptr(req);
diff --git a/drivers/ml/cnxk/cn10k_ml_ops.h b/drivers/ml/cnxk/cn10k_ml_ops.h
index eb3e1c139c7..0f352282014 100644
--- a/drivers/ml/cnxk/cn10k_ml_ops.h
+++ b/drivers/ml/cnxk/cn10k_ml_ops.h
@@ -312,7 +312,7 @@  int cn10k_ml_model_params_update(struct cnxk_ml_dev *cnxk_mldev, struct cnxk_ml_
 /* Fast-path ops */
 __rte_hot bool cn10k_ml_enqueue_single(struct cnxk_ml_dev *cnxk_mldev, struct rte_ml_op *op,
 				       uint16_t layer_id, struct cnxk_ml_qp *qp, uint64_t head);
-__rte_hot int cn10k_ml_op_error_get(struct rte_ml_dev *dev, struct rte_ml_op *op,
+__rte_hot int cn10k_ml_op_error_get(struct cnxk_ml_dev *cnxk_mldev, struct rte_ml_op *op,
 				    struct rte_ml_op_error *error);
 __rte_hot int cn10k_ml_inference_sync(void *device, uint16_t index, void *input, void *output,
 				      uint16_t nb_batches);
diff --git a/drivers/ml/cnxk/cnxk_ml_dev.c b/drivers/ml/cnxk/cnxk_ml_dev.c
index dc4512223ca..567f8ea7542 100644
--- a/drivers/ml/cnxk/cnxk_ml_dev.c
+++ b/drivers/ml/cnxk/cnxk_ml_dev.c
@@ -12,11 +12,3 @@  int cnxk_ml_dev_initialized;
 
 /* Dummy operations for ML device */
 struct rte_ml_dev_ops ml_dev_dummy_ops = {0};
-
-/* Error type database */
-struct cnxk_ml_error_db ml_etype_db[] = {
-	{ML_CNXK_ETYPE_NO_ERROR, "NO_ERROR"},	     {ML_CNXK_ETYPE_FW_NONFATAL, "FW_NON_FATAL"},
-	{ML_CNXK_ETYPE_HW_NONFATAL, "HW_NON_FATAL"}, {ML_CNXK_ETYPE_HW_FATAL, "HW_FATAL"},
-	{ML_CNXK_ETYPE_HW_WARNING, "HW_WARNING"},    {ML_CNXK_ETYPE_DRIVER, "DRIVER_ERROR"},
-	{ML_CNXK_ETYPE_UNKNOWN, "UNKNOWN_ERROR"},
-};
diff --git a/drivers/ml/cnxk/cnxk_ml_dev.h b/drivers/ml/cnxk/cnxk_ml_dev.h
index 491c4c4aea5..9e373e65715 100644
--- a/drivers/ml/cnxk/cnxk_ml_dev.h
+++ b/drivers/ml/cnxk/cnxk_ml_dev.h
@@ -22,22 +22,6 @@ 
 #define ML_CNXK_POLL_JOB_START	0
 #define ML_CNXK_POLL_JOB_FINISH 1
 
-/* Error types enumeration */
-enum cnxk_ml_error_etype {
-	/* 0x0 */ ML_CNXK_ETYPE_NO_ERROR = 0, /* No error */
-	/* 0x1 */ ML_CNXK_ETYPE_FW_NONFATAL,  /* Firmware non-fatal error */
-	/* 0x2 */ ML_CNXK_ETYPE_HW_NONFATAL,  /* Hardware non-fatal error */
-	/* 0x3 */ ML_CNXK_ETYPE_HW_FATAL,     /* Hardware fatal error */
-	/* 0x4 */ ML_CNXK_ETYPE_HW_WARNING,   /* Hardware warning */
-	/* 0x5 */ ML_CNXK_ETYPE_DRIVER,	      /* Driver specific error */
-	/* 0x6 */ ML_CNXK_ETYPE_UNKNOWN,      /* Unknown error */
-};
-
-struct cnxk_ml_error_db {
-	uint64_t code;
-	char str[RTE_ML_STR_MAX];
-};
-
 /* Device type */
 enum cnxk_ml_dev_type {
 	/* PCI based Marvell's ML HW accelerator device */
@@ -115,6 +99,6 @@  struct cnxk_ml_dev {
 	struct cnxk_ml_index_map *index_map;
 };
 
-extern struct cnxk_ml_error_db ml_etype_db[];
+extern struct cn10k_ml_error_db ml_etype_db[];
 
 #endif /* _CNXK_ML_DEV_H_ */
diff --git a/drivers/ml/cnxk/cnxk_ml_model.h b/drivers/ml/cnxk/cnxk_ml_model.h
index a2fced46a22..1cd5ca1906a 100644
--- a/drivers/ml/cnxk/cnxk_ml_model.h
+++ b/drivers/ml/cnxk/cnxk_ml_model.h
@@ -128,6 +128,8 @@  typedef bool (*enqueue_single_t)(struct cnxk_ml_dev *cnxk_mldev, struct rte_ml_o
 typedef void (*result_update_t)(struct cnxk_ml_dev *cnxk_mldev, int qp_id, void *request);
 typedef void (*set_error_code_t)(struct cnxk_ml_req *req, uint64_t etype, uint64_t stype);
 typedef void (*set_poll_addr_t)(struct cnxk_ml_req *req);
+typedef int (*op_error_get_t)(struct cnxk_ml_dev *cnxk_mldev, struct rte_ml_op *op,
+			      struct rte_ml_op_error *error);
 
 /* Model Object */
 struct cnxk_ml_model {
@@ -184,6 +186,7 @@  struct cnxk_ml_model {
 	result_update_t result_update;
 	set_error_code_t set_error_code;
 	set_poll_addr_t set_poll_addr;
+	op_error_get_t op_error_get;
 };
 
 enum cnxk_ml_model_type cnxk_ml_model_get_type(struct rte_ml_model_params *params);
diff --git a/drivers/ml/cnxk/cnxk_ml_ops.c b/drivers/ml/cnxk/cnxk_ml_ops.c
index 971362b2420..6e0160f2656 100644
--- a/drivers/ml/cnxk/cnxk_ml_ops.c
+++ b/drivers/ml/cnxk/cnxk_ml_ops.c
@@ -647,9 +647,7 @@  cnxk_ml_dev_configure(struct rte_ml_dev *dev, const struct rte_ml_dev_config *co
 
 	cnxk_mldev->mldev->enqueue_burst = cnxk_ml_enqueue_burst;
 	cnxk_mldev->mldev->dequeue_burst = cnxk_ml_dequeue_burst;
-
-	if (cnxk_mldev->type == CNXK_ML_DEV_TYPE_PCI)
-		cnxk_mldev->mldev->op_error_get = cn10k_ml_op_error_get;
+	cnxk_mldev->mldev->op_error_get = cnxk_ml_op_error_get;
 
 	/* Allocate and initialize index_map */
 	if (cnxk_mldev->index_map == NULL) {
@@ -1636,7 +1634,7 @@  cnxk_ml_dequeue_burst(struct rte_ml_dev *dev, uint16_t qp_id, struct rte_ml_op *
 		if (plt_tsc_cycles() < req->timeout)
 			goto empty_or_active;
 		else /* Timeout, set indication of driver error */
-			model->set_error_code(req, ML_CNXK_ETYPE_DRIVER, 0);
+			model->set_error_code(req, ML_CN10K_ETYPE_DRIVER, 0);
 	}
 
 	model->result_update(cnxk_mldev, qp->id, req);
@@ -1654,6 +1652,18 @@  cnxk_ml_dequeue_burst(struct rte_ml_dev *dev, uint16_t qp_id, struct rte_ml_op *
 	return count;
 }
 
+__rte_hot int
+cnxk_ml_op_error_get(struct rte_ml_dev *dev, struct rte_ml_op *op, struct rte_ml_op_error *error)
+{
+	struct cnxk_ml_dev *cnxk_mldev;
+	struct cnxk_ml_model *model;
+
+	cnxk_mldev = dev->data->dev_private;
+	model = cnxk_mldev->mldev->data->models[op->model_id];
+
+	return model->op_error_get(cnxk_mldev, op, error);
+}
+
 struct rte_ml_dev_ops cnxk_ml_ops = {
 	/* Device control ops */
 	.dev_info_get = cnxk_ml_dev_info_get,
diff --git a/drivers/ml/cnxk/cnxk_ml_ops.h b/drivers/ml/cnxk/cnxk_ml_ops.h
index e348cc4e857..7a79fec412e 100644
--- a/drivers/ml/cnxk/cnxk_ml_ops.h
+++ b/drivers/ml/cnxk/cnxk_ml_ops.h
@@ -83,5 +83,7 @@  __rte_hot uint16_t cnxk_ml_dequeue_burst(struct rte_ml_dev *dev, uint16_t qp_id,
 					 struct rte_ml_op **ops, uint16_t nb_ops);
 __rte_hot void cnxk_ml_set_poll_ptr(struct cnxk_ml_req *req);
 __rte_hot uint64_t cnxk_ml_get_poll_ptr(struct cnxk_ml_req *req);
+__rte_hot int cnxk_ml_op_error_get(struct rte_ml_dev *dev, struct rte_ml_op *op,
+				   struct rte_ml_op_error *error);
 
 #endif /* _CNXK_ML_OPS_H_ */
diff --git a/drivers/ml/cnxk/mvtvm_ml_ops.c b/drivers/ml/cnxk/mvtvm_ml_ops.c
index e825c3fb23e..4c1cda3005b 100644
--- a/drivers/ml/cnxk/mvtvm_ml_ops.c
+++ b/drivers/ml/cnxk/mvtvm_ml_ops.c
@@ -329,11 +329,13 @@  mvtvm_ml_model_load(struct cnxk_ml_dev *cnxk_mldev, struct rte_ml_model_params *
 		model->result_update = cn10k_ml_result_update;
 		model->set_error_code = cn10k_ml_set_error_code;
 		model->set_poll_addr = cn10k_ml_set_poll_addr;
+		model->op_error_get = cn10k_ml_op_error_get;
 	} else {
 		model->enqueue_single = mvtvm_ml_enqueue_single;
 		model->result_update = mvtvm_ml_result_update;
 		model->set_error_code = mvtvm_ml_set_error_code;
 		model->set_poll_addr = mvtvm_ml_set_poll_addr;
+		model->op_error_get = mvtvm_ml_op_error_get;
 	}
 
 	return 0;
@@ -584,6 +586,17 @@  mvtvm_ml_set_error_code(struct cnxk_ml_req *req, uint64_t etype, uint64_t stype)
 	req->mvtvm_req.result.error_code = etype;
 }
 
+__rte_hot int
+mvtvm_ml_op_error_get(struct cnxk_ml_dev *cnxk_mldev, struct rte_ml_op *op,
+		      struct rte_ml_op_error *error)
+{
+	RTE_SET_USED(cnxk_mldev);
+	RTE_SET_USED(op);
+	RTE_SET_USED(error);
+
+	return 0;
+}
+
 __rte_hot bool
 mvtvm_ml_enqueue_single(struct cnxk_ml_dev *cnxk_mldev, struct rte_ml_op *op, uint16_t layer_id,
 			struct cnxk_ml_qp *qp, uint64_t head)
diff --git a/drivers/ml/cnxk/mvtvm_ml_ops.h b/drivers/ml/cnxk/mvtvm_ml_ops.h
index 0232c5ead5d..d8f2f361fb1 100644
--- a/drivers/ml/cnxk/mvtvm_ml_ops.h
+++ b/drivers/ml/cnxk/mvtvm_ml_ops.h
@@ -71,6 +71,8 @@  int mvtvm_ml_io_dequantize(void *device, uint16_t model_id, const char *layer_na
 
 __rte_hot bool mvtvm_ml_enqueue_single(struct cnxk_ml_dev *cnxk_mldev, struct rte_ml_op *op,
 				       uint16_t layer_id, struct cnxk_ml_qp *qp, uint64_t head);
+__rte_hot int mvtvm_ml_op_error_get(struct cnxk_ml_dev *cnxk_mldev, struct rte_ml_op *op,
+				    struct rte_ml_op_error *error);
 __rte_hot void mvtvm_ml_result_update(struct cnxk_ml_dev *cnxk_mldev, int qp_id, void *request);
 __rte_hot void mvtvm_ml_set_error_code(struct cnxk_ml_req *req, uint64_t etype, uint64_t stype);