[v5,17/34] ml/cnxk: move error handling to cnxk layer

Message ID 20231018064806.24145-18-syalavarthi@marvell.com (mailing list archive)
State Superseded, archived
Delegated to: Jerin Jacob
Headers
Series Implementation of revised ml/cnxk driver |

Checks

Context Check Description
ci/checkpatch success coding style OK

Commit Message

Srikanth Yalavarthi Oct. 18, 2023, 6:47 a.m. UTC
  Move error type structures to cnxk layer. cn10k layer to
handle fw and hw error sub-types only.

Signed-off-by: Srikanth Yalavarthi <syalavarthi@marvell.com>
---
 drivers/ml/cnxk/cn10k_ml_dev.h | 41 ++++++---------
 drivers/ml/cnxk/cn10k_ml_ops.c | 93 +++++++++++++---------------------
 drivers/ml/cnxk/cnxk_ml_dev.c  |  8 +++
 drivers/ml/cnxk/cnxk_ml_dev.h  | 18 +++++++
 drivers/ml/cnxk/cnxk_ml_ops.c  |  2 +-
 5 files changed, 78 insertions(+), 84 deletions(-)
  

Patch

diff --git a/drivers/ml/cnxk/cn10k_ml_dev.h b/drivers/ml/cnxk/cn10k_ml_dev.h
index 94a94d996f..2e7eb6c9ef 100644
--- a/drivers/ml/cnxk/cn10k_ml_dev.h
+++ b/drivers/ml/cnxk/cn10k_ml_dev.h
@@ -52,38 +52,27 @@  struct cnxk_ml_dev;
 struct cnxk_ml_req;
 struct cnxk_ml_qp;
 
-/* Error types enumeration */
-enum cn10k_ml_error_etype {
-	/* 0x0 */ ML_ETYPE_NO_ERROR = 0, /* No error */
-	/* 0x1 */ ML_ETYPE_FW_NONFATAL,	 /* Firmware non-fatal error */
-	/* 0x2 */ ML_ETYPE_HW_NONFATAL,	 /* Hardware non-fatal error */
-	/* 0x3 */ ML_ETYPE_HW_FATAL,	 /* Hardware fatal error */
-	/* 0x4 */ ML_ETYPE_HW_WARNING,	 /* Hardware warning */
-	/* 0x5 */ ML_ETYPE_DRIVER,	 /* Driver specific error */
-	/* 0x6 */ ML_ETYPE_UNKNOWN,	 /* Unknown error */
-};
-
 /* Firmware non-fatal error sub-type */
 enum cn10k_ml_error_stype_fw_nf {
-	/* 0x0 */ ML_FW_ERR_NOERR = 0,		 /* No error */
-	/* 0x1 */ ML_FW_ERR_UNLOAD_ID_NOT_FOUND, /* Model ID not found during load */
-	/* 0x2 */ ML_FW_ERR_LOAD_LUT_OVERFLOW,	 /* Lookup table overflow at load */
-	/* 0x3 */ ML_FW_ERR_ID_IN_USE,		 /* Model ID already in use */
-	/* 0x4 */ ML_FW_ERR_INVALID_TILEMASK,	 /* Invalid OCM tilemask */
-	/* 0x5 */ ML_FW_ERR_RUN_LUT_OVERFLOW,	 /* Lookup table overflow at run */
-	/* 0x6 */ ML_FW_ERR_RUN_ID_NOT_FOUND,	 /* Model ID not found during run */
-	/* 0x7 */ ML_FW_ERR_COMMAND_NOTSUP,	 /* Unsupported command */
-	/* 0x8 */ ML_FW_ERR_DDR_ADDR_RANGE,	 /* DDR address out of range */
-	/* 0x9 */ ML_FW_ERR_NUM_BATCHES_INVALID, /* Invalid number of batches */
-	/* 0xA */ ML_FW_ERR_INSSYNC_TIMEOUT,	 /* INS sync timeout */
+	/* 0x0 */ ML_CN10K_FW_ERR_NOERR = 0,	       /* No error */
+	/* 0x1 */ ML_CN10K_FW_ERR_UNLOAD_ID_NOT_FOUND, /* Model ID not found during load */
+	/* 0x2 */ ML_CN10K_FW_ERR_LOAD_LUT_OVERFLOW,   /* Lookup table overflow at load */
+	/* 0x3 */ ML_CN10K_FW_ERR_ID_IN_USE,	       /* Model ID already in use */
+	/* 0x4 */ ML_CN10K_FW_ERR_INVALID_TILEMASK,    /* Invalid OCM tilemask */
+	/* 0x5 */ ML_CN10K_FW_ERR_RUN_LUT_OVERFLOW,    /* Lookup table overflow at run */
+	/* 0x6 */ ML_CN10K_FW_ERR_RUN_ID_NOT_FOUND,    /* Model ID not found during run */
+	/* 0x7 */ ML_CN10K_FW_ERR_COMMAND_NOTSUP,      /* Unsupported command */
+	/* 0x8 */ ML_CN10K_FW_ERR_DDR_ADDR_RANGE,      /* DDR address out of range */
+	/* 0x9 */ ML_CN10K_FW_ERR_NUM_BATCHES_INVALID, /* Invalid number of batches */
+	/* 0xA */ ML_CN10K_FW_ERR_INSSYNC_TIMEOUT,     /* INS sync timeout */
 };
 
 /* Driver error sub-type */
 enum cn10k_ml_error_stype_driver {
-	/* 0x0 */ ML_DRIVER_ERR_NOERR = 0, /* No error */
-	/* 0x1 */ ML_DRIVER_ERR_UNKNOWN,   /* Unable to determine error sub-type */
-	/* 0x2 */ ML_DRIVER_ERR_EXCEPTION, /* Firmware exception */
-	/* 0x3 */ ML_DRIVER_ERR_FW_ERROR,  /* Unknown firmware error */
+	/* 0x0 */ ML_CN10K_DRIVER_ERR_NOERR = 0, /* No error */
+	/* 0x1 */ ML_CN10K_DRIVER_ERR_UNKNOWN,	 /* Unable to determine error sub-type */
+	/* 0x2 */ ML_CN10K_DRIVER_ERR_EXCEPTION, /* Firmware exception */
+	/* 0x3 */ ML_CN10K_DRIVER_ERR_FW_ERROR,	 /* Unknown firmware error */
 };
 
 /* Error structure */
diff --git a/drivers/ml/cnxk/cn10k_ml_ops.c b/drivers/ml/cnxk/cn10k_ml_ops.c
index 8116c8dedb..65eaaf030d 100644
--- a/drivers/ml/cnxk/cn10k_ml_ops.c
+++ b/drivers/ml/cnxk/cn10k_ml_ops.c
@@ -22,47 +22,27 @@ 
 #define ML_FLAGS_POLL_COMPL BIT(0)
 #define ML_FLAGS_SSO_COMPL  BIT(1)
 
-/* Error message length */
-#define ERRMSG_LEN 32
-
-/* Error type database */
-static const struct cn10k_ml_etype_db {
-	enum cn10k_ml_error_etype etype;
-	char name[ERRMSG_LEN];
-} ml_etype_db[] = {
-	{ML_ETYPE_NO_ERROR, "NO_ERROR"},	{ML_ETYPE_FW_NONFATAL, "FW_NON_FATAL"},
-	{ML_ETYPE_HW_NONFATAL, "HW_NON_FATAL"}, {ML_ETYPE_HW_FATAL, "HW_FATAL"},
-	{ML_ETYPE_HW_WARNING, "HW_WARNING"},	{ML_ETYPE_DRIVER, "DRIVER_ERROR"},
-	{ML_ETYPE_UNKNOWN, "UNKNOWN_ERROR"},
-};
-
 /* Hardware non-fatal error subtype database */
-static const struct cn10k_ml_stype_db_hw_nf {
-	enum cn10k_ml_error_stype_fw_nf stype;
-	char msg[ERRMSG_LEN];
-} ml_stype_db_hw_nf[] = {
-	{ML_FW_ERR_NOERR, "NO ERROR"},
-	{ML_FW_ERR_UNLOAD_ID_NOT_FOUND, "UNLOAD MODEL ID NOT FOUND"},
-	{ML_FW_ERR_LOAD_LUT_OVERFLOW, "LOAD LUT OVERFLOW"},
-	{ML_FW_ERR_ID_IN_USE, "MODEL ID IN USE"},
-	{ML_FW_ERR_INVALID_TILEMASK, "INVALID TILEMASK"},
-	{ML_FW_ERR_RUN_LUT_OVERFLOW, "RUN LUT OVERFLOW"},
-	{ML_FW_ERR_RUN_ID_NOT_FOUND, "RUN MODEL ID NOT FOUND"},
-	{ML_FW_ERR_COMMAND_NOTSUP, "COMMAND NOT SUPPORTED"},
-	{ML_FW_ERR_DDR_ADDR_RANGE, "DDR ADDRESS OUT OF RANGE"},
-	{ML_FW_ERR_NUM_BATCHES_INVALID, "INVALID BATCHES"},
-	{ML_FW_ERR_INSSYNC_TIMEOUT, "INSSYNC TIMEOUT"},
+static struct cnxk_ml_error_db ml_stype_db_hw_nf[] = {
+	{ML_CN10K_FW_ERR_NOERR, "NO ERROR"},
+	{ML_CN10K_FW_ERR_UNLOAD_ID_NOT_FOUND, "UNLOAD MODEL ID NOT FOUND"},
+	{ML_CN10K_FW_ERR_LOAD_LUT_OVERFLOW, "LOAD LUT OVERFLOW"},
+	{ML_CN10K_FW_ERR_ID_IN_USE, "MODEL ID IN USE"},
+	{ML_CN10K_FW_ERR_INVALID_TILEMASK, "INVALID TILEMASK"},
+	{ML_CN10K_FW_ERR_RUN_LUT_OVERFLOW, "RUN LUT OVERFLOW"},
+	{ML_CN10K_FW_ERR_RUN_ID_NOT_FOUND, "RUN MODEL ID NOT FOUND"},
+	{ML_CN10K_FW_ERR_COMMAND_NOTSUP, "COMMAND NOT SUPPORTED"},
+	{ML_CN10K_FW_ERR_DDR_ADDR_RANGE, "DDR ADDRESS OUT OF RANGE"},
+	{ML_CN10K_FW_ERR_NUM_BATCHES_INVALID, "INVALID BATCHES"},
+	{ML_CN10K_FW_ERR_INSSYNC_TIMEOUT, "INSSYNC TIMEOUT"},
 };
 
 /* Driver error subtype database */
-static const struct cn10k_ml_stype_db_driver {
-	enum cn10k_ml_error_stype_driver stype;
-	char msg[ERRMSG_LEN];
-} ml_stype_db_driver[] = {
-	{ML_DRIVER_ERR_NOERR, "NO ERROR"},
-	{ML_DRIVER_ERR_UNKNOWN, "UNKNOWN ERROR"},
-	{ML_DRIVER_ERR_EXCEPTION, "FW EXCEPTION"},
-	{ML_DRIVER_ERR_FW_ERROR, "UNKNOWN FIRMWARE ERROR"},
+static struct cnxk_ml_error_db ml_stype_db_driver[] = {
+	{ML_CN10K_DRIVER_ERR_NOERR, "NO ERROR"},
+	{ML_CN10K_DRIVER_ERR_UNKNOWN, "UNKNOWN ERROR"},
+	{ML_CN10K_DRIVER_ERR_EXCEPTION, "FW EXCEPTION"},
+	{ML_CN10K_DRIVER_ERR_FW_ERROR, "UNKNOWN FIRMWARE ERROR"},
 };
 
 __rte_hot void
@@ -1241,19 +1221,19 @@  cn10k_ml_result_update(struct cnxk_ml_dev *cnxk_mldev, int qp_id, void *request)
 
 		/* Handle driver error */
 		error_code = (union cn10k_ml_error_code *)&result->error_code;
-		if (error_code->s.etype == ML_ETYPE_DRIVER) {
+		if (error_code->s.etype == ML_CNXK_ETYPE_DRIVER) {
 			cn10k_mldev = &cnxk_mldev->cn10k_mldev;
 
 			/* Check for exception */
 			if ((roc_ml_reg_read64(&cn10k_mldev->roc, ML_SCRATCH_EXCEPTION_SP_C0) !=
 			     0) ||
 			    (roc_ml_reg_read64(&cn10k_mldev->roc, ML_SCRATCH_EXCEPTION_SP_C1) != 0))
-				error_code->s.stype = ML_DRIVER_ERR_EXCEPTION;
+				error_code->s.stype = ML_CN10K_DRIVER_ERR_EXCEPTION;
 			else if ((roc_ml_reg_read64(&cn10k_mldev->roc, ML_CORE_INT_LO) != 0) ||
 				 (roc_ml_reg_read64(&cn10k_mldev->roc, ML_CORE_INT_HI) != 0))
-				error_code->s.stype = ML_DRIVER_ERR_FW_ERROR;
+				error_code->s.stype = ML_CN10K_DRIVER_ERR_FW_ERROR;
 			else
-				error_code->s.stype = ML_DRIVER_ERR_UNKNOWN;
+				error_code->s.stype = ML_CN10K_DRIVER_ERR_UNKNOWN;
 		}
 
 		op->impl_opaque = result->error_code;
@@ -1294,7 +1274,7 @@  cn10k_ml_enqueue_single(struct cnxk_ml_dev *cnxk_mldev, struct rte_ml_op *op, ui
 
 	memset(&req->cn10k_req.result, 0, sizeof(struct cn10k_ml_result));
 	error_code = (union cn10k_ml_error_code *)&req->cn10k_req.result.error_code;
-	error_code->s.etype = ML_ETYPE_UNKNOWN;
+	error_code->s.etype = ML_CNXK_ETYPE_UNKNOWN;
 	req->cn10k_req.result.user_ptr = op->user_ptr;
 
 	cnxk_ml_set_poll_ptr(req);
@@ -1311,30 +1291,29 @@  __rte_hot int
 cn10k_ml_op_error_get(struct rte_ml_dev *dev, struct rte_ml_op *op, struct rte_ml_op_error *error)
 {
 	union cn10k_ml_error_code *error_code;
-	char msg[RTE_ML_STR_MAX];
 
 	PLT_SET_USED(dev);
 
 	error_code = (union cn10k_ml_error_code *)&op->impl_opaque;
 
-	/* Copy error message */
-	plt_strlcpy(msg, ml_etype_db[error_code->s.etype].name, sizeof(msg));
-
 	/* Copy sub error message */
-	if (error_code->s.etype == ML_ETYPE_HW_NONFATAL) {
-		strcat(msg, " : ");
+	if (error_code->s.etype == ML_CNXK_ETYPE_HW_NONFATAL) {
 		if (error_code->s.stype < PLT_DIM(ml_stype_db_hw_nf))
-			strcat(msg, ml_stype_db_hw_nf[error_code->s.stype].msg);
+			snprintf(error->message, RTE_ML_STR_MAX, "%s : %s",
+				 ml_etype_db[error_code->s.etype].str,
+				 ml_stype_db_hw_nf[error_code->s.stype].str);
 		else
-			strcat(msg, "UNKNOWN ERROR");
-	}
-
-	if (error_code->s.etype == ML_ETYPE_DRIVER) {
-		strcat(msg, " : ");
-		strcat(msg, ml_stype_db_driver[error_code->s.stype].msg);
+			snprintf(error->message, RTE_ML_STR_MAX, "%s : UNKNOWN ERROR",
+				 ml_etype_db[error_code->s.etype].str);
+	} else if (error_code->s.etype == ML_CNXK_ETYPE_DRIVER) {
+		snprintf(error->message, RTE_ML_STR_MAX, "%s : %s",
+			 ml_etype_db[error_code->s.etype].str,
+			 ml_stype_db_driver[error_code->s.stype].str);
+	} else {
+		snprintf(error->message, RTE_ML_STR_MAX, "%s",
+			 ml_etype_db[error_code->s.etype].str);
 	}
 
-	plt_strlcpy(error->message, msg, sizeof(error->message));
 	error->errcode = error_code->u64;
 
 	return 0;
@@ -1372,7 +1351,7 @@  cn10k_ml_inference_sync(void *device, uint16_t index, void *input, void *output,
 
 	memset(&req->cn10k_req.result, 0, sizeof(struct cn10k_ml_result));
 	error_code = (union cn10k_ml_error_code *)&req->cn10k_req.result.error_code;
-	error_code->s.etype = ML_ETYPE_UNKNOWN;
+	error_code->s.etype = ML_CNXK_ETYPE_UNKNOWN;
 	req->cn10k_req.result.user_ptr = NULL;
 
 	cnxk_ml_set_poll_ptr(req);
diff --git a/drivers/ml/cnxk/cnxk_ml_dev.c b/drivers/ml/cnxk/cnxk_ml_dev.c
index 2a5c17c973..63d1c9e417 100644
--- a/drivers/ml/cnxk/cnxk_ml_dev.c
+++ b/drivers/ml/cnxk/cnxk_ml_dev.c
@@ -9,3 +9,11 @@ 
 
 /* Dummy operations for ML device */
 struct rte_ml_dev_ops ml_dev_dummy_ops = {0};
+
+/* Error type database */
+struct cnxk_ml_error_db ml_etype_db[] = {
+	{ML_CNXK_ETYPE_NO_ERROR, "NO_ERROR"},	     {ML_CNXK_ETYPE_FW_NONFATAL, "FW_NON_FATAL"},
+	{ML_CNXK_ETYPE_HW_NONFATAL, "HW_NON_FATAL"}, {ML_CNXK_ETYPE_HW_FATAL, "HW_FATAL"},
+	{ML_CNXK_ETYPE_HW_WARNING, "HW_WARNING"},    {ML_CNXK_ETYPE_DRIVER, "DRIVER_ERROR"},
+	{ML_CNXK_ETYPE_UNKNOWN, "UNKNOWN_ERROR"},
+};
diff --git a/drivers/ml/cnxk/cnxk_ml_dev.h b/drivers/ml/cnxk/cnxk_ml_dev.h
index 3ce9338f1f..382fca64be 100644
--- a/drivers/ml/cnxk/cnxk_ml_dev.h
+++ b/drivers/ml/cnxk/cnxk_ml_dev.h
@@ -18,6 +18,22 @@ 
 #define ML_CNXK_POLL_JOB_START	0
 #define ML_CNXK_POLL_JOB_FINISH 1
 
+/* Error types enumeration */
+enum cnxk_ml_error_etype {
+	/* 0x0 */ ML_CNXK_ETYPE_NO_ERROR = 0, /* No error */
+	/* 0x1 */ ML_CNXK_ETYPE_FW_NONFATAL,  /* Firmware non-fatal error */
+	/* 0x2 */ ML_CNXK_ETYPE_HW_NONFATAL,  /* Hardware non-fatal error */
+	/* 0x3 */ ML_CNXK_ETYPE_HW_FATAL,     /* Hardware fatal error */
+	/* 0x4 */ ML_CNXK_ETYPE_HW_WARNING,   /* Hardware warning */
+	/* 0x5 */ ML_CNXK_ETYPE_DRIVER,	      /* Driver specific error */
+	/* 0x6 */ ML_CNXK_ETYPE_UNKNOWN,      /* Unknown error */
+};
+
+struct cnxk_ml_error_db {
+	uint64_t code;
+	char str[RTE_ML_STR_MAX];
+};
+
 /* Device configuration state enum */
 enum cnxk_ml_dev_state {
 	/* Probed and not configured */
@@ -78,4 +94,6 @@  struct cnxk_ml_dev {
 	struct cnxk_ml_index_map *index_map;
 };
 
+extern struct cnxk_ml_error_db ml_etype_db[];
+
 #endif /* _CNXK_ML_DEV_H_ */
diff --git a/drivers/ml/cnxk/cnxk_ml_ops.c b/drivers/ml/cnxk/cnxk_ml_ops.c
index 6a44a69508..8339f8342b 100644
--- a/drivers/ml/cnxk/cnxk_ml_ops.c
+++ b/drivers/ml/cnxk/cnxk_ml_ops.c
@@ -1372,7 +1372,7 @@  cnxk_ml_dequeue_burst(struct rte_ml_dev *dev, uint16_t qp_id, struct rte_ml_op *
 		if (plt_tsc_cycles() < req->timeout)
 			goto empty_or_active;
 		else /* Timeout, set indication of driver error */
-			model->set_error_code(req, ML_ETYPE_DRIVER, 0);
+			model->set_error_code(req, ML_CNXK_ETYPE_DRIVER, 0);
 	}
 
 	model->result_update(cnxk_mldev, qp->id, req);