[v5,29/39] ml/cnxk: enable support for firmware error codes

Message ID 20230207160719.1307-30-syalavarthi@marvell.com (mailing list archive)
State Superseded, archived
Delegated to: Thomas Monjalon
Headers
Series Implementation of ML CNXK driver |

Checks

Context Check Description
ci/checkpatch success coding style OK

Commit Message

Srikanth Yalavarthi Feb. 7, 2023, 4:07 p.m. UTC
  Enabled support for error handling. Added error types and subtypes
supported by ML firmware. Enabled support to get device specific
error code and message for a completed ML request.

Signed-off-by: Srikanth Yalavarthi <syalavarthi@marvell.com>
---
 drivers/ml/cnxk/cn10k_ml_dev.c |   4 +-
 drivers/ml/cnxk/cn10k_ml_dev.h |  50 +++++++++++++-
 drivers/ml/cnxk/cn10k_ml_ops.c | 117 ++++++++++++++++++++++++++++++---
 drivers/ml/cnxk/cn10k_ml_ops.h |   2 +
 4 files changed, 160 insertions(+), 13 deletions(-)
  

Patch

diff --git a/drivers/ml/cnxk/cn10k_ml_dev.c b/drivers/ml/cnxk/cn10k_ml_dev.c
index 837f006bf0..76ed853a3c 100644
--- a/drivers/ml/cnxk/cn10k_ml_dev.c
+++ b/drivers/ml/cnxk/cn10k_ml_dev.c
@@ -261,7 +261,7 @@  cn10k_ml_fw_load_asim(struct cn10k_ml_fw *fw)
 	} while (plt_tsc_cycles() < timeout_cycle);
 
 	/* Check firmware load status, clean-up and exit on failure. */
-	if ((!timeout) && (fw->req->result.error_code == 0)) {
+	if ((!timeout) && (fw->req->result.error_code.u64 == 0)) {
 		cn10k_ml_fw_print_info(fw);
 	} else {
 		/* Set ML to disable new jobs */
@@ -452,7 +452,7 @@  cn10k_ml_fw_load_cn10ka(struct cn10k_ml_fw *fw, void *buffer, uint64_t size)
 	} while (plt_tsc_cycles() < timeout_cycle);
 
 	/* Check firmware load status, clean-up and exit on failure. */
-	if ((!timeout) && (fw->req->result.error_code == 0)) {
+	if ((!timeout) && (fw->req->result.error_code.u64 == 0)) {
 		cn10k_ml_fw_print_info(fw);
 	} else {
 		/* Set ML to disable new jobs */
diff --git a/drivers/ml/cnxk/cn10k_ml_dev.h b/drivers/ml/cnxk/cn10k_ml_dev.h
index 8f6bc24370..604a200e26 100644
--- a/drivers/ml/cnxk/cn10k_ml_dev.h
+++ b/drivers/ml/cnxk/cn10k_ml_dev.h
@@ -64,6 +64,54 @@  enum cn10k_ml_dev_state {
 	ML_CN10K_DEV_STATE_CLOSED
 };
 
+/* Error types enumeration */
+enum cn10k_ml_error_etype {
+	/* 0x0 */ ML_ETYPE_NO_ERROR = 0, /* No error */
+	/* 0x1 */ ML_ETYPE_FW_NONFATAL,	 /* Firmware non-fatal error */
+	/* 0x2 */ ML_ETYPE_HW_NONFATAL,	 /* Hardware non-fatal error */
+	/* 0x3 */ ML_ETYPE_HW_FATAL,	 /* Hardware fatal error */
+	/* 0x4 */ ML_ETYPE_HW_WARNING,	 /* Hardware warning */
+	/* 0x5 */ ML_ETYPE_DRIVER,	 /* Driver specific error */
+	/* 0x6 */ ML_ETYPE_UNKNOWN,	 /* Unknown error */
+};
+
+/* Firmware non-fatal error sub-type */
+enum cn10k_ml_error_stype_fw_nf {
+	/* 0x0 */ ML_FW_ERR_NOERR = 0,		 /* No error */
+	/* 0x1 */ ML_FW_ERR_UNLOAD_ID_NOT_FOUND, /* Model ID not found during load */
+	/* 0x2 */ ML_FW_ERR_LOAD_LUT_OVERFLOW,	 /* Lookup table overflow at load */
+	/* 0x3 */ ML_FW_ERR_ID_IN_USE,		 /* Model ID already in use */
+	/* 0x4 */ ML_FW_ERR_INVALID_TILEMASK,	 /* Invalid OCM tilemask */
+	/* 0x5 */ ML_FW_ERR_RUN_LUT_OVERFLOW,	 /* Lookup table overflow at run */
+	/* 0x6 */ ML_FW_ERR_RUN_ID_NOT_FOUND,	 /* Model ID not found during run */
+	/* 0x7 */ ML_FW_ERR_COMMAND_NOTSUP,	 /* Unsupported command */
+	/* 0x8 */ ML_FW_ERR_DDR_ADDR_RANGE,	 /* DDR address out of range */
+	/* 0x9 */ ML_FW_ERR_NUM_BATCHES_INVALID, /* Invalid number of batches */
+	/* 0xA */ ML_FW_ERR_INSSYNC_TIMEOUT,	 /* INS sync timeout */
+};
+
+/* Driver error sub-type */
+enum cn10k_ml_error_stype_driver {
+	/* 0x0 */ ML_DRIVER_ERR_NOERR = 0, /* No error */
+	/* 0x1 */ ML_DRIVER_ERR_UNKNOWN,   /* Unable to determine error sub-type */
+	/* 0x2 */ ML_DRIVER_ERR_EXCEPTION, /* Firmware exception */
+	/* 0x3 */ ML_DRIVER_ERR_FW_ERROR,  /* Unknown firmware error */
+};
+
+/* Error structure */
+union cn10k_ml_error_code {
+	struct {
+		/* Error type */
+		uint64_t etype : 4;
+
+		/* Error sub-type */
+		uint64_t stype : 60;
+	} s;
+
+	/* WORD 0 */
+	uint64_t u64;
+};
+
 /* Firmware stats */
 struct cn10k_ml_fw_stats {
 	/* Firmware start cycle */
@@ -82,7 +130,7 @@  struct cn10k_ml_fw_stats {
 /* Result structure */
 struct cn10k_ml_result {
 	/* Job error code */
-	uint64_t error_code;
+	union cn10k_ml_error_code error_code;
 
 	/* Firmware stats */
 	struct cn10k_ml_fw_stats stats;
diff --git a/drivers/ml/cnxk/cn10k_ml_ops.c b/drivers/ml/cnxk/cn10k_ml_ops.c
index 87778c37bb..23a9ca4ff2 100644
--- a/drivers/ml/cnxk/cn10k_ml_ops.c
+++ b/drivers/ml/cnxk/cn10k_ml_ops.c
@@ -23,6 +23,49 @@ 
 #define ML_FLAGS_POLL_COMPL BIT(0)
 #define ML_FLAGS_SSO_COMPL  BIT(1)
 
+/* Error message length */
+#define ERRMSG_LEN 32
+
+/* Error type database */
+static const struct cn10k_ml_etype_db {
+	enum cn10k_ml_error_etype etype;
+	char name[ERRMSG_LEN];
+} ml_etype_db[] = {
+	{ML_ETYPE_NO_ERROR, "NO_ERROR"},	{ML_ETYPE_FW_NONFATAL, "FW_NON_FATAL"},
+	{ML_ETYPE_HW_NONFATAL, "HW_NON_FATAL"}, {ML_ETYPE_HW_FATAL, "HW_FATAL"},
+	{ML_ETYPE_HW_WARNING, "HW_WARNING"},	{ML_ETYPE_DRIVER, "DRIVER_ERROR"},
+	{ML_ETYPE_UNKNOWN, "UNKNOWN_ERROR"},
+};
+
+/* Hardware non-fatal error subtype database */
+static const struct cn10k_ml_stype_db_hw_nf {
+	enum cn10k_ml_error_stype_fw_nf stype;
+	char msg[ERRMSG_LEN];
+} ml_stype_db_hw_nf[] = {
+	{ML_FW_ERR_NOERR, "NO ERROR"},
+	{ML_FW_ERR_UNLOAD_ID_NOT_FOUND, "UNLOAD MODEL ID NOT FOUND"},
+	{ML_FW_ERR_LOAD_LUT_OVERFLOW, "LOAD LUT OVERFLOW"},
+	{ML_FW_ERR_ID_IN_USE, "MODEL ID IN USE"},
+	{ML_FW_ERR_INVALID_TILEMASK, "INVALID TILEMASK"},
+	{ML_FW_ERR_RUN_LUT_OVERFLOW, "RUN LUT OVERFLOW"},
+	{ML_FW_ERR_RUN_ID_NOT_FOUND, "RUN MODEL ID NOT FOUND"},
+	{ML_FW_ERR_COMMAND_NOTSUP, "COMMAND NOT SUPPORTED"},
+	{ML_FW_ERR_DDR_ADDR_RANGE, "DDR ADDRESS OUT OF RANGE"},
+	{ML_FW_ERR_NUM_BATCHES_INVALID, "INVALID BATCHES"},
+	{ML_FW_ERR_INSSYNC_TIMEOUT, "INSSYNC TIMEOUT"},
+};
+
+/* Driver error subtype database */
+static const struct cn10k_ml_stype_db_driver {
+	enum cn10k_ml_error_stype_driver stype;
+	char msg[ERRMSG_LEN];
+} ml_stype_db_driver[] = {
+	{ML_DRIVER_ERR_NOERR, "NO ERROR"},
+	{ML_DRIVER_ERR_UNKNOWN, "UNKNOWN ERROR"},
+	{ML_DRIVER_ERR_EXCEPTION, "FW EXCEPTION"},
+	{ML_DRIVER_ERR_FW_ERROR, "UNKNOWN FIRMWARE ERROR"},
+};
+
 static void
 print_line(FILE *fp, int len)
 {
@@ -474,6 +517,7 @@  cn10k_ml_dev_configure(struct rte_ml_dev *dev, const struct rte_ml_dev_config *c
 
 	dev->enqueue_burst = cn10k_ml_enqueue_burst;
 	dev->dequeue_burst = cn10k_ml_dequeue_burst;
+	dev->op_error_get = cn10k_ml_op_error_get;
 
 	mldev->nb_models_loaded = 0;
 	mldev->state = ML_CN10K_DEV_STATE_CONFIGURED;
@@ -758,7 +802,7 @@  cn10k_ml_dev_selftest(struct rte_ml_dev *dev)
 	if (timeout) {
 		ret = -ETIME;
 	} else {
-		if (req->result.error_code != 0)
+		if (req->result.error_code.u64 != 0)
 			ret = -1;
 	}
 
@@ -936,7 +980,7 @@  cn10k_ml_model_start(struct rte_ml_dev *dev, uint16_t model_id)
 	/* Prepare JD */
 	req = model->req;
 	cn10k_ml_prep_sp_job_descriptor(mldev, model, req, ML_CN10K_JOB_TYPE_MODEL_START);
-	req->result.error_code = 0x0;
+	req->result.error_code.u64 = 0x0;
 	req->result.user_ptr = NULL;
 
 	plt_write64(ML_CN10K_POLL_JOB_START, &req->status);
@@ -1017,7 +1061,7 @@  cn10k_ml_model_start(struct rte_ml_dev *dev, uint16_t model_id)
 
 	if (job_dequeued) {
 		if (plt_read64(&req->status) == ML_CN10K_POLL_JOB_FINISH) {
-			if (req->result.error_code == 0)
+			if (req->result.error_code.u64 == 0)
 				ret = 0;
 			else
 				ret = -1;
@@ -1079,7 +1123,7 @@  cn10k_ml_model_stop(struct rte_ml_dev *dev, uint16_t model_id)
 	/* Prepare JD */
 	req = model->req;
 	cn10k_ml_prep_sp_job_descriptor(mldev, model, req, ML_CN10K_JOB_TYPE_MODEL_STOP);
-	req->result.error_code = 0x0;
+	req->result.error_code.u64 = 0x0;
 	req->result.user_ptr = NULL;
 
 	plt_write64(ML_CN10K_POLL_JOB_START, &req->status);
@@ -1134,7 +1178,7 @@  cn10k_ml_model_stop(struct rte_ml_dev *dev, uint16_t model_id)
 
 	if (job_dequeued) {
 		if (plt_read64(&req->status) == ML_CN10K_POLL_JOB_FINISH) {
-			if (req->result.error_code == 0x0)
+			if (req->result.error_code.u64 == 0x0)
 				ret = 0;
 			else
 				ret = -1;
@@ -1426,12 +1470,30 @@  cn10k_ml_result_update(struct rte_ml_dev *dev, int qp_id, struct cn10k_ml_result
 	PLT_SET_USED(dev);
 	PLT_SET_USED(qp_id);
 
-	op->impl_opaque = result->error_code;
+	struct cn10k_ml_dev *mldev;
 
-	if (likely(result->error_code == 0))
+	if (likely(result->error_code.u64 == 0)) {
+		op->impl_opaque = result->error_code.u64;
 		op->status = RTE_ML_OP_STATUS_SUCCESS;
-	else
+	} else {
+		/* Handle driver error */
+		if (result->error_code.s.etype == ML_ETYPE_DRIVER) {
+			mldev = dev->data->dev_private;
+
+			/* Check for exception */
+			if ((roc_ml_reg_read64(&mldev->roc, ML_SCRATCH_EXCEPTION_SP_C0) != 0) ||
+			    (roc_ml_reg_read64(&mldev->roc, ML_SCRATCH_EXCEPTION_SP_C1) != 0))
+				result->error_code.s.stype = ML_DRIVER_ERR_EXCEPTION;
+			else if ((roc_ml_reg_read64(&mldev->roc, ML_CORE_INT_LO) != 0) ||
+				 (roc_ml_reg_read64(&mldev->roc, ML_CORE_INT_HI) != 0))
+				result->error_code.s.stype = ML_DRIVER_ERR_FW_ERROR;
+			else
+				result->error_code.s.stype = ML_DRIVER_ERR_UNKNOWN;
+		}
+
+		op->impl_opaque = result->error_code.u64;
 		op->status = RTE_ML_OP_STATUS_ERROR;
+	}
 
 	op->user_ptr = result->user_ptr;
 }
@@ -1468,6 +1530,7 @@  cn10k_ml_enqueue_burst(struct rte_ml_dev *dev, uint16_t qp_id, struct rte_ml_op
 	cn10k_ml_prep_fp_job_descriptor(dev, req, op);
 
 	memset(&req->result, 0, sizeof(struct cn10k_ml_result));
+	req->result.error_code.s.etype = ML_ETYPE_UNKNOWN;
 	req->result.user_ptr = op->user_ptr;
 
 	plt_write64(ML_CN10K_POLL_JOB_START, &req->status);
@@ -1515,8 +1578,12 @@  cn10k_ml_dequeue_burst(struct rte_ml_dev *dev, uint16_t qp_id, struct rte_ml_op
 dequeue_req:
 	req = &queue->reqs[tail];
 	status = plt_read64(&req->status);
-	if (unlikely(status != ML_CN10K_POLL_JOB_FINISH))
-		goto empty_or_active;
+	if (unlikely(status != ML_CN10K_POLL_JOB_FINISH)) {
+		if (plt_tsc_cycles() < req->timeout)
+			goto empty_or_active;
+		else /* Timeout, set indication of driver error */
+			req->result.error_code.s.etype = ML_ETYPE_DRIVER;
+	}
 
 	cn10k_ml_result_update(dev, qp_id, &req->result, req->op);
 	ops[count] = req->op;
@@ -1533,6 +1600,35 @@  cn10k_ml_dequeue_burst(struct rte_ml_dev *dev, uint16_t qp_id, struct rte_ml_op
 	return count;
 }
 
+__rte_hot int
+cn10k_ml_op_error_get(struct rte_ml_dev *dev, struct rte_ml_op *op, struct rte_ml_op_error *error)
+{
+	union cn10k_ml_error_code *error_code;
+	char msg[RTE_ML_STR_MAX];
+
+	PLT_SET_USED(dev);
+
+	error_code = (union cn10k_ml_error_code *)&op->impl_opaque;
+
+	/* Copy error message */
+	plt_strlcpy(msg, ml_etype_db[error_code->s.etype].name, sizeof(msg));
+
+	/* Copy sub error message */
+	if (error_code->s.etype == ML_ETYPE_HW_NONFATAL) {
+		strcat(msg, " : ");
+		strcat(msg, ml_stype_db_hw_nf[error_code->s.stype].msg);
+	}
+
+	if (error_code->s.etype == ML_ETYPE_DRIVER) {
+		strcat(msg, " : ");
+		strcat(msg, ml_stype_db_driver[error_code->s.stype].msg);
+	}
+
+	plt_strlcpy(error->message, msg, sizeof(error->message));
+
+	return 0;
+}
+
 __rte_hot int
 cn10k_ml_inference_sync(struct rte_ml_dev *dev, struct rte_ml_op *op)
 {
@@ -1549,6 +1645,7 @@  cn10k_ml_inference_sync(struct rte_ml_dev *dev, struct rte_ml_op *op)
 	cn10k_ml_prep_fp_job_descriptor(dev, req, op);
 
 	memset(&req->result, 0, sizeof(struct cn10k_ml_result));
+	req->result.error_code.s.etype = ML_ETYPE_UNKNOWN;
 	req->result.user_ptr = op->user_ptr;
 
 	plt_write64(ML_CN10K_POLL_JOB_START, &req->status);
diff --git a/drivers/ml/cnxk/cn10k_ml_ops.h b/drivers/ml/cnxk/cn10k_ml_ops.h
index 7c35bf7539..1784900cff 100644
--- a/drivers/ml/cnxk/cn10k_ml_ops.h
+++ b/drivers/ml/cnxk/cn10k_ml_ops.h
@@ -75,6 +75,8 @@  __rte_hot uint16_t cn10k_ml_enqueue_burst(struct rte_ml_dev *dev, uint16_t qp_id
 					  struct rte_ml_op **ops, uint16_t nb_ops);
 __rte_hot uint16_t cn10k_ml_dequeue_burst(struct rte_ml_dev *dev, uint16_t qp_id,
 					  struct rte_ml_op **ops, uint16_t nb_ops);
+__rte_hot int cn10k_ml_op_error_get(struct rte_ml_dev *dev, struct rte_ml_op *op,
+				    struct rte_ml_op_error *error);
 __rte_hot int cn10k_ml_inference_sync(struct rte_ml_dev *dev, struct rte_ml_op *op);
 
 #endif /* _CN10K_ML_OPS_H_ */