[v5,07/39] ml/cnxk: enable firmware load and device reset

Message ID 20230207160719.1307-8-syalavarthi@marvell.com (mailing list archive)
State Superseded, archived
Delegated to: Thomas Monjalon
Headers
Series Implementation of ML CNXK driver |

Checks

Context Check Description
ci/checkpatch success coding style OK

Commit Message

Srikanth Yalavarthi Feb. 7, 2023, 4:06 p.m. UTC
  Enabled support to load ML firmware on cn10ka ROC model. Reset
MLIP device during dev_close driver operation. Device can't be
reconfigured after a call to close. Job execution is disabled
after firmware load, execution is enabled in device start state.
Added internal request structure.

Signed-off-by: Srikanth Yalavarthi <syalavarthi@marvell.com>
---
 drivers/ml/cnxk/cn10k_ml_dev.c | 327 +++++++++++++++++++++++++++++++++
 drivers/ml/cnxk/cn10k_ml_dev.h | 156 ++++++++++++++++
 drivers/ml/cnxk/cn10k_ml_ops.c |  21 +++
 drivers/ml/cnxk/cn10k_ml_ops.h |  14 ++
 4 files changed, 518 insertions(+)
  

Patch

diff --git a/drivers/ml/cnxk/cn10k_ml_dev.c b/drivers/ml/cnxk/cn10k_ml_dev.c
index 117cac43aa..90fca45ddd 100644
--- a/drivers/ml/cnxk/cn10k_ml_dev.c
+++ b/drivers/ml/cnxk/cn10k_ml_dev.c
@@ -12,6 +12,8 @@ 
 
 #include <roc_api.h>
 
+#include <eal_firmware.h>
+
 #include "cn10k_ml_dev.h"
 #include "cn10k_ml_ops.h"
 
@@ -19,6 +21,15 @@ 
 
 #define CN10K_ML_FW_PATH_DEFAULT "/lib/firmware/mlip-fw.bin"
 
+/* ML firmware macros */
+#define FW_MEMZONE_NAME		 "ml_cn10k_fw_mz"
+#define FW_STACK_BUFFER_SIZE	 0x40000
+#define FW_DEBUG_BUFFER_SIZE	 (2 * 0x20000)
+#define FW_EXCEPTION_BUFFER_SIZE 0x400
+#define FW_LINKER_OFFSET	 0x80000
+#define FW_WAIT_CYCLES		 100
+#define FW_LOAD_FLAGS		 0x1
+
 static const char *const valid_args[] = {CN10K_ML_FW_PATH, NULL};
 
 /* Dummy operations for ML device */
@@ -175,6 +186,322 @@  cn10k_ml_pci_remove(struct rte_pci_device *pci_dev)
 	return rte_ml_dev_pmd_destroy(dev);
 }
 
+static void
+cn10k_ml_fw_print_info(struct cn10k_ml_fw *fw)
+{
+	plt_info("ML Firmware Version = %s", fw->req->jd.fw_load.version);
+
+	plt_ml_dbg("Firmware capabilities = 0x%016lx", fw->req->jd.fw_load.cap.u64);
+	plt_ml_dbg("Version = %s", fw->req->jd.fw_load.version);
+	plt_ml_dbg("core0_debug_ptr = 0x%016lx", fw->req->jd.fw_load.debug.core0_debug_ptr);
+	plt_ml_dbg("core1_debug_ptr = 0x%016lx", fw->req->jd.fw_load.debug.core1_debug_ptr);
+	plt_ml_dbg("debug_buffer_size = %u bytes", fw->req->jd.fw_load.debug.debug_buffer_size);
+	plt_ml_dbg("core0_exception_buffer = 0x%016lx",
+		   fw->req->jd.fw_load.debug.core0_exception_buffer);
+	plt_ml_dbg("core1_exception_buffer = 0x%016lx",
+		   fw->req->jd.fw_load.debug.core1_exception_buffer);
+	plt_ml_dbg("exception_state_size = %u bytes",
+		   fw->req->jd.fw_load.debug.exception_state_size);
+	plt_ml_dbg("flags = 0x%016lx", fw->req->jd.fw_load.flags);
+}
+
+uint64_t
+cn10k_ml_fw_flags_get(struct cn10k_ml_fw *fw)
+{
+	PLT_SET_USED(fw);
+
+	return FW_LOAD_FLAGS;
+}
+
+static int
+cn10k_ml_fw_load_cn10ka(struct cn10k_ml_fw *fw, void *buffer, uint64_t size)
+{
+	union ml_a35_0_rst_vector_base_s a35_0_rst_vector_base;
+	union ml_a35_0_rst_vector_base_s a35_1_rst_vector_base;
+	struct cn10k_ml_dev *mldev;
+	uint64_t timeout_cycle;
+	uint64_t reg_val64;
+	uint32_t reg_val32;
+	uint64_t offset;
+	bool timeout;
+	int ret = 0;
+	uint8_t i;
+
+	mldev = fw->mldev;
+
+	/* Reset HEAD and TAIL debug pointer registers */
+	roc_ml_reg_write64(&mldev->roc, 0, ML_SCRATCH_DBG_BUFFER_HEAD_C0);
+	roc_ml_reg_write64(&mldev->roc, 0, ML_SCRATCH_DBG_BUFFER_TAIL_C0);
+	roc_ml_reg_write64(&mldev->roc, 0, ML_SCRATCH_DBG_BUFFER_HEAD_C1);
+	roc_ml_reg_write64(&mldev->roc, 0, ML_SCRATCH_DBG_BUFFER_TAIL_C1);
+	roc_ml_reg_write64(&mldev->roc, 0, ML_SCRATCH_EXCEPTION_SP_C0);
+	roc_ml_reg_write64(&mldev->roc, 0, ML_SCRATCH_EXCEPTION_SP_C1);
+
+	/* (1) Write firmware images for ACC's two A35 cores to the ML region in LLC / DRAM. */
+	rte_memcpy(PLT_PTR_ADD(fw->data, FW_LINKER_OFFSET), buffer, size);
+
+	/* (2) Set ML(0)_MLR_BASE = Base IOVA of the ML region in LLC/DRAM. */
+	reg_val64 = PLT_PTR_SUB_U64_CAST(fw->data, rte_eal_get_baseaddr());
+	roc_ml_reg_write64(&mldev->roc, reg_val64, ML_MLR_BASE);
+	plt_ml_dbg("ML_MLR_BASE => 0x%016lx", roc_ml_reg_read64(&mldev->roc, ML_MLR_BASE));
+	roc_ml_reg_save(&mldev->roc, ML_MLR_BASE);
+
+	/* (3) Set ML(0)_AXI_BRIDGE_CTRL(1) = 0x184003 to remove back-pressure check on DMA AXI
+	 * bridge.
+	 */
+	reg_val64 = (ROC_ML_AXI_BRIDGE_CTRL_AXI_RESP_CTRL |
+		     ROC_ML_AXI_BRIDGE_CTRL_BRIDGE_CTRL_MODE | ROC_ML_AXI_BRIDGE_CTRL_NCB_WR_BLK |
+		     ROC_ML_AXI_BRIDGE_CTRL_FORCE_WRESP_OK | ROC_ML_AXI_BRIDGE_CTRL_FORCE_RRESP_OK);
+	roc_ml_reg_write64(&mldev->roc, reg_val64, ML_AXI_BRIDGE_CTRL(1));
+	plt_ml_dbg("ML_AXI_BRIDGE_CTRL(1) => 0x%016lx",
+		   roc_ml_reg_read64(&mldev->roc, ML_AXI_BRIDGE_CTRL(1)));
+
+	/* (4) Set ML(0)_ANB(0..2)_BACKP_DISABLE = 0x3 to remove back-pressure on the AXI to NCB
+	 * bridges.
+	 */
+	for (i = 0; i < ML_ANBX_NR; i++) {
+		reg_val64 = (ROC_ML_ANBX_BACKP_DISABLE_EXTMSTR_B_BACKP_DISABLE |
+			     ROC_ML_ANBX_BACKP_DISABLE_EXTMSTR_R_BACKP_DISABLE);
+		roc_ml_reg_write64(&mldev->roc, reg_val64, ML_ANBX_BACKP_DISABLE(i));
+		plt_ml_dbg("ML_ANBX_BACKP_DISABLE(%u) => 0x%016lx", i,
+			   roc_ml_reg_read64(&mldev->roc, ML_ANBX_BACKP_DISABLE(i)));
+	}
+
+	/* (5) Set ML(0)_ANB(0..2)_NCBI_P_OVR = 0x3000 and ML(0)_ANB(0..2)_NCBI_NP_OVR = 0x3000 to
+	 * signal all ML transactions as non-secure.
+	 */
+	for (i = 0; i < ML_ANBX_NR; i++) {
+		reg_val64 = (ML_ANBX_NCBI_P_OVR_ANB_NCBI_P_NS_OVR |
+			     ML_ANBX_NCBI_P_OVR_ANB_NCBI_P_NS_OVR_VLD);
+		roc_ml_reg_write64(&mldev->roc, reg_val64, ML_ANBX_NCBI_P_OVR(i));
+		plt_ml_dbg("ML_ANBX_NCBI_P_OVR(%u) => 0x%016lx", i,
+			   roc_ml_reg_read64(&mldev->roc, ML_ANBX_NCBI_P_OVR(i)));
+
+		reg_val64 |= (ML_ANBX_NCBI_NP_OVR_ANB_NCBI_NP_NS_OVR |
+			      ML_ANBX_NCBI_NP_OVR_ANB_NCBI_NP_NS_OVR_VLD);
+		roc_ml_reg_write64(&mldev->roc, reg_val64, ML_ANBX_NCBI_NP_OVR(i));
+		plt_ml_dbg("ML_ANBX_NCBI_NP_OVR(%u) => 0x%016lx", i,
+			   roc_ml_reg_read64(&mldev->roc, ML_ANBX_NCBI_NP_OVR(i)));
+	}
+
+	/* (6) Set ML(0)_CFG[MLIP_CLK_FORCE] = 1, to force turning on the MLIP clock. */
+	reg_val64 = roc_ml_reg_read64(&mldev->roc, ML_CFG);
+	reg_val64 |= ROC_ML_CFG_MLIP_CLK_FORCE;
+	roc_ml_reg_write64(&mldev->roc, reg_val64, ML_CFG);
+	plt_ml_dbg("ML_CFG => 0x%016lx", roc_ml_reg_read64(&mldev->roc, ML_CFG));
+
+	/* (7) Set ML(0)_JOB_MGR_CTRL[STALL_ON_IDLE] = 0, to make sure the boot request is accepted
+	 * when there is no job in the command queue.
+	 */
+	reg_val64 = roc_ml_reg_read64(&mldev->roc, ML_JOB_MGR_CTRL);
+	reg_val64 &= ~ROC_ML_JOB_MGR_CTRL_STALL_ON_IDLE;
+	roc_ml_reg_write64(&mldev->roc, reg_val64, ML_JOB_MGR_CTRL);
+	plt_ml_dbg("ML_JOB_MGR_CTRL => 0x%016lx", roc_ml_reg_read64(&mldev->roc, ML_JOB_MGR_CTRL));
+
+	/* (8) Set ML(0)_CFG[ENA] = 0 and ML(0)_CFG[MLIP_ENA] = 1 to bring MLIP out of reset while
+	 * keeping the job manager disabled.
+	 */
+	reg_val64 = roc_ml_reg_read64(&mldev->roc, ML_CFG);
+	reg_val64 |= ROC_ML_CFG_MLIP_ENA;
+	reg_val64 &= ~ROC_ML_CFG_ENA;
+	roc_ml_reg_write64(&mldev->roc, reg_val64, ML_CFG);
+	plt_ml_dbg("ML_CFG => 0x%016lx", roc_ml_reg_read64(&mldev->roc, ML_CFG));
+
+	/* (9) Wait at least 70 coprocessor clock cycles. */
+	plt_delay_us(FW_WAIT_CYCLES);
+
+	/* (10) Write ML outbound addresses pointing to the firmware images written in step 1 to the
+	 * following registers: ML(0)_A35_0_RST_VECTOR_BASE_W(0..1) for core 0,
+	 * ML(0)_A35_1_RST_VECTOR_BASE_W(0..1) for core 1. The value written to each register is the
+	 * AXI outbound address divided by 4. Read after write.
+	 */
+	offset = PLT_PTR_ADD_U64_CAST(
+		fw->data, FW_LINKER_OFFSET - roc_ml_reg_read64(&mldev->roc, ML_MLR_BASE));
+	a35_0_rst_vector_base.s.addr = (offset + ML_AXI_START_ADDR) / 4;
+	a35_1_rst_vector_base.s.addr = (offset + ML_AXI_START_ADDR) / 4;
+
+	roc_ml_reg_write32(&mldev->roc, a35_0_rst_vector_base.w.w0, ML_A35_0_RST_VECTOR_BASE_W(0));
+	reg_val32 = roc_ml_reg_read32(&mldev->roc, ML_A35_0_RST_VECTOR_BASE_W(0));
+	plt_ml_dbg("ML_A35_0_RST_VECTOR_BASE_W(0) => 0x%08x", reg_val32);
+
+	roc_ml_reg_write32(&mldev->roc, a35_0_rst_vector_base.w.w1, ML_A35_0_RST_VECTOR_BASE_W(1));
+	reg_val32 = roc_ml_reg_read32(&mldev->roc, ML_A35_0_RST_VECTOR_BASE_W(1));
+	plt_ml_dbg("ML_A35_0_RST_VECTOR_BASE_W(1) => 0x%08x", reg_val32);
+
+	roc_ml_reg_write32(&mldev->roc, a35_1_rst_vector_base.w.w0, ML_A35_1_RST_VECTOR_BASE_W(0));
+	reg_val32 = roc_ml_reg_read32(&mldev->roc, ML_A35_1_RST_VECTOR_BASE_W(0));
+	plt_ml_dbg("ML_A35_1_RST_VECTOR_BASE_W(0) => 0x%08x", reg_val32);
+
+	roc_ml_reg_write32(&mldev->roc, a35_1_rst_vector_base.w.w1, ML_A35_1_RST_VECTOR_BASE_W(1));
+	reg_val32 = roc_ml_reg_read32(&mldev->roc, ML_A35_1_RST_VECTOR_BASE_W(1));
+	plt_ml_dbg("ML_A35_1_RST_VECTOR_BASE_W(1) => 0x%08x", reg_val32);
+
+	/* (11) Clear MLIP's ML(0)_SW_RST_CTRL[ACC_RST]. This will bring the ACC cores and other
+	 * MLIP components out of reset. The cores will execute firmware from the ML region as
+	 * written in step 1.
+	 */
+	reg_val32 = roc_ml_reg_read32(&mldev->roc, ML_SW_RST_CTRL);
+	reg_val32 &= ~ROC_ML_SW_RST_CTRL_ACC_RST;
+	roc_ml_reg_write32(&mldev->roc, reg_val32, ML_SW_RST_CTRL);
+	reg_val32 = roc_ml_reg_read32(&mldev->roc, ML_SW_RST_CTRL);
+	plt_ml_dbg("ML_SW_RST_CTRL => 0x%08x", reg_val32);
+
+	/* (12) Wait for notification from firmware that ML is ready for job execution. */
+	fw->req->jd.hdr.jce.w1.u64 = PLT_U64_CAST(&fw->req->status);
+	fw->req->jd.hdr.job_type = ML_CN10K_JOB_TYPE_FIRMWARE_LOAD;
+	fw->req->jd.hdr.result = roc_ml_addr_ap2mlip(&mldev->roc, &fw->req->result);
+	fw->req->jd.fw_load.flags = cn10k_ml_fw_flags_get(fw);
+	plt_write64(ML_CN10K_POLL_JOB_START, &fw->req->status);
+	plt_wmb();
+
+	/* Enqueue FW load through scratch registers */
+	timeout = true;
+	timeout_cycle = plt_tsc_cycles() + ML_CN10K_CMD_TIMEOUT * plt_tsc_hz();
+	roc_ml_scratch_enqueue(&mldev->roc, &fw->req->jd);
+
+	plt_rmb();
+	do {
+		if (roc_ml_scratch_is_done_bit_set(&mldev->roc) &&
+		    (plt_read64(&fw->req->status) == ML_CN10K_POLL_JOB_FINISH)) {
+			timeout = false;
+			break;
+		}
+	} while (plt_tsc_cycles() < timeout_cycle);
+
+	/* Check firmware load status, clean-up and exit on failure. */
+	if ((!timeout) && (fw->req->result.error_code == 0)) {
+		cn10k_ml_fw_print_info(fw);
+	} else {
+		/* Set ML to disable new jobs */
+		reg_val64 = (ROC_ML_CFG_JD_SIZE | ROC_ML_CFG_MLIP_ENA);
+		roc_ml_reg_write64(&mldev->roc, reg_val64, ML_CFG);
+
+		/* Clear scratch registers */
+		roc_ml_reg_write64(&mldev->roc, 0, ML_SCRATCH_WORK_PTR);
+		roc_ml_reg_write64(&mldev->roc, 0, ML_SCRATCH_FW_CTRL);
+
+		if (timeout) {
+			plt_err("Firmware load timeout");
+			ret = -ETIME;
+		} else {
+			plt_err("Firmware load failed");
+			ret = -1;
+		}
+
+		return ret;
+	}
+
+	/* (13) Set ML(0)_JOB_MGR_CTRL[STALL_ON_IDLE] = 0x1; this is needed to shut down the MLIP
+	 * clock when there are no more jobs to process.
+	 */
+	reg_val64 = roc_ml_reg_read64(&mldev->roc, ML_JOB_MGR_CTRL);
+	reg_val64 |= ROC_ML_JOB_MGR_CTRL_STALL_ON_IDLE;
+	roc_ml_reg_write64(&mldev->roc, reg_val64, ML_JOB_MGR_CTRL);
+	plt_ml_dbg("ML_JOB_MGR_CTRL => 0x%016lx", roc_ml_reg_read64(&mldev->roc, ML_JOB_MGR_CTRL));
+
+	/* (14) Set ML(0)_CFG[MLIP_CLK_FORCE] = 0; the MLIP clock will be turned on/off based on job
+	 * activities.
+	 */
+	reg_val64 = roc_ml_reg_read64(&mldev->roc, ML_CFG);
+	reg_val64 &= ~ROC_ML_CFG_MLIP_CLK_FORCE;
+	roc_ml_reg_write64(&mldev->roc, reg_val64, ML_CFG);
+	plt_ml_dbg("ML_CFG => 0x%016lx", roc_ml_reg_read64(&mldev->roc, ML_CFG));
+
+	/* (15) Set ML(0)_CFG[ENA] to enable ML job execution. */
+	reg_val64 = roc_ml_reg_read64(&mldev->roc, ML_CFG);
+	reg_val64 |= ROC_ML_CFG_ENA;
+	roc_ml_reg_write64(&mldev->roc, reg_val64, ML_CFG);
+	plt_ml_dbg("ML_CFG => 0x%016lx", roc_ml_reg_read64(&mldev->roc, ML_CFG));
+
+	/* Reset scratch registers */
+	roc_ml_reg_write64(&mldev->roc, 0, ML_SCRATCH_FW_CTRL);
+	roc_ml_reg_write64(&mldev->roc, 0, ML_SCRATCH_WORK_PTR);
+
+	/* Disable job execution, to be enabled in start */
+	reg_val64 = roc_ml_reg_read64(&mldev->roc, ML_CFG);
+	reg_val64 &= ~ROC_ML_CFG_ENA;
+	roc_ml_reg_write64(&mldev->roc, reg_val64, ML_CFG);
+	plt_ml_dbg("ML_CFG => 0x%016lx", roc_ml_reg_read64(&mldev->roc, ML_CFG));
+
+	/* Additional fixes: Set RO bit to fix O2D DMA bandwidth issue on cn10ka */
+	for (i = 0; i < ML_ANBX_NR; i++) {
+		reg_val64 = roc_ml_reg_read64(&mldev->roc, ML_ANBX_NCBI_P_OVR(i));
+		reg_val64 |= (ML_ANBX_NCBI_P_OVR_ANB_NCBI_P_RO_OVR |
+			      ML_ANBX_NCBI_P_OVR_ANB_NCBI_P_RO_OVR_VLD);
+		roc_ml_reg_write64(&mldev->roc, reg_val64, ML_ANBX_NCBI_P_OVR(i));
+	}
+
+	return ret;
+}
+
+int
+cn10k_ml_fw_load(struct cn10k_ml_dev *mldev)
+{
+	const struct plt_memzone *mz;
+	struct cn10k_ml_fw *fw;
+	void *fw_buffer = NULL;
+	uint64_t mz_size = 0;
+	uint64_t fw_size = 0;
+	int ret = 0;
+
+	fw = &mldev->fw;
+	fw->mldev = mldev;
+
+	/* Read firmware image to a buffer */
+	ret = rte_firmware_read(fw->path, &fw_buffer, &fw_size);
+	if (ret < 0) {
+		plt_err("Can't read firmware data: %s\n", fw->path);
+		return ret;
+	}
+
+	/* Reserve memzone for firmware load completion and data */
+	mz_size = sizeof(struct cn10k_ml_req) + fw_size + FW_STACK_BUFFER_SIZE +
+		  FW_DEBUG_BUFFER_SIZE + FW_EXCEPTION_BUFFER_SIZE;
+	mz = plt_memzone_reserve_aligned(FW_MEMZONE_NAME, mz_size, 0, ML_CN10K_ALIGN_SIZE);
+	if (mz == NULL) {
+		plt_err("plt_memzone_reserve failed : %s", FW_MEMZONE_NAME);
+		if (fw_buffer != NULL)
+			free(fw_buffer);
+		return -ENOMEM;
+	}
+	fw->req = mz->addr;
+
+	/* Reset firmware load completion structure */
+	memset(&fw->req->jd, 0, sizeof(struct cn10k_ml_jd));
+	memset(&fw->req->jd.fw_load.version[0], '\0', MLDEV_FIRMWARE_VERSION_LENGTH);
+
+	/* Reset device, if in active state */
+	if (roc_ml_mlip_is_enabled(&mldev->roc))
+		roc_ml_mlip_reset(&mldev->roc, true);
+
+	/* Load firmware */
+	fw->data = PLT_PTR_ADD(mz->addr, sizeof(struct cn10k_ml_req));
+	ret = cn10k_ml_fw_load_cn10ka(fw, fw_buffer, fw_size);
+	if (fw_buffer != NULL)
+		free(fw_buffer);
+	if (ret < 0)
+		cn10k_ml_fw_unload(mldev);
+
+	return ret;
+}
+
+void
+cn10k_ml_fw_unload(struct cn10k_ml_dev *mldev)
+{
+	const struct plt_memzone *mz;
+	uint64_t reg_val;
+
+	/* Disable and reset device */
+	reg_val = roc_ml_reg_read64(&mldev->roc, ML_CFG);
+	reg_val &= ~ROC_ML_CFG_MLIP_ENA;
+	roc_ml_reg_write64(&mldev->roc, reg_val, ML_CFG);
+	roc_ml_mlip_reset(&mldev->roc, true);
+
+	mz = plt_memzone_lookup(FW_MEMZONE_NAME);
+	if (mz != NULL)
+		plt_memzone_free(mz);
+}
+
 static struct rte_pci_id pci_id_ml_table[] = {
 	{RTE_PCI_DEVICE(PCI_VENDOR_ID_CAVIUM, PCI_DEVID_CN10K_ML_PF)},
 	/* sentinel */
diff --git a/drivers/ml/cnxk/cn10k_ml_dev.h b/drivers/ml/cnxk/cn10k_ml_dev.h
index 5333566cff..00d23eb3ca 100644
--- a/drivers/ml/cnxk/cn10k_ml_dev.h
+++ b/drivers/ml/cnxk/cn10k_ml_dev.h
@@ -10,6 +10,9 @@ 
 /* Marvell OCTEON CN10K ML PMD device name */
 #define MLDEV_NAME_CN10K_PMD ml_cn10k
 
+/* Firmware version string length */
+#define MLDEV_FIRMWARE_VERSION_LENGTH 32
+
 /* Device alignment size */
 #define ML_CN10K_ALIGN_SIZE 128
 
@@ -28,6 +31,19 @@ 
 /* ML command timeout in seconds */
 #define ML_CN10K_CMD_TIMEOUT 5
 
+/* Poll mode job state */
+#define ML_CN10K_POLL_JOB_START	 0
+#define ML_CN10K_POLL_JOB_FINISH 1
+
+/* Job types */
+enum cn10k_ml_job_type {
+	ML_CN10K_JOB_TYPE_MODEL_RUN = 0,
+	ML_CN10K_JOB_TYPE_MODEL_STOP,
+	ML_CN10K_JOB_TYPE_MODEL_START,
+	ML_CN10K_JOB_TYPE_FIRMWARE_LOAD,
+	ML_CN10K_JOB_TYPE_FIRMWARE_SELFTEST,
+};
+
 /* Device configuration state enum */
 enum cn10k_ml_dev_state {
 	/* Probed and not configured */
@@ -43,6 +59,136 @@  enum cn10k_ml_dev_state {
 	ML_CN10K_DEV_STATE_CLOSED
 };
 
+/* Firmware stats */
+struct cn10k_ml_fw_stats {
+	/* Firmware start cycle */
+	uint64_t fw_start;
+
+	/* Firmware end cycle */
+	uint64_t fw_end;
+
+	/* Hardware start cycle */
+	uint64_t hw_start;
+
+	/* Hardware end cycle */
+	uint64_t hw_end;
+};
+
+/* Result structure */
+struct cn10k_ml_result {
+	/* Job error code */
+	uint64_t error_code;
+
+	/* Firmware stats */
+	struct cn10k_ml_fw_stats stats;
+
+	/* User context pointer */
+	void *user_ptr;
+};
+
+/* Firmware capability structure */
+union cn10k_ml_fw_cap {
+	uint64_t u64;
+
+	struct {
+		/* CMPC completion support */
+		uint64_t cmpc_completions : 1;
+
+		/* Poll mode completion support */
+		uint64_t poll_completions : 1;
+
+		/* SSO completion support */
+		uint64_t sso_completions : 1;
+
+		/* Support for model side loading */
+		uint64_t side_load_model : 1;
+
+		/* Batch execution */
+		uint64_t batch_run : 1;
+
+		/* Max number of models to be loaded in parallel */
+		uint64_t max_models : 8;
+
+		/* Firmware statistics */
+		uint64_t fw_stats : 1;
+
+		/* Hardware statistics */
+		uint64_t hw_stats : 1;
+
+		/* Max number of batches */
+		uint64_t max_num_batches : 16;
+
+		uint64_t rsvd : 33;
+	} s;
+};
+
+/* Firmware debug info structure */
+struct cn10k_ml_fw_debug {
+	/* ACC core 0 debug buffer */
+	uint64_t core0_debug_ptr;
+
+	/* ACC core 1 debug buffer */
+	uint64_t core1_debug_ptr;
+
+	/* ACC core 0 exception state buffer */
+	uint64_t core0_exception_buffer;
+
+	/* ACC core 1 exception state buffer */
+	uint64_t core1_exception_buffer;
+
+	/* Debug buffer size per core */
+	uint32_t debug_buffer_size;
+
+	/* Exception state dump size */
+	uint32_t exception_state_size;
+};
+
+/* Job descriptor header (32 bytes) */
+struct cn10k_ml_jd_header {
+	/* Job completion structure */
+	struct ml_jce_s jce;
+
+	/* Model ID */
+	uint64_t model_id : 8;
+
+	/* Job type */
+	uint64_t job_type : 8;
+
+	/* Flags for fast-path jobs */
+	uint64_t fp_flags : 16;
+
+	/* Flags for slow-path jobs */
+	uint64_t sp_flags : 16;
+	uint64_t rsvd : 16;
+
+	/* Job result pointer */
+	uint64_t *result;
+};
+
+/* Job descriptor structure */
+struct cn10k_ml_jd {
+	/* Job descriptor header (32 bytes) */
+	struct cn10k_ml_jd_header hdr;
+
+	union {
+		struct cn10k_ml_jd_section_fw_load {
+			/* Firmware capability structure (8 bytes) */
+			union cn10k_ml_fw_cap cap;
+
+			/* Firmware version (32 bytes) */
+			uint8_t version[MLDEV_FIRMWARE_VERSION_LENGTH];
+
+			/* Debug capability structure (40 bytes) */
+			struct cn10k_ml_fw_debug debug;
+
+			/* Flags to control error handling */
+			uint64_t flags;
+
+			uint8_t rsvd[8];
+		} fw_load;
+	};
+};
+
 /* ML firmware structure */
 struct cn10k_ml_fw {
 	/* Device reference */
@@ -50,6 +196,12 @@  struct cn10k_ml_fw {
 
 	/* Firmware file path */
 	const char *path;
+
+	/* Data buffer */
+	uint8_t *data;
+
+	/* Firmware load / handshake request structure */
+	struct cn10k_ml_req *req;
 };
 
 /* Device private data */
@@ -64,4 +216,8 @@  struct cn10k_ml_dev {
 	struct cn10k_ml_fw fw;
 };
 
+uint64_t cn10k_ml_fw_flags_get(struct cn10k_ml_fw *fw);
+int cn10k_ml_fw_load(struct cn10k_ml_dev *mldev);
+void cn10k_ml_fw_unload(struct cn10k_ml_dev *mldev);
+
 #endif /* _CN10K_ML_DEV_H_ */
diff --git a/drivers/ml/cnxk/cn10k_ml_ops.c b/drivers/ml/cnxk/cn10k_ml_ops.c
index 3a78d8c816..3df1254dca 100644
--- a/drivers/ml/cnxk/cn10k_ml_ops.c
+++ b/drivers/ml/cnxk/cn10k_ml_ops.c
@@ -30,6 +30,7 @@  cn10k_ml_dev_configure(struct rte_ml_dev *dev, const struct rte_ml_dev_config *c
 {
 	struct rte_ml_dev_info dev_info;
 	struct cn10k_ml_dev *mldev;
+	int ret;
 
 	if (dev == NULL || conf == NULL)
 		return -EINVAL;
@@ -51,6 +52,11 @@  cn10k_ml_dev_configure(struct rte_ml_dev *dev, const struct rte_ml_dev_config *c
 	if (mldev->state == ML_CN10K_DEV_STATE_PROBED) {
 		plt_ml_dbg("Configuring ML device, nb_queue_pairs = %u, nb_models = %u",
 			   conf->nb_queue_pairs, conf->nb_models);
+
+		/* Load firmware */
+		ret = cn10k_ml_fw_load(mldev);
+		if (ret != 0)
+			return ret;
 	} else if (mldev->state == ML_CN10K_DEV_STATE_CONFIGURED) {
 		plt_ml_dbg("Re-configuring ML device, nb_queue_pairs = %u, nb_models = %u",
 			   conf->nb_queue_pairs, conf->nb_models);
@@ -77,6 +83,21 @@  cn10k_ml_dev_close(struct rte_ml_dev *dev)
 
 	mldev = dev->data->dev_private;
 
+	/* Unload firmware */
+	cn10k_ml_fw_unload(mldev);
+
+	/* Clear scratch registers */
+	roc_ml_reg_write64(&mldev->roc, 0, ML_SCRATCH_WORK_PTR);
+	roc_ml_reg_write64(&mldev->roc, 0, ML_SCRATCH_FW_CTRL);
+	roc_ml_reg_write64(&mldev->roc, 0, ML_SCRATCH_DBG_BUFFER_HEAD_C0);
+	roc_ml_reg_write64(&mldev->roc, 0, ML_SCRATCH_DBG_BUFFER_TAIL_C0);
+	roc_ml_reg_write64(&mldev->roc, 0, ML_SCRATCH_DBG_BUFFER_HEAD_C1);
+	roc_ml_reg_write64(&mldev->roc, 0, ML_SCRATCH_DBG_BUFFER_TAIL_C1);
+
+	/* Reset ML_MLR_BASE */
+	roc_ml_reg_write64(&mldev->roc, 0, ML_MLR_BASE);
+	plt_ml_dbg("ML_MLR_BASE = 0x%016lx", roc_ml_reg_read64(&mldev->roc, ML_MLR_BASE));
+
 	mldev->state = ML_CN10K_DEV_STATE_CLOSED;
 
 	/* Remove PCI device */
diff --git a/drivers/ml/cnxk/cn10k_ml_ops.h b/drivers/ml/cnxk/cn10k_ml_ops.h
index b14221d02c..fe18730aca 100644
--- a/drivers/ml/cnxk/cn10k_ml_ops.h
+++ b/drivers/ml/cnxk/cn10k_ml_ops.h
@@ -5,6 +5,20 @@ 
 #ifndef _CN10K_ML_OPS_H_
 #define _CN10K_ML_OPS_H_
 
+#include "cn10k_ml_dev.h"
+
+/* ML request */
+struct cn10k_ml_req {
+	/* Job descriptor */
+	struct cn10k_ml_jd jd;
+
+	/* Job result */
+	struct cn10k_ml_result result;
+
+	/* Status field for poll mode requests */
+	volatile uint64_t status;
+} __rte_aligned(ROC_ALIGN);
+
 /* Device ops */
 extern struct rte_ml_dev_ops cn10k_ml_ops;