Added JD structures for load, unload and run jobs. Initialize
job command and allocate memory for request structures for slow
path jobs.
Signed-off-by: Srikanth Yalavarthi <syalavarthi@marvell.com>
---
drivers/ml/cnxk/cn10k_ml_dev.h | 99 ++++++++++++++++++++++++++++++++
drivers/ml/cnxk/cn10k_ml_model.h | 4 ++
drivers/ml/cnxk/cn10k_ml_ops.c | 19 +++++-
drivers/ml/cnxk/cn10k_ml_ops.h | 4 ++
4 files changed, 125 insertions(+), 1 deletion(-)
@@ -188,6 +188,105 @@ struct cn10k_ml_jd {
uint8_t rsvd[8];
} fw_load;
+
+ struct cn10k_ml_jd_section_model_start {
+ /* Source model start address in DDR relative to ML_MLR_BASE */
+ uint64_t model_src_ddr_addr;
+
+ /* Destination model start address in DDR relative to ML_MLR_BASE */
+ uint64_t model_dst_ddr_addr;
+
+ /* Offset to model init section in the model */
+ uint64_t model_init_offset : 32;
+
+ /* Size of init section in the model */
+ uint64_t model_init_size : 32;
+
+ /* Offset to model main section in the model */
+ uint64_t model_main_offset : 32;
+
+ /* Size of main section in the model */
+ uint64_t model_main_size : 32;
+
+ /* Offset to model finish section in the model */
+ uint64_t model_finish_offset : 32;
+
+ /* Size of finish section in the model */
+ uint64_t model_finish_size : 32;
+
+ /* Offset to WB in model bin */
+ uint64_t model_wb_offset : 32;
+
+ /* Number of model layers */
+ uint64_t num_layers : 8;
+
+ /* Number of gather entries, 0 means linear input mode (= no gather) */
+ uint64_t num_gather_entries : 8;
+
+ /* Number of scatter entries 0 means linear input mode (= no scatter) */
+ uint64_t num_scatter_entries : 8;
+
+ /* Tile mask to load model */
+ uint64_t tilemask : 8;
+
+ /* Batch size of model */
+ uint64_t batch_size : 32;
+
+ /* OCM WB base address */
+ uint64_t ocm_wb_base_address : 32;
+
+ /* OCM WB range start */
+ uint64_t ocm_wb_range_start : 32;
+
+ /* OCM WB range End */
+ uint64_t ocm_wb_range_end : 32;
+
+ /* DDR WB address */
+ uint64_t ddr_wb_base_address;
+
+ /* DDR WB range start */
+ uint64_t ddr_wb_range_start : 32;
+
+ /* DDR WB range end */
+ uint64_t ddr_wb_range_end : 32;
+
+ union {
+ /* Points to gather list if num_gather_entries > 0 */
+ void *gather_list;
+ struct {
+ /* Linear input mode */
+ uint64_t ddr_range_start : 32;
+ uint64_t ddr_range_end : 32;
+ } s;
+ } input;
+
+ union {
+ /* Points to scatter list if num_scatter_entries > 0 */
+ void *scatter_list;
+ struct {
+ /* Linear output mode */
+ uint64_t ddr_range_start : 32;
+ uint64_t ddr_range_end : 32;
+ } s;
+ } output;
+ } model_start;
+
+ struct cn10k_ml_jd_section_model_stop {
+ uint8_t rsvd[96];
+ } model_stop;
+
+ struct cn10k_ml_jd_section_model_run {
+ /* Address of the input for the run relative to ML_MLR_BASE */
+ uint64_t input_ddr_addr;
+
+ /* Address of the output for the run relative to ML_MLR_BASE */
+ uint64_t output_ddr_addr;
+
+ /* Number of batches to run in variable batch processing */
+ uint16_t num_batches;
+
+ uint8_t rsvd[78];
+ } model_run;
};
};
@@ -11,6 +11,7 @@
#include "cn10k_ml_dev.h"
#include "cn10k_ml_ocm.h"
+#include "cn10k_ml_ops.h"
/* Model state */
enum cn10k_ml_model_state {
@@ -426,6 +427,9 @@ struct cn10k_ml_model {
/* State */
enum cn10k_ml_model_state state;
+
+ /* Slow-path operations request pointer */
+ struct cn10k_ml_req *req;
};
int cn10k_ml_model_metadata_check(uint8_t *buffer, uint64_t size);
@@ -12,6 +12,10 @@
/* ML model macros */
#define CN10K_ML_MODEL_MEMZONE_NAME "ml_cn10k_model_mz"
+/* ML Job descriptor flags */
+#define ML_FLAGS_POLL_COMPL BIT(0)
+#define ML_FLAGS_SSO_COMPL BIT(1)
+
static void
qp_memzone_name_get(char *name, int size, int dev_id, int qp_id)
{
@@ -65,6 +69,7 @@ cn10k_ml_qp_create(const struct rte_ml_dev *dev, uint16_t qp_id, uint32_t nb_des
struct cn10k_ml_qp *qp;
uint32_t len;
uint8_t *va;
+ uint64_t i;
/* Allocate queue pair */
qp = rte_zmalloc_socket("cn10k_ml_pmd_queue_pair", sizeof(struct cn10k_ml_qp), ROC_ALIGN,
@@ -95,6 +100,12 @@ cn10k_ml_qp_create(const struct rte_ml_dev *dev, uint16_t qp_id, uint32_t nb_des
qp->queue.wait_cycles = ML_CN10K_CMD_TIMEOUT * plt_tsc_hz();
qp->nb_desc = nb_desc;
+ /* Initialize job command */
+ for (i = 0; i < qp->nb_desc; i++) {
+ memset(&qp->queue.reqs[i].jd, 0, sizeof(struct cn10k_ml_jd));
+ qp->queue.reqs[i].jcmd.w1.s.jobptr = PLT_U64_CAST(&qp->queue.reqs[i].jd);
+ }
+
return qp;
qp_free:
@@ -468,7 +479,8 @@ cn10k_ml_model_load(struct rte_ml_dev *dev, struct rte_ml_model_params *params,
metadata->finish_model.file_size + metadata->weights_bias.file_size;
model_data_size = PLT_ALIGN_CEIL(model_data_size, ML_CN10K_ALIGN_SIZE);
mz_size = PLT_ALIGN_CEIL(sizeof(struct cn10k_ml_model), ML_CN10K_ALIGN_SIZE) +
- 2 * model_data_size;
+ 2 * model_data_size +
+ PLT_ALIGN_CEIL(sizeof(struct cn10k_ml_req), ML_CN10K_ALIGN_SIZE);
/* Allocate memzone for model object and model data */
snprintf(str, RTE_MEMZONE_NAMESIZE, "%s_%u", CN10K_ML_MODEL_MEMZONE_NAME, idx);
@@ -507,6 +519,11 @@ cn10k_ml_model_load(struct rte_ml_dev *dev, struct rte_ml_model_params *params,
model->model_mem_map.wb_pages = wb_pages;
model->model_mem_map.scratch_pages = scratch_pages;
+ /* Set slow-path request address and state */
+ model->req = PLT_PTR_ADD(
+ mz->addr, PLT_ALIGN_CEIL(sizeof(struct cn10k_ml_model), ML_CN10K_ALIGN_SIZE) +
+ 2 * model_data_size);
+
plt_spinlock_init(&model->lock);
model->state = ML_CN10K_MODEL_STATE_LOADED;
dev->data->models[idx] = model;
@@ -6,6 +6,7 @@
#define _CN10K_ML_OPS_H_
#include <rte_mldev.h>
+#include <rte_mldev_pmd.h>
#include <roc_api.h>
@@ -21,6 +22,9 @@ struct cn10k_ml_req {
/* Status field for poll mode requests */
volatile uint64_t status;
+
+ /* Job command */
+ struct ml_job_cmd_s jcmd;
} __rte_aligned(ROC_ALIGN);
/* Request queue */