[v2,30/37] ml/cnxk: add support to handle extended dev stats

Message ID 20221208201806.21893-31-syalavarthi@marvell.com (mailing list archive)
State Superseded, archived
Delegated to: Thomas Monjalon
Headers
Series Implementation of ML CNXK driver |

Checks

Context Check Description
ci/checkpatch success coding style OK

Commit Message

Srikanth Yalavarthi Dec. 8, 2022, 8:17 p.m. UTC
  Added support to handle ML device extended stats. Support
is enabled to get xstats names and stats values and reset
xstats. Supported xstats include avg, min and max hardware
and firmware latency.

Signed-off-by: Srikanth Yalavarthi <syalavarthi@marvell.com>
---
 drivers/ml/cnxk/cn10k_ml_dev.h   |   3 +
 drivers/ml/cnxk/cn10k_ml_model.h |  57 +++++
 drivers/ml/cnxk/cn10k_ml_ops.c   | 356 ++++++++++++++++++++++++++++++-
 3 files changed, 415 insertions(+), 1 deletion(-)
  

Patch

diff --git a/drivers/ml/cnxk/cn10k_ml_dev.h b/drivers/ml/cnxk/cn10k_ml_dev.h
index f292078920..fadca2a9f5 100644
--- a/drivers/ml/cnxk/cn10k_ml_dev.h
+++ b/drivers/ml/cnxk/cn10k_ml_dev.h
@@ -372,6 +372,9 @@  struct cn10k_ml_dev {
 
 	/* Number of models loaded */
 	uint16_t nb_models_loaded;
+
+	/* xstats status */
+	bool xstats_enabled;
 };
 
 uint64_t cn10k_ml_fw_flags_get(struct cn10k_ml_fw *fw);
diff --git a/drivers/ml/cnxk/cn10k_ml_model.h b/drivers/ml/cnxk/cn10k_ml_model.h
index 2372ac9b72..9d8068a173 100644
--- a/drivers/ml/cnxk/cn10k_ml_model.h
+++ b/drivers/ml/cnxk/cn10k_ml_model.h
@@ -402,6 +402,57 @@  struct cn10k_ml_model_addr {
 	uint32_t total_output_sz_d;
 };
 
+/* Extended stats types enum */
+enum cn10k_ml_model_xstats_type {
+	/* Average hardware latency */
+	avg_hw_latency = 0,
+
+	/* Minimum hardware latency */
+	min_hw_latency,
+
+	/* Maximum hardware latency */
+	max_hw_latency,
+
+	/* Average firmware latency */
+	avg_fw_latency,
+
+	/* Minimum firmware latency */
+	min_fw_latency,
+
+	/* Maximum firmware latency */
+	max_fw_latency,
+};
+
+/* Model fast-path stats */
+struct cn10k_ml_model_stats {
+	/* Total hardware latency, sum of all inferences */
+	uint64_t hw_latency_tot;
+
+	/* Minimum hardware latency */
+	uint64_t hw_latency_min;
+
+	/* Maximum hardware latency */
+	uint64_t hw_latency_max;
+
+	/* Total firmware latency, sum of all inferences */
+	uint64_t fw_latency_tot;
+
+	/* Minimum firmware latency */
+	uint64_t fw_latency_min;
+
+	/* Maximum firmware latency */
+	uint64_t fw_latency_max;
+
+	/* Total jobs dequeued */
+	uint64_t dequeued_count;
+
+	/* Hardware stats reset index */
+	uint64_t hw_reset_count;
+
+	/* Firmware stats reset index */
+	uint64_t fw_reset_count;
+};
+
 /* ML Model Object */
 struct cn10k_ml_model {
 	/* Device reference */
@@ -441,6 +492,12 @@  struct cn10k_ml_model {
 
 	/* Model slow-path operations request pointer */
 	struct cn10k_ml_req *req;
+
+	/* Model stats for burst ops */
+	struct cn10k_ml_model_stats *burst_stats;
+
+	/* Model stats for sync ops */
+	struct cn10k_ml_model_stats *sync_stats;
 };
 
 int cn10k_ml_model_metadata_check(uint8_t *buffer, uint64_t size);
diff --git a/drivers/ml/cnxk/cn10k_ml_ops.c b/drivers/ml/cnxk/cn10k_ml_ops.c
index 732d0a63ba..eeea98a4d5 100644
--- a/drivers/ml/cnxk/cn10k_ml_ops.c
+++ b/drivers/ml/cnxk/cn10k_ml_ops.c
@@ -354,6 +354,134 @@  cn10k_ml_prep_fp_job_descriptor(struct rte_ml_dev *dev, struct cn10k_ml_req *req
 	req->jd.model_run.num_batches = op->nb_batches;
 }
 
+#define ML_AVG_FOREACH_QP(dev, model, qp_id, str, value, count)                                    \
+	do {                                                                                       \
+		value = 0;                                                                         \
+		for (qp_id = 0; qp_id < dev->data->nb_queue_pairs; qp_id++) {                      \
+			value += model->burst_stats[qp_id].str##_latency_tot;                      \
+			count += model->burst_stats[qp_id].dequeued_count -                        \
+				 model->burst_stats[qp_id].str##_reset_count;                      \
+		}                                                                                  \
+		value = value / count;                                                             \
+	} while (0)
+
+#define ML_MIN_FOREACH_QP(dev, model, qp_id, str, value, count)                                    \
+	do {                                                                                       \
+		value = UINT64_MAX;                                                                \
+		for (qp_id = 0; qp_id < dev->data->nb_queue_pairs; qp_id++) {                      \
+			value = PLT_MIN(value, model->burst_stats[qp_id].str##_latency_min);       \
+			count += model->burst_stats[qp_id].dequeued_count -                        \
+				 model->burst_stats[qp_id].str##_reset_count;                      \
+		}                                                                                  \
+		if (count == 0)                                                                    \
+			value = 0;                                                                 \
+	} while (0)
+
+#define ML_MAX_FOREACH_QP(dev, model, qp_id, str, value, count)                                    \
+	do {                                                                                       \
+		value = 0;                                                                         \
+		for (qp_id = 0; qp_id < dev->data->nb_queue_pairs; qp_id++) {                      \
+			value = PLT_MAX(value, model->burst_stats[qp_id].str##_latency_max);       \
+			count += model->burst_stats[qp_id].dequeued_count -                        \
+				 model->burst_stats[qp_id].str##_reset_count;                      \
+		}                                                                                  \
+		if (count == 0)                                                                    \
+			value = 0;                                                                 \
+	} while (0)
+
+static uint64_t
+cn10k_ml_model_xstat_get(struct rte_ml_dev *dev, uint16_t model_id,
+			 enum cn10k_ml_model_xstats_type type)
+{
+	struct cn10k_ml_model *model;
+	uint64_t count = 0;
+	uint64_t value;
+	uint32_t qp_id;
+
+	model = dev->data->models[model_id];
+	if (model == NULL)
+		return 0;
+
+	switch (type) {
+	case avg_hw_latency:
+		ML_AVG_FOREACH_QP(dev, model, qp_id, hw, value, count);
+		break;
+	case min_hw_latency:
+		ML_MIN_FOREACH_QP(dev, model, qp_id, hw, value, count);
+		break;
+	case max_hw_latency:
+		ML_MAX_FOREACH_QP(dev, model, qp_id, hw, value, count);
+		break;
+	case avg_fw_latency:
+		ML_AVG_FOREACH_QP(dev, model, qp_id, fw, value, count);
+		break;
+	case min_fw_latency:
+		ML_MIN_FOREACH_QP(dev, model, qp_id, fw, value, count);
+		break;
+	case max_fw_latency:
+		ML_MAX_FOREACH_QP(dev, model, qp_id, fw, value, count);
+		break;
+	default:
+		value = 0;
+	}
+
+	return value;
+}
+
+#define ML_AVG_RESET_FOREACH_QP(dev, model, qp_id, str)                                            \
+	do {                                                                                       \
+		for (qp_id = 0; qp_id < dev->data->nb_queue_pairs; qp_id++) {                      \
+			model->burst_stats[qp_id].str##_latency_tot = 0;                           \
+			model->burst_stats[qp_id].str##_reset_count =                              \
+				model->burst_stats[qp_id].dequeued_count;                          \
+		}                                                                                  \
+	} while (0)
+
+#define ML_MIN_RESET_FOREACH_QP(dev, model, qp_id, str)                                            \
+	do {                                                                                       \
+		for (qp_id = 0; qp_id < dev->data->nb_queue_pairs; qp_id++)                        \
+			model->burst_stats[qp_id].str##_latency_min = UINT64_MAX;                  \
+	} while (0)
+
+#define ML_MAX_RESET_FOREACH_QP(dev, model, qp_id, str)                                            \
+	do {                                                                                       \
+		for (qp_id = 0; qp_id < dev->data->nb_queue_pairs; qp_id++)                        \
+			model->burst_stats[qp_id].str##_latency_max = 0;                           \
+	} while (0)
+
+static void
+cn10k_ml_model_xstat_reset(struct rte_ml_dev *dev, uint16_t model_id,
+			   enum cn10k_ml_model_xstats_type type)
+{
+	struct cn10k_ml_model *model;
+	uint32_t qp_id;
+
+	model = dev->data->models[model_id];
+
+	switch (type) {
+	case avg_hw_latency:
+		ML_AVG_RESET_FOREACH_QP(dev, model, qp_id, hw);
+		break;
+	case min_hw_latency:
+		ML_MIN_RESET_FOREACH_QP(dev, model, qp_id, hw);
+		break;
+	case max_hw_latency:
+		ML_MAX_RESET_FOREACH_QP(dev, model, qp_id, hw);
+		break;
+	case avg_fw_latency:
+		ML_AVG_RESET_FOREACH_QP(dev, model, qp_id, fw);
+		break;
+	case min_fw_latency:
+		ML_MIN_RESET_FOREACH_QP(dev, model, qp_id, fw);
+		break;
+	case max_fw_latency:
+		ML_MAX_RESET_FOREACH_QP(dev, model, qp_id, fw);
+		break;
+	default:
+		return;
+	}
+}
+
 static int
 cn10k_ml_dev_info_get(struct rte_ml_dev *dev, struct rte_ml_dev_info *dev_info)
 {
@@ -519,6 +647,13 @@  cn10k_ml_dev_configure(struct rte_ml_dev *dev, const struct rte_ml_dev_config *c
 
 	rte_spinlock_init(&ocm->lock);
 
+	/* Check firmware stats */
+	if ((mldev->fw.req->jd.fw_load.cap.s.hw_stats) &&
+	    (mldev->fw.req->jd.fw_load.cap.s.fw_stats))
+		mldev->xstats_enabled = true;
+	else
+		mldev->xstats_enabled = false;
+
 	dev->enqueue_burst = cn10k_ml_enqueue_burst;
 	dev->dequeue_burst = cn10k_ml_dequeue_burst;
 	dev->op_error_get = cn10k_ml_op_error_get;
@@ -714,6 +849,170 @@  cn10k_ml_dev_stats_reset(struct rte_ml_dev *dev)
 	}
 }
 
+/* Model xstats names */
+struct rte_ml_dev_xstats_map cn10k_ml_model_xstats_table[] = {
+	{avg_hw_latency, "Avg-HW-Latency"}, {min_hw_latency, "Min-HW-Latency"},
+	{max_hw_latency, "Max-HW-Latency"}, {avg_fw_latency, "Avg-FW-Latency"},
+	{min_fw_latency, "Min-FW-Latency"}, {max_fw_latency, "Max-FW-Latency"},
+};
+
+static int
+cn10k_ml_dev_xstats_names_get(struct rte_ml_dev *dev, struct rte_ml_dev_xstats_map *xstats_map,
+			      uint32_t size)
+{
+	struct rte_ml_dev_info dev_info;
+	struct cn10k_ml_model *model;
+	struct cn10k_ml_dev *mldev;
+	uint32_t model_id;
+	uint32_t count;
+	uint32_t type;
+	uint32_t id;
+
+	mldev = dev->data->dev_private;
+	if (!mldev->xstats_enabled)
+		return 0;
+
+	if (xstats_map == NULL)
+		return PLT_DIM(cn10k_ml_model_xstats_table) * mldev->nb_models_loaded;
+
+	/* Model xstats names */
+	count = 0;
+	cn10k_ml_dev_info_get(dev, &dev_info);
+
+	for (id = 0; id < PLT_DIM(cn10k_ml_model_xstats_table) * dev_info.max_models; id++) {
+		model_id = id / PLT_DIM(cn10k_ml_model_xstats_table);
+		model = dev->data->models[model_id];
+
+		if (model == NULL)
+			continue;
+
+		xstats_map[count].id = id;
+		type = id % PLT_DIM(cn10k_ml_model_xstats_table);
+
+		snprintf(xstats_map[count].name, RTE_ML_STR_MAX, "%s-%s-cycles",
+			 model->metadata.model.name, cn10k_ml_model_xstats_table[type].name);
+
+		count++;
+		if (count == size)
+			break;
+	}
+
+	return count;
+}
+
+static int
+cn10k_ml_dev_xstats_by_name_get(struct rte_ml_dev *dev, const char *name, uint16_t *stat_id,
+				uint64_t *value)
+{
+	struct rte_ml_dev_xstats_map *xstats_map;
+	struct rte_ml_dev_info dev_info;
+	struct cn10k_ml_dev *mldev;
+	uint32_t num_xstats;
+	uint32_t model_id;
+	uint32_t type;
+	uint32_t id;
+
+	mldev = dev->data->dev_private;
+	if (!mldev->xstats_enabled)
+		return 0;
+
+	num_xstats = PLT_DIM(cn10k_ml_model_xstats_table) * mldev->nb_models_loaded;
+	xstats_map = rte_zmalloc("cn10k_ml_xstats_map",
+				 sizeof(struct rte_ml_dev_xstats_map) * num_xstats, 0);
+	cn10k_ml_dev_xstats_names_get(dev, xstats_map, num_xstats);
+
+	cn10k_ml_dev_info_get(dev, &dev_info);
+	for (id = 0; id < PLT_DIM(cn10k_ml_model_xstats_table) * dev_info.max_models; id++) {
+		if (strncmp(name, xstats_map[id].name, strlen(name)) == 0) {
+			*stat_id = id;
+			rte_free(xstats_map);
+			break;
+		}
+	}
+
+	if (id == PLT_DIM(cn10k_ml_model_xstats_table) * dev_info.max_models)
+		return -EINVAL;
+
+	model_id = id / PLT_DIM(cn10k_ml_model_xstats_table);
+	type = id % PLT_DIM(cn10k_ml_model_xstats_table);
+	*value = cn10k_ml_model_xstat_get(dev, model_id, type);
+
+	return 0;
+}
+
+static int
+cn10k_ml_dev_xstats_get(struct rte_ml_dev *dev, const uint16_t *stat_ids, uint64_t *values,
+			uint16_t nb_ids)
+{
+	struct cn10k_ml_model *model;
+	struct cn10k_ml_dev *mldev;
+	uint32_t model_id;
+	uint32_t count;
+	uint32_t type;
+	uint32_t i;
+
+	mldev = dev->data->dev_private;
+	if (!mldev->xstats_enabled)
+		return 0;
+
+	count = 0;
+	for (i = 0; i < nb_ids; i++) {
+		model_id = stat_ids[i] / PLT_DIM(cn10k_ml_model_xstats_table);
+		model = dev->data->models[model_id];
+
+		if (model == NULL)
+			continue;
+
+		type = stat_ids[i] % PLT_DIM(cn10k_ml_model_xstats_table);
+		values[i] = cn10k_ml_model_xstat_get(dev, model_id, type);
+		count++;
+	}
+
+	return count;
+}
+
+static int
+cn10k_ml_dev_xstats_reset(struct rte_ml_dev *dev, const uint16_t *stat_ids, uint16_t nb_ids)
+{
+	struct rte_ml_dev_info dev_info;
+	struct cn10k_ml_model *model;
+	struct cn10k_ml_dev *mldev;
+	uint32_t model_id;
+	uint32_t type;
+	uint32_t i;
+
+	mldev = dev->data->dev_private;
+	if (!mldev->xstats_enabled)
+		return 0;
+
+	cn10k_ml_dev_info_get(dev, &dev_info);
+	if (stat_ids == NULL) {
+		for (i = 0; i < PLT_DIM(cn10k_ml_model_xstats_table) * dev_info.max_models; i++) {
+			model_id = i / PLT_DIM(cn10k_ml_model_xstats_table);
+			model = dev->data->models[model_id];
+
+			if (model == NULL)
+				continue;
+
+			type = i % PLT_DIM(cn10k_ml_model_xstats_table);
+			cn10k_ml_model_xstat_reset(dev, model_id, type);
+		}
+	} else {
+		for (i = 0; i < nb_ids; i++) {
+			model_id = stat_ids[i] / PLT_DIM(cn10k_ml_model_xstats_table);
+			model = dev->data->models[model_id];
+
+			if (model == NULL)
+				continue;
+
+			type = stat_ids[i] % PLT_DIM(cn10k_ml_model_xstats_table);
+			cn10k_ml_model_xstat_reset(dev, model_id, type);
+		}
+	}
+
+	return 0;
+}
+
 static int
 cn10k_ml_dev_dump(struct rte_ml_dev *dev, FILE *fp)
 {
@@ -856,6 +1155,7 @@  cn10k_ml_model_load(struct rte_ml_dev *dev, struct rte_ml_model_params *params,
 
 	char str[RTE_MEMZONE_NAMESIZE];
 	const struct plt_memzone *mz;
+	size_t model_stats_size;
 	size_t model_data_size;
 	size_t model_info_size;
 	uint8_t *base_dma_addr;
@@ -864,6 +1164,7 @@  cn10k_ml_model_load(struct rte_ml_dev *dev, struct rte_ml_model_params *params,
 	uint64_t mz_size;
 	uint16_t idx;
 	bool found;
+	int qp_id;
 	int ret;
 
 	ret = cn10k_ml_model_metadata_check(params->addr, params->size);
@@ -900,10 +1201,12 @@  cn10k_ml_model_load(struct rte_ml_dev *dev, struct rte_ml_model_params *params,
 			  metadata->model.num_input * sizeof(struct rte_ml_io_info) +
 			  metadata->model.num_output * sizeof(struct rte_ml_io_info);
 	model_info_size = PLT_ALIGN_CEIL(model_info_size, ML_CN10K_ALIGN_SIZE);
+	model_stats_size = (dev->data->nb_queue_pairs + 1) * sizeof(struct cn10k_ml_model_stats);
 
 	mz_size = PLT_ALIGN_CEIL(sizeof(struct cn10k_ml_model), ML_CN10K_ALIGN_SIZE) +
 		  2 * model_data_size + model_info_size +
-		  PLT_ALIGN_CEIL(sizeof(struct cn10k_ml_req), ML_CN10K_ALIGN_SIZE);
+		  PLT_ALIGN_CEIL(sizeof(struct cn10k_ml_req), ML_CN10K_ALIGN_SIZE) +
+		  model_stats_size;
 
 	/* Allocate memzone for model object and model data */
 	snprintf(str, RTE_MEMZONE_NAMESIZE, "%s_%u", CN10K_ML_MODEL_MEMZONE_NAME, idx);
@@ -952,6 +1255,24 @@  cn10k_ml_model_load(struct rte_ml_dev *dev, struct rte_ml_model_params *params,
 	/* Set model info */
 	cn10k_ml_model_info_set(dev, model);
 
+	/* Reset burst and sync stats */
+	model->burst_stats = PLT_PTR_ADD(
+		model->req, PLT_ALIGN_CEIL(sizeof(struct cn10k_ml_req), ML_CN10K_ALIGN_SIZE));
+	for (qp_id = 0; qp_id < dev->data->nb_queue_pairs + 1; qp_id++) {
+		model->burst_stats[qp_id].hw_latency_tot = 0;
+		model->burst_stats[qp_id].hw_latency_min = UINT64_MAX;
+		model->burst_stats[qp_id].hw_latency_max = 0;
+		model->burst_stats[qp_id].fw_latency_tot = 0;
+		model->burst_stats[qp_id].fw_latency_min = UINT64_MAX;
+		model->burst_stats[qp_id].fw_latency_max = 0;
+		model->burst_stats[qp_id].hw_reset_count = 0;
+		model->burst_stats[qp_id].fw_reset_count = 0;
+		model->burst_stats[qp_id].dequeued_count = 0;
+	}
+	model->sync_stats =
+		PLT_PTR_ADD(model->burst_stats,
+			    dev->data->nb_queue_pairs * sizeof(struct cn10k_ml_model_stats));
+
 	plt_spinlock_init(&model->lock);
 	model->state = ML_CN10K_MODEL_STATE_LOADED;
 	dev->data->models[idx] = model;
@@ -1506,15 +1827,44 @@  static __rte_always_inline void
 cn10k_ml_result_update(struct rte_ml_dev *dev, int qp_id, struct cn10k_ml_result *result,
 		       struct rte_ml_op *op)
 {
+	struct cn10k_ml_model_stats *stats;
+	struct cn10k_ml_model *model;
 	struct cn10k_ml_dev *mldev;
 	struct cn10k_ml_qp *qp;
+	uint64_t hw_latency;
+	uint64_t fw_latency;
 
 	if (likely(result->error_code.u64 == 0)) {
+		model = dev->data->models[op->model_id];
 		if (likely(qp_id >= 0)) {
 			qp = dev->data->queue_pairs[qp_id];
 			qp->stats.dequeued_count++;
+			stats = &model->burst_stats[qp_id];
+		} else {
+			stats = model->sync_stats;
+		}
+
+		if (unlikely(stats->dequeued_count == stats->hw_reset_count)) {
+			stats->hw_latency_min = UINT64_MAX;
+			stats->hw_latency_max = 0;
 		}
 
+		if (unlikely(stats->dequeued_count == stats->fw_reset_count)) {
+			stats->fw_latency_min = UINT64_MAX;
+			stats->fw_latency_max = 0;
+		}
+
+		hw_latency = result->stats.hw_end - result->stats.hw_start;
+		fw_latency = result->stats.fw_end - result->stats.fw_start - hw_latency;
+
+		stats->hw_latency_tot += hw_latency;
+		stats->hw_latency_min = PLT_MIN(stats->hw_latency_min, hw_latency);
+		stats->hw_latency_max = PLT_MAX(stats->hw_latency_max, hw_latency);
+		stats->fw_latency_tot += fw_latency;
+		stats->fw_latency_min = PLT_MIN(stats->fw_latency_min, fw_latency);
+		stats->fw_latency_max = PLT_MAX(stats->fw_latency_max, fw_latency);
+		stats->dequeued_count++;
+
 		op->impl_opaque = result->error_code.u64;
 		op->status = RTE_ML_OP_STATUS_SUCCESS;
 	} else {
@@ -1748,6 +2098,10 @@  struct rte_ml_dev_ops cn10k_ml_ops = {
 	/* Stats ops */
 	.dev_stats_get = cn10k_ml_dev_stats_get,
 	.dev_stats_reset = cn10k_ml_dev_stats_reset,
+	.dev_xstats_names_get = cn10k_ml_dev_xstats_names_get,
+	.dev_xstats_by_name_get = cn10k_ml_dev_xstats_by_name_get,
+	.dev_xstats_get = cn10k_ml_dev_xstats_get,
+	.dev_xstats_reset = cn10k_ml_dev_xstats_reset,
 
 	/* Model ops */
 	.model_load = cn10k_ml_model_load,