[v1,29/34] ml/cnxk: enable reporting model runtime as xstats

Message ID 20230830155927.3566-30-syalavarthi@marvell.com (mailing list archive)
State Superseded, archived
Delegated to: Thomas Monjalon
Headers
Series Implemenation of revised ml/cnxk driver |

Checks

Context Check Description
ci/checkpatch success coding style OK

Commit Message

Srikanth Yalavarthi Aug. 30, 2023, 3:59 p.m. UTC
  Added model xstats entries to compute runtime latency.
Allocated internal resources for TVM model xstats.

Signed-off-by: Srikanth Yalavarthi <syalavarthi@marvell.com>
---
 drivers/ml/cnxk/cnxk_ml_ops.c    | 182 ++++++++++++++++++++++++++++---
 drivers/ml/cnxk/cnxk_ml_ops.h    |   1 +
 drivers/ml/cnxk/cnxk_ml_xstats.h |   7 ++
 drivers/ml/cnxk/mvtvm_ml_model.h |  24 ++++
 drivers/ml/cnxk/mvtvm_ml_ops.c   |  24 +++-
 5 files changed, 223 insertions(+), 15 deletions(-)
  

Patch

diff --git a/drivers/ml/cnxk/cnxk_ml_ops.c b/drivers/ml/cnxk/cnxk_ml_ops.c
index f933a2b846f..ff9ecd3c941 100644
--- a/drivers/ml/cnxk/cnxk_ml_ops.c
+++ b/drivers/ml/cnxk/cnxk_ml_ops.c
@@ -146,7 +146,8 @@  cnxk_ml_xstats_init(struct cnxk_ml_dev *cnxk_mldev)
 
 	/* Allocate memory for xstats entries. Don't allocate during reconfigure */
 	nb_stats = RTE_DIM(device_xstats) +
-		   RTE_DIM(layer_xstats) * ML_CNXK_MAX_MODELS * ML_CNXK_MODEL_MAX_LAYERS;
+		   RTE_DIM(layer_xstats) * ML_CNXK_MAX_MODELS * ML_CNXK_MODEL_MAX_LAYERS +
+		   RTE_DIM(model_xstats) * ML_CNXK_MAX_MODELS;
 	if (cnxk_mldev->xstats.entries == NULL)
 		cnxk_mldev->xstats.entries = rte_zmalloc(
 			"cnxk_ml_xstats", sizeof(struct cnxk_ml_xstats_entry) * nb_stats,
@@ -177,6 +178,25 @@  cnxk_ml_xstats_init(struct cnxk_ml_dev *cnxk_mldev)
 	for (model = 0; model < ML_CNXK_MAX_MODELS; model++) {
 		cnxk_mldev->xstats.offset_for_model[model] = stat_id;
 
+		for (i = 0; i < RTE_DIM(model_xstats); i++) {
+			cnxk_mldev->xstats.entries[stat_id].map.id = stat_id;
+			cnxk_mldev->xstats.entries[stat_id].mode = RTE_ML_DEV_XSTATS_MODEL;
+			cnxk_mldev->xstats.entries[stat_id].group = CNXK_ML_XSTATS_GROUP_MODEL;
+			cnxk_mldev->xstats.entries[stat_id].type = model_xstats[i].type;
+			cnxk_mldev->xstats.entries[stat_id].fn_id = CNXK_ML_XSTATS_FN_MODEL;
+			cnxk_mldev->xstats.entries[stat_id].obj_idx = model;
+			cnxk_mldev->xstats.entries[stat_id].layer_id = -1;
+			cnxk_mldev->xstats.entries[stat_id].reset_allowed =
+				model_xstats[i].reset_allowed;
+
+			/* Name of xstat is updated during model load */
+			snprintf(cnxk_mldev->xstats.entries[stat_id].map.name,
+				 sizeof(cnxk_mldev->xstats.entries[stat_id].map.name),
+				 "Model-%u-%s", model, model_xstats[i].name);
+
+			stat_id++;
+		}
+
 		for (layer = 0; layer < ML_CNXK_MODEL_MAX_LAYERS; layer++) {
 			cnxk_mldev->xstats.offset_for_layer[model][layer] = stat_id;
 
@@ -203,7 +223,8 @@  cnxk_ml_xstats_init(struct cnxk_ml_dev *cnxk_mldev)
 			cnxk_mldev->xstats.count_per_layer[model][layer] = RTE_DIM(layer_xstats);
 		}
 
-		cnxk_mldev->xstats.count_per_model[model] = RTE_DIM(layer_xstats);
+		cnxk_mldev->xstats.count_per_model[model] =
+			RTE_DIM(layer_xstats) + ML_CNXK_MODEL_MAX_LAYERS * RTE_DIM(model_xstats);
 	}
 
 	cnxk_mldev->xstats.count_mode_model = stat_id - cnxk_mldev->xstats.count_mode_device;
@@ -212,6 +233,42 @@  cnxk_ml_xstats_init(struct cnxk_ml_dev *cnxk_mldev)
 	return 0;
 }
 
+void
+cnxk_ml_xstats_model_name_update(struct cnxk_ml_dev *cnxk_mldev, uint16_t model_id)
+{
+	struct cnxk_ml_model *model;
+	uint16_t rclk_freq;
+	uint16_t sclk_freq;
+	uint16_t stat_id;
+	char suffix[8];
+	uint16_t i;
+
+	model = cnxk_mldev->mldev->data->models[model_id];
+	stat_id = cnxk_mldev->xstats.offset_for_model[model_id];
+
+	roc_clk_freq_get(&rclk_freq, &sclk_freq);
+	if (sclk_freq == 0)
+		strcpy(suffix, "cycles");
+	else
+		strcpy(suffix, "ns");
+
+	/* Update xstat name based on layer name and sclk availability */
+	for (i = 0; i < RTE_DIM(model_xstats); i++) {
+		if (model->type == ML_CNXK_MODEL_TYPE_GLOW)
+			snprintf(cnxk_mldev->xstats.entries[stat_id].map.name,
+				 sizeof(cnxk_mldev->xstats.entries[stat_id].map.name), "%s-%s-%s",
+				 model->glow.metadata.model.name, model_xstats[i].name, suffix);
+#ifdef RTE_MLDEV_CNXK_ENABLE_MVTVM
+		else
+			snprintf(cnxk_mldev->xstats.entries[stat_id].map.name,
+				 sizeof(cnxk_mldev->xstats.entries[stat_id].map.name), "%s-%s-%s",
+				 model->mvtvm.metadata.model.name, model_xstats[i].name, suffix);
+#endif
+
+		stat_id++;
+	}
+}
+
 static void
 cnxk_ml_xstats_uninit(struct cnxk_ml_dev *cnxk_mldev)
 {
@@ -249,6 +306,9 @@  cnxk_ml_dev_xstat_get(struct cnxk_ml_dev *cnxk_mldev, uint16_t obj_idx __rte_unu
 			count += layer->glow.burst_xstats[qp_id].dequeued_count -                  \
 				 layer->glow.burst_xstats[qp_id].str##_reset_count;                \
 		}                                                                                  \
+		value += layer->glow.sync_xstats->str##_latency_tot;                               \
+		count += layer->glow.sync_xstats->dequeued_count -                                 \
+			 layer->glow.sync_xstats->str##_reset_count;                               \
 		if (count != 0)                                                                    \
 			value = value / count;                                                     \
 	} while (0)
@@ -261,6 +321,9 @@  cnxk_ml_dev_xstat_get(struct cnxk_ml_dev *cnxk_mldev, uint16_t obj_idx __rte_unu
 			count += layer->glow.burst_xstats[qp_id].dequeued_count -                  \
 				 layer->glow.burst_xstats[qp_id].str##_reset_count;                \
 		}                                                                                  \
+		value = PLT_MIN(value, layer->glow.sync_xstats->str##_latency_min);                \
+		count += layer->glow.sync_xstats->dequeued_count -                                 \
+			 layer->glow.sync_xstats->str##_reset_count;                               \
 		if (count == 0)                                                                    \
 			value = 0;                                                                 \
 	} while (0)
@@ -273,10 +336,53 @@  cnxk_ml_dev_xstat_get(struct cnxk_ml_dev *cnxk_mldev, uint16_t obj_idx __rte_unu
 			count += layer->glow.burst_xstats[qp_id].dequeued_count -                  \
 				 layer->glow.burst_xstats[qp_id].str##_reset_count;                \
 		}                                                                                  \
+		value = PLT_MAX(value, layer->glow.sync_xstats->str##_latency_max);                \
+		count += layer->glow.sync_xstats->dequeued_count -                                 \
+			 layer->glow.sync_xstats->str##_reset_count;                               \
 		if (count == 0)                                                                    \
 			value = 0;                                                                 \
 	} while (0)
 
+#ifdef RTE_MLDEV_CNXK_ENABLE_MVTVM
+#define ML_AVG_FOREACH_QP_MVTVM(cnxk_mldev, model, qp_id, value, count)                            \
+	do {                                                                                       \
+		value = 0;                                                                         \
+		for (qp_id = 0; qp_id < cnxk_mldev->mldev->data->nb_queue_pairs; qp_id++) {        \
+			value += model->mvtvm.burst_xstats[qp_id].tvm_rt_latency_tot;              \
+			count += model->mvtvm.burst_xstats[qp_id].dequeued_count -                 \
+				 model->mvtvm.burst_xstats[qp_id].tvm_rt_reset_count;              \
+		}                                                                                  \
+		if (count != 0)                                                                    \
+			value = value / count;                                                     \
+	} while (0)
+
+#define ML_MIN_FOREACH_QP_MVTVM(cnxk_mldev, model, qp_id, value, count)                            \
+	do {                                                                                       \
+		value = UINT64_MAX;                                                                \
+		for (qp_id = 0; qp_id < cnxk_mldev->mldev->data->nb_queue_pairs; qp_id++) {        \
+			value = PLT_MIN(value,                                                     \
+					model->mvtvm.burst_xstats[qp_id].tvm_rt_latency_min);      \
+			count += model->mvtvm.burst_xstats[qp_id].dequeued_count -                 \
+				 model->mvtvm.burst_xstats[qp_id].tvm_rt_reset_count;              \
+		}                                                                                  \
+		if (count == 0)                                                                    \
+			value = 0;                                                                 \
+	} while (0)
+
+#define ML_MAX_FOREACH_QP_MVTVM(cnxk_mldev, model, qp_id, value, count)                            \
+	do {                                                                                       \
+		value = 0;                                                                         \
+		for (qp_id = 0; qp_id < cnxk_mldev->mldev->data->nb_queue_pairs; qp_id++) {        \
+			value = PLT_MAX(value,                                                     \
+					model->mvtvm.burst_xstats[qp_id].tvm_rt_latency_max);      \
+			count += model->mvtvm.burst_xstats[qp_id].dequeued_count -                 \
+				 model->mvtvm.burst_xstats[qp_id].tvm_rt_reset_count;              \
+		}                                                                                  \
+		if (count == 0)                                                                    \
+			value = 0;                                                                 \
+	} while (0)
+#endif
+
 static uint64_t
 cnxk_ml_model_xstat_get(struct cnxk_ml_dev *cnxk_mldev, uint16_t obj_idx, int32_t layer_id,
 			enum cnxk_ml_xstats_type type)
@@ -317,6 +423,17 @@  cnxk_ml_model_xstat_get(struct cnxk_ml_dev *cnxk_mldev, uint16_t obj_idx, int32_
 	case max_fw_latency:
 		ML_MAX_FOREACH_QP(cnxk_mldev, layer, qp_id, fw, value, count);
 		break;
+#ifdef RTE_MLDEV_CNXK_ENABLE_MVTVM
+	case avg_rt_latency:
+		ML_AVG_FOREACH_QP_MVTVM(cnxk_mldev, model, qp_id, value, count);
+		break;
+	case min_rt_latency:
+		ML_MIN_FOREACH_QP_MVTVM(cnxk_mldev, model, qp_id, value, count);
+		break;
+	case max_rt_latency:
+		ML_MAX_FOREACH_QP_MVTVM(cnxk_mldev, model, qp_id, value, count);
+		break;
+#endif
 	default:
 		value = 0;
 	}
@@ -907,8 +1024,9 @@  cnxk_ml_dev_xstats_names_get(struct rte_ml_dev *dev, enum rte_ml_dev_xstats_mode
 {
 	struct cnxk_ml_xstats_entry *xs;
 	struct cnxk_ml_dev *cnxk_mldev;
+	struct cnxk_ml_model *model;
 	uint32_t xstats_mode_count;
-	uint16_t layer_id = 0;
+	uint16_t layer_id;
 	uint32_t idx = 0;
 	uint32_t i;
 
@@ -925,7 +1043,17 @@  cnxk_ml_dev_xstats_names_get(struct rte_ml_dev *dev, enum rte_ml_dev_xstats_mode
 	case RTE_ML_DEV_XSTATS_MODEL:
 		if (model_id >= ML_CNXK_MAX_MODELS)
 			break;
-		xstats_mode_count = cnxk_mldev->xstats.count_per_layer[model_id][layer_id];
+
+		model = cnxk_mldev->mldev->data->models[model_id];
+		for (layer_id = 0; layer_id < model->nb_layers; layer_id++) {
+			if (model->layer[layer_id].type == ML_CNXK_LAYER_TYPE_MRVL)
+				xstats_mode_count +=
+					cnxk_mldev->xstats.count_per_layer[model_id][layer_id];
+		}
+
+		if ((model->type == ML_CNXK_MODEL_TYPE_TVM) &&
+		    (model->subtype != ML_CNXK_MODEL_SUBTYPE_TVM_MRVL))
+			xstats_mode_count += RTE_DIM(model_xstats);
 		break;
 	default:
 		return -EINVAL;
@@ -939,9 +1067,20 @@  cnxk_ml_dev_xstats_names_get(struct rte_ml_dev *dev, enum rte_ml_dev_xstats_mode
 		if (xs->mode != mode)
 			continue;
 
-		if (mode == RTE_ML_DEV_XSTATS_MODEL &&
-		    (model_id != xs->obj_idx || layer_id != xs->layer_id))
-			continue;
+		if (mode == RTE_ML_DEV_XSTATS_MODEL) {
+			if (model_id != xs->obj_idx)
+				continue;
+
+			model = cnxk_mldev->mldev->data->models[model_id];
+			if ((model->type == ML_CNXK_MODEL_TYPE_GLOW ||
+			     model->subtype == ML_CNXK_MODEL_SUBTYPE_TVM_MRVL) &&
+			    xs->group == CNXK_ML_XSTATS_GROUP_MODEL)
+				continue;
+
+			if (model->type == ML_CNXK_MODEL_TYPE_TVM &&
+			    model->layer[xs->layer_id].type == ML_CNXK_LAYER_TYPE_LLVM)
+				continue;
+		}
 
 		strncpy(xstats_map[idx].name, xs->map.name, RTE_ML_STR_MAX);
 		xstats_map[idx].id = xs->map.id;
@@ -1002,9 +1141,10 @@  cnxk_ml_dev_xstats_get(struct rte_ml_dev *dev, enum rte_ml_dev_xstats_mode mode,
 {
 	struct cnxk_ml_xstats_entry *xs;
 	struct cnxk_ml_dev *cnxk_mldev;
+	struct cnxk_ml_model *model;
 	uint32_t xstats_mode_count;
-	uint16_t layer_id = 0;
 	cnxk_ml_xstats_fn fn;
+	uint16_t layer_id;
 	uint64_t val;
 	uint32_t idx;
 	uint32_t i;
@@ -1022,7 +1162,14 @@  cnxk_ml_dev_xstats_get(struct rte_ml_dev *dev, enum rte_ml_dev_xstats_mode mode,
 	case RTE_ML_DEV_XSTATS_MODEL:
 		if (model_id >= ML_CNXK_MAX_MODELS)
 			return -EINVAL;
-		xstats_mode_count = cnxk_mldev->xstats.count_per_layer[model_id][layer_id];
+
+		model = cnxk_mldev->mldev->data->models[model_id];
+		for (layer_id = 0; layer_id < model->nb_layers; layer_id++)
+			xstats_mode_count += cnxk_mldev->xstats.count_per_layer[model_id][layer_id];
+
+		if ((model->type == ML_CNXK_MODEL_TYPE_TVM) &&
+		    (model->subtype != ML_CNXK_MODEL_SUBTYPE_TVM_MRVL))
+			xstats_mode_count += RTE_DIM(model_xstats);
 		break;
 	default:
 		return -EINVAL;
@@ -1034,11 +1181,18 @@  cnxk_ml_dev_xstats_get(struct rte_ml_dev *dev, enum rte_ml_dev_xstats_mode mode,
 		if (stat_ids[i] > cnxk_mldev->xstats.count || xs->mode != mode)
 			continue;
 
-		if (mode == RTE_ML_DEV_XSTATS_MODEL &&
-		    (model_id != xs->obj_idx || layer_id != xs->layer_id)) {
-			plt_err("Invalid stats_id[%d] = %d for model_id = %d\n", i, stat_ids[i],
-				model_id);
-			return -EINVAL;
+		if (mode == RTE_ML_DEV_XSTATS_MODEL) {
+			if (model_id != xs->obj_idx)
+				continue;
+
+			model = cnxk_mldev->mldev->data->models[xs->obj_idx];
+			if ((model->type == ML_CNXK_MODEL_TYPE_GLOW ||
+			     model->subtype == ML_CNXK_MODEL_SUBTYPE_TVM_MRVL) &&
+			    xs->group == CNXK_ML_XSTATS_GROUP_MODEL)
+				continue;
+
+			if (xs->layer_id == -1 && xs->group == CNXK_ML_XSTATS_GROUP_LAYER)
+				continue;
 		}
 
 		switch (xs->fn_id) {
diff --git a/drivers/ml/cnxk/cnxk_ml_ops.h b/drivers/ml/cnxk/cnxk_ml_ops.h
index d0c126f34b7..2575f4c6e10 100644
--- a/drivers/ml/cnxk/cnxk_ml_ops.h
+++ b/drivers/ml/cnxk/cnxk_ml_ops.h
@@ -64,6 +64,7 @@  extern struct rte_ml_dev_ops cnxk_ml_ops;
 
 int cnxk_ml_model_unload(struct rte_ml_dev *dev, uint16_t model_id);
 int cnxk_ml_model_stop(struct rte_ml_dev *dev, uint16_t model_id);
+void cnxk_ml_xstats_model_name_update(struct cnxk_ml_dev *cnxk_mldev, uint16_t model_id);
 
 __rte_hot uint16_t cnxk_ml_enqueue_burst(struct rte_ml_dev *dev, uint16_t qp_id,
 					 struct rte_ml_op **ops, uint16_t nb_ops);
diff --git a/drivers/ml/cnxk/cnxk_ml_xstats.h b/drivers/ml/cnxk/cnxk_ml_xstats.h
index 5e02bb876ca..a2c9adfe4ab 100644
--- a/drivers/ml/cnxk/cnxk_ml_xstats.h
+++ b/drivers/ml/cnxk/cnxk_ml_xstats.h
@@ -142,4 +142,11 @@  static const struct cnxk_ml_xstat_info layer_xstats[] = {
 	{"Min-FW-Latency", min_fw_latency, 1}, {"Max-FW-Latency", max_fw_latency, 1},
 };
 
+/* Model xstats */
+static const struct cnxk_ml_xstat_info model_xstats[] = {
+	{"Avg-RT-Latency", avg_rt_latency, 1},
+	{"Min-RT-Latency", min_rt_latency, 1},
+	{"Max-RT-Latency", max_rt_latency, 1},
+};
+
 #endif /* _CNXK_ML_XSTATS_H_ */
diff --git a/drivers/ml/cnxk/mvtvm_ml_model.h b/drivers/ml/cnxk/mvtvm_ml_model.h
index fa7735cfaa0..d71df36f5a5 100644
--- a/drivers/ml/cnxk/mvtvm_ml_model.h
+++ b/drivers/ml/cnxk/mvtvm_ml_model.h
@@ -33,6 +33,27 @@  struct mvtvm_ml_model_object {
 	int64_t size;
 };
 
+/* Model fast-path stats */
+struct mvtvm_ml_model_xstats {
+	/* Total TVM runtime latency, sum of all inferences */
+	uint64_t tvm_rt_latency_tot;
+
+	/* TVM runtime latency */
+	uint64_t tvm_rt_latency;
+
+	/* Minimum TVM runtime latency */
+	uint64_t tvm_rt_latency_min;
+
+	/* Maximum TVM runtime latency */
+	uint64_t tvm_rt_latency_max;
+
+	/* Total jobs dequeued */
+	uint64_t dequeued_count;
+
+	/* Hardware stats reset index */
+	uint64_t tvm_rt_reset_count;
+};
+
 struct mvtvm_ml_model_data {
 	/* Model metadata */
 	struct tvmdp_model_metadata metadata;
@@ -45,6 +66,9 @@  struct mvtvm_ml_model_data {
 
 	/* Model I/O info */
 	struct cnxk_ml_io_info info;
+
+	/* Stats for burst ops */
+	struct mvtvm_ml_model_xstats *burst_xstats;
 };
 
 int mvtvm_ml_model_blob_parse(struct rte_ml_model_params *params,
diff --git a/drivers/ml/cnxk/mvtvm_ml_ops.c b/drivers/ml/cnxk/mvtvm_ml_ops.c
index 3fae25f6d2d..c251579668c 100644
--- a/drivers/ml/cnxk/mvtvm_ml_ops.c
+++ b/drivers/ml/cnxk/mvtvm_ml_ops.c
@@ -16,6 +16,7 @@ 
 
 #include "cnxk_ml_dev.h"
 #include "cnxk_ml_model.h"
+#include "cnxk_ml_ops.h"
 
 /* ML model macros */
 #define MVTVM_ML_MODEL_MEMZONE_NAME "ml_mvtvm_model_mz"
@@ -59,6 +60,7 @@  mvtvm_ml_model_load(struct cnxk_ml_dev *cnxk_mldev, struct rte_ml_model_params *
 	char str[RTE_MEMZONE_NAMESIZE];
 	const struct plt_memzone *mz;
 	size_t model_object_size = 0;
+	size_t model_xstats_size = 0;
 	uint16_t nb_mrvl_layers;
 	uint16_t nb_llvm_layers;
 	uint8_t layer_id = 0;
@@ -74,7 +76,11 @@  mvtvm_ml_model_load(struct cnxk_ml_dev *cnxk_mldev, struct rte_ml_model_params *
 	model_object_size = RTE_ALIGN_CEIL(object[0].size, RTE_CACHE_LINE_MIN_SIZE) +
 			    RTE_ALIGN_CEIL(object[1].size, RTE_CACHE_LINE_MIN_SIZE) +
 			    RTE_ALIGN_CEIL(object[2].size, RTE_CACHE_LINE_MIN_SIZE);
-	mz_size += model_object_size;
+
+	model_xstats_size =
+		cnxk_mldev->mldev->data->nb_queue_pairs * sizeof(struct mvtvm_ml_model_xstats);
+
+	mz_size += model_object_size + model_xstats_size;
 
 	/* Allocate memzone for model object */
 	snprintf(str, RTE_MEMZONE_NAMESIZE, "%s_%u", MVTVM_ML_MODEL_MEMZONE_NAME, model->model_id);
@@ -187,6 +193,22 @@  mvtvm_ml_model_load(struct cnxk_ml_dev *cnxk_mldev, struct rte_ml_model_params *
 	/* Set model info */
 	mvtvm_ml_model_info_set(cnxk_mldev, model);
 
+	/* Update model xstats name */
+	cnxk_ml_xstats_model_name_update(cnxk_mldev, model->model_id);
+
+	model->mvtvm.burst_xstats = RTE_PTR_ADD(
+		model->mvtvm.object.params.addr,
+		RTE_ALIGN_CEIL(model->mvtvm.object.params.size, RTE_CACHE_LINE_MIN_SIZE));
+
+	for (int qp_id = 0; qp_id < cnxk_mldev->mldev->data->nb_queue_pairs; qp_id++) {
+		model->mvtvm.burst_xstats[qp_id].tvm_rt_latency_tot = 0;
+		model->mvtvm.burst_xstats[qp_id].tvm_rt_latency = 0;
+		model->mvtvm.burst_xstats[qp_id].tvm_rt_latency_min = UINT64_MAX;
+		model->mvtvm.burst_xstats[qp_id].tvm_rt_latency_max = 0;
+		model->mvtvm.burst_xstats[qp_id].tvm_rt_reset_count = 0;
+		model->mvtvm.burst_xstats[qp_id].dequeued_count = 0;
+	}
+
 	return 0;
 
 error: