net/mlx5: add non-template flow metadata split

Message ID 20240613125146.547992-1-bingz@nvidia.com (mailing list archive)
State Superseded
Delegated to: Raslan Darawsheh
Headers
Series net/mlx5: add non-template flow metadata split |

Checks

Context Check Description
ci/checkpatch warning coding style issues
ci/loongarch-compilation success Compilation OK
ci/loongarch-unit-testing success Unit Testing PASS
ci/Intel-compilation fail Compilation issues
ci/intel-Testing success Testing PASS
ci/intel-Functional success Functional PASS
ci/iol-abi-testing warning Testing issues
ci/iol-sample-apps-testing warning Testing issues
ci/iol-compile-amd64-testing fail Testing issues
ci/iol-unit-arm64-testing fail Testing issues
ci/iol-unit-amd64-testing fail Testing issues
ci/iol-compile-arm64-testing fail Testing issues

Commit Message

Bing Zhao June 13, 2024, 12:51 p.m. UTC
  The method will not create any new flow rule implicitly during split
stage, but only generate needed items, actions and attributes based
on the detection.

All the rules will still be created in the flow_hw_list_create().

In the meanwhile, once the mark action is specified in the FDB rule,
a new rule in the NIC Rx will be created to:
  1. match the mark value on REG_C_x in FDB and set it into Rx
     flow tag field.
  2. copy the metadata in REG_C_x' into the REG_B.

If there is no mark, the default rule with only copying metadata
will be hit if there is Queue or RSS action in the NIC Rx rule.

Regarding the NIC Tx, only the metadata is relevant and it will be
copied in NIC Tx from REG_A into some REG_C_x. The current HWS
implementation already has already supported in the default copy
rule or the default SQ miss rule in the NIC Tx root table.

Signed-off-by: Bing Zhao <bingz@nvidia.com>
Acked-by: Dariusz Sosnowski <dsosnowski@nvidia.com>
---
Depends-on: patch-141122 ("net/mlx5: fix mlx5 device start failure")
---
 drivers/net/mlx5/linux/mlx5_os.c  |  19 +-
 drivers/net/mlx5/meson.build      |   1 +
 drivers/net/mlx5/mlx5.c           |   5 +-
 drivers/net/mlx5/mlx5_flow.c      |  16 +
 drivers/net/mlx5/mlx5_flow.h      |  51 ++-
 drivers/net/mlx5/mlx5_flow_hw.c   | 121 ++++++-
 drivers/net/mlx5/mlx5_nta_split.c | 568 ++++++++++++++++++++++++++++++
 drivers/net/mlx5/mlx5_trigger.c   |   7 +-
 8 files changed, 758 insertions(+), 30 deletions(-)
 create mode 100644 drivers/net/mlx5/mlx5_nta_split.c
  

Patch

diff --git a/drivers/net/mlx5/linux/mlx5_os.c b/drivers/net/mlx5/linux/mlx5_os.c
index 50f4810bff..b71f27203c 100644
--- a/drivers/net/mlx5/linux/mlx5_os.c
+++ b/drivers/net/mlx5/linux/mlx5_os.c
@@ -503,8 +503,25 @@  mlx5_alloc_shared_dr(struct rte_eth_dev *eth_dev)
 					       mlx5_flex_parser_clone_free_cb);
 	if (!sh->flex_parsers_dv)
 		goto error;
-	if (priv->sh->config.dv_flow_en == 2)
+	if (priv->sh->config.dv_flow_en == 2) {
+		if (sh->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY &&
+		    sh->dv_regc0_mask) {
+			/* Reuse DV callback functions. */
+			sh->mreg_cp_tbl = mlx5_hlist_create(MLX5_FLOW_MREG_HNAME,
+							    MLX5_FLOW_MREG_HTABLE_SZ,
+							    false, true, eth_dev,
+							    flow_nta_mreg_create_cb,
+							    flow_dv_mreg_match_cb,
+							    flow_nta_mreg_remove_cb,
+							    flow_dv_mreg_clone_cb,
+							    flow_dv_mreg_clone_free_cb);
+			if (!sh->mreg_cp_tbl) {
+				err = ENOMEM;
+				goto error;
+			}
+		}
 		return 0;
+	}
 	/* Init port id action list. */
 	snprintf(s, sizeof(s), "%s_port_id_action_list", sh->ibdev_name);
 	sh->port_id_action_list = mlx5_list_create(s, sh, true,
diff --git a/drivers/net/mlx5/meson.build b/drivers/net/mlx5/meson.build
index b279ddf47c..eb5eb2cce7 100644
--- a/drivers/net/mlx5/meson.build
+++ b/drivers/net/mlx5/meson.build
@@ -52,6 +52,7 @@  if is_linux
             'mlx5_hws_cnt.c',
             'mlx5_flow_quota.c',
             'mlx5_flow_verbs.c',
+            'mlx5_nta_split.c',
     )
 endif
 
diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index e482f7f0e5..654146badf 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -2335,8 +2335,7 @@  mlx5_dev_close(struct rte_eth_dev *dev)
 	 * If default mreg copy action is removed at the stop stage,
 	 * the search will return none and nothing will be done anymore.
 	 */
-	if (priv->sh->config.dv_flow_en != 2)
-		mlx5_flow_stop_default(dev);
+	mlx5_flow_stop_default(dev);
 	mlx5_traffic_disable(dev);
 	/*
 	 * If all the flows are already flushed in the device stop stage,
@@ -3297,7 +3296,7 @@  mlx5_set_metadata_mask(struct rte_eth_dev *dev)
 		break;
 	case MLX5_XMETA_MODE_META32_HWS:
 		meta = UINT32_MAX;
-		mark = MLX5_FLOW_MARK_MASK;
+		mark = (reg_c0 >> rte_bsf32(reg_c0)) & MLX5_FLOW_MARK_MASK;
 		break;
 	default:
 		meta = 0;
diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c
index 9ccbbecc50..eb730354e7 100644
--- a/drivers/net/mlx5/mlx5_flow.c
+++ b/drivers/net/mlx5/mlx5_flow.c
@@ -8125,6 +8125,17 @@  mlx5_flow_list_flush(struct rte_eth_dev *dev, enum mlx5_flow_type type,
 void
 mlx5_flow_stop_default(struct rte_eth_dev *dev)
 {
+#ifdef HAVE_MLX5_HWS_SUPPORT
+	struct mlx5_priv *priv = dev->data->dev_private;
+
+	if (priv->sh->config.dv_flow_en == 2) {
+		mlx5_flow_nta_del_default_copy_action(dev);
+		if (!rte_atomic_load_explicit(&priv->hws_mark_refcnt,
+					      rte_memory_order_relaxed))
+			flow_hw_rxq_flag_set(dev, false);
+		return;
+	}
+#endif
 	flow_mreg_del_default_copy_action(dev);
 	flow_rxq_flags_clear(dev);
 }
@@ -8169,7 +8180,12 @@  int
 mlx5_flow_start_default(struct rte_eth_dev *dev)
 {
 	struct rte_flow_error error;
+#ifdef HAVE_MLX5_HWS_SUPPORT
+	struct mlx5_priv *priv = dev->data->dev_private;
 
+	if (priv->sh->config.dv_flow_en == 2)
+		return mlx5_flow_nta_add_default_copy_action(dev, &error);
+#endif
 	/* Make sure default copy action (reg_c[0] -> reg_b) is created. */
 	return flow_mreg_add_default_copy_action(dev, &error);
 }
diff --git a/drivers/net/mlx5/mlx5_flow.h b/drivers/net/mlx5/mlx5_flow.h
index 92e2ecedb3..22df9a18bb 100644
--- a/drivers/net/mlx5/mlx5_flow.h
+++ b/drivers/net/mlx5/mlx5_flow.h
@@ -727,8 +727,11 @@  struct mlx5_flow_mreg_copy_resource {
 	LIST_ENTRY(mlx5_flow_mreg_copy_resource) next;
 	/* List entry for device flows. */
 	uint32_t idx;
-	uint32_t rix_flow; /* Built flow for copy. */
 	uint32_t mark_id;
+	union {
+		uint32_t rix_flow; /* Built flow for copy. */
+		uintptr_t hw_flow;
+	};
 };
 
 /* Table tunnel parameter. */
@@ -1334,6 +1337,7 @@  struct rte_flow_nt2hws {
 	SLIST_ENTRY(rte_flow_hw) next;
 	/** Encap/decap index. */
 	uint32_t rix_encap_decap;
+	uint32_t rix_mreg_copy;
 	uint8_t chaned_flow;
 };
 
@@ -1972,6 +1976,19 @@  struct mlx5_flow_split_info {
 	uint64_t prefix_layers; /**< Prefix subflow layers. */
 };
 
+struct mlx5_flow_hw_partial_resource {
+	const struct rte_flow_attr *attr;
+	const struct rte_flow_item *items;
+	const struct rte_flow_action *actions;
+};
+
+struct mlx5_flow_hw_split_resource {
+	struct mlx5_flow_hw_partial_resource prefix;
+	struct mlx5_flow_hw_partial_resource suffix;
+	void *buf_start; /* start address of continuous buffer. */
+	uint32_t flow_idx; /* This memory pool index to the flow. */
+};
+
 struct mlx5_hl_data {
 	uint8_t dw_offset;
 	uint32_t dw_mask;
@@ -3619,5 +3636,37 @@  extern const struct rte_flow_action_raw_decap empty_decap;
 extern const struct rte_flow_item_ipv6 nic_ipv6_mask;
 extern const struct rte_flow_item_tcp nic_tcp_mask;
 
+/* mlx5_nta_split.c */
+int
+mlx5_flow_nta_split_metadata(struct rte_eth_dev *dev,
+			     const struct rte_flow_attr *attr,
+			     const struct rte_flow_action actions[],
+			     const struct rte_flow_action *qrss,
+			     uint64_t action_flags,
+			     int actions_n,
+			     bool external,
+			     struct mlx5_flow_hw_split_resource *res,
+			     struct rte_flow_error *error);
+void
+mlx5_flow_nta_split_resource_free(struct rte_eth_dev *dev,
+				  struct mlx5_flow_hw_split_resource *res);
+struct mlx5_list_entry *
+flow_nta_mreg_create_cb(void *tool_ctx, void *cb_ctx);
+void
+flow_nta_mreg_remove_cb(void *tool_ctx, struct mlx5_list_entry *entry);
+void
+mlx5_flow_nta_del_copy_action(struct rte_eth_dev *dev, uint32_t idx);
+void
+mlx5_flow_nta_del_default_copy_action(struct rte_eth_dev *dev);
+int
+mlx5_flow_nta_add_default_copy_action(struct rte_eth_dev *dev,
+				      struct rte_flow_error *error);
+int
+mlx5_flow_nta_update_copy_table(struct rte_eth_dev *dev,
+				uint32_t *idx,
+				const struct rte_flow_action *mark,
+				uint64_t action_flags,
+				struct rte_flow_error *error);
+
 #endif
 #endif /* RTE_PMD_MLX5_FLOW_H_ */
diff --git a/drivers/net/mlx5/mlx5_flow_hw.c b/drivers/net/mlx5/mlx5_flow_hw.c
index eb89dcf454..d741572a9a 100644
--- a/drivers/net/mlx5/mlx5_flow_hw.c
+++ b/drivers/net/mlx5/mlx5_flow_hw.c
@@ -590,11 +590,20 @@  flow_hw_matching_item_flags_get(const struct rte_flow_item items[])
 
 static uint64_t
 flow_hw_action_flags_get(const struct rte_flow_action actions[],
+			 const struct rte_flow_action **qrss,
+			 const struct rte_flow_action **mark,
+			 int *encap_idx,
+			 int *act_cnt,
 			 struct rte_flow_error *error)
 {
 	uint64_t action_flags = 0;
 	const struct rte_flow_action *action;
+	const struct rte_flow_action_raw_encap *raw_encap;
+	int raw_decap_idx = -1;
+	int action_idx;
 
+	*encap_idx = -1;
+	action_idx = 0;
 	for (action = actions; action->type != RTE_FLOW_ACTION_TYPE_END; action++) {
 		int type = (int)action->type;
 		switch (type) {
@@ -617,8 +626,12 @@  flow_hw_action_flags_get(const struct rte_flow_action actions[],
 		case RTE_FLOW_ACTION_TYPE_DROP:
 			action_flags |= MLX5_FLOW_ACTION_DROP;
 			break;
+		case RTE_FLOW_ACTION_TYPE_FLAG:
+			action_flags |= MLX5_FLOW_ACTION_FLAG;
+			break;
 		case RTE_FLOW_ACTION_TYPE_MARK:
 			action_flags |= MLX5_FLOW_ACTION_MARK;
+			*mark = action;
 			break;
 		case RTE_FLOW_ACTION_TYPE_OF_PUSH_VLAN:
 			action_flags |= MLX5_FLOW_ACTION_OF_PUSH_VLAN;
@@ -631,17 +644,24 @@  flow_hw_action_flags_get(const struct rte_flow_action actions[],
 			break;
 		case RTE_FLOW_ACTION_TYPE_QUEUE:
 			action_flags |= MLX5_FLOW_ACTION_QUEUE;
+			*qrss = action;
 			break;
 		case RTE_FLOW_ACTION_TYPE_RSS:
 rss:
 			action_flags |= MLX5_FLOW_ACTION_RSS;
+			*qrss = action;
 			break;
 		case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP:
 		case RTE_FLOW_ACTION_TYPE_NVGRE_ENCAP:
 			action_flags |= MLX5_FLOW_ACTION_ENCAP;
+			*encap_idx = action_idx;
 			break;
 		case RTE_FLOW_ACTION_TYPE_RAW_ENCAP:
 			action_flags |= MLX5_FLOW_ACTION_ENCAP;
+			raw_encap = action->conf;
+			if (raw_encap->size > MLX5_ENCAPSULATION_DECISION_SIZE)
+				*encap_idx = raw_decap_idx != -1 ?
+					     raw_decap_idx : action_idx;
 			break;
 		case RTE_FLOW_ACTION_TYPE_VXLAN_DECAP:
 		case RTE_FLOW_ACTION_TYPE_NVGRE_DECAP:
@@ -649,6 +669,7 @@  flow_hw_action_flags_get(const struct rte_flow_action actions[],
 			break;
 		case RTE_FLOW_ACTION_TYPE_RAW_DECAP:
 			action_flags |= MLX5_FLOW_ACTION_DECAP;
+			raw_decap_idx = action_idx;
 			break;
 		case RTE_FLOW_ACTION_TYPE_SEND_TO_KERNEL:
 			action_flags |= MLX5_FLOW_ACTION_SEND_TO_KERNEL;
@@ -685,7 +706,12 @@  flow_hw_action_flags_get(const struct rte_flow_action actions[],
 		default:
 			goto error;
 		}
+		action_idx++;
 	}
+	if (*encap_idx == -1)
+		*encap_idx = action_idx;
+	action_idx++; /* The END action. */
+	*act_cnt = action_idx;
 	return action_flags;
 error:
 	rte_flow_error_set(error, EINVAL, RTE_FLOW_ERROR_TYPE_ACTION,
@@ -13380,10 +13406,12 @@  static int flow_hw_register_matcher(struct rte_eth_dev *dev,
 		.message = NULL,
 	};
 	struct rte_flow_attr flow_attr = *attr;
+	uint32_t specialize = 0; /* No unified FDB. */
 	struct mlx5_flow_cb_ctx ctx = {
 		.dev = dev,
 		.error = &sub_error,
 		.data = &flow_attr,
+		.data2 = &specialize,
 	};
 	void *items_ptr = &items;
 	struct mlx5_flow_cb_ctx matcher_ctx = {
@@ -13548,7 +13576,7 @@  flow_hw_create_flow(struct rte_eth_dev *dev, enum mlx5_flow_type type,
 		    struct rte_flow_hw **flow, struct rte_flow_error *error)
 {
 	int ret;
-	struct mlx5_hw_actions hw_act;
+	struct mlx5_hw_actions hw_act = { { NULL } };
 	struct mlx5_flow_hw_action_params ap;
 	struct mlx5_flow_dv_matcher matcher = {
 		.mask = {
@@ -13566,7 +13594,6 @@  flow_hw_create_flow(struct rte_eth_dev *dev, enum mlx5_flow_type type,
 		.tbl_type = 0,
 		};
 
-	memset(&hw_act, 0, sizeof(hw_act));
 	if (attr->transfer)
 		tbl_type = MLX5DR_TABLE_TYPE_FDB;
 	else if (attr->egress)
@@ -13637,8 +13664,7 @@  flow_hw_create_flow(struct rte_eth_dev *dev, enum mlx5_flow_type type,
 	if (ret) {
 		/* release after actual error */
 		if ((*flow)->nt2hws && (*flow)->nt2hws->matcher)
-			flow_hw_unregister_matcher(dev,
-						   (*flow)->nt2hws->matcher);
+			flow_hw_unregister_matcher(dev, (*flow)->nt2hws->matcher);
 	}
 	return ret;
 }
@@ -13709,8 +13735,9 @@  flow_hw_list_destroy(struct rte_eth_dev *dev, enum mlx5_flow_type type,
 	struct rte_flow_hw *flow = (struct rte_flow_hw *)flow_addr;
 	struct mlx5_nta_rss_flow_head head = { .slh_first = flow };
 
-	if (flow->nt2hws->chaned_flow)
+	if (!flow || !flow->nt2hws || flow->nt2hws->chaned_flow)
 		return;
+	mlx5_flow_nta_del_copy_action(dev, flow->nt2hws->rix_mreg_copy);
 	while (!SLIST_EMPTY(&head)) {
 		flow = SLIST_FIRST(&head);
 		SLIST_REMOVE_HEAD(&head, nt2hws->next);
@@ -13751,41 +13778,97 @@  static uintptr_t flow_hw_list_create(struct rte_eth_dev *dev,
 				     struct rte_flow_error *error)
 {
 	int ret;
+	int split;
+	int encap_idx;
+	uint32_t cpy_idx = 0;
+	int actions_n = 0;
 	struct rte_flow_hw *flow = NULL;
+	struct rte_flow_hw *prfx_flow = NULL;
+	const struct rte_flow_action *qrss = NULL;
+	const struct rte_flow_action *mark = NULL;
 	uint64_t item_flags = flow_hw_matching_item_flags_get(items);
-	uint64_t action_flags = flow_hw_action_flags_get(actions, error);
+	uint64_t action_flags = flow_hw_action_flags_get(actions, &qrss, &mark,
+							 &encap_idx, &actions_n, error);
+	struct mlx5_flow_hw_split_resource resource = {
+		.suffix = {
+			.attr = attr,
+			.items = items,
+			.actions = actions,
+		},
+	};
 
 	/*
 	 * TODO: add a call to flow_hw_validate function once it exist.
 	 * and update mlx5_flow_hw_drv_ops accordingly.
 	 */
 
+	RTE_SET_USED(encap_idx);
+	split = mlx5_flow_nta_split_metadata(dev, attr, actions, qrss, action_flags,
+					     actions_n, external, &resource, error);
+	if (split < 0)
+		return split;
+
+	/* Update the metadata copy table - MLX5_FLOW_MREG_CP_TABLE_GROUP */
+	if (((attr->ingress && attr->group != MLX5_FLOW_MREG_CP_TABLE_GROUP) ||
+	     attr->transfer) && external) {
+		ret = mlx5_flow_nta_update_copy_table(dev, &cpy_idx, mark,
+						      action_flags, error);
+		if (ret)
+			goto free;
+	}
+
 	if (action_flags & MLX5_FLOW_ACTION_RSS) {
 		const struct rte_flow_action_rss
-			*rss_conf = flow_nta_locate_rss(dev, actions, error);
-		flow = flow_nta_handle_rss(dev, attr, items, actions, rss_conf,
-					   item_flags, action_flags, external,
-					   type, error);
-		if (flow)
-			return (uintptr_t)flow;
+			*rss_conf = flow_nta_locate_rss(dev, resource.suffix.actions, error);
+		flow = flow_nta_handle_rss(dev, resource.suffix.attr, resource.suffix.items,
+					   resource.suffix.actions, rss_conf, item_flags,
+					   action_flags, external, type, error);
+		if (flow) {
+			flow->nt2hws->rix_mreg_copy = cpy_idx;
+			cpy_idx = 0;
+			if (!split)
+				return (uintptr_t)flow;
+			goto prefix_flow;
+		}
 		if (error->type != RTE_FLOW_ERROR_TYPE_NONE)
-			return 0;
+			goto free;
 		/* Fall Through to non-expanded RSS flow */
 	}
-	/*TODO: Handle split/expand to num_flows. */
 
 	/* Create single flow. */
-	ret = flow_hw_create_flow(dev, type, attr, items, actions,
-				  item_flags, action_flags,
+	ret = flow_hw_create_flow(dev, type, resource.suffix.attr, resource.suffix.items,
+				  resource.suffix.actions, item_flags, action_flags,
 				  external, &flow, error);
 	if (ret)
 		goto free;
-	if (flow)
-		return (uintptr_t)flow;
-
+	if (flow) {
+		flow->nt2hws->rix_mreg_copy = cpy_idx;
+		cpy_idx = 0;
+		if (!split)
+			return (uintptr_t)flow;
+		/* Fall Through to prefix flow creation. */
+	}
+prefix_flow:
+	ret = flow_hw_create_flow(dev, type, attr, items, resource.prefix.actions,
+				  item_flags, action_flags, external, &prfx_flow, error);
+	if (ret)
+		goto free;
+	if (prfx_flow) {
+		prfx_flow->nt2hws->rix_mreg_copy = flow->nt2hws->rix_mreg_copy;
+		flow->nt2hws->chaned_flow = 1;
+		SLIST_INSERT_AFTER(prfx_flow, flow, nt2hws->next);
+		mlx5_flow_nta_split_resource_free(dev, &resource);
+		return (uintptr_t)prfx_flow;
+	}
 free:
+	if (prfx_flow)
+		flow_hw_list_destroy(dev, type, (uintptr_t)prfx_flow);
 	if (flow)
 		flow_hw_list_destroy(dev, type, (uintptr_t)flow);
+	if (cpy_idx)
+		mlx5_flow_nta_del_copy_action(dev, cpy_idx);
+	if (split > 0)
+		mlx5_flow_nta_split_resource_free(dev, &resource);
 	return 0;
 }
 
diff --git a/drivers/net/mlx5/mlx5_nta_split.c b/drivers/net/mlx5/mlx5_nta_split.c
new file mode 100644
index 0000000000..b26f305bca
--- /dev/null
+++ b/drivers/net/mlx5/mlx5_nta_split.c
@@ -0,0 +1,568 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright (c) 2024 NVIDIA Corporation & Affiliates
+ */
+
+#include <rte_common.h>
+#include <rte_flow.h>
+
+#include "mlx5_malloc.h"
+#include "mlx5.h"
+#include "mlx5_defs.h"
+#include "mlx5_flow.h"
+#include "mlx5_rx.h"
+
+#ifdef HAVE_MLX5_HWS_SUPPORT
+
+/*
+ * Generate new actions lists for prefix and suffix flows.
+ *
+ * @param[in] dev
+ *   Pointer to rte_eth_dev structure.
+ * @param[in] prefix_act
+ *   Pointer to actions for prefix flow rule.
+ * @param[in] suffix_act
+ *   Pointer to actions for suffix flow rule.
+ * @param[in] actions
+ *   Pointer to the original actions list.
+ * @param[in] qrss
+ *   Pointer to the action of QUEUE / RSS.
+ * @param[in] actions_n
+ *   Number of the actions in the original list.
+ * @param[out] error
+ *   Pointer to error structure.
+ *
+ * @return
+ *   Positive prefix flow ID on success, zero on failure.
+ */
+static uint32_t
+mlx5_flow_nta_split_qrss_actions_prep(struct rte_eth_dev *dev,
+				      struct rte_flow_action *prefix_act,
+				      struct rte_flow_action *suffix_act,
+				      const struct rte_flow_action *actions,
+				      const struct rte_flow_action *qrss,
+				      int actions_n,
+				      struct rte_flow_error *error)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_rte_flow_action_set_tag *set_tag;
+	struct rte_flow_action_jump *jump;
+	const int qrss_idx = qrss - actions;
+	uint32_t flow_id = 0;
+	int ret = 0;
+
+	/* Allocate the new subflow ID and used to be matched later. */
+	mlx5_ipool_malloc(priv->sh->ipool[MLX5_IPOOL_RSS_EXPANTION_FLOW_ID], &flow_id);
+	if (!flow_id) {
+		rte_flow_error_set(error, ENOMEM, RTE_FLOW_ERROR_TYPE_ACTION, NULL,
+				   "can't allocate id for split Q/RSS subflow");
+		return 0;
+	}
+	/*
+	 * Given actions will be split
+	 * - Replace QUEUE/RSS action with SET_TAG to set flow ID.
+	 * - Add jump to mreg CP_TBL.
+	 * As a result, there will be one more action.
+	 */
+	memcpy(prefix_act, actions, sizeof(struct rte_flow_action) * actions_n);
+	/* Count MLX5_RTE_FLOW_ACTION_TYPE_TAG. */
+	actions_n++;
+	set_tag = (void *)(prefix_act + actions_n);
+	/* Reuse ASO reg, should always succeed. Consider to use REG_C_6. */
+	ret = flow_hw_get_reg_id_by_domain(dev, RTE_FLOW_ITEM_TYPE_METER_COLOR,
+					   MLX5DR_TABLE_TYPE_NIC_RX, 0);
+	MLX5_ASSERT(ret != (int)REG_NON);
+	set_tag->id = (enum modify_reg)ret;
+	/* Internal SET_TAG action to set flow ID. */
+	set_tag->data = flow_id;
+	/* Construct new actions array and replace QUEUE/RSS action. */
+	prefix_act[qrss_idx] = (struct rte_flow_action) {
+		.type = (enum rte_flow_action_type)MLX5_RTE_FLOW_ACTION_TYPE_TAG,
+		.conf = set_tag,
+	};
+	/* JUMP action to jump to mreg copy table (CP_TBL). */
+	jump = (void *)(set_tag + 1);
+	*jump = (struct rte_flow_action_jump) {
+		.group = MLX5_FLOW_MREG_CP_TABLE_GROUP,
+	};
+	prefix_act[actions_n - 2] = (struct rte_flow_action) {
+		.type = RTE_FLOW_ACTION_TYPE_JUMP,
+		.conf = jump,
+	};
+	prefix_act[actions_n - 1] = (struct rte_flow_action) {
+		.type = RTE_FLOW_ACTION_TYPE_END,
+	};
+	/* Copy the suffix Q/RSS action, can also be indirect RSS. */
+	suffix_act[0] = (struct rte_flow_action) {
+		.type = qrss->type,
+		.conf = qrss->conf,
+	};
+	suffix_act[1] = (struct rte_flow_action) {
+		.type = RTE_FLOW_ACTION_TYPE_END,
+	};
+	return flow_id;
+}
+
+/*
+ * Generate new attribute and items for suffix flows.
+ *
+ * @param[in] dev
+ *   Pointer to rte_eth_dev structure.
+ * @param[in] split_attr
+ *   Pointer to attribute for prefix flow rule.
+ * @param[in] split_items
+ *   Pointer to actions for suffix flow rule.
+ * @param[in] qrss_id
+ *   Prefix flow ID to match.
+ */
+static void
+mlx5_flow_nta_split_qrss_items_prep(struct rte_eth_dev *dev,
+				    struct rte_flow_attr *split_attr,
+				    struct rte_flow_item *split_items,
+				    uint32_t qrss_id)
+{
+	struct mlx5_rte_flow_item_tag *q_tag_spec;
+
+	/* MLX5_FLOW_MREG_CP_TABLE_GROUP -> MLX5_FLOW_MREG_ACT_TABLE_GROUP(Q/RSS base) */
+	split_attr->ingress = 1;
+	split_attr->group = MLX5_FLOW_MREG_ACT_TABLE_GROUP;
+	/* Only internal tag will be used, together with the item flags for RSS. */
+	q_tag_spec = (void *)((char *)split_items + 2 * sizeof(struct rte_flow_item));
+	split_items[0].type = (enum rte_flow_item_type)MLX5_RTE_FLOW_ITEM_TYPE_TAG;
+	split_items[0].spec = q_tag_spec;
+	split_items[1].type = RTE_FLOW_ITEM_TYPE_END;
+	q_tag_spec->data = qrss_id;
+	q_tag_spec->id = (enum modify_reg)
+			 flow_hw_get_reg_id_by_domain(dev, RTE_FLOW_ITEM_TYPE_METER_COLOR,
+						      MLX5DR_TABLE_TYPE_NIC_RX, 0);
+	MLX5_ASSERT(q_tag_spec->id != REG_NON);
+}
+
+/*
+ * Checking the split information and split the actions, items, attributes into
+ * prefix and suffix to connect the flows after passing the copy tables.
+ *
+ * @param[in] dev
+ *   Pointer to rte_eth_dev structure.
+ * @param[in] attr
+ *   Pointer to the flow attributes.
+ * @param[in] actions
+ *   Pointer to the original actions list.
+ * @param[in] qrss
+ *   Pointer to the action of QUEUE / RSS.
+ * @param[in] action_flags
+ *   Holds the actions detected.
+ * @param[in] actions_n
+ *   Number of original actions.
+ * @param[in] external
+ *   This flow rule is created by request external to PMD.
+ * @param[out] res
+ *   Pointer to the resource to store the split result.
+ * @param[out] error
+ *   Pointer to error structure.
+ *
+ * @return
+ *   - Positive 1 on succeed.
+ *   - 0 on no split.
+ *   - negative errno value on error.
+ */
+int
+mlx5_flow_nta_split_metadata(struct rte_eth_dev *dev,
+			     const struct rte_flow_attr *attr,
+			     const struct rte_flow_action actions[],
+			     const struct rte_flow_action *qrss,
+			     uint64_t action_flags,
+			     int actions_n,
+			     bool external,
+			     struct mlx5_flow_hw_split_resource *res,
+			     struct rte_flow_error *error)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_sh_config *config = &priv->sh->config;
+	const struct rte_flow_action_queue *queue;
+	const struct rte_flow_action_rss *rss;
+	struct rte_flow_action *prfx_actions;
+	struct rte_flow_action *sfx_actions;
+	struct rte_flow_attr *sfx_attr;
+	struct rte_flow_item *sfx_items;
+	size_t pefx_act_size, sfx_act_size;
+	size_t attr_size, item_size;
+	size_t total_size;
+	uint32_t qrss_id;
+
+	/*
+	 * The metadata copy flow should be created:
+	 *   1. only on NIC Rx domain with Q / RSS
+	 *   2. only when extended metadata mode is enabled
+	 *   3. only on HWS, should always be "config->dv_flow_en == 2", this
+	 *      checking can be skipped
+	 * Note:
+	 *   1. Even if metadata is not enabled in the data-path, it can still
+	 *      be used to match on the Rx side.
+	 *   2. The HWS Tx default copy rule or SQ rules already have the metadata
+	 *      copy on the root table. The user's rule will always be inserted
+	 *      and executed after the root table steering.
+	 */
+	if (config->dv_xmeta_en == MLX5_XMETA_MODE_LEGACY || attr->transfer ||
+	    attr->egress || !external || !qrss)
+		return 0;
+	if (action_flags & MLX5_FLOW_ACTION_QUEUE) {
+		queue = (const struct rte_flow_action_queue *)actions->conf;
+		if (mlx5_rxq_is_hairpin(dev, queue->index))
+			return 0;
+	} else if (action_flags & MLX5_FLOW_ACTION_RSS) {
+		rss = (const struct rte_flow_action_rss *)actions->conf;
+		if (mlx5_rxq_is_hairpin(dev, rss->queue[0]))
+			return 0;
+	}
+	/* The prefix and suffix flows' actions. */
+	pefx_act_size = sizeof(struct rte_flow_action) * (actions_n + 1) +
+			sizeof(struct rte_flow_action_set_tag) +
+			sizeof(struct rte_flow_action_jump);
+	sfx_act_size = sizeof(struct rte_flow_action) * 2;
+	/* The suffix attribute. */
+	attr_size = sizeof(struct rte_flow_attr);
+	/* The suffix items - mlx5_tag + end. */
+	item_size = sizeof(struct rte_flow_item) * 2 +
+		    sizeof(struct mlx5_rte_flow_item_tag);
+	total_size = pefx_act_size + sfx_act_size + attr_size + item_size;
+	prfx_actions = mlx5_malloc(MLX5_MEM_ZERO, total_size, 0, SOCKET_ID_ANY);
+	if (!prfx_actions)
+		return rte_flow_error_set(error, ENOMEM,
+					  RTE_FLOW_ERROR_TYPE_ACTION,
+					  NULL, "no memory to split "
+					  "metadata flow");
+	sfx_actions = (void *)((char *)prfx_actions + pefx_act_size);
+	qrss_id = mlx5_flow_nta_split_qrss_actions_prep(dev, prfx_actions,
+							sfx_actions, actions,
+							qrss, actions_n, error);
+	if (!qrss_id) {
+		mlx5_free(prfx_actions);
+		return -rte_errno;
+	}
+	sfx_attr = (void *)((char *)sfx_actions + sfx_act_size);
+	sfx_items = (void *)((char *)sfx_attr + attr_size);
+	mlx5_flow_nta_split_qrss_items_prep(dev, sfx_attr, sfx_items, qrss_id);
+	res->prefix.actions = prfx_actions;
+	res->suffix.actions = sfx_actions;
+	res->suffix.items = sfx_items;
+	res->suffix.attr = sfx_attr;
+	res->buf_start = prfx_actions;
+	res->flow_idx = qrss_id;
+	return 1;
+}
+
+/*
+ * Release the buffer and flow ID.
+ *
+ * @param[in] dev
+ *   Pointer to rte_eth_dev structure.
+ * @param[in] res
+ *   Pointer to the resource to release.
+ */
+void
+mlx5_flow_nta_split_resource_free(struct rte_eth_dev *dev,
+				  struct mlx5_flow_hw_split_resource *res)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+
+	mlx5_ipool_free(priv->sh->ipool[MLX5_IPOOL_RSS_EXPANTION_FLOW_ID], res->flow_idx);
+	mlx5_free(res->buf_start);
+}
+
+/*
+ * Callback functions for the metadata copy and mark / flag set flow.
+ * The create and remove cannot reuse the DV since the flow opaque and structure
+ * are different, and the action used to copy the metadata is also different.
+ */
+struct mlx5_list_entry *
+flow_nta_mreg_create_cb(void *tool_ctx, void *cb_ctx)
+{
+	struct rte_eth_dev *dev = tool_ctx;
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_flow_cb_ctx *ctx = cb_ctx;
+	struct mlx5_flow_mreg_copy_resource *mcp_res;
+	struct rte_flow_error *error = ctx->error;
+	uint32_t idx = 0;
+	uint32_t mark_id = *(uint32_t *)(ctx->data);
+	struct rte_flow_attr attr = {
+		.group = MLX5_FLOW_MREG_CP_TABLE_GROUP,
+		.ingress = 1,
+	};
+	struct mlx5_rte_flow_item_tag tag_spec = {
+		.id = REG_C_0,
+		.data = mark_id,
+	};
+	struct mlx5_rte_flow_item_tag tag_mask = {
+		.data = priv->sh->dv_mark_mask,
+	};
+	struct rte_flow_action_mark ftag = {
+		.id = mark_id,
+	};
+	struct rte_flow_action_modify_field rx_meta = {
+		.operation = RTE_FLOW_MODIFY_SET,
+		.dst = {
+			.field = (enum rte_flow_field_id)MLX5_RTE_FLOW_FIELD_META_REG,
+			.tag_index = REG_B,
+		},
+		.src = {
+			.field = (enum rte_flow_field_id)MLX5_RTE_FLOW_FIELD_META_REG,
+			.tag_index = REG_C_1,
+		},
+		.width = 32,
+	};
+	struct rte_flow_action_jump jump = {
+		.group = MLX5_FLOW_MREG_ACT_TABLE_GROUP,
+	};
+	struct rte_flow_item items[2];
+	struct rte_flow_action actions[4];
+
+	/* Provide the full width of FLAG specific value. */
+	if (mark_id == (priv->sh->dv_regc0_mask & MLX5_FLOW_MARK_DEFAULT))
+		tag_spec.data = MLX5_FLOW_MARK_DEFAULT;
+	/* Build a new flow. */
+	if (mark_id != MLX5_DEFAULT_COPY_ID) {
+		items[0] = (struct rte_flow_item) {
+			.type = (enum rte_flow_item_type)MLX5_RTE_FLOW_ITEM_TYPE_TAG,
+			.spec = &tag_spec,
+			.mask = &tag_mask,
+		};
+		actions[0] = (struct rte_flow_action) {
+			.type = RTE_FLOW_ACTION_TYPE_MARK,
+			.conf = &ftag,
+		};
+	} else {
+		/* Default rule, wildcard match with lowest priority. */
+		attr.priority = MLX5_FLOW_LOWEST_PRIO_INDICATOR;
+		items[0] = (struct rte_flow_item) {
+			.type = RTE_FLOW_ITEM_TYPE_ETH,
+		};
+		actions[0] = (struct rte_flow_action) {
+			.type = RTE_FLOW_ACTION_TYPE_VOID,
+		};
+	}
+	/* (match REG 'tag') or all. */
+	items[1].type = RTE_FLOW_ITEM_TYPE_END;
+	/* (Mark) or void + copy to Rx meta + jump to the MREG_ACT_TABLE_GROUP. */
+	actions[1].type = RTE_FLOW_ACTION_TYPE_MODIFY_FIELD,
+	actions[1].conf = &rx_meta,
+	actions[2].type = RTE_FLOW_ACTION_TYPE_JUMP;
+	actions[2].conf = &jump;
+	actions[3].type = RTE_FLOW_ACTION_TYPE_END;
+	/* Build a new entry. */
+	mcp_res = mlx5_ipool_zmalloc(priv->sh->ipool[MLX5_IPOOL_MCP], &idx);
+	if (!mcp_res) {
+		rte_errno = ENOMEM;
+		return NULL;
+	}
+	mcp_res->idx = idx;
+	mcp_res->mark_id = mark_id;
+	/*
+	 * The copy flows are not included in any list. There
+	 * ones are referenced from other flows and cannot
+	 * be applied, removed, deleted in arbitrary order
+	 * by list traversing.
+	 */
+	mcp_res->hw_flow = mlx5_flow_list_create(dev, MLX5_FLOW_TYPE_MCP, &attr,
+						 items, actions, false, error);
+	if (!mcp_res->hw_flow) {
+		mlx5_ipool_free(priv->sh->ipool[MLX5_IPOOL_MCP], idx);
+		return NULL;
+	}
+	return &mcp_res->hlist_ent;
+}
+
+void
+flow_nta_mreg_remove_cb(void *tool_ctx, struct mlx5_list_entry *entry)
+{
+	struct mlx5_flow_mreg_copy_resource *mcp_res =
+			       container_of(entry, typeof(*mcp_res), hlist_ent);
+	struct rte_eth_dev *dev = tool_ctx;
+	struct mlx5_priv *priv = dev->data->dev_private;
+
+	MLX5_ASSERT(mcp_res->hw_flow);
+	mlx5_flow_list_destroy(dev, MLX5_FLOW_TYPE_MCP, mcp_res->hw_flow);
+	mlx5_ipool_free(priv->sh->ipool[MLX5_IPOOL_MCP], mcp_res->idx);
+}
+
+/*
+ * Add a flow of copying flow metadata registers in RX_CP_TBL.
+ * @see flow_mreg_add_copy_action
+ *
+ * @param[in] dev
+ *   Pointer to Ethernet device.
+ * @param[in] mark_id
+ *   ID of MARK action, zero means default flow for META.
+ * @param[out] error
+ *   Perform verbose error reporting if not NULL.
+ *
+ * @return
+ *   Associated resource on success, NULL otherwise and rte_errno is set.
+ */
+static struct mlx5_flow_mreg_copy_resource *
+mlx5_flow_nta_add_copy_action(struct rte_eth_dev *dev,
+			      uint32_t mark_id,
+			      struct rte_flow_error *error)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_list_entry *entry;
+	uint32_t specialize = 0;
+	struct mlx5_flow_cb_ctx ctx = {
+		.dev = dev,
+		.error = error,
+		.data = &mark_id,
+		.data2 = &specialize,
+	};
+
+	/* Check if already registered. */
+	MLX5_ASSERT(priv->sh->mreg_cp_tbl);
+	entry = mlx5_hlist_register(priv->sh->mreg_cp_tbl, mark_id, &ctx);
+	if (!entry)
+		return NULL;
+	return container_of(entry, struct mlx5_flow_mreg_copy_resource, hlist_ent);
+}
+
+/*
+ * Release flow in RX_CP_TBL.
+ *
+ * @param[in] dev
+ *   Pointer to Ethernet device.
+ * @param[in] idx
+ *   Index in the pool to store the copy flow.
+ */
+void
+mlx5_flow_nta_del_copy_action(struct rte_eth_dev *dev, uint32_t idx)
+{
+	struct mlx5_flow_mreg_copy_resource *mcp_res;
+	struct mlx5_priv *priv = dev->data->dev_private;
+
+	if (!idx)
+		return;
+	mcp_res = mlx5_ipool_get(priv->sh->ipool[MLX5_IPOOL_MCP], idx);
+	if (!mcp_res || !priv->sh->mreg_cp_tbl)
+		return;
+	MLX5_ASSERT(mcp_res->hw_flow);
+	mlx5_hlist_unregister(priv->sh->mreg_cp_tbl, &mcp_res->hlist_ent);
+}
+
+/*
+ * Remove the default copy action from RX_CP_TBL.
+ * @see flow_mreg_del_default_copy_action
+ *
+ * This functions is called in the mlx5_dev_start(). No thread safe
+ * is guaranteed.
+ *
+ * @param[in] dev
+ *   Pointer to Ethernet device.
+ */
+void
+mlx5_flow_nta_del_default_copy_action(struct rte_eth_dev *dev)
+{
+	struct mlx5_list_entry *entry;
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_flow_cb_ctx ctx;
+	uint32_t mark_id;
+
+	/* Check if default flow is registered. */
+	if (!priv->sh->mreg_cp_tbl)
+		return;
+	mark_id = MLX5_DEFAULT_COPY_ID;
+	ctx.data = &mark_id;
+	entry = mlx5_hlist_lookup(priv->sh->mreg_cp_tbl, mark_id, &ctx);
+	if (!entry)
+		return;
+	mlx5_hlist_unregister(priv->sh->mreg_cp_tbl, entry);
+}
+
+/*
+ * Add the default copy action in RX_CP_TBL.
+ *
+ * This functions is called in the mlx5_dev_start(). No thread safe
+ * is guaranteed.
+ * @see flow_mreg_add_default_copy_action
+ *
+ * @param[in] dev
+ *   Pointer to Ethernet device.
+ * @param[out] error
+ *   Perform verbose error reporting if not NULL.
+ *
+ * @return
+ *   0 for success, negative value otherwise and rte_errno is set.
+ */
+int
+mlx5_flow_nta_add_default_copy_action(struct rte_eth_dev *dev,
+				      struct rte_flow_error *error)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_sh_config *config = &priv->sh->config;
+	struct mlx5_flow_mreg_copy_resource *mcp_res;
+	struct mlx5_flow_cb_ctx ctx;
+	uint32_t mark_id;
+
+	if (config->dv_xmeta_en == MLX5_XMETA_MODE_LEGACY ||
+	    !priv->sh->dv_regc0_mask)
+		return 0;
+	/*
+	 * Add default mreg copy flow may be called multiple time, but
+	 * only be called once in stop. Avoid register it twice.
+	 */
+	mark_id = MLX5_DEFAULT_COPY_ID;
+	ctx.data = &mark_id;
+	if (mlx5_hlist_lookup(priv->sh->mreg_cp_tbl, mark_id, &ctx))
+		return 0;
+	mcp_res = mlx5_flow_nta_add_copy_action(dev, mark_id, error);
+	if (!mcp_res)
+		return -rte_errno;
+	return 0;
+}
+
+/*
+ * Add a flow of copying flow metadata registers in RX_CP_TBL.
+ * @see flow_mreg_update_copy_table
+ *
+ * @param[in] dev
+ *   Pointer to Ethernet device.
+ * @param[out] idx
+ *   Pointer to store the index of flow in the pool.
+ * @param[in] mark
+ *   Pointer to mark or flag action.
+ * @param[in] action_flags
+ *   Holds the actions detected.
+ * @param[out] error
+ *   Perform verbose error reporting if not NULL.
+ *
+ * @return
+ *   0 on success, negative value otherwise and rte_errno is set.
+ */
+int
+mlx5_flow_nta_update_copy_table(struct rte_eth_dev *dev,
+				uint32_t *idx,
+				const struct rte_flow_action *mark,
+				uint64_t action_flags,
+				struct rte_flow_error *error)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_sh_config *config = &priv->sh->config;
+	struct mlx5_flow_mreg_copy_resource *mcp_res;
+	const struct rte_flow_action_mark *mark_conf;
+	uint32_t mark_id;
+
+	if (config->dv_xmeta_en == MLX5_XMETA_MODE_LEGACY ||
+	    !priv->sh->dv_regc0_mask)
+		return 0;
+	/* Find MARK action. */
+	if (action_flags & (MLX5_FLOW_ACTION_FLAG | MLX5_FLOW_ACTION_MARK)) {
+		if (mark) {
+			mark_conf = (const struct rte_flow_action_mark *)mark->conf;
+			mark_id = mark_conf->id;
+		} else {
+			mark_id = MLX5_FLOW_MARK_DEFAULT;
+		}
+		mcp_res = mlx5_flow_nta_add_copy_action(dev, mark_id, error);
+		if (!mcp_res)
+			return -rte_errno;
+		*idx = mcp_res->idx;
+	}
+	return 0;
+}
+
+#endif
diff --git a/drivers/net/mlx5/mlx5_trigger.c b/drivers/net/mlx5/mlx5_trigger.c
index 6fa7c01cd0..1cdf5382c3 100644
--- a/drivers/net/mlx5/mlx5_trigger.c
+++ b/drivers/net/mlx5/mlx5_trigger.c
@@ -1446,12 +1446,7 @@  mlx5_dev_stop(struct rte_eth_dev *dev)
 	mlx5_mp_os_req_stop_rxtx(dev);
 	rte_delay_us_sleep(1000 * priv->rxqs_n);
 	DRV_LOG(DEBUG, "port %u stopping device", dev->data->port_id);
-	if (priv->sh->config.dv_flow_en == 2) {
-		if (!rte_atomic_load_explicit(&priv->hws_mark_refcnt, rte_memory_order_relaxed))
-			flow_hw_rxq_flag_set(dev, false);
-	} else {
-		mlx5_flow_stop_default(dev);
-	}
+	mlx5_flow_stop_default(dev);
 	/* Control flows for default traffic can be removed firstly. */
 	mlx5_traffic_disable(dev);
 	/* All RX queue flags will be cleared in the flush interface. */