[v2,3/4] net/mlx5: implement tunnel offload API

Message ID 20200908201552.14423-4-getelson@nvidia.com (mailing list archive)
State Superseded, archived
Delegated to: Ferruh Yigit
Headers
Series Tunnel Offload API |

Checks

Context Check Description
ci/checkpatch warning coding style issues

Commit Message

Gregory Etelson Sept. 8, 2020, 8:15 p.m. UTC
  From: Gregory Etelson <getelson@mellanox.com>

Tunnel Offload API provides hardware independent, unified model
to offload tunneled traffic. Key model elements are:
 - apply matches to both outer and inner packet headers
   during entire offload procedure;
 - restore outer header of partially offloaded packet;
 - model is implemented as a set of helper functions.

Implementation details:
* tunnel_offload PMD parameter must be set to 1 to enable the feature.
* application cannot use MARK and META flow actions whith tunnel.
* offload JUMP action is restricted to steering tunnel rule only.

Signed-off-by: Gregory Etelson <getelson@mellanox.com>
---
v2:
* introduce MLX5 PMD API implementation
---
 drivers/net/mlx5/linux/mlx5_os.c |  14 +
 drivers/net/mlx5/mlx5.c          |   6 +
 drivers/net/mlx5/mlx5.h          |   4 +
 drivers/net/mlx5/mlx5_flow.c     | 453 +++++++++++++++++++++++++++++++
 drivers/net/mlx5/mlx5_flow.h     |  49 ++++
 drivers/net/mlx5/mlx5_flow_dv.c  |  71 ++++-
 6 files changed, 595 insertions(+), 2 deletions(-)
  

Patch

diff --git a/drivers/net/mlx5/linux/mlx5_os.c b/drivers/net/mlx5/linux/mlx5_os.c
index 69123e12c3..7193398750 100644
--- a/drivers/net/mlx5/linux/mlx5_os.c
+++ b/drivers/net/mlx5/linux/mlx5_os.c
@@ -281,6 +281,12 @@  mlx5_alloc_shared_dr(struct mlx5_priv *priv)
 		sh->esw_drop_action = mlx5_glue->dr_create_flow_action_drop();
 	}
 #endif
+	if (!sh->tunnel_hub)
+		err = mlx5_alloc_tunnel_hub(sh);
+	if (err) {
+		DRV_LOG(ERR, "mlx5_alloc_tunnel_hub failed err=%d", err);
+		goto error;
+	}
 	if (priv->config.reclaim_mode == MLX5_RCM_AGGR) {
 		mlx5_glue->dr_reclaim_domain_memory(sh->rx_domain, 1);
 		mlx5_glue->dr_reclaim_domain_memory(sh->tx_domain, 1);
@@ -319,6 +325,10 @@  mlx5_alloc_shared_dr(struct mlx5_priv *priv)
 		mlx5_hlist_destroy(sh->tag_table, NULL, NULL);
 		sh->tag_table = NULL;
 	}
+	if (sh->tunnel_hub) {
+		mlx5_release_tunnel_hub(sh);
+		sh->tunnel_hub = NULL;
+	}
 	mlx5_free_table_hash_list(priv);
 	return err;
 }
@@ -372,6 +382,10 @@  mlx5_os_free_shared_dr(struct mlx5_priv *priv)
 		mlx5_hlist_destroy(sh->tag_table, NULL, NULL);
 		sh->tag_table = NULL;
 	}
+	if (sh->tunnel_hub) {
+		mlx5_release_tunnel_hub(sh);
+		sh->tunnel_hub = NULL;
+	}
 	mlx5_free_table_hash_list(priv);
 }
 
diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 1e4c695f84..569070d3db 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -177,6 +177,9 @@ 
 /* Decap will be used or not. */
 #define MLX5_DECAP_EN "decap_en"
 
+/* Configure flow tunnel offload functionality */
+#define MLX5_TUNNEL_OFFLOAD "tunnel_offload"
+
 /* Shared memory between primary and secondary processes. */
 struct mlx5_shared_data *mlx5_shared_data;
 
@@ -1621,6 +1624,8 @@  mlx5_args_check(const char *key, const char *val, void *opaque)
 		config->sys_mem_en = !!tmp;
 	} else if (strcmp(MLX5_DECAP_EN, key) == 0) {
 		config->decap_en = !!tmp;
+	} else if (strcmp(MLX5_TUNNEL_OFFLOAD, key) == 0) {
+		config->tunnel_offload = !!tmp;
 	} else {
 		DRV_LOG(WARNING, "%s: unknown parameter", key);
 		rte_errno = EINVAL;
@@ -1681,6 +1686,7 @@  mlx5_args(struct mlx5_dev_config *config, struct rte_devargs *devargs)
 		MLX5_RECLAIM_MEM,
 		MLX5_SYS_MEM_EN,
 		MLX5_DECAP_EN,
+		MLX5_TUNNEL_OFFLOAD,
 		NULL,
 	};
 	struct rte_kvargs *kvlist;
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 78d6eb7281..e450aec029 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -208,6 +208,7 @@  struct mlx5_dev_config {
 	unsigned int rt_timestamp:1; /* realtime timestamp format. */
 	unsigned int sys_mem_en:1; /* The default memory allocator. */
 	unsigned int decap_en:1; /* Whether decap will be used or not. */
+	unsigned int tunnel_offload:1; /* Flow tunnel offload functionality */
 	struct {
 		unsigned int enabled:1; /* Whether MPRQ is enabled. */
 		unsigned int stride_num_n; /* Number of strides. */
@@ -605,6 +606,7 @@  struct mlx5_dev_ctx_shared {
 	LIST_ENTRY(mlx5_dev_ctx_shared) next;
 	uint32_t refcnt;
 	uint32_t devx:1; /* Opened with DV. */
+	uint32_t tunnel:1; /* 1 RTE flow tunnel enabled */
 	uint32_t max_port; /* Maximal IB device port index. */
 	void *ctx; /* Verbs/DV/DevX context. */
 	void *pd; /* Protection Domain. */
@@ -634,6 +636,8 @@  struct mlx5_dev_ctx_shared {
 	/* UAR same-page access control required in 32bit implementations. */
 #endif
 	struct mlx5_hlist *flow_tbls;
+	struct rte_hash *flow_tbl_map; /* app group-to-flow table map */
+	struct mlx5_flow_tunnel_hub *tunnel_hub;
 	/* Direct Rules tables for FDB, NIC TX+RX */
 	void *esw_drop_action; /* Pointer to DR E-Switch drop action. */
 	void *pop_vlan_action; /* Pointer to DR pop VLAN action. */
diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c
index 71501730b5..7d4bddee39 100644
--- a/drivers/net/mlx5/mlx5_flow.c
+++ b/drivers/net/mlx5/mlx5_flow.c
@@ -18,6 +18,7 @@ 
 #include <rte_flow_driver.h>
 #include <rte_malloc.h>
 #include <rte_ip.h>
+#include <rte_hash.h>
 
 #include <mlx5_glue.h>
 #include <mlx5_devx_cmds.h>
@@ -30,6 +31,13 @@ 
 #include "mlx5_flow_os.h"
 #include "mlx5_rxtx.h"
 
+static struct mlx5_flow_tunnel *
+mlx5_find_tunnel_id(struct rte_eth_dev *dev, uint32_t id);
+static void
+mlx5_flow_tunnel_free(struct rte_eth_dev *dev, struct mlx5_flow_tunnel *tunnel);
+static uint32_t
+mlx5_mark_to_tunnel_id(uint32_t mark);
+
 /** Device flow drivers. */
 extern const struct mlx5_flow_driver_ops mlx5_flow_verbs_drv_ops;
 
@@ -220,6 +228,169 @@  static const struct rte_flow_expand_node mlx5_support_expansion[] = {
 	},
 };
 
+static inline bool
+mlx5_flow_tunnel_validate(__rte_unused struct rte_eth_dev *dev,
+			  struct rte_flow_tunnel *tunnel)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+
+	if (!priv->config.tunnel_offload || !tunnel)
+		goto err;
+
+	switch (tunnel->type) {
+	default:
+		goto err;
+	case RTE_FLOW_ITEM_TYPE_VXLAN:
+		break;
+	}
+
+	return true;
+
+err:
+	return false;
+}
+
+static int
+mlx5_flow_tunnel_set(struct rte_eth_dev *dev,
+		    struct rte_flow_tunnel *app_tunnel,
+		    struct rte_flow_action **actions,
+		    uint32_t *num_of_actions,
+		    struct rte_flow_error *error)
+{
+	int ret;
+	struct mlx5_flow_tunnel *tunnel;
+	if (!mlx5_flow_tunnel_validate(dev, app_tunnel))
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ACTION_CONF, NULL,
+					  "invalid argument");
+
+	ret = mlx5_get_flow_tunnel(dev, app_tunnel, &tunnel);
+	if (ret < 0) {
+		return rte_flow_error_set(error, ret,
+					  RTE_FLOW_ERROR_TYPE_ACTION_CONF, NULL,
+					  "failed to match pmd tunnel");
+	}
+	rte_atomic32_inc(&tunnel->refctn);
+	*actions = &tunnel->action;
+	*num_of_actions = 1;
+	return 0;
+}
+
+static int
+mlx5_flow_tunnel_match(struct rte_eth_dev *dev,
+		       struct rte_flow_tunnel *app_tunnel,
+		       struct rte_flow_item **items,
+		       uint32_t *num_of_items,
+		       struct rte_flow_error *error)
+{
+	int ret;
+	struct mlx5_flow_tunnel *tunnel;
+	if (!mlx5_flow_tunnel_validate(dev, app_tunnel))
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_HANDLE, NULL,
+					  "invalid argument");
+
+	ret = mlx5_get_flow_tunnel(dev, app_tunnel, &tunnel);
+	if (ret < 0) {
+		return rte_flow_error_set(error, ret,
+					  RTE_FLOW_ERROR_TYPE_HANDLE, NULL,
+					  "failed to match pmd tunnel");
+	}
+
+	rte_atomic32_inc(&tunnel->refctn);
+	*items = &tunnel->item;
+	*num_of_items = 1;
+	return 0;
+}
+
+static int
+mlx5_flow_item_release(struct rte_eth_dev *dev,
+		       struct rte_flow_item *pmd_items,
+		       uint32_t num_items, struct rte_flow_error *err)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_dev_ctx_shared *sh = priv->sh;
+	struct mlx5_flow_tunnel_hub *thub = sh->tunnel_hub;
+	struct mlx5_flow_tunnel *tun;
+
+	LIST_FOREACH(tun, &thub->tunnels, chain) {
+		if (&tun->item == pmd_items)
+			break;
+	}
+	if (!tun || num_items != 1)
+		return rte_flow_error_set(err, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_HANDLE, NULL,
+					  "invalid argument");
+	if (rte_atomic32_dec_and_test(&tun->refctn))
+		mlx5_flow_tunnel_free(dev, tun);
+	return 0;
+}
+
+static int
+mlx5_flow_action_release(struct rte_eth_dev *dev,
+			 struct rte_flow_action *pmd_actions,
+			 uint32_t num_actions, struct rte_flow_error *err)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_dev_ctx_shared *sh = priv->sh;
+	struct mlx5_flow_tunnel_hub *thub = sh->tunnel_hub;
+	struct mlx5_flow_tunnel *tun;
+
+	LIST_FOREACH(tun, &thub->tunnels, chain) {
+		if (&tun->action == pmd_actions)
+			break;
+	}
+	if (!tun || num_actions != 1)
+		return rte_flow_error_set(err, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_HANDLE, NULL,
+					  "invalid argument");
+	if (rte_atomic32_dec_and_test(&tun->refctn))
+		mlx5_flow_tunnel_free(dev, tun);
+
+	return 0;
+}
+
+static void
+mlx5_restore_packet_outer(struct rte_eth_dev *dev __rte_unused,
+			  struct mlx5_flow_tunnel *tunnel __rte_unused,
+			  struct rte_mbuf *m __rte_unused)
+{
+}
+
+static int
+mlx5_flow_tunnel_get_restore_info(struct rte_eth_dev *dev,
+				  struct rte_mbuf *m,
+				  struct rte_flow_restore_info *info,
+				  struct rte_flow_error *err)
+{
+	uint32_t id;
+	uint64_t ol_flags = m->ol_flags;
+	struct mlx5_flow_tunnel *tunnel;
+	const uint64_t mask = PKT_RX_FDIR | PKT_RX_FDIR_ID;
+
+	if ((ol_flags & mask) != mask)
+		goto err;
+	id = mlx5_mark_to_tunnel_id(m->hash.fdir.hi);
+	if (!id)
+		goto err;
+	tunnel = mlx5_find_tunnel_id(dev, id);
+	if (!tunnel)
+		goto err;
+	mlx5_restore_packet_outer(dev, tunnel, m);
+	memcpy(&info->tunnel, &tunnel->app_tunnel, sizeof(info->tunnel));
+	m->ol_flags &= ~PKT_RX_FDIR;
+	info->group_id = -1u;
+	info->flags = RTE_FLOW_RESTORE_INFO_TUNNEL |
+		      RTE_FLOW_RESTORE_INFO_ENCAPSULATED;
+
+	return 0;
+
+err:
+	return rte_flow_error_set(err, EINVAL,
+				  RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+				  "failed to get restore info");
+}
+
 static const struct rte_flow_ops mlx5_flow_ops = {
 	.validate = mlx5_flow_validate,
 	.create = mlx5_flow_create,
@@ -229,6 +400,11 @@  static const struct rte_flow_ops mlx5_flow_ops = {
 	.query = mlx5_flow_query,
 	.dev_dump = mlx5_flow_dev_dump,
 	.get_aged_flows = mlx5_flow_get_aged_flows,
+	.tunnel_decap_set = mlx5_flow_tunnel_set,
+	.tunnel_match = mlx5_flow_tunnel_match,
+	.action_release = mlx5_flow_action_release,
+	.item_release = mlx5_flow_item_release,
+	.get_restore_info = mlx5_flow_tunnel_get_restore_info,
 };
 
 /* Convert FDIR request to Generic flow. */
@@ -3524,6 +3700,104 @@  flow_hairpin_split(struct rte_eth_dev *dev,
 	return 0;
 }
 
+static uint32_t
+mlx5_tunnel_id_to_mark(uint32_t id)
+{
+	return (!id || id >= MLX5_MAX_TUNNELS) ?
+		0 : (id << 16);
+}
+
+static uint32_t
+mlx5_mark_to_tunnel_id(uint32_t mark)
+{
+	return mark & MLX5_TUNNEL_MARK_MASK ?
+		mark >> 16 : 0;
+}
+
+static int
+flow_tunnel_add_default_miss(struct rte_eth_dev *dev,
+			     struct rte_flow *flow,
+			     const struct rte_flow_attr *attr,
+			     const struct rte_flow_action *app_actions,
+			     uint32_t flow_idx,
+			     struct rte_flow_error *error)
+{
+	struct mlx5_flow *dev_flow;
+	struct rte_flow_attr miss_attr = *attr;
+	const struct mlx5_flow_tunnel *tunnel = app_actions[0].conf;
+	struct mlx5_priv *priv = dev->data->dev_private;
+	uint16_t queue[priv->reta_idx_n];
+	struct rte_flow_action_rss action_rss = {
+		.func = RTE_ETH_HASH_FUNCTION_DEFAULT,
+		.level = 0,
+		.types = priv->rss_conf.rss_hf,
+		.key_len = priv->rss_conf.rss_key_len,
+		.queue_num = priv->reta_idx_n,
+		.key = priv->rss_conf.rss_key,
+		.queue = queue,
+	};
+	const struct rte_flow_action_mark miss_mark = {
+		.id = mlx5_tunnel_id_to_mark(tunnel->tunnel_id)
+	};
+	const struct rte_flow_item *items, miss_items[2] = {
+		{
+			.type = RTE_FLOW_ITEM_TYPE_ETH,
+			.spec = NULL,
+			.last = NULL,
+			.mask = NULL
+		},
+		{
+			.type = RTE_FLOW_ITEM_TYPE_END,
+			.spec = NULL,
+			.last = NULL,
+			.mask = NULL
+		}
+	};
+	const struct rte_flow_action *actions, miss_actions[3] = {
+		{ .type = RTE_FLOW_ACTION_TYPE_MARK, .conf = &miss_mark },
+		{ .type = RTE_FLOW_ACTION_TYPE_RSS, .conf = &action_rss },
+		{ .type = RTE_FLOW_ACTION_TYPE_END, .conf = NULL }
+	};
+	const struct rte_flow_action_jump *jump_data;
+	uint32_t i;
+
+	if (!priv->reta_idx_n || !priv->rxqs_n)
+		return rte_flow_error_set(error, EINVAL,
+				   RTE_FLOW_ERROR_TYPE_ACTION_CONF,
+				   NULL, "invalid port configuration");
+	if (!(dev->data->dev_conf.rxmode.mq_mode & ETH_MQ_RX_RSS_FLAG))
+		action_rss.types = 0;
+	for (i = 0; i != priv->reta_idx_n; ++i)
+		queue[i] = (*priv->reta_idx)[i];
+
+	if (!miss_mark.id)
+		rte_flow_error_set(error, EINVAL,
+				   RTE_FLOW_ERROR_TYPE_ACTION_CONF,
+				   NULL, "invalid tunnel id");
+	items = (typeof(items))miss_items;
+	actions = (typeof(actions))miss_actions;
+	for (; app_actions->type != RTE_FLOW_ACTION_TYPE_JUMP; app_actions++);
+	jump_data = app_actions->conf;
+	miss_attr.priority = 3;
+	miss_attr.group = TUNNEL_STEER_GROUP(jump_data->group);
+	dev_flow = flow_drv_prepare(dev, flow, &miss_attr,
+				    items, actions, flow_idx, error);
+	if (!dev_flow)
+		return -rte_errno;
+	dev_flow->flow = flow;
+	dev_flow->external = true;
+	/* Subflow object was created, we must include one in the list. */
+	SILIST_INSERT(&flow->dev_handles, dev_flow->handle_idx,
+		      dev_flow->handle, next);
+
+	DRV_LOG(DEBUG,
+		"port %u tunnel type=%d id=%u miss rule priority=%u group=%u",
+		dev->data->port_id, tunnel->app_tunnel.type,
+		tunnel->tunnel_id, miss_attr.priority, miss_attr.group);
+	return flow_drv_translate(dev, dev_flow, &miss_attr, items,
+				  actions, error);
+}
+
 /**
  * The last stage of splitting chain, just creates the subflow
  * without any modification.
@@ -4296,6 +4570,27 @@  flow_create_split_outer(struct rte_eth_dev *dev,
 	return ret;
 }
 
+static struct mlx5_flow_tunnel *
+flow_tunnel_from_rule(struct rte_eth_dev *dev,
+		      const struct rte_flow_attr *attr,
+		      const struct rte_flow_item items[],
+		      const struct rte_flow_action actions[])
+{
+	struct mlx5_flow_tunnel *tunnel;
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wcast-qual"
+	if (is_flow_tunnel_match_rule(dev, attr, items, actions))
+		tunnel = (struct mlx5_flow_tunnel *)items[0].spec;
+	else if (is_flow_tunnel_steer_rule(dev, attr, items, actions))
+		tunnel = (struct mlx5_flow_tunnel *)actions[0].conf;
+	else
+		tunnel = NULL;
+#pragma GCC diagnostic pop
+
+	return tunnel;
+}
+
 /**
  * Create a flow and add it to @p list.
  *
@@ -4356,6 +4651,7 @@  flow_list_create(struct rte_eth_dev *dev, uint32_t *list,
 	int hairpin_flow;
 	uint32_t hairpin_id = 0;
 	struct rte_flow_attr attr_tx = { .priority = 0 };
+	struct mlx5_flow_tunnel *tunnel;
 	int ret;
 
 	hairpin_flow = flow_check_hairpin_split(dev, attr, actions);
@@ -4430,6 +4726,15 @@  flow_list_create(struct rte_eth_dev *dev, uint32_t *list,
 					      error);
 		if (ret < 0)
 			goto error;
+		if (is_flow_tunnel_steer_rule(dev, attr,
+					      buf->entry[i].pattern,
+					      p_actions_rx)) {
+			ret = flow_tunnel_add_default_miss(dev, flow, attr,
+							   p_actions_rx,
+							   idx, error);
+			if (ret < 0)
+				goto error;
+		}
 	}
 	/* Create the tx flow. */
 	if (hairpin_flow) {
@@ -4484,6 +4789,12 @@  flow_list_create(struct rte_eth_dev *dev, uint32_t *list,
 	priv->flow_idx = priv->flow_nested_idx;
 	if (priv->flow_nested_idx)
 		priv->flow_nested_idx = 0;
+	tunnel = flow_tunnel_from_rule(dev, attr, items, actions);
+	if (tunnel) {
+		flow->tunnel = 1;
+		flow->tunnel_id = tunnel->tunnel_id;
+		rte_atomic32_inc(&tunnel->refctn);
+	}
 	return idx;
 error:
 	MLX5_ASSERT(flow);
@@ -4657,6 +4968,13 @@  flow_list_destroy(struct rte_eth_dev *dev, uint32_t *list,
 		}
 	}
 	mlx5_ipool_free(priv->sh->ipool[MLX5_IPOOL_RTE_FLOW], flow_idx);
+	if (flow->tunnel) {
+		struct mlx5_flow_tunnel *tunnel;
+		tunnel = mlx5_find_tunnel_id(dev, flow->tunnel_id);
+		RTE_VERIFY(tunnel);
+		if (rte_atomic32_dec_and_test(&tunnel->refctn))
+			mlx5_flow_tunnel_free(dev, tunnel);
+	}
 }
 
 /**
@@ -6301,3 +6619,138 @@  mlx5_flow_get_aged_flows(struct rte_eth_dev *dev, void **contexts,
 		 dev->data->port_id);
 	return -ENOTSUP;
 }
+
+static void
+mlx5_flow_tunnel_free(struct rte_eth_dev *dev,
+		      struct mlx5_flow_tunnel *tunnel)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_dev_ctx_shared *sh = priv->sh;
+	struct mlx5_flow_tunnel_hub *thub = sh->tunnel_hub;
+	struct mlx5_flow_id_pool *id_pool = thub->tunnel_ids;
+
+	DRV_LOG(DEBUG, "port %u release pmd tunnel id=0x%x",
+		dev->data->port_id, tunnel->tunnel_id);
+	RTE_VERIFY(!rte_atomic32_read(&tunnel->refctn));
+	LIST_REMOVE(tunnel, chain);
+	mlx5_flow_id_release(id_pool, tunnel->tunnel_id);
+	free(tunnel);
+}
+
+static struct mlx5_flow_tunnel *
+mlx5_find_tunnel_id(struct rte_eth_dev *dev, uint32_t id)
+{
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_dev_ctx_shared *sh = priv->sh;
+	struct mlx5_flow_tunnel_hub *thub = sh->tunnel_hub;
+	struct mlx5_flow_tunnel *tun;
+
+	LIST_FOREACH(tun, &thub->tunnels, chain) {
+		if (tun->tunnel_id == id)
+			break;
+	}
+
+	return tun;
+}
+
+static struct mlx5_flow_tunnel *
+mlx5_flow_tunnel_allocate(struct rte_eth_dev *dev,
+			  const struct rte_flow_tunnel *app_tunnel)
+{
+	int ret;
+	struct mlx5_flow_tunnel *tunnel;
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_dev_ctx_shared *sh = priv->sh;
+	struct mlx5_flow_tunnel_hub *thub = sh->tunnel_hub;
+	struct mlx5_flow_id_pool *id_pool = thub->tunnel_ids;
+	uint32_t id;
+
+	ret = mlx5_flow_id_get(id_pool, &id);
+	if (ret)
+		return NULL;
+	/**
+	 * mlx5 flow tunnel is an auxlilary data structure
+	 * It's not part of IO. No need to allocate it from
+	 * huge pages pools dedicated for IO
+	 */
+	tunnel = calloc(1, sizeof(*tunnel));
+	if (!tunnel) {
+		mlx5_flow_id_release(id_pool, id);
+		return NULL;
+	}
+	/* initiate new PMD tunnel */
+	memcpy(&tunnel->app_tunnel, app_tunnel, sizeof(*app_tunnel));
+	rte_atomic32_init(&tunnel->refctn);
+	tunnel->tunnel_id = id;
+	tunnel->action.type = MLX5_RTE_FLOW_ACTION_TYPE_TUNNEL_SET;
+	tunnel->action.conf = tunnel;
+	tunnel->item.type = MLX5_RTE_FLOW_ITEM_TYPE_TUNNEL;
+	tunnel->item.spec = tunnel;
+	tunnel->item.last = NULL;
+	tunnel->item.mask = NULL;
+	DRV_LOG(DEBUG, "port %u new pmd tunnel id=0x%x",
+		dev->data->port_id, tunnel->tunnel_id);
+
+	return tunnel;
+}
+
+int
+mlx5_get_flow_tunnel(struct rte_eth_dev *dev,
+		     const struct rte_flow_tunnel *app_tunnel,
+		     struct mlx5_flow_tunnel **tunnel)
+{
+	int ret;
+	struct mlx5_priv *priv = dev->data->dev_private;
+	struct mlx5_dev_ctx_shared *sh = priv->sh;
+	struct mlx5_flow_tunnel_hub *thub = sh->tunnel_hub;
+	struct mlx5_flow_tunnel *tun;
+
+	LIST_FOREACH(tun, &thub->tunnels, chain) {
+		if (!memcmp(app_tunnel, &tun->app_tunnel,
+			    sizeof(*app_tunnel))) {
+			*tunnel = tun;
+			ret = 0;
+			break;
+		}
+	}
+	if (!tun) {
+		tun = mlx5_flow_tunnel_allocate(dev, app_tunnel);
+		if (tun) {
+			LIST_INSERT_HEAD(&thub->tunnels, tun, chain);
+			*tunnel = tun;
+		} else {
+			ret = -ENOMEM;
+		}
+	}
+
+	return ret;
+}
+
+void mlx5_release_tunnel_hub(struct mlx5_dev_ctx_shared *sh)
+{
+	if (sh->tunnel_hub)
+		return;
+	RTE_VERIFY(LIST_EMPTY(&sh->tunnel_hub->tunnels));
+	mlx5_flow_id_pool_release(sh->tunnel_hub->tunnel_ids);
+	free(sh->tunnel_hub);
+}
+
+int mlx5_alloc_tunnel_hub(struct mlx5_dev_ctx_shared *sh)
+{
+	int err;
+
+	sh->tunnel_hub = calloc(1, sizeof(*sh->tunnel_hub));
+	if (!sh->tunnel_hub)
+		return -ENOMEM;
+	LIST_INIT(&sh->tunnel_hub->tunnels);
+	sh->tunnel_hub->tunnel_ids = mlx5_flow_id_pool_alloc(MLX5_MAX_TUNNELS);
+	if (!sh->tunnel_hub->tunnel_ids) {
+		free(sh->tunnel_hub);
+		err = -rte_errno;
+		goto out;
+	}
+	err = 0;
+
+out:
+	return err;
+}
diff --git a/drivers/net/mlx5/mlx5_flow.h b/drivers/net/mlx5/mlx5_flow.h
index 66caefce46..5d2be7d123 100644
--- a/drivers/net/mlx5/mlx5_flow.h
+++ b/drivers/net/mlx5/mlx5_flow.h
@@ -26,6 +26,7 @@  enum mlx5_rte_flow_item_type {
 	MLX5_RTE_FLOW_ITEM_TYPE_TAG,
 	MLX5_RTE_FLOW_ITEM_TYPE_TX_QUEUE,
 	MLX5_RTE_FLOW_ITEM_TYPE_VLAN,
+	MLX5_RTE_FLOW_ITEM_TYPE_TUNNEL,
 };
 
 /* Private (internal) rte flow actions. */
@@ -35,6 +36,7 @@  enum mlx5_rte_flow_action_type {
 	MLX5_RTE_FLOW_ACTION_TYPE_MARK,
 	MLX5_RTE_FLOW_ACTION_TYPE_COPY_MREG,
 	MLX5_RTE_FLOW_ACTION_TYPE_DEFAULT_MISS,
+	MLX5_RTE_FLOW_ACTION_TYPE_TUNNEL_SET,
 };
 
 /* Matches on selected register. */
@@ -196,6 +198,7 @@  enum mlx5_feature_name {
 #define MLX5_FLOW_ACTION_SET_IPV6_DSCP (1ull << 33)
 #define MLX5_FLOW_ACTION_AGE (1ull << 34)
 #define MLX5_FLOW_ACTION_DEFAULT_MISS (1ull << 35)
+#define MLX5_FLOW_ACTION_TUNNEL_TYPE1 (1ull << 36)
 
 #define MLX5_FLOW_FATE_ACTIONS \
 	(MLX5_FLOW_ACTION_DROP | MLX5_FLOW_ACTION_QUEUE | \
@@ -816,6 +819,45 @@  struct mlx5_fdir_flow {
 
 #define HAIRPIN_FLOW_ID_BITS 28
 
+#define MLX5_MAX_TUNNELS 63
+#define MLX5_TUNNEL_MARK_MASK 0x3F0000u
+#define TUNNEL_STEER_GROUP(grp) ((grp) | (1u << 31))
+
+struct mlx5_flow_tunnel {
+	LIST_ENTRY(mlx5_flow_tunnel) chain;
+	struct rte_flow_tunnel app_tunnel;	/** app tunnel copy */
+	uint32_t tunnel_id;		/** unique tunnel ID */
+	rte_atomic32_t refctn;
+	struct rte_flow_action action;
+	struct rte_flow_item item;
+};
+
+/** PMD tunnel related context */
+struct mlx5_flow_tunnel_hub {
+	LIST_HEAD(, mlx5_flow_tunnel) tunnels;
+	struct mlx5_flow_id_pool *tunnel_ids;
+};
+
+static inline bool
+is_flow_tunnel_match_rule(__rte_unused struct rte_eth_dev *dev,
+			  __rte_unused const struct rte_flow_attr *attr,
+			  __rte_unused const struct rte_flow_item items[],
+			  __rte_unused const struct rte_flow_action actions[])
+{
+	return (items[0].type == (typeof(items[0].type))
+			MLX5_RTE_FLOW_ITEM_TYPE_TUNNEL);
+}
+
+static inline bool
+is_flow_tunnel_steer_rule(__rte_unused struct rte_eth_dev *dev,
+			  __rte_unused const struct rte_flow_attr *attr,
+			  __rte_unused const struct rte_flow_item items[],
+			  __rte_unused const struct rte_flow_action actions[])
+{
+	return (actions[0].type == (typeof(actions[0].type))
+			MLX5_RTE_FLOW_ACTION_TYPE_TUNNEL_SET);
+}
+
 /* Flow structure. */
 struct rte_flow {
 	ILIST_ENTRY(uint32_t)next; /**< Index to the next flow structure. */
@@ -823,12 +865,14 @@  struct rte_flow {
 	/**< Device flow handles that are part of the flow. */
 	uint32_t drv_type:2; /**< Driver type. */
 	uint32_t fdir:1; /**< Identifier of associated FDIR if any. */
+	uint32_t tunnel:1;
 	uint32_t hairpin_flow_id:HAIRPIN_FLOW_ID_BITS;
 	/**< The flow id used for hairpin. */
 	uint32_t copy_applied:1; /**< The MARK copy Flow os applied. */
 	uint32_t rix_mreg_copy;
 	/**< Index to metadata register copy table resource. */
 	uint32_t counter; /**< Holds flow counter. */
+	uint32_t tunnel_id;  /**< Tunnel id */
 	uint16_t meter; /**< Holds flow meter id. */
 } __rte_packed;
 
@@ -1045,4 +1089,9 @@  int mlx5_flow_destroy_policer_rules(struct rte_eth_dev *dev,
 				    const struct rte_flow_attr *attr);
 int mlx5_flow_meter_flush(struct rte_eth_dev *dev,
 			  struct rte_mtr_error *error);
+int mlx5_get_flow_tunnel(struct rte_eth_dev *dev,
+			 const struct rte_flow_tunnel *app_tunnel,
+			 struct mlx5_flow_tunnel **tunnel);
+void mlx5_release_tunnel_hub(struct mlx5_dev_ctx_shared *sh);
+int mlx5_alloc_tunnel_hub(struct mlx5_dev_ctx_shared *sh);
 #endif /* RTE_PMD_MLX5_FLOW_H_ */
diff --git a/drivers/net/mlx5/mlx5_flow_dv.c b/drivers/net/mlx5/mlx5_flow_dv.c
index 53399800ff..21af7fb93a 100644
--- a/drivers/net/mlx5/mlx5_flow_dv.c
+++ b/drivers/net/mlx5/mlx5_flow_dv.c
@@ -3676,6 +3676,8 @@  flow_dv_validate_action_jump(const struct rte_flow_action *action,
 					  NULL, "action configuration not set");
 	target_group =
 		((const struct rte_flow_action_jump *)action->conf)->group;
+	if (action_flags & MLX5_FLOW_ACTION_TUNNEL_TYPE1)
+		target_group = TUNNEL_STEER_GROUP(target_group);
 	ret = mlx5_flow_group_to_table(attributes, external, target_group,
 				       true, &table, error);
 	if (ret)
@@ -5035,6 +5037,15 @@  flow_dv_validate(struct rte_eth_dev *dev, const struct rte_flow_attr *attr,
 						  RTE_FLOW_ERROR_TYPE_ITEM,
 						  NULL, "item not supported");
 		switch (type) {
+		case MLX5_RTE_FLOW_ITEM_TYPE_TUNNEL:
+			if (items[0].type != (typeof(items[0].type))
+						MLX5_RTE_FLOW_ITEM_TYPE_TUNNEL)
+				return rte_flow_error_set
+						(error, EINVAL,
+						RTE_FLOW_ERROR_TYPE_ITEM,
+						NULL, "MLX5 private items "
+						"must be the first");
+			break;
 		case RTE_FLOW_ITEM_TYPE_VOID:
 			break;
 		case RTE_FLOW_ITEM_TYPE_PORT_ID:
@@ -5699,6 +5710,17 @@  flow_dv_validate(struct rte_eth_dev *dev, const struct rte_flow_attr *attr,
 			action_flags |= MLX5_FLOW_ACTION_SET_IPV6_DSCP;
 			rw_act_num += MLX5_ACT_NUM_SET_DSCP;
 			break;
+		case MLX5_RTE_FLOW_ACTION_TYPE_TUNNEL_SET:
+			if (actions[0].type != (typeof(actions[0].type))
+				MLX5_RTE_FLOW_ACTION_TYPE_TUNNEL_SET)
+				return rte_flow_error_set
+						(error, EINVAL,
+						RTE_FLOW_ERROR_TYPE_ACTION,
+						NULL, "MLX5 private action "
+						"must be the first");
+
+			action_flags |= MLX5_FLOW_ACTION_TUNNEL_TYPE1;
+			break;
 		default:
 			return rte_flow_error_set(error, ENOTSUP,
 						  RTE_FLOW_ERROR_TYPE_ACTION,
@@ -5706,6 +5728,31 @@  flow_dv_validate(struct rte_eth_dev *dev, const struct rte_flow_attr *attr,
 						  "action not supported");
 		}
 	}
+	/*
+	 * Validate flow tunnel decap_set rule
+	 */
+	if (action_flags & MLX5_FLOW_ACTION_TUNNEL_TYPE1) {
+		uint64_t bad_actions_mask = MLX5_FLOW_ACTION_DECAP |
+						MLX5_FLOW_ACTION_MARK;
+
+		if (action_flags & bad_actions_mask)
+			return rte_flow_error_set
+					(error, EINVAL,
+					RTE_FLOW_ERROR_TYPE_ACTION, NULL,
+					"Invalid RTE action in tunnel "
+					"set decap rule");
+		if (!(action_flags & MLX5_FLOW_ACTION_JUMP))
+			return rte_flow_error_set
+					(error, EINVAL,
+					RTE_FLOW_ERROR_TYPE_ACTION, NULL,
+					"tunnel set decap rule must terminate "
+					"with JUMP");
+		if (!attr->ingress)
+			return rte_flow_error_set
+					(error, EINVAL,
+					RTE_FLOW_ERROR_TYPE_ACTION, NULL,
+					"tunnel flows for ingress traffic only");
+	}
 	/*
 	 * Validate the drop action mutual exclusion with other actions.
 	 * Drop action is mutually-exclusive with any other action, except for
@@ -8110,11 +8157,14 @@  __flow_dv_translate(struct rte_eth_dev *dev,
 	uint8_t next_protocol = 0xff;
 	struct rte_vlan_hdr vlan = { 0 };
 	uint32_t table;
+	uint32_t attr_group;
 	int ret = 0;
 
+	attr_group = !is_flow_tunnel_match_rule(dev, attr, items, actions) ?
+			attr->group : TUNNEL_STEER_GROUP(attr->group);
 	mhdr_res->ft_type = attr->egress ? MLX5DV_FLOW_TABLE_TYPE_NIC_TX :
 					   MLX5DV_FLOW_TABLE_TYPE_NIC_RX;
-	ret = mlx5_flow_group_to_table(attr, dev_flow->external, attr->group,
+	ret = mlx5_flow_group_to_table(attr, dev_flow->external, attr_group,
 				       !!priv->fdb_def_rule, &table, error);
 	if (ret)
 		return ret;
@@ -8125,6 +8175,15 @@  __flow_dv_translate(struct rte_eth_dev *dev,
 		priority = dev_conf->flow_prio - 1;
 	/* number of actions must be set to 0 in case of dirty stack. */
 	mhdr_res->actions_num = 0;
+	if (is_flow_tunnel_match_rule(dev, attr, items, actions)) {
+		if (flow_dv_create_action_l2_decap(dev, dev_flow,
+						   attr->transfer,
+						   error))
+			return -rte_errno;
+		dev_flow->dv.actions[actions_n++] =
+				dev_flow->dv.encap_decap->action;
+		action_flags |= MLX5_FLOW_ACTION_DECAP;
+	}
 	for (; !actions_end ; actions++) {
 		const struct rte_flow_action_queue *queue;
 		const struct rte_flow_action_rss *rss;
@@ -8134,6 +8193,7 @@  __flow_dv_translate(struct rte_eth_dev *dev,
 		const struct rte_flow_action_meter *mtr;
 		struct mlx5_flow_tbl_resource *tbl;
 		uint32_t port_id = 0;
+		uint32_t jump_group;
 		struct mlx5_flow_dv_port_id_action_resource port_id_resource;
 		int action_type = actions->type;
 		const struct rte_flow_action *found_action = NULL;
@@ -8145,6 +8205,9 @@  __flow_dv_translate(struct rte_eth_dev *dev,
 						  actions,
 						  "action not supported");
 		switch (action_type) {
+		case MLX5_RTE_FLOW_ACTION_TYPE_TUNNEL_SET:
+			action_flags |= MLX5_FLOW_ACTION_TUNNEL_TYPE1;
+			break;
 		case RTE_FLOW_ACTION_TYPE_VOID:
 			break;
 		case RTE_FLOW_ACTION_TYPE_PORT_ID:
@@ -8377,8 +8440,12 @@  __flow_dv_translate(struct rte_eth_dev *dev,
 			break;
 		case RTE_FLOW_ACTION_TYPE_JUMP:
 			jump_data = action->conf;
+			jump_group = !(action_flags &
+					MLX5_FLOW_ACTION_TUNNEL_TYPE1) ?
+					jump_data->group :
+					TUNNEL_STEER_GROUP(jump_data->group);
 			ret = mlx5_flow_group_to_table(attr, dev_flow->external,
-						       jump_data->group,
+						       jump_group,
 						       !!priv->fdb_def_rule,
 						       &table, error);
 			if (ret)