[v2,13/20] net/mlx5: add RSS flow action

Message ID ae5d5fc2b1a1501ca622e31c9d1cc6a348b2bd15.1530111623.git.nelio.laranjeiro@6wind.com (mailing list archive)
State Superseded, archived
Delegated to: Shahaf Shuler
Headers
Series net/mlx5: flow rework |

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/Intel-compilation fail apply issues

Commit Message

Nélio Laranjeiro June 27, 2018, 3:07 p.m. UTC
  Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
---
 drivers/net/mlx5/mlx5_flow.c | 1211 +++++++++++++++++++++++++---------
 1 file changed, 899 insertions(+), 312 deletions(-)
  

Comments

Yongseok Koh July 6, 2018, 2:16 a.m. UTC | #1
On Wed, Jun 27, 2018 at 05:07:45PM +0200, Nelio Laranjeiro wrote:
> Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
> ---
>  drivers/net/mlx5/mlx5_flow.c | 1211 +++++++++++++++++++++++++---------
>  1 file changed, 899 insertions(+), 312 deletions(-)
> 
> diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c
> index a39157533..08e0a6556 100644
> --- a/drivers/net/mlx5/mlx5_flow.c
> +++ b/drivers/net/mlx5/mlx5_flow.c
> @@ -51,13 +51,148 @@ extern const struct eth_dev_ops mlx5_dev_ops_isolate;
>  /* Action fate on the packet. */
>  #define MLX5_FLOW_FATE_DROP (1u << 0)
>  #define MLX5_FLOW_FATE_QUEUE (1u << 1)
> +#define MLX5_FLOW_FATE_RSS (1u << 2)
>  
>  /* Modify a packet. */
>  #define MLX5_FLOW_MOD_FLAG (1u << 0)
>  #define MLX5_FLOW_MOD_MARK (1u << 1)
>  
> +/* Priority reserved for default flows. */
> +#define MLX5_FLOW_PRIO_RSVD ((uint32_t)-1)
> +
> +enum mlx5_expansion {
> +	MLX5_EXPANSION_ROOT,
> +	MLX5_EXPANSION_ROOT2,

How about MLX5_EXPANSION_OUTER_ROOT?

> +	MLX5_EXPANSION_OUTER_ETH,
> +	MLX5_EXPANSION_OUTER_IPV4,
> +	MLX5_EXPANSION_OUTER_IPV4_UDP,
> +	MLX5_EXPANSION_OUTER_IPV4_TCP,
> +	MLX5_EXPANSION_OUTER_IPV6,
> +	MLX5_EXPANSION_OUTER_IPV6_UDP,
> +	MLX5_EXPANSION_OUTER_IPV6_TCP,
> +	MLX5_EXPANSION_VXLAN,
> +	MLX5_EXPANSION_VXLAN_GPE,
> +	MLX5_EXPANSION_GRE,
> +	MLX5_EXPANSION_MPLS,
> +	MLX5_EXPANSION_ETH,
> +	MLX5_EXPANSION_IPV4,
> +	MLX5_EXPANSION_IPV4_UDP,
> +	MLX5_EXPANSION_IPV4_TCP,
> +	MLX5_EXPANSION_IPV6,
> +	MLX5_EXPANSION_IPV6_UDP,
> +	MLX5_EXPANSION_IPV6_TCP,
> +};
> +
> +/** Supported expansion of items. */
> +static const struct rte_flow_expand_node mlx5_support_expansion[] = {
> +	[MLX5_EXPANSION_ROOT] = {
> +		.next = RTE_FLOW_EXPAND_ITEMS(MLX5_EXPANSION_ETH,
> +					      MLX5_EXPANSION_IPV4,
> +					      MLX5_EXPANSION_IPV6),
> +		.type = RTE_FLOW_ITEM_TYPE_END,
> +	},
> +	[MLX5_EXPANSION_ROOT2] = {
> +		.next = RTE_FLOW_EXPAND_ITEMS(MLX5_EXPANSION_OUTER_ETH,
> +					      MLX5_EXPANSION_OUTER_IPV4,
> +					      MLX5_EXPANSION_OUTER_IPV6),
> +		.type = RTE_FLOW_ITEM_TYPE_END,
> +	},
> +	[MLX5_EXPANSION_OUTER_ETH] = {
> +		.next = RTE_FLOW_EXPAND_ITEMS(MLX5_EXPANSION_OUTER_IPV4,
> +					      MLX5_EXPANSION_OUTER_IPV6),
> +		.type = RTE_FLOW_ITEM_TYPE_ETH,
> +		.rss_types = 0,
> +	},
> +	[MLX5_EXPANSION_OUTER_IPV4] = {
> +		.next = RTE_FLOW_EXPAND_ITEMS(MLX5_EXPANSION_OUTER_IPV4_UDP,
> +					      MLX5_EXPANSION_OUTER_IPV4_TCP),
> +		.type = RTE_FLOW_ITEM_TYPE_IPV4,
> +		.rss_types = ETH_RSS_IPV4 | ETH_RSS_FRAG_IPV4 |
> +			ETH_RSS_NONFRAG_IPV4_OTHER,
> +	},
> +	[MLX5_EXPANSION_OUTER_IPV4_UDP] = {
> +		.next = RTE_FLOW_EXPAND_ITEMS(MLX5_EXPANSION_VXLAN),
> +		.type = RTE_FLOW_ITEM_TYPE_UDP,
> +		.rss_types = ETH_RSS_NONFRAG_IPV4_UDP,
> +	},
> +	[MLX5_EXPANSION_OUTER_IPV4_TCP] = {
> +		.type = RTE_FLOW_ITEM_TYPE_TCP,
> +		.rss_types = ETH_RSS_NONFRAG_IPV4_TCP,
> +	},
> +	[MLX5_EXPANSION_OUTER_IPV6] = {
> +		.next = RTE_FLOW_EXPAND_ITEMS(MLX5_EXPANSION_OUTER_IPV6_UDP,
> +					      MLX5_EXPANSION_OUTER_IPV6_TCP),
> +		.type = RTE_FLOW_ITEM_TYPE_IPV6,
> +		.rss_types = ETH_RSS_IPV6 | ETH_RSS_FRAG_IPV6 |
> +			ETH_RSS_NONFRAG_IPV6_OTHER,
> +	},
> +	[MLX5_EXPANSION_OUTER_IPV6_UDP] = {
> +		.next = RTE_FLOW_EXPAND_ITEMS(MLX5_EXPANSION_VXLAN),
> +		.type = RTE_FLOW_ITEM_TYPE_UDP,
> +		.rss_types = ETH_RSS_NONFRAG_IPV6_UDP,
> +	},
> +	[MLX5_EXPANSION_OUTER_IPV6_TCP] = {
> +		.type = RTE_FLOW_ITEM_TYPE_TCP,
> +		.rss_types = ETH_RSS_NONFRAG_IPV6_TCP,
> +	},
> +	[MLX5_EXPANSION_VXLAN] = {
> +		.next = RTE_FLOW_EXPAND_ITEMS(MLX5_EXPANSION_ETH),
> +		.type = RTE_FLOW_ITEM_TYPE_VXLAN,
> +	},
> +	[MLX5_EXPANSION_VXLAN_GPE] = {
> +		.next = RTE_FLOW_EXPAND_ITEMS(MLX5_EXPANSION_ETH,
> +					      MLX5_EXPANSION_IPV4,
> +					      MLX5_EXPANSION_IPV6),
> +		.type = RTE_FLOW_ITEM_TYPE_VXLAN_GPE,
> +	},
> +	[MLX5_EXPANSION_GRE] = {
> +		.next = RTE_FLOW_EXPAND_ITEMS(MLX5_EXPANSION_IPV4),
> +		.type = RTE_FLOW_ITEM_TYPE_GRE,
> +	},
> +	[MLX5_EXPANSION_ETH] = {
> +		.next = RTE_FLOW_EXPAND_ITEMS(MLX5_EXPANSION_IPV4,
> +					      MLX5_EXPANSION_IPV6),
> +		.type = RTE_FLOW_ITEM_TYPE_ETH,
> +	},
> +	[MLX5_EXPANSION_IPV4] = {
> +		.next = RTE_FLOW_EXPAND_ITEMS(MLX5_EXPANSION_IPV4_UDP,
> +					      MLX5_EXPANSION_IPV4_TCP),
> +		.type = RTE_FLOW_ITEM_TYPE_IPV4,
> +		.rss_types = ETH_RSS_IPV4 | ETH_RSS_FRAG_IPV4 |
> +			ETH_RSS_NONFRAG_IPV4_OTHER,
> +	},
> +	[MLX5_EXPANSION_IPV4_UDP] = {
> +		.type = RTE_FLOW_ITEM_TYPE_UDP,
> +		.rss_types = ETH_RSS_NONFRAG_IPV4_UDP,
> +	},
> +	[MLX5_EXPANSION_IPV4_TCP] = {
> +		.type = RTE_FLOW_ITEM_TYPE_TCP,
> +		.rss_types = ETH_RSS_NONFRAG_IPV4_TCP,
> +	},
> +	[MLX5_EXPANSION_IPV6] = {
> +		.next = RTE_FLOW_EXPAND_ITEMS(MLX5_EXPANSION_IPV6_UDP,
> +					      MLX5_EXPANSION_IPV6_TCP),
> +		.type = RTE_FLOW_ITEM_TYPE_IPV6,
> +		.rss_types = ETH_RSS_IPV6 | ETH_RSS_FRAG_IPV6 |
> +			ETH_RSS_NONFRAG_IPV6_OTHER,
> +	},
> +	[MLX5_EXPANSION_IPV6_UDP] = {
> +		.type = RTE_FLOW_ITEM_TYPE_UDP,
> +		.rss_types = ETH_RSS_NONFRAG_IPV6_UDP,
> +	},
> +	[MLX5_EXPANSION_IPV6_TCP] = {
> +		.type = RTE_FLOW_ITEM_TYPE_TCP,
> +		.rss_types = ETH_RSS_NONFRAG_IPV6_TCP,
> +	},
> +};
> +
>  /** Handles information leading to a drop fate. */
>  struct mlx5_flow_verbs {
> +	LIST_ENTRY(mlx5_flow_verbs) next;
> +	uint32_t layers;
> +	/**< Bit-fields of expanded layers see MLX5_FLOW_ITEMS_*. */
> +	uint32_t modifier;
> +	/**< Bit-fields of expanded modifier see MLX5_FLOW_MOD_*. */
>  	unsigned int size; /**< Size of the attribute. */
>  	struct {
>  		struct ibv_flow_attr *attr;
> @@ -66,20 +201,26 @@ struct mlx5_flow_verbs {
>  	};
>  	struct ibv_flow *flow; /**< Verbs flow pointer. */
>  	struct mlx5_hrxq *hrxq; /**< Hash Rx queue object. */
> +	uint64_t hash_fields; /**< Verbs hash Rx queue hash fields. */
>  };
>  
>  /* Flow structure. */
>  struct rte_flow {
>  	TAILQ_ENTRY(rte_flow) next; /**< Pointer to the next flow structure. */
>  	struct rte_flow_attr attributes; /**< User flow attribute. */
> +	uint32_t expand:1; /**< Flow is expanded due to RSS configuration. */

Suggest 'expanded'.

>  	uint32_t layers;
>  	/**< Bit-fields of present layers see MLX5_FLOW_ITEMS_*. */
>  	uint32_t modifier;
>  	/**< Bit-fields of present modifier see MLX5_FLOW_MOD_*. */
>  	uint32_t fate;
>  	/**< Bit-fields of present fate see MLX5_FLOW_FATE_*. */
> -	struct mlx5_flow_verbs verbs; /* Verbs flow. */
> -	uint16_t queue; /**< Destination queue to redirect traffic to. */
> +	LIST_HEAD(verbs, mlx5_flow_verbs) verbs; /**< Verbs flows list. */
> +	struct mlx5_flow_verbs *cur_verbs;
> +	/**< Current Verbs flow structure being filled. */
> +	struct rte_flow_action_rss rss;/**< RSS context. */
> +	uint8_t key[40]; /**< RSS hash key. */

Let's define a macro for '40'.

> +	uint16_t (*queue)[]; /**< Destination queues to redirect traffic to. */
>  };
>  
>  static const struct rte_flow_ops mlx5_flow_ops = {
> @@ -122,16 +263,27 @@ struct ibv_spec_header {
>  	uint16_t size;
>  };
>  
> - /**
> -  * Get the maximum number of priority available.
> -  *
> -  * @param dev
> -  *   Pointer to Ethernet device.
> -  *
> -  * @return
> -  *   number of supported flow priority on success, a negative errno value
> -  *   otherwise and rte_errno is set.
> -  */
> +/* Map of Verbs to Flow priority with 8 Verbs priorities. */
> +static const uint32_t priority_map_3[][3] = {
> +	{ 0, 1, 2 }, { 2, 3, 4 }, { 5, 6, 7 },
> +};
> +
> +/* Map of Verbs to Flow priority with 16 Verbs priorities. */
> +static const uint32_t priority_map_5[][3] = {
> +	{ 0, 1, 2 }, { 3, 4, 5 }, { 6, 7, 8 },
> +	{ 9, 10, 11 }, { 12, 13, 14 },
> +};

How about 

enum mlx5_sub_priority {
	MLX5_SUB_PRIORITY_0 = 0,
	MLX5_SUB_PRIORITY_1,
	MLX5_SUB_PRIORITY_2,
	MLX5_SUB_PRIORITY_MAX,
};

static const uint32_t priority_map_3[][MLX5_SUB_PRIORITY_MAX] = {

> +
> +/**
> + * Get the maximum number of priority available.
> + *
> + * @param dev
> + *   Pointer to Ethernet device.
> + *
> + * @return
> + *   number of supported flow priority on success, a negative errno
> + *   value otherwise and rte_errno is set.
> + */
>  int
>  mlx5_flow_priorities(struct rte_eth_dev *dev)

mlx5_flow_priorities() vs mlx5_flow_priority(), similar name but different
functionality. Better to rename it, e.g. mlx5_flow_get_max_priority() and
mlx5_flow_adjust_priority()

>  {
> @@ -156,6 +308,7 @@ mlx5_flow_priorities(struct rte_eth_dev *dev)
>  	struct mlx5_hrxq *drop = mlx5_hrxq_drop_new(dev);
>  	uint16_t vprio[] = { 8, 16 };
>  	int i;
> +	int priority = 0;
>  
>  	if (!drop) {
>  		rte_errno = ENOTSUP;
> @@ -167,11 +320,54 @@ mlx5_flow_priorities(struct rte_eth_dev *dev)
>  		if (!flow)
>  			break;
>  		claim_zero(mlx5_glue->destroy_flow(flow));
> +		priority = vprio[i];
> +	}
> +	switch (priority) {
> +	case 8:
> +		priority = 3;

How about,
	priority = RTE_DIM(priority_map_3);

> +		break;
> +	case 16:
> +		priority = 5;

	priority = RTE_DIM(priority_map_5);

> +		break;
> +	default:
> +		rte_errno = ENOTSUP;
> +		DRV_LOG(ERR,
> +			"port %u verbs maximum priority: %d expected 8/16",
> +			dev->data->port_id, vprio[i]);
> +		return -rte_errno;
>  	}
>  	mlx5_hrxq_drop_release(dev, drop);
>  	DRV_LOG(INFO, "port %u flow maximum priority: %d",
> -		dev->data->port_id, vprio[i]);
> -	return vprio[i];
> +		dev->data->port_id, priority);
> +	return priority;
> +}
> +
> +/** > + * Adjust flow priority.
> + *
> + * @param dev
> + *   Pointer to Ethernet device.
> + * @param flow
> + *   Pointer to an rte flow.
> + *
> + * @return
> + *   The priority adjusted.
> + */
> +static int
> +mlx5_flow_priority(struct rte_eth_dev *dev, uint32_t priority,
> +		   uint32_t subpriority)
> +{
> +	struct priv *priv = dev->data->dev_private;
> +
> +	switch (priv->config.flow_prio) {
> +	case 3:

	case RTE_DIM(priority_map_3):

> +		priority = priority_map_3[priority][subpriority];
> +		break;
> +	case 5:

	case RTE_DIM(priority_map_5):

> +		priority = priority_map_5[priority][subpriority];
> +		break;
> +	}
> +	return priority;
>  }
>  
>  /**
> @@ -185,6 +381,8 @@ void
>  mlx5_flow_print(struct rte_flow *flow __rte_unused)
>  {
>  #ifndef NDEBUG
> +	struct mlx5_flow_verbs *verbs = LIST_FIRST(&flow->verbs);
> +
>  	fprintf(stdout, "---------8<------------\n");
>  	fprintf(stdout, "%s: flow information\n", MLX5_DRIVER_NAME);
>  	fprintf(stdout, " attributes: group %u priority %u ingress %d egress %d"
> @@ -193,26 +391,36 @@ mlx5_flow_print(struct rte_flow *flow __rte_unused)
>  		flow->attributes.ingress,
>  		flow->attributes.egress,
>  		flow->attributes.transfer);
> -	fprintf(stdout, " layers: %s/%s/%s\n",
> -		flow->layers & MLX5_FLOW_LAYER_OUTER_L2 ? "l2" : "-",
> -		flow->layers & MLX5_FLOW_LAYER_OUTER_L3 ? "l3" : "-",
> -		flow->layers & MLX5_FLOW_LAYER_OUTER_L4 ? "l4" : "-");
> -	if (flow->fate & MLX5_FLOW_FATE_DROP)
> +	if (flow->fate & MLX5_FLOW_FATE_DROP) {
>  		fprintf(stdout, " fate: drop queue\n");
> -	else if (flow->fate & MLX5_FLOW_FATE_QUEUE)
> -		fprintf(stdout, " fate: target queue %u\n", flow->queue);
> -	if (flow->verbs.attr) {
> -		struct ibv_spec_header *hdr =
> -			(struct ibv_spec_header *)flow->verbs.specs;
> -		const int n = flow->verbs.attr->num_of_specs;
> -		int i;
> -
> -		fprintf(stdout, " Verbs attributes: specs_n %u\n",
> -			flow->verbs.attr->num_of_specs);
> -		for (i = 0; i != n; ++i) {
> -			rte_hexdump(stdout, " ", hdr, hdr->size);
> -			hdr = (struct ibv_spec_header *)
> -				((uint8_t *)hdr + hdr->size);
> +	} else {
> +		uint16_t i;
> +
> +		fprintf(stdout, " fate: target queues");
> +		for (i = 0; i != flow->rss.queue_num; ++i)
> +			fprintf(stdout, " %u", (*flow->queue)[i]);
> +		fprintf(stdout, "\n");
> +	}
> +	LIST_FOREACH(verbs, &flow->verbs, next) {
> +		uint32_t layers = flow->layers | verbs->layers;
> +
> +		fprintf(stdout, " layers: %s/%s/%s\n",
> +			layers & MLX5_FLOW_LAYER_OUTER_L2 ? "l2" : "-",
> +			layers & MLX5_FLOW_LAYER_OUTER_L3 ? "l3" : "-",
> +			layers & MLX5_FLOW_LAYER_OUTER_L4 ? "l4" : "-");
> +		if (verbs->attr) {
> +			struct ibv_spec_header *hdr =
> +				(struct ibv_spec_header *)verbs->specs;
> +			const int n = verbs->attr->num_of_specs;
> +			int i;
> +
> +			fprintf(stdout, " Verbs attributes: specs_n %u\n",
> +				verbs->attr->num_of_specs);
> +			for (i = 0; i != n; ++i) {
> +				rte_hexdump(stdout, " ", hdr, hdr->size);
> +				hdr = (struct ibv_spec_header *)
> +					((uint8_t *)hdr + hdr->size);
> +			}
>  		}
>  	}
>  	fprintf(stdout, "--------->8------------\n");
> @@ -239,18 +447,20 @@ mlx5_flow_attributes(struct rte_eth_dev *dev, const struct rte_flow_attr *attr,
>  		     struct rte_flow *flow, struct rte_flow_error *error)
>  {
>  	uint32_t priority_max =
> -		((struct priv *)dev->data->dev_private)->config.flow_prio;
> +		((struct priv *)dev->data->dev_private)->config.flow_prio - 1;
>  
>  	if (attr->group)
>  		return rte_flow_error_set(error, ENOTSUP,
>  					  RTE_FLOW_ERROR_TYPE_ATTR_GROUP,
>  					  NULL,
>  					  "groups are not supported");
> -	if (attr->priority >= priority_max)
> +	if (attr->priority != MLX5_FLOW_PRIO_RSVD &&
> +	    attr->priority >= priority_max)
>  		return rte_flow_error_set(error, ENOTSUP,
>  					  RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
>  					  NULL,
> -					  "priority value is not supported");
> +					  "requested priority value is not"
> +					  " supported");
>  	if (attr->egress)
>  		return rte_flow_error_set(error, ENOTSUP,
>  					  RTE_FLOW_ERROR_TYPE_ATTR_EGRESS,
> @@ -267,6 +477,8 @@ mlx5_flow_attributes(struct rte_eth_dev *dev, const struct rte_flow_attr *attr,
>  					  NULL,
>  					  "only ingress is supported");
>  	flow->attributes = *attr;
> +	if (attr->priority == MLX5_FLOW_PRIO_RSVD)
> +		flow->attributes.priority = priority_max;
>  	return 0;
>  }
>  
> @@ -346,14 +558,51 @@ mlx5_flow_item_validate(const struct rte_flow_item *item,
>  static void
>  mlx5_flow_spec_verbs_add(struct rte_flow *flow, void *src, unsigned int size)
>  {
> -	if (flow->verbs.specs) {
> +	struct mlx5_flow_verbs *verbs = flow->cur_verbs;
> +
> +	if (verbs->specs) {
>  		void *dst;
>  
> -		dst = (void *)(flow->verbs.specs + flow->verbs.size);
> +		dst = (void *)(verbs->specs + verbs->size);
>  		memcpy(dst, src, size);
> -		++flow->verbs.attr->num_of_specs;
> +		++verbs->attr->num_of_specs;
>  	}
> -	flow->verbs.size += size;
> +	verbs->size += size;
> +}
> +
> +/**
> + * Update layer bit-field.
> + *
> + * @param flow[in, out]
> + *   Pointer to flow structure.
> + * @param layers
> + *   Bit-fields of layers to add see MLX5_FLOW_ITEMS_*.

Where is MLX5_FLOW_ITEMS_*? Isn't it MLX5_FLOW_LAYER_*?
There are several occurrences.

> + */
> +static void
> +mlx5_flow_layers_update(struct rte_flow *flow, uint32_t layers)
> +{
> +	if (flow->expand) {
> +		if (flow->cur_verbs)
> +			flow->cur_verbs->layers |= layers;

If flow->cur_verbs is null, does that mean it is a testing call? Then, is it
unnecessary to update layers for the testing call? Confusing..

> +	} else {
> +		flow->layers |= layers;
> +	}
> +}
> +
> +/**
> + * Get layers bit-field.
> + *
> + * @param flow[in, out]
> + *   Pointer to flow structure.
> + */
> +static uint32_t
> +mlx5_flow_layers(struct rte_flow *flow)
> +{
> +	uint32_t layers = flow->layers;
> +
> +	if (flow->expand && flow->cur_verbs)

If flow is expanded and it is a testing call, then flow->layers is used?

> +		layers |= flow->cur_verbs->layers;
> +	return layers;

This part is so unclear to me, hard to understand. There are two 'layers'
fields, one in rte_flow and the other in mlx5_flow_verbs. It seems
rte_flow->layers is used only when the flow isn't expanded. If a flow is
expanded, flow->expand is set after processing the first entry in the expanded
list. In mlx5_flow_merge(),

	for (i = 0; i != buf->entries; ++i) {

		...

		flow->expand = !!(buf->entries > 1);
	}

Why is flow->expand set at the end of the loop? Is this in order to avoid
validation for the expanded flows? mlx5_flow_item_xxx() executes validation only
if flow->expand is zero, why?

And why does mlx5_flow_layers() have to return (flow->layers |
flow->cur_verbs->layers) if expanded?

If there are 3 entries in the rte_flow_expand_rss,
	eth
	eth / ipv4 / udp
	eth / ipv6 / udp

Then, the 2nd and 3rd don't have MLX5_FLOW_LAYER_OUTER_L2 in layers field?
Please explain in details and add comments appropriately.

>  }
>  
>  /**
> @@ -388,22 +637,26 @@ mlx5_flow_item_eth(const struct rte_flow_item *item, struct rte_flow *flow,
>  		.type = IBV_FLOW_SPEC_ETH,
>  		.size = size,
>  	};
> +	const uint32_t layers = mlx5_flow_layers(flow);
>  	int ret;
>  
> -	if (flow->layers & MLX5_FLOW_LAYER_OUTER_L2)
> -		return rte_flow_error_set(error, ENOTSUP,
> -					  RTE_FLOW_ERROR_TYPE_ITEM,
> -					  item,
> -					  "L2 layers already configured");
> -	if (!mask)
> -		mask = &rte_flow_item_eth_mask;
> -	ret = mlx5_flow_item_validate(item, (const uint8_t *)mask,
> -				      (const uint8_t *)&nic_mask,
> -				      sizeof(struct rte_flow_item_eth),
> -				      error);
> -	if (ret)
> -		return ret;
> -	flow->layers |= MLX5_FLOW_LAYER_OUTER_L2;
> +	if (!flow->expand) {
> +		if (layers & MLX5_FLOW_LAYER_OUTER_L2)
> +			return rte_flow_error_set(error, ENOTSUP,
> +						  RTE_FLOW_ERROR_TYPE_ITEM,
> +						  item,
> +						  "L2 layers already"
> +						  " configured");
> +		if (!mask)
> +			mask = &rte_flow_item_eth_mask;
> +		ret = mlx5_flow_item_validate(item, (const uint8_t *)mask,
> +					      (const uint8_t *)&nic_mask,
> +					      sizeof(struct rte_flow_item_eth),
> +					      error);
> +		if (ret)
> +			return ret;
> +	}
> +	mlx5_flow_layers_update(flow, MLX5_FLOW_LAYER_OUTER_L2);
>  	if (size > flow_size)
>  		return size;
>  	if (spec) {
> @@ -482,6 +735,7 @@ mlx5_flow_item_vlan(const struct rte_flow_item *item, struct rte_flow *flow,
>  		.tci = RTE_BE16(0x0fff),
>  	};
>  	unsigned int size = sizeof(struct ibv_flow_spec_eth);
> +	struct mlx5_flow_verbs *verbs = flow->cur_verbs;
>  	struct ibv_flow_spec_eth eth = {
>  		.type = IBV_FLOW_SPEC_ETH,
>  		.size = size,
> @@ -491,24 +745,30 @@ mlx5_flow_item_vlan(const struct rte_flow_item *item, struct rte_flow *flow,
>  			MLX5_FLOW_LAYER_OUTER_L4;
>  	const uint32_t vlanm = MLX5_FLOW_LAYER_OUTER_VLAN;
>  	const uint32_t l2m = MLX5_FLOW_LAYER_OUTER_L2;
> +	const uint32_t layers = mlx5_flow_layers(flow);
>  
> -	if (flow->layers & vlanm)
> -		return rte_flow_error_set(error, ENOTSUP,
> -					  RTE_FLOW_ERROR_TYPE_ITEM,
> -					  item,
> -					  "L2 layers already configured");
> -	else if ((flow->layers & lm) != 0)
> -		return rte_flow_error_set(error, ENOTSUP,
> -					  RTE_FLOW_ERROR_TYPE_ITEM,
> -					  item,
> -					  "L2 layer cannot follow L3/L4 layer");
> -	if (!mask)
> -		mask = &rte_flow_item_vlan_mask;
> -	ret = mlx5_flow_item_validate(item, (const uint8_t *)mask,
> -				      (const uint8_t *)&nic_mask,
> -				      sizeof(struct rte_flow_item_vlan), error);
> -	if (ret)
> -		return ret;
> +	if (!flow->expand) {
> +		if (layers & vlanm)
> +			return rte_flow_error_set(error, ENOTSUP,
> +						  RTE_FLOW_ERROR_TYPE_ITEM,
> +						  item,
> +						  "L2 layers already"
> +						  " configured");
> +		else if ((layers & lm) != 0)
> +			return rte_flow_error_set(error, ENOTSUP,
> +						  RTE_FLOW_ERROR_TYPE_ITEM,
> +						  item,
> +						  "L2 layer cannot follow"
> +						  " L3/L4 layer");
> +		if (!mask)
> +			mask = &rte_flow_item_vlan_mask;
> +		ret = mlx5_flow_item_validate(item, (const uint8_t *)mask,
> +					      (const uint8_t *)&nic_mask,
> +					      sizeof(struct rte_flow_item_vlan),
> +					      error);
> +		if (ret)
> +			return ret;
> +	}
>  	if (spec) {
>  		eth.val.vlan_tag = spec->tci;
>  		eth.mask.vlan_tag = mask->tci;
> @@ -517,32 +777,34 @@ mlx5_flow_item_vlan(const struct rte_flow_item *item, struct rte_flow *flow,
>  		eth.mask.ether_type = mask->inner_type;
>  		eth.val.ether_type &= eth.mask.ether_type;
>  	}
> -	/*
> -	 * From verbs perspective an empty VLAN is equivalent
> -	 * to a packet without VLAN layer.
> -	 */
> -	if (!eth.mask.vlan_tag)
> -		return rte_flow_error_set(error, EINVAL,
> -					  RTE_FLOW_ERROR_TYPE_ITEM_SPEC,
> -					  item->spec,
> -					  "VLAN cannot be empty");
> -	/* Outer TPID cannot be matched. */
> -	if (eth.mask.ether_type)
> -		return rte_flow_error_set(error, ENOTSUP,
> -					  RTE_FLOW_ERROR_TYPE_ITEM_SPEC,
> -					  item->spec,
> -					  "VLAN TPID matching is not"
> -					  " supported");
> -	if (!(flow->layers & l2m)) {
> +	if (!flow->expand) {
> +		/*
> +		 * From verbs perspective an empty VLAN is equivalent
> +		 * to a packet without VLAN layer.
> +		 */
> +		if (!eth.mask.vlan_tag)
> +			return rte_flow_error_set(error, EINVAL,
> +						  RTE_FLOW_ERROR_TYPE_ITEM_SPEC,
> +						  item->spec,
> +						  "VLAN cannot be empty");
> +		/* Outer TPID cannot be matched. */
> +		if (eth.mask.ether_type)
> +			return rte_flow_error_set(error, ENOTSUP,
> +						  RTE_FLOW_ERROR_TYPE_ITEM_SPEC,
> +						  item->spec,
> +						  "VLAN TPID matching is not"
> +						  " supported");
> +	}
> +	if (!(layers & l2m)) {
>  		if (size <= flow_size)
>  			mlx5_flow_spec_verbs_add(flow, &eth, size);
>  	} else {
> -		if (flow->verbs.attr)
> -			mlx5_flow_item_vlan_update(flow->verbs.attr, &eth);
> +		if (verbs->attr)
> +			mlx5_flow_item_vlan_update(verbs->attr, &eth);
>  		size = 0; /**< Only an update is done in eth specification. */
>  	}
> -	flow->layers |= MLX5_FLOW_LAYER_OUTER_L2 |
> -		MLX5_FLOW_LAYER_OUTER_VLAN;
> +	mlx5_flow_layers_update(flow, MLX5_FLOW_LAYER_OUTER_L2 |
> +				MLX5_FLOW_LAYER_OUTER_VLAN);
>  	return size;
>  }
>  
> @@ -582,25 +844,31 @@ mlx5_flow_item_ipv4(const struct rte_flow_item *item, struct rte_flow *flow,
>  		.size = size,
>  	};
>  	int ret;
> +	const uint32_t layers = mlx5_flow_layers(flow);
>  
> -	if (flow->layers & MLX5_FLOW_LAYER_OUTER_L3)
> -		return rte_flow_error_set(error, ENOTSUP,
> -					  RTE_FLOW_ERROR_TYPE_ITEM,
> -					  item,
> -					  "multiple L3 layers not supported");
> -	else if (flow->layers & MLX5_FLOW_LAYER_OUTER_L4)
> -		return rte_flow_error_set(error, ENOTSUP,
> -					  RTE_FLOW_ERROR_TYPE_ITEM,
> -					  item,
> -					  "L3 cannot follow an L4 layer.");
> -	if (!mask)
> -		mask = &rte_flow_item_ipv4_mask;
> -	ret = mlx5_flow_item_validate(item, (const uint8_t *)mask,
> -				      (const uint8_t *)&nic_mask,
> -				      sizeof(struct rte_flow_item_ipv4), error);
> -	if (ret < 0)
> -		return ret;
> -	flow->layers |= MLX5_FLOW_LAYER_OUTER_L3_IPV4;
> +	if (!flow->expand) {
> +		if (layers & MLX5_FLOW_LAYER_OUTER_L3)
> +			return rte_flow_error_set(error, ENOTSUP,
> +						  RTE_FLOW_ERROR_TYPE_ITEM,
> +						  item,
> +						  "multiple L3 layers not"
> +						  " supported");
> +		else if (layers & MLX5_FLOW_LAYER_OUTER_L4)
> +			return rte_flow_error_set(error, ENOTSUP,
> +						  RTE_FLOW_ERROR_TYPE_ITEM,
> +						  item,
> +						  "L3 cannot follow an L4"
> +						  " layer");
> +		if (!mask)
> +			mask = &rte_flow_item_ipv4_mask;
> +		ret = mlx5_flow_item_validate(item, (const uint8_t *)mask,
> +					      (const uint8_t *)&nic_mask,
> +					      sizeof(struct rte_flow_item_ipv4),
> +					      error);
> +		if (ret < 0)
> +			return ret;
> +	}
> +	mlx5_flow_layers_update(flow, MLX5_FLOW_LAYER_OUTER_L3_IPV4);
>  	if (size > flow_size)
>  		return size;
>  	if (spec) {
> @@ -667,25 +935,31 @@ mlx5_flow_item_ipv6(const struct rte_flow_item *item, struct rte_flow *flow,
>  		.size = size,
>  	};
>  	int ret;
> +	const uint32_t layers = mlx5_flow_layers(flow);
>  
> -	if (flow->layers & MLX5_FLOW_LAYER_OUTER_L3)
> -		return rte_flow_error_set(error, ENOTSUP,
> -					  RTE_FLOW_ERROR_TYPE_ITEM,
> -					  item,
> -					  "multiple L3 layers not supported");
> -	else if (flow->layers & MLX5_FLOW_LAYER_OUTER_L4)
> -		return rte_flow_error_set(error, ENOTSUP,
> -					  RTE_FLOW_ERROR_TYPE_ITEM,
> -					  item,
> -					  "L3 cannot follow an L4 layer.");
> -	if (!mask)
> -		mask = &rte_flow_item_ipv6_mask;
> -	ret = mlx5_flow_item_validate(item, (const uint8_t *)mask,
> -				      (const uint8_t *)&nic_mask,
> -				      sizeof(struct rte_flow_item_ipv6), error);
> -	if (ret < 0)
> -		return ret;
> -	flow->layers |= MLX5_FLOW_LAYER_OUTER_L3_IPV6;
> +	if (!flow->expand) {
> +		if (layers & MLX5_FLOW_LAYER_OUTER_L3)
> +			return rte_flow_error_set(error, ENOTSUP,
> +						  RTE_FLOW_ERROR_TYPE_ITEM,
> +						  item,
> +						  "multiple L3 layers not"
> +						  " supported");
> +		else if (layers & MLX5_FLOW_LAYER_OUTER_L4)
> +			return rte_flow_error_set(error, ENOTSUP,
> +						  RTE_FLOW_ERROR_TYPE_ITEM,
> +						  item,
> +						  "L3 cannot follow an L4"
> +						  " layer");
> +		if (!mask)
> +			mask = &rte_flow_item_ipv6_mask;
> +		ret = mlx5_flow_item_validate(item, (const uint8_t *)mask,
> +					      (const uint8_t *)&nic_mask,
> +					      sizeof(struct rte_flow_item_ipv6),
> +					      error);
> +		if (ret < 0)
> +			return ret;
> +	}
> +	mlx5_flow_layers_update(flow, MLX5_FLOW_LAYER_OUTER_L3_IPV6);
>  	if (size > flow_size)
>  		return size;
>  	if (spec) {
> @@ -759,25 +1033,31 @@ mlx5_flow_item_udp(const struct rte_flow_item *item, struct rte_flow *flow,
>  		.size = size,
>  	};
>  	int ret;
> +	const uint32_t layers = mlx5_flow_layers(flow);
>  
> -	if (!(flow->layers & MLX5_FLOW_LAYER_OUTER_L3))
> -		return rte_flow_error_set(error, ENOTSUP,
> -					  RTE_FLOW_ERROR_TYPE_ITEM,
> -					  item,
> -					  "L3 is mandatory to filter on L4");
> -	if (flow->layers & MLX5_FLOW_LAYER_OUTER_L4)
> -		return rte_flow_error_set(error, ENOTSUP,
> -					  RTE_FLOW_ERROR_TYPE_ITEM,
> -					  item,
> -					  "L4 layer is already present");
> -	if (!mask)
> -		mask = &rte_flow_item_udp_mask;
> -	ret = mlx5_flow_item_validate(item, (const uint8_t *)mask,
> -				      (const uint8_t *)&rte_flow_item_udp_mask,
> -				      sizeof(struct rte_flow_item_udp), error);
> -	if (ret < 0)
> -		return ret;
> -	flow->layers |= MLX5_FLOW_LAYER_OUTER_L4_UDP;
> +	if (!flow->expand) {
> +		if (!(layers & MLX5_FLOW_LAYER_OUTER_L3))
> +			return rte_flow_error_set(error, ENOTSUP,
> +						  RTE_FLOW_ERROR_TYPE_ITEM,
> +						  item,
> +						  "L3 is mandatory to filter"
> +						  " on L4");
> +		if (layers & MLX5_FLOW_LAYER_OUTER_L4)
> +			return rte_flow_error_set(error, ENOTSUP,
> +						  RTE_FLOW_ERROR_TYPE_ITEM,
> +						  item,
> +						  "L4 layer is already"
> +						  " present");
> +		if (!mask)
> +			mask = &rte_flow_item_udp_mask;
> +		ret = mlx5_flow_item_validate
> +			(item, (const uint8_t *)mask,
> +			 (const uint8_t *)&rte_flow_item_udp_mask,
> +			 sizeof(struct rte_flow_item_udp), error);
> +		if (ret < 0)
> +			return ret;
> +	}
> +	mlx5_flow_layers_update(flow, MLX5_FLOW_LAYER_OUTER_L4_UDP);
>  	if (size > flow_size)
>  		return size;
>  	if (spec) {
> @@ -821,25 +1101,31 @@ mlx5_flow_item_tcp(const struct rte_flow_item *item, struct rte_flow *flow,
>  		.size = size,
>  	};
>  	int ret;
> +	const uint32_t layers = mlx5_flow_layers(flow);
>  
> -	if (!(flow->layers & MLX5_FLOW_LAYER_OUTER_L3))
> -		return rte_flow_error_set(error, ENOTSUP,
> -					  RTE_FLOW_ERROR_TYPE_ITEM,
> -					  item,
> -					  "L3 is mandatory to filter on L4");
> -	if (flow->layers & MLX5_FLOW_LAYER_OUTER_L4)
> -		return rte_flow_error_set(error, ENOTSUP,
> -					  RTE_FLOW_ERROR_TYPE_ITEM,
> -					  item,
> -					  "L4 layer is already present");
> -	if (!mask)
> -		mask = &rte_flow_item_tcp_mask;
> -	ret = mlx5_flow_item_validate(item, (const uint8_t *)mask,
> -				      (const uint8_t *)&rte_flow_item_tcp_mask,
> -				      sizeof(struct rte_flow_item_tcp), error);
> -	if (ret < 0)
> -		return ret;
> -	flow->layers |= MLX5_FLOW_LAYER_OUTER_L4_TCP;
> +	if (!flow->expand) {
> +		if (!(layers & MLX5_FLOW_LAYER_OUTER_L3))
> +			return rte_flow_error_set(error, ENOTSUP,
> +						  RTE_FLOW_ERROR_TYPE_ITEM,
> +						  item,
> +						  "L3 is mandatory to filter"
> +						  " on L4");
> +		if (layers & MLX5_FLOW_LAYER_OUTER_L4)
> +			return rte_flow_error_set(error, ENOTSUP,
> +						  RTE_FLOW_ERROR_TYPE_ITEM,
> +						  item,
> +						  "L4 layer is already"
> +						  " present");
> +		if (!mask)
> +			mask = &rte_flow_item_tcp_mask;
> +		ret = mlx5_flow_item_validate
> +			(item, (const uint8_t *)mask,
> +			 (const uint8_t *)&rte_flow_item_tcp_mask,
> +			 sizeof(struct rte_flow_item_tcp), error);
> +		if (ret < 0)
> +			return ret;
> +	}
> +	mlx5_flow_layers_update(flow, MLX5_FLOW_LAYER_OUTER_L4_TCP);
>  	if (size > flow_size)
>  		return size;
>  	if (spec) {
> @@ -954,18 +1240,20 @@ mlx5_flow_action_drop(const struct rte_flow_action *actions,
>  			.size = size,
>  	};
>  
> -	if (flow->fate)
> -		return rte_flow_error_set(error, ENOTSUP,
> -					  RTE_FLOW_ERROR_TYPE_ACTION,
> -					  actions,
> -					  "multiple fate actions are not"
> -					  " supported");
> -	if (flow->modifier & (MLX5_FLOW_MOD_FLAG | MLX5_FLOW_MOD_MARK))
> -		return rte_flow_error_set(error, ENOTSUP,
> -					  RTE_FLOW_ERROR_TYPE_ACTION,
> -					  actions,
> -					  "drop is not compatible with"
> -					  " flag/mark action");
> +	if (!flow->expand) {
> +		if (flow->fate)
> +			return rte_flow_error_set(error, ENOTSUP,
> +						  RTE_FLOW_ERROR_TYPE_ACTION,
> +						  actions,
> +						  "multiple fate actions are"
> +						  " not supported");
> +		if (flow->modifier & (MLX5_FLOW_MOD_FLAG | MLX5_FLOW_MOD_MARK))
> +			return rte_flow_error_set(error, ENOTSUP,
> +						  RTE_FLOW_ERROR_TYPE_ACTION,
> +						  actions,
> +						  "drop is not compatible with"
> +						  " flag/mark action");
> +	}
>  	if (size < flow_size)
>  		mlx5_flow_spec_verbs_add(flow, &drop, size);
>  	flow->fate |= MLX5_FLOW_FATE_DROP;
> @@ -998,6 +1286,8 @@ mlx5_flow_action_queue(struct rte_eth_dev *dev,
>  	struct priv *priv = dev->data->dev_private;
>  	const struct rte_flow_action_queue *queue = actions->conf;
>  
> +	if (flow->expand)
> +		return 0;
>  	if (flow->fate)
>  		return rte_flow_error_set(error, ENOTSUP,
>  					  RTE_FLOW_ERROR_TYPE_ACTION,
> @@ -1014,11 +1304,162 @@ mlx5_flow_action_queue(struct rte_eth_dev *dev,
>  					  RTE_FLOW_ERROR_TYPE_ACTION_CONF,
>  					  &queue->index,
>  					  "queue is not configured");
> -	flow->queue = queue->index;
> +	if (flow->queue)
> +		(*flow->queue)[0] = queue->index;
> +	flow->rss.queue_num = 1;
>  	flow->fate |= MLX5_FLOW_FATE_QUEUE;
>  	return 0;
>  }
>  
> +/**
> + * Store the Verbs hash fields and priority according to the layer and types.
> + *
> + * @param dev
> + *   Pointer to Ethernet device.
> + * @param flow
> + *   Pointer to flow structure.
> + * @param types
> + *   RSS types for this flow (see ETH_RSS_*).
> + *
> + * @return
> + *   0 on success, a negative errno value otherwise and rte_errno is set.
> + */
> +static int
> +mlx5_flow_action_rss_verbs_attr(struct rte_eth_dev *dev, struct rte_flow *flow,
> +				uint32_t types)
> +{
> +	const uint32_t layers = mlx5_flow_layers(flow);
> +	uint64_t hash_fields;
> +	uint32_t priority;
> +
> +	if ((types & ETH_RSS_NONFRAG_IPV4_TCP) &&
> +	    (layers & MLX5_FLOW_LAYER_OUTER_L4_TCP)) {
> +		hash_fields = IBV_RX_HASH_SRC_IPV4 |
> +			IBV_RX_HASH_DST_IPV4 |
> +			IBV_RX_HASH_SRC_PORT_TCP |
> +			IBV_RX_HASH_DST_PORT_TCP;
> +		priority = 0;
> +	} else if ((types & ETH_RSS_NONFRAG_IPV4_UDP) &&
> +		 (layers & MLX5_FLOW_LAYER_OUTER_L4_UDP)) {
> +		hash_fields = IBV_RX_HASH_SRC_IPV4 |
> +			IBV_RX_HASH_DST_IPV4 |
> +			IBV_RX_HASH_SRC_PORT_UDP |
> +			IBV_RX_HASH_DST_PORT_UDP;
> +		priority = 0;
> +	} else if ((types & (ETH_RSS_IPV4 | ETH_RSS_FRAG_IPV4)) &&
> +		 (layers & MLX5_FLOW_LAYER_OUTER_L3_IPV4)) {
> +		hash_fields = IBV_RX_HASH_SRC_IPV4 |
> +			IBV_RX_HASH_DST_IPV4;
> +		priority = 1;
> +	} else if ((types & ETH_RSS_NONFRAG_IPV6_TCP) &&
> +		 (layers & MLX5_FLOW_LAYER_OUTER_L4_TCP)) {
> +		hash_fields = IBV_RX_HASH_SRC_IPV6 |
> +			IBV_RX_HASH_DST_IPV6 |
> +			IBV_RX_HASH_SRC_PORT_TCP |
> +			IBV_RX_HASH_DST_PORT_TCP;
> +		priority = 0;
> +	} else if ((types & ETH_RSS_NONFRAG_IPV6_UDP) &&
> +		 (layers & MLX5_FLOW_LAYER_OUTER_L3_IPV6)) {
> +		hash_fields = IBV_RX_HASH_SRC_IPV6 |
> +			IBV_RX_HASH_DST_IPV6 |
> +			IBV_RX_HASH_SRC_PORT_UDP |
> +			IBV_RX_HASH_DST_PORT_UDP;
> +		priority = 0;
> +	} else if ((types & (ETH_RSS_IPV6 | ETH_RSS_FRAG_IPV6)) &&
> +		 (layers & MLX5_FLOW_LAYER_OUTER_L3_IPV6)) {
> +		hash_fields = IBV_RX_HASH_SRC_IPV6 |
> +			IBV_RX_HASH_DST_IPV6;
> +		priority = 1;
> +	} else {
> +		hash_fields = 0;
> +		priority = 2;

How about 
		delta = MLX5_SUB_PRIORITY_2;

> +	}
> +	flow->cur_verbs->hash_fields = hash_fields;
> +	flow->cur_verbs->attr->priority =
> +		mlx5_flow_priority(dev, flow->attributes.priority, priority);
> +	return 0;
> +}
> +
> +/**
> + * Validate action queue provided by the user.
> + *
> + * @param dev
> + *   Pointer to Ethernet device structure.
> + * @param actions
> + *   Pointer to flow actions array.
> + * @param flow
> + *   Pointer to the rte_flow structure.
> + * @param error
> + *   Pointer to error structure.

Missing return value.

> + */
> +static int
> +mlx5_flow_action_rss(struct rte_eth_dev *dev,
> +		     const struct rte_flow_action *actions,
> +		     struct rte_flow *flow,
> +		     struct rte_flow_error *error)
> +{
> +	struct priv *priv = dev->data->dev_private;
> +	const struct rte_flow_action_rss *rss = actions->conf;
> +	unsigned int i;
> +
> +	if (flow->expand)
> +		return 0;
> +	if (flow->fate)
> +		return rte_flow_error_set(error, ENOTSUP,
> +					  RTE_FLOW_ERROR_TYPE_ACTION,
> +					  actions,
> +					  "multiple fate actions are not"
> +					  " supported");
> +	if (rss->func != RTE_ETH_HASH_FUNCTION_DEFAULT &&
> +	    rss->func != RTE_ETH_HASH_FUNCTION_TOEPLITZ)
> +		return rte_flow_error_set(error, ENOTSUP,
> +					  RTE_FLOW_ERROR_TYPE_ACTION_CONF,
> +					  &rss->func,
> +					  "RSS hash function not supported");
> +	if (rss->level > 1)
> +		return rte_flow_error_set(error, ENOTSUP,
> +					  RTE_FLOW_ERROR_TYPE_ACTION_CONF,
> +					  &rss->level,
> +					  "tunnel RSS is not supported");
> +	if (rss->key_len < rss_hash_default_key_len)
> +		return rte_flow_error_set(error, ENOTSUP,
> +					  RTE_FLOW_ERROR_TYPE_ACTION_CONF,
> +					  &rss->key_len,
> +					  "RSS hash key too small");
> +	if (rss->key_len > rss_hash_default_key_len)
> +		return rte_flow_error_set(error, ENOTSUP,
> +					  RTE_FLOW_ERROR_TYPE_ACTION_CONF,
> +					  &rss->key_len,
> +					  "RSS hash key too large");
> +	if (rss->queue_num > priv->config.ind_table_max_size)
> +		return rte_flow_error_set(error, ENOTSUP,
> +					  RTE_FLOW_ERROR_TYPE_ACTION_CONF,
> +					  &rss->queue_num,
> +					  "number of queues too large");
> +	if (rss->types & MLX5_RSS_HF_MASK)
> +		return rte_flow_error_set(error, ENOTSUP,
> +					  RTE_FLOW_ERROR_TYPE_ACTION_CONF,
> +					  &rss->types,
> +					  "some RSS protocols are not"
> +					  " supported");
> +	for (i = 0; i != rss->queue_num; ++i) {
> +		if (!(*priv->rxqs)[rss->queue[i]])
> +			return rte_flow_error_set
> +				(error, EINVAL,
> +				 RTE_FLOW_ERROR_TYPE_ACTION_CONF,
> +				 &rss->queue[i],
> +				 "queue is not configured");
> +	}
> +	if (flow->queue)
> +		memcpy((*flow->queue), rss->queue,
> +		       rss->queue_num * sizeof(uint16_t));
> +	flow->rss.queue_num = rss->queue_num;
> +	memcpy(flow->key, rss->key, rss_hash_default_key_len);
> +	flow->rss.types = rss->types;
> +	flow->fate |= MLX5_FLOW_FATE_RSS;
> +	return 0;
> +}
> +
>  /**
>   * Validate action flag provided by the user.
>   *
> @@ -1046,43 +1487,59 @@ mlx5_flow_action_flag(const struct rte_flow_action *actions,
>  		.size = size,
>  		.tag_id = mlx5_flow_mark_set(MLX5_FLOW_MARK_DEFAULT),
>  	};
> +	struct mlx5_flow_verbs *verbs = flow->cur_verbs;
>  
> -	if (flow->modifier & MLX5_FLOW_MOD_FLAG)
> -		return rte_flow_error_set(error, ENOTSUP,
> -					  RTE_FLOW_ERROR_TYPE_ACTION,
> -					  actions,
> -					  "flag action already present");
> -	if (flow->fate & MLX5_FLOW_FATE_DROP)
> -		return rte_flow_error_set(error, ENOTSUP,
> -					  RTE_FLOW_ERROR_TYPE_ACTION,
> -					  actions,
> -					  "flag is not compatible with drop"
> -					  " action");
> -	if (flow->modifier & MLX5_FLOW_MOD_MARK)
> -		return 0;
> +	if (!flow->expand) {
> +		if (flow->modifier & MLX5_FLOW_MOD_FLAG)
> +			return rte_flow_error_set(error, ENOTSUP,
> +						  RTE_FLOW_ERROR_TYPE_ACTION,
> +						  actions,
> +						  "flag action already present");
> +		if (flow->fate & MLX5_FLOW_FATE_DROP)
> +			return rte_flow_error_set(error, ENOTSUP,
> +						  RTE_FLOW_ERROR_TYPE_ACTION,
> +						  actions,
> +						  "flag is not compatible with"
> +						  " drop action");
> +	}
> +	/*
> +	 * The two only possible cases, a mark has already been added in the
> +	 * specification, in such case, the flag is already present in
> +	 * addition of the mark.
> +	 * Second case, has it is not possible to have two flags, it just
> +	 * needs to add it.
> +	 */

Can you rephrase the 'second case'? Maybe 'has' -> 'as'?

> +	if (verbs) {
> +		verbs->modifier |= MLX5_FLOW_MOD_FLAG;
> +		if (verbs->modifier & MLX5_FLOW_MOD_MARK)
> +			size = 0;
> +		else if (size <= flow_size)
> +			mlx5_flow_spec_verbs_add(flow, &tag, size);
> +	} else {
> +		if (flow->modifier & MLX5_FLOW_MOD_MARK)
> +			size = 0;
> +	}
>  	flow->modifier |= MLX5_FLOW_MOD_FLAG;
> -	if (size <= flow_size)
> -		mlx5_flow_spec_verbs_add(flow, &tag, size);
>  	return size;
>  }
>  
>  /**
>   * Update verbs specification to modify the flag to mark.
>   *
> - * @param flow
> - *   Pointer to the rte_flow structure.
> + * @param verbs
> + *   Pointer to the mlx5_flow_verbs structure.
>   * @param mark_id
>   *   Mark identifier to replace the flag.
>   */
>  static void
> -mlx5_flow_verbs_mark_update(struct rte_flow *flow, uint32_t mark_id)
> +mlx5_flow_verbs_mark_update(struct mlx5_flow_verbs *verbs, uint32_t mark_id)
>  {
>  	struct ibv_spec_header *hdr;
>  	int i;
>  
>  	/* Update Verbs specification. */
> -	hdr = (struct ibv_spec_header *)flow->verbs.specs;
> -	for (i = 0; i != flow->verbs.attr->num_of_specs; ++i) {
> +	hdr = (struct ibv_spec_header *)verbs->specs;
> +	for (i = 0; i != verbs->attr->num_of_specs; ++i) {
>  		if (hdr->type == IBV_FLOW_SPEC_ACTION_TAG) {
>  			struct ibv_flow_spec_action_tag *t =
>  				(struct ibv_flow_spec_action_tag *)hdr;
> @@ -1120,38 +1577,52 @@ mlx5_flow_action_mark(const struct rte_flow_action *actions,
>  		.type = IBV_FLOW_SPEC_ACTION_TAG,
>  		.size = size,
>  	};
> +	struct mlx5_flow_verbs *verbs = flow->cur_verbs;
>  
> -	if (!mark)
> -		return rte_flow_error_set(error, EINVAL,
> -					  RTE_FLOW_ERROR_TYPE_ACTION,
> -					  actions,
> -					  "configuration cannot be null");
> -	if (mark->id >= MLX5_FLOW_MARK_MAX)
> -		return rte_flow_error_set(error, EINVAL,
> -					  RTE_FLOW_ERROR_TYPE_ACTION_CONF,
> -					  &mark->id,
> -					  "mark must be between 0 and"
> -					  " 16777199");
> -	if (flow->modifier & MLX5_FLOW_MOD_MARK)
> -		return rte_flow_error_set(error, ENOTSUP,
> -					  RTE_FLOW_ERROR_TYPE_ACTION,
> -					  actions,
> -					  "mark action already present");
> -	if (flow->fate & MLX5_FLOW_FATE_DROP)
> -		return rte_flow_error_set(error, ENOTSUP,
> -					  RTE_FLOW_ERROR_TYPE_ACTION,
> -					  actions,
> -					  "mark is not compatible with drop"
> -					  " action");
> -	if (flow->modifier & MLX5_FLOW_MOD_FLAG) {
> -		mlx5_flow_verbs_mark_update(flow, mark->id);
> -		size = 0; /**< Only an update is done in the specification. */
> -	} else {
> -		tag.tag_id = mlx5_flow_mark_set(mark->id);
> -		if (size <= flow_size) {
> +	if (!flow->expand) {
> +		if (!mark)
> +			return rte_flow_error_set(error, EINVAL,
> +						  RTE_FLOW_ERROR_TYPE_ACTION,
> +						  actions,
> +						  "configuration cannot be"
> +						  " null");
> +		if (mark->id >= MLX5_FLOW_MARK_MAX)
> +			return rte_flow_error_set
> +				(error, EINVAL,
> +				 RTE_FLOW_ERROR_TYPE_ACTION_CONF,
> +				 &mark->id,
> +				 "mark must be between 0 and 16777199");
> +		if (flow->modifier & MLX5_FLOW_MOD_MARK)
> +			return rte_flow_error_set(error, ENOTSUP,
> +						  RTE_FLOW_ERROR_TYPE_ACTION,
> +						  actions,
> +						  "mark action already"
> +						  " present");
> +		if (flow->fate & MLX5_FLOW_FATE_DROP)
> +			return rte_flow_error_set(error, ENOTSUP,
> +						  RTE_FLOW_ERROR_TYPE_ACTION,
> +						  actions,
> +						  "mark is not compatible with"
> +						  " drop action");
> +	}
> +	/*
> +	 * The two only possible cases, a flag has already been added in the
> +	 * specification, in such case, it needs to be update to add the id.
> +	 * Second case, has it is not possible to have two mark, it just
> +	 * needs to add it.
> +	 */

Can you rephrase the 'second case'? Maybe 'has' -> 'as'?

> +	if (verbs) {
> +		verbs->modifier |= MLX5_FLOW_MOD_MARK;
> +		if (verbs->modifier & MLX5_FLOW_MOD_FLAG) {
> +			mlx5_flow_verbs_mark_update(verbs, mark->id);
> +			size = 0;
> +		} else if (size <= flow_size) {

If verbs isn't null (not testing call), isn't it guaranteed there's enough
space? Is it still needed to check the size?

>  			tag.tag_id = mlx5_flow_mark_set(mark->id);
>  			mlx5_flow_spec_verbs_add(flow, &tag, size);
>  		}
> +	} else {
> +		if (flow->modifier & MLX5_FLOW_MOD_FLAG)
> +			size = 0;
>  	}
>  	flow->modifier |= MLX5_FLOW_MOD_MARK;
>  	return size;
> @@ -1185,6 +1656,15 @@ mlx5_flow_actions(struct rte_eth_dev *dev,
>  	int remain = flow_size;
>  	int ret = 0;
>  
> +	/*
> +	 * FLAG/MARK are the only actions having a specification in Verbs and
> +	 * not making part of the packet fate.  Due to this specificity and to
> +	 * avoid extra variable, their bit in the flow->modifier bit-field are
> +	 * disabled here to compute the exact necessary memory those action
> +	 * needs.
> +	 */
> +	flow->modifier &= ~(MLX5_FLOW_MOD_FLAG | MLX5_FLOW_MOD_MARK);

Can't understand this well. Is this for the case where the flow is expanded? If
so, why don't you reset flow->modifier in the for loop of mlx5_flow_merge()?

> +	/* Process the actions. */
>  	for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
>  		switch (actions->type) {
>  		case RTE_FLOW_ACTION_TYPE_VOID:
> @@ -1204,6 +1684,9 @@ mlx5_flow_actions(struct rte_eth_dev *dev,
>  		case RTE_FLOW_ACTION_TYPE_QUEUE:
>  			ret = mlx5_flow_action_queue(dev, actions, flow, error);
>  			break;
> +		case RTE_FLOW_ACTION_TYPE_RSS:
> +			ret = mlx5_flow_action_rss(dev, actions, flow, error);
> +			break;
>  		default:
>  			return rte_flow_error_set(error, ENOTSUP,
>  						  RTE_FLOW_ERROR_TYPE_ACTION,
> @@ -1257,27 +1740,92 @@ mlx5_flow_merge(struct rte_eth_dev *dev, struct rte_flow *flow,
>  		struct rte_flow_error *error)
>  {
>  	struct rte_flow local_flow = { .layers = 0, };
> -	size_t size = sizeof(*flow) + sizeof(struct ibv_flow_attr);
> +	size_t size = sizeof(*flow);
>  	int remain = (flow_size > size) ? flow_size - size : 0;
> +	struct rte_flow_expand_rss *buf;
>  	int ret;
> +	uint32_t i;
>  
>  	if (!remain)
>  		flow = &local_flow;
>  	ret = mlx5_flow_attributes(dev, attr, flow, error);
>  	if (ret < 0)
>  		return ret;
> -	ret = mlx5_flow_items(items, flow, remain, error);
> -	if (ret < 0)
> -		return ret;
> -	size += ret;
> -	remain = (flow_size > size) ? flow_size - size : 0;
> -	ret = mlx5_flow_actions(dev, actions, flow, remain, error);
> +	ret = mlx5_flow_actions(dev, actions, &local_flow, 0, error);
>  	if (ret < 0)
>  		return ret;
> -	size += ret;
> +	ret = rte_flow_expand_rss(NULL, 0, items, local_flow.rss.types,
> +				  mlx5_support_expansion,
> +				  local_flow.rss.level < 2 ?
> +				  MLX5_EXPANSION_ROOT : MLX5_EXPANSION_ROOT2);
> +	assert(ret > 0);
> +	buf = rte_calloc(__func__, 1, ret, 0);
> +	if (!buf) {
> +		rte_flow_error_set(error, ENOMEM,
> +				   RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
> +				   NULL,
> +				   "not enough memory to expand the RSS flow");
> +		goto error;
> +	}

I'm pretty sure you've already fixed this bug. Validation can't return ENOMEM.

> +	ret = rte_flow_expand_rss(buf, ret, items, local_flow.rss.types,
> +				  mlx5_support_expansion,
> +				  local_flow.rss.level < 2 ?
> +				  MLX5_EXPANSION_ROOT : MLX5_EXPANSION_ROOT2);
> +	assert(ret > 0);
> +	size += RTE_ALIGN_CEIL(local_flow.rss.queue_num * sizeof(uint16_t),
> +			       sizeof(void *));
>  	if (size <= flow_size)
> -		flow->verbs.attr->priority = flow->attributes.priority;
> +		flow->queue = (void *)(flow + 1);
> +	LIST_INIT(&flow->verbs);
> +	flow->layers = 0;
> +	flow->modifier = 0;
> +	flow->fate = 0;
> +	for (i = 0; i != buf->entries; ++i) {
> +		size_t off = size;
> +
> +		size += sizeof(struct ibv_flow_attr) +
> +			sizeof(struct mlx5_flow_verbs);
> +		remain = (flow_size > size) ? flow_size - size : 0;
> +		if (remain) {
> +			flow->cur_verbs = (void *)((uintptr_t)flow + off);
> +			flow->cur_verbs->attr = (void *)(flow->cur_verbs + 1);
> +			flow->cur_verbs->specs =
> +				(void *)(flow->cur_verbs->attr + 1);
> +		}
> +		ret = mlx5_flow_items
> +			((const struct rte_flow_item *)buf->patterns[i],
> +			 flow, remain, error);
> +		if (ret < 0)
> +			goto error;
> +		size += ret;
> +		if (remain > ret)
> +			remain -= ret;
> +		else
> +			remain = 0;
> +		ret = mlx5_flow_actions(dev, actions, flow, remain, error);
> +		if (ret < 0)
> +			goto error;
> +		size += ret;
> +		if (remain > ret)
> +			remain -= ret;
> +		else
> +			remain = 0;
> +		if (size <= flow_size) {
> +			flow->cur_verbs->attr->priority =
> +				flow->attributes.priority;
> +			ret = mlx5_flow_action_rss_verbs_attr(dev, flow,
> +							      flow->rss.types);
> +			if (ret < 0)
> +				goto error;
> +			LIST_INSERT_HEAD(&flow->verbs, flow->cur_verbs, next);
> +		}
> +		flow->expand = !!(buf->entries > 1);
> +	}
> +	rte_free(buf);
>  	return size;
> +error:
> +	rte_free(buf);
> +	return ret;
>  }
>  
>  /**
> @@ -1292,9 +1840,13 @@ static void
>  mlx5_flow_rxq_mark(struct rte_eth_dev *dev, struct rte_flow *flow)
>  {
>  	struct priv *priv = dev->data->dev_private;
> +	const uint32_t mask = MLX5_FLOW_MOD_FLAG | MLX5_FLOW_MOD_MARK;
> +	uint32_t i;
>  
> -	(*priv->rxqs)[flow->queue]->mark |=
> -		flow->modifier & (MLX5_FLOW_MOD_FLAG | MLX5_FLOW_MOD_MARK);
> +	if (!(flow->modifier & mask))
> +		return;
> +	for (i = 0; i != flow->rss.queue_num; ++i)
> +		(*priv->rxqs)[(*flow->queue)[i]]->mark = 1;
>  }
>  
>  /**
> @@ -1328,18 +1880,20 @@ mlx5_flow_validate(struct rte_eth_dev *dev,
>  static void
>  mlx5_flow_fate_remove(struct rte_eth_dev *dev, struct rte_flow *flow)
>  {
> -	if (flow->fate & MLX5_FLOW_FATE_DROP) {
> -		if (flow->verbs.flow) {
> -			claim_zero(mlx5_glue->destroy_flow(flow->verbs.flow));
> -			flow->verbs.flow = NULL;
> +	struct mlx5_flow_verbs *verbs;
> +
> +	LIST_FOREACH(verbs, &flow->verbs, next) {
> +		if (verbs->flow) {
> +			claim_zero(mlx5_glue->destroy_flow(verbs->flow));
> +			verbs->flow = NULL;
> +		}
> +		if (verbs->hrxq) {
> +			if (flow->fate & MLX5_FLOW_FATE_DROP)
> +				mlx5_hrxq_drop_release(dev, verbs->hrxq);
> +			else
> +				mlx5_hrxq_release(dev, verbs->hrxq);
> +			verbs->hrxq = NULL;
>  		}
> -	}
> -	if (flow->verbs.hrxq) {
> -		if (flow->fate & MLX5_FLOW_FATE_DROP)
> -			mlx5_hrxq_drop_release(dev, flow->verbs.hrxq);
> -		else if (flow->fate & MLX5_FLOW_FATE_QUEUE)
> -			mlx5_hrxq_release(dev, flow->verbs.hrxq);
> -		flow->verbs.hrxq = NULL;
>  	}
>  }
>  
> @@ -1360,46 +1914,68 @@ static int
>  mlx5_flow_fate_apply(struct rte_eth_dev *dev, struct rte_flow *flow,
>  		     struct rte_flow_error *error)
>  {
> -	if (flow->fate & MLX5_FLOW_FATE_DROP) {
> -		flow->verbs.hrxq = mlx5_hrxq_drop_new(dev);
> -		if (!flow->verbs.hrxq)
> -			return rte_flow_error_set
> -				(error, errno,
> -				 RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
> -				 NULL,
> -				 "cannot allocate Drop queue");
> -	} else if (flow->fate & MLX5_FLOW_FATE_QUEUE) {
> -		struct mlx5_hrxq *hrxq;
> -
> -		hrxq = mlx5_hrxq_get(dev, rss_hash_default_key,
> -				     rss_hash_default_key_len, 0,
> -				     &flow->queue, 1, 0, 0);
> -		if (!hrxq)
> -			hrxq = mlx5_hrxq_new(dev, rss_hash_default_key,
> -					     rss_hash_default_key_len, 0,
> -					     &flow->queue, 1, 0, 0);
> -		if (!hrxq)
> -			return rte_flow_error_set(error, rte_errno,
> -					RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
> -					NULL,
> -					"cannot create flow");
> -		flow->verbs.hrxq = hrxq;
> -	}
> -	flow->verbs.flow =
> -		mlx5_glue->create_flow(flow->verbs.hrxq->qp, flow->verbs.attr);
> -	if (!flow->verbs.flow) {
> -		if (flow->fate & MLX5_FLOW_FATE_DROP)
> -			mlx5_hrxq_drop_release(dev, flow->verbs.hrxq);
> -		else
> -			mlx5_hrxq_release(dev, flow->verbs.hrxq);
> -		flow->verbs.hrxq = NULL;
> -		return rte_flow_error_set(error, errno,
> -					  RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
> -					  NULL,
> -					  "kernel module refuses to create"
> -					  " flow");
> +	struct mlx5_flow_verbs *verbs;
> +	int err;
> +
> +	LIST_FOREACH(verbs, &flow->verbs, next) {
> +		if (flow->fate & MLX5_FLOW_FATE_DROP) {
> +			verbs->hrxq = mlx5_hrxq_drop_new(dev);
> +			if (!verbs->hrxq) {
> +				rte_flow_error_set
> +					(error, errno,
> +					 RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
> +					 NULL,
> +					 "cannot get drop hash queue");
> +				goto error;
> +			}
> +		} else {
> +			struct mlx5_hrxq *hrxq;
> +
> +			hrxq = mlx5_hrxq_get(dev, flow->key,
> +					     rss_hash_default_key_len,
> +					     verbs->hash_fields,
> +					     (*flow->queue),
> +					     flow->rss.queue_num, 0, 0);
> +			if (!hrxq)
> +				hrxq = mlx5_hrxq_new(dev, flow->key,
> +						     rss_hash_default_key_len,
> +						     verbs->hash_fields,
> +						     (*flow->queue),
> +						     flow->rss.queue_num, 0, 0);
> +			if (!hrxq) {
> +				rte_flow_error_set
> +					(error, rte_errno,
> +					 RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
> +					 NULL,
> +					 "cannot get hash queue");
> +				goto error;
> +			}
> +			verbs->hrxq = hrxq;
> +		}
> +		verbs->flow =
> +			mlx5_glue->create_flow(verbs->hrxq->qp, verbs->attr);
> +		if (!verbs->flow) {
> +			rte_flow_error_set(error, errno,
> +					   RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
> +					   NULL,
> +					   "hardware refuses to create flow");
> +			goto error;
> +		}
>  	}
>  	return 0;
> +error:
> +	err = rte_errno; /* Save rte_errno before cleanup. */
> +	LIST_FOREACH(verbs, &flow->verbs, next) {
> +		if (verbs->hrxq) {
> +			if (flow->fate & MLX5_FLOW_FATE_DROP)
> +				mlx5_hrxq_drop_release(dev, verbs->hrxq);
> +			else
> +				mlx5_hrxq_release(dev, verbs->hrxq);
> +			verbs->hrxq = NULL;
> +		}
> +	}
> +	rte_errno = err; /* Restore rte_errno. */
> +	return -rte_errno;
>  }
>  
>  /**
> @@ -1429,42 +2005,43 @@ mlx5_flow_list_create(struct rte_eth_dev *dev,
>  		      const struct rte_flow_action actions[],
>  		      struct rte_flow_error *error)
>  {
> -	struct rte_flow *flow;
> -	size_t size;
> +	struct rte_flow *flow = NULL;
> +	size_t size = 0;
>  	int ret;
>  
> -	ret = mlx5_flow_merge(dev, NULL, 0, attr, items, actions, error);
> +	ret = mlx5_flow_merge(dev, flow, size, attr, items, actions, error);
>  	if (ret < 0)
>  		return NULL;
>  	size = ret;
> -	flow = rte_zmalloc(__func__, size, 0);
> +	flow = rte_calloc(__func__, 1, size, 0);
>  	if (!flow) {
>  		rte_flow_error_set(error, ENOMEM,
>  				   RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
>  				   NULL,
> -				   "cannot allocate memory");
> +				   "not enough memory to create flow");
>  		return NULL;
>  	}
> -	flow->verbs.attr = (struct ibv_flow_attr *)(flow + 1);
> -	flow->verbs.specs = (uint8_t *)(flow->verbs.attr + 1);
>  	ret = mlx5_flow_merge(dev, flow, size, attr, items, actions, error);
> -	if (ret < 0)
> -		goto error;
> +	if (ret < 0) {
> +		rte_free(flow);
> +		return NULL;
> +	}
>  	assert((size_t)ret == size);
>  	if (dev->data->dev_started) {
>  		ret = mlx5_flow_fate_apply(dev, flow, error);
> -		if (ret < 0)
> -			goto error;
> +		if (ret < 0) {
> +			ret = rte_errno; /* Save rte_errno before cleanup. */
> +			if (flow) {
> +				mlx5_flow_fate_remove(dev, flow);
> +				rte_free(flow);
> +			}
> +			rte_errno = ret; /* Restore rte_errno. */
> +			return NULL;
> +		}
>  	}
>  	mlx5_flow_rxq_mark(dev, flow);
>  	TAILQ_INSERT_TAIL(list, flow, next);
>  	return flow;
> -error:
> -	ret = rte_errno; /* Save rte_errno before cleanup. */
> -	mlx5_flow_fate_remove(dev, flow);
> -	rte_free(flow);
> -	rte_errno = ret; /* Restore rte_errno. */
> -	return NULL;
>  }
>  
>  /**
> @@ -1502,7 +2079,7 @@ mlx5_flow_list_destroy(struct rte_eth_dev *dev, struct mlx5_flows *list,
>  	struct priv *priv = dev->data->dev_private;
>  	struct rte_flow *rflow;
>  	const uint32_t mask = MLX5_FLOW_MOD_FLAG & MLX5_FLOW_MOD_MARK;
> -	int mark = 0;
> +	unsigned int i;
>  
>  	mlx5_flow_fate_remove(dev, flow);
>  	TAILQ_REMOVE(list, flow, next);
> @@ -1512,18 +2089,28 @@ mlx5_flow_list_destroy(struct rte_eth_dev *dev, struct mlx5_flows *list,
>  	}
>  	/*
>  	 * When a flow is removed and this flow has a flag/mark modifier, all
> -	 * flows needs to be parse to verify if the Rx queue use by the flow
> +	 * flows needs to be parse to verify if the Rx queues use by the flow
>  	 * still need to track the flag/mark request.
>  	 */
> -	TAILQ_FOREACH(rflow, &priv->flows, next) {
> -		if (!(rflow->modifier & mask))
> -			continue;
> -		if (flow->queue == rflow->queue) {
> -			mark = 1;
> -			break;
> +	for (i = 0; i != flow->rss.queue_num; ++i) {
> +		int mark = 0;
> +
> +		TAILQ_FOREACH(rflow, &priv->flows, next) {
> +			unsigned int j;
> +
> +			if (!(rflow->modifier & mask))
> +				continue;
> +			for (j = 0; j != rflow->rss.queue_num; ++j) {
> +				if ((*flow->queue)[i] == (*rflow->queue)[j]) {
> +					mark = 1;
> +					break;
> +				}
> +			}
> +			if (mark)
> +				break;
>  		}
> +		(*priv->rxqs)[i]->mark = !!mark;
>  	}
> -	(*priv->rxqs)[flow->queue]->mark = !!mark;
>  	rte_free(flow);
>  }
>  
> @@ -1654,7 +2241,7 @@ mlx5_ctrl_flow_vlan(struct rte_eth_dev *dev,
>  	struct priv *priv = dev->data->dev_private;
>  	const struct rte_flow_attr attr = {
>  		.ingress = 1,
> -		.priority = priv->config.flow_prio - 1,
> +		.priority = MLX5_FLOW_PRIO_RSVD,
>  	};
>  	struct rte_flow_item items[] = {
>  		{
> -- 
> 2.18.0
>
  
Nélio Laranjeiro July 6, 2018, 3:59 p.m. UTC | #2
Hi Yongseok,

I am only addressing your questions concerns here, almost all other
points I also agree with them.

On Thu, Jul 05, 2018 at 07:16:35PM -0700, Yongseok Koh wrote:
> On Wed, Jun 27, 2018 at 05:07:45PM +0200, Nelio Laranjeiro wrote:
> > Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
> > ---
>[...]
> 
> > + */
> > +static void
> > +mlx5_flow_layers_update(struct rte_flow *flow, uint32_t layers)
> > +{
> > +	if (flow->expand) {
> > +		if (flow->cur_verbs)
> > +			flow->cur_verbs->layers |= layers;
> 
> If flow->cur_verbs is null, does that mean it is a testing call? Then, is it
> unnecessary to update layers for the testing call? Confusing..

No it may also happen if the buffer was too small, in any case the code
continues its validation.

> > +	} else {
> > +		flow->layers |= layers;
> > +	}
> > +}
> > +
> > +/**
> > + * Get layers bit-field.
> > + *
> > + * @param flow[in, out]
> > + *   Pointer to flow structure.
> > + */
> > +static uint32_t
> > +mlx5_flow_layers(struct rte_flow *flow)
> > +{
> > +	uint32_t layers = flow->layers;
> > +
> > +	if (flow->expand && flow->cur_verbs)
> 
> If flow is expanded and it is a testing call, then flow->layers is used?
>
> > +		layers |= flow->cur_verbs->layers;
> > +	return layers;
> 
> This part is so unclear to me, hard to understand. There are two 'layers'
> fields, one in rte_flow and the other in mlx5_flow_verbs. It seems
> rte_flow->layers is used only when the flow isn't expanded. If a flow is
> expanded, flow->expand is set after processing the first entry in the expanded
> list. In mlx5_flow_merge(),
> 
> 	for (i = 0; i != buf->entries; ++i) {
> 
> 		...
> 
> 		flow->expand = !!(buf->entries > 1);
> 	}
> 
> Why is flow->expand set at the end of the loop? Is this in order to avoid
> validation for the expanded flows?
> mlx5_flow_item_xxx() executes validation only if flow->expand is zero,
> why?

Expanded flows are PMD internal cooking to match the user request, they
are fully added by the PMD and thus must be valid.  There is not need to
validate their position nor redundancy and for the spec, last, mask they
are provided as wildcards which means those pointer are null.

> And why does mlx5_flow_layers() have to return (flow->layers |
> flow->cur_verbs->layers) if expanded?

You are right, and indeed it is a bug.  With the fix, only the expanded
flow will use the verbs->layers, whereas the first conversion will use
the flow->layers.
 
> If there are 3 entries in the rte_flow_expand_rss,
> 	eth
> 	eth / ipv4 / udp
> 	eth / ipv6 / udp
> 
> Then, the 2nd and 3rd don't have MLX5_FLOW_LAYER_OUTER_L2 in layers field?
> Please explain in details and add comments appropriately.

The 2nd and 3rd should have the same layers bits as the 1st rule in
addition of their own bits set .e.g 2nd rule will have L2 + L3_IPv4 +
L4_UDP the 3rd L3 + L3_IPv6 + L4_UDP.


I will try to sanitize it, the issue being for a single rte_flow we can
have several verbs flows with the expansion, for each of them we need to
update the priority and this based on the more specific layer of each,
it needs to be stored in some place.  I think the biggest issue you are
raising here is the lack of correct variable naming and documentation,
I'll try to improve it at most.

>[...]
> > +	if (verbs) {
> > +		verbs->modifier |= MLX5_FLOW_MOD_MARK;
> > +		if (verbs->modifier & MLX5_FLOW_MOD_FLAG) {
> > +			mlx5_flow_verbs_mark_update(verbs, mark->id);
> > +			size = 0;
> > +		} else if (size <= flow_size) {
> 
> If verbs isn't null (not testing call), isn't it guaranteed there's enough
> space? Is it still needed to check the size?

Unfortunately not, verbs variable may be valid and pointing into a zone
in the buffer but it does not mean there is enough space to store the
mark specification in the buffer.

> >  			tag.tag_id = mlx5_flow_mark_set(mark->id);
> >  			mlx5_flow_spec_verbs_add(flow, &tag, size);
> >  		}
> > +	} else {
> > +		if (flow->modifier & MLX5_FLOW_MOD_FLAG)
> > +			size = 0;
> >  	}
> >  	flow->modifier |= MLX5_FLOW_MOD_MARK;
> >  	return size;
> > @@ -1185,6 +1656,15 @@ mlx5_flow_actions(struct rte_eth_dev *dev,
> >  	int remain = flow_size;
> >  	int ret = 0;
> >  
> > +	/*
> > +	 * FLAG/MARK are the only actions having a specification in Verbs and
> > +	 * not making part of the packet fate.  Due to this specificity and to
> > +	 * avoid extra variable, their bit in the flow->modifier bit-field are
> > +	 * disabled here to compute the exact necessary memory those action
> > +	 * needs.
> > +	 */
> > +	flow->modifier &= ~(MLX5_FLOW_MOD_FLAG | MLX5_FLOW_MOD_MARK);
> 
> Can't understand this well. Is this for the case where the flow is expanded? If
> so, why don't you reset flow->modifier in the for loop of mlx5_flow_merge()?

Yes it is, I'll move it.

>[...]
> > +	assert(ret > 0);
> > +	buf = rte_calloc(__func__, 1, ret, 0);
> > +	if (!buf) {
> > +		rte_flow_error_set(error, ENOMEM,
> > +				   RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
> > +				   NULL,
> > +				   "not enough memory to expand the RSS flow");
> > +		goto error;
> > +	}
> 
> I'm pretty sure you've already fixed this bug. Validation can't return ENOMEM.

You know me well ;)

Thanks,
  
Yongseok Koh July 6, 2018, 5:35 p.m. UTC | #3
> On Jul 6, 2018, at 8:59 AM, Nélio Laranjeiro <nelio.laranjeiro@6wind.com> wrote:
> 
> Hi Yongseok,
> 
> I am only addressing your questions concerns here, almost all other
> points I also agree with them.
> 
> On Thu, Jul 05, 2018 at 07:16:35PM -0700, Yongseok Koh wrote:
>> On Wed, Jun 27, 2018 at 05:07:45PM +0200, Nelio Laranjeiro wrote:
>>> Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
>>> ---
>> [...]
>> 
>>> + */
>>> +static void
>>> +mlx5_flow_layers_update(struct rte_flow *flow, uint32_t layers)
>>> +{
>>> +	if (flow->expand) {
>>> +		if (flow->cur_verbs)
>>> +			flow->cur_verbs->layers |= layers;
>> 
>> If flow->cur_verbs is null, does that mean it is a testing call? Then, is it
>> unnecessary to update layers for the testing call? Confusing..
> 
> No it may also happen if the buffer was too small, in any case the code
> continues its validation.

Okay, understand. Thanks.
But another question was, if it is a testing call (flow->cur_verbs is null) with
flow->expand being set, then no 'layers' isn't updated in this code. Is it okay?

Thanks,
Yongseok

>>> +	} else {
>>> +		flow->layers |= layers;
>>> +	}
>>> +}
>>> +
  
Nélio Laranjeiro July 9, 2018, 1:09 p.m. UTC | #4
On Fri, Jul 06, 2018 at 05:35:22PM +0000, Yongseok Koh wrote:
> 
> > On Jul 6, 2018, at 8:59 AM, Nélio Laranjeiro <nelio.laranjeiro@6wind.com> wrote:
> > 
> > Hi Yongseok,
> > 
> > I am only addressing your questions concerns here, almost all other
> > points I also agree with them.
> > 
> > On Thu, Jul 05, 2018 at 07:16:35PM -0700, Yongseok Koh wrote:
> >> On Wed, Jun 27, 2018 at 05:07:45PM +0200, Nelio Laranjeiro wrote:
> >>> Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
> >>> ---
> >> [...]
> >> 
> >>> + */
> >>> +static void
> >>> +mlx5_flow_layers_update(struct rte_flow *flow, uint32_t layers)
> >>> +{
> >>> +	if (flow->expand) {
> >>> +		if (flow->cur_verbs)
> >>> +			flow->cur_verbs->layers |= layers;
> >> 
> >> If flow->cur_verbs is null, does that mean it is a testing call? Then, is it
> >> unnecessary to update layers for the testing call? Confusing..
> > 
> > No it may also happen if the buffer was too small, in any case the code
> > continues its validation.
> 
> Okay, understand. Thanks.
> But another question was, if it is a testing call (flow->cur_verbs is null) with
> flow->expand being set, then no 'layers' isn't updated in this code. Is it okay?

yes it was ok, after I've fixed the issue in the layers themselves,
again no layer position was done when the expanded was enabled.

Thanks,
  

Patch

diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c
index a39157533..08e0a6556 100644
--- a/drivers/net/mlx5/mlx5_flow.c
+++ b/drivers/net/mlx5/mlx5_flow.c
@@ -51,13 +51,148 @@  extern const struct eth_dev_ops mlx5_dev_ops_isolate;
 /* Action fate on the packet. */
 #define MLX5_FLOW_FATE_DROP (1u << 0)
 #define MLX5_FLOW_FATE_QUEUE (1u << 1)
+#define MLX5_FLOW_FATE_RSS (1u << 2)
 
 /* Modify a packet. */
 #define MLX5_FLOW_MOD_FLAG (1u << 0)
 #define MLX5_FLOW_MOD_MARK (1u << 1)
 
+/* Priority reserved for default flows. */
+#define MLX5_FLOW_PRIO_RSVD ((uint32_t)-1)
+
+enum mlx5_expansion {
+	MLX5_EXPANSION_ROOT,
+	MLX5_EXPANSION_ROOT2,
+	MLX5_EXPANSION_OUTER_ETH,
+	MLX5_EXPANSION_OUTER_IPV4,
+	MLX5_EXPANSION_OUTER_IPV4_UDP,
+	MLX5_EXPANSION_OUTER_IPV4_TCP,
+	MLX5_EXPANSION_OUTER_IPV6,
+	MLX5_EXPANSION_OUTER_IPV6_UDP,
+	MLX5_EXPANSION_OUTER_IPV6_TCP,
+	MLX5_EXPANSION_VXLAN,
+	MLX5_EXPANSION_VXLAN_GPE,
+	MLX5_EXPANSION_GRE,
+	MLX5_EXPANSION_MPLS,
+	MLX5_EXPANSION_ETH,
+	MLX5_EXPANSION_IPV4,
+	MLX5_EXPANSION_IPV4_UDP,
+	MLX5_EXPANSION_IPV4_TCP,
+	MLX5_EXPANSION_IPV6,
+	MLX5_EXPANSION_IPV6_UDP,
+	MLX5_EXPANSION_IPV6_TCP,
+};
+
+/** Supported expansion of items. */
+static const struct rte_flow_expand_node mlx5_support_expansion[] = {
+	[MLX5_EXPANSION_ROOT] = {
+		.next = RTE_FLOW_EXPAND_ITEMS(MLX5_EXPANSION_ETH,
+					      MLX5_EXPANSION_IPV4,
+					      MLX5_EXPANSION_IPV6),
+		.type = RTE_FLOW_ITEM_TYPE_END,
+	},
+	[MLX5_EXPANSION_ROOT2] = {
+		.next = RTE_FLOW_EXPAND_ITEMS(MLX5_EXPANSION_OUTER_ETH,
+					      MLX5_EXPANSION_OUTER_IPV4,
+					      MLX5_EXPANSION_OUTER_IPV6),
+		.type = RTE_FLOW_ITEM_TYPE_END,
+	},
+	[MLX5_EXPANSION_OUTER_ETH] = {
+		.next = RTE_FLOW_EXPAND_ITEMS(MLX5_EXPANSION_OUTER_IPV4,
+					      MLX5_EXPANSION_OUTER_IPV6),
+		.type = RTE_FLOW_ITEM_TYPE_ETH,
+		.rss_types = 0,
+	},
+	[MLX5_EXPANSION_OUTER_IPV4] = {
+		.next = RTE_FLOW_EXPAND_ITEMS(MLX5_EXPANSION_OUTER_IPV4_UDP,
+					      MLX5_EXPANSION_OUTER_IPV4_TCP),
+		.type = RTE_FLOW_ITEM_TYPE_IPV4,
+		.rss_types = ETH_RSS_IPV4 | ETH_RSS_FRAG_IPV4 |
+			ETH_RSS_NONFRAG_IPV4_OTHER,
+	},
+	[MLX5_EXPANSION_OUTER_IPV4_UDP] = {
+		.next = RTE_FLOW_EXPAND_ITEMS(MLX5_EXPANSION_VXLAN),
+		.type = RTE_FLOW_ITEM_TYPE_UDP,
+		.rss_types = ETH_RSS_NONFRAG_IPV4_UDP,
+	},
+	[MLX5_EXPANSION_OUTER_IPV4_TCP] = {
+		.type = RTE_FLOW_ITEM_TYPE_TCP,
+		.rss_types = ETH_RSS_NONFRAG_IPV4_TCP,
+	},
+	[MLX5_EXPANSION_OUTER_IPV6] = {
+		.next = RTE_FLOW_EXPAND_ITEMS(MLX5_EXPANSION_OUTER_IPV6_UDP,
+					      MLX5_EXPANSION_OUTER_IPV6_TCP),
+		.type = RTE_FLOW_ITEM_TYPE_IPV6,
+		.rss_types = ETH_RSS_IPV6 | ETH_RSS_FRAG_IPV6 |
+			ETH_RSS_NONFRAG_IPV6_OTHER,
+	},
+	[MLX5_EXPANSION_OUTER_IPV6_UDP] = {
+		.next = RTE_FLOW_EXPAND_ITEMS(MLX5_EXPANSION_VXLAN),
+		.type = RTE_FLOW_ITEM_TYPE_UDP,
+		.rss_types = ETH_RSS_NONFRAG_IPV6_UDP,
+	},
+	[MLX5_EXPANSION_OUTER_IPV6_TCP] = {
+		.type = RTE_FLOW_ITEM_TYPE_TCP,
+		.rss_types = ETH_RSS_NONFRAG_IPV6_TCP,
+	},
+	[MLX5_EXPANSION_VXLAN] = {
+		.next = RTE_FLOW_EXPAND_ITEMS(MLX5_EXPANSION_ETH),
+		.type = RTE_FLOW_ITEM_TYPE_VXLAN,
+	},
+	[MLX5_EXPANSION_VXLAN_GPE] = {
+		.next = RTE_FLOW_EXPAND_ITEMS(MLX5_EXPANSION_ETH,
+					      MLX5_EXPANSION_IPV4,
+					      MLX5_EXPANSION_IPV6),
+		.type = RTE_FLOW_ITEM_TYPE_VXLAN_GPE,
+	},
+	[MLX5_EXPANSION_GRE] = {
+		.next = RTE_FLOW_EXPAND_ITEMS(MLX5_EXPANSION_IPV4),
+		.type = RTE_FLOW_ITEM_TYPE_GRE,
+	},
+	[MLX5_EXPANSION_ETH] = {
+		.next = RTE_FLOW_EXPAND_ITEMS(MLX5_EXPANSION_IPV4,
+					      MLX5_EXPANSION_IPV6),
+		.type = RTE_FLOW_ITEM_TYPE_ETH,
+	},
+	[MLX5_EXPANSION_IPV4] = {
+		.next = RTE_FLOW_EXPAND_ITEMS(MLX5_EXPANSION_IPV4_UDP,
+					      MLX5_EXPANSION_IPV4_TCP),
+		.type = RTE_FLOW_ITEM_TYPE_IPV4,
+		.rss_types = ETH_RSS_IPV4 | ETH_RSS_FRAG_IPV4 |
+			ETH_RSS_NONFRAG_IPV4_OTHER,
+	},
+	[MLX5_EXPANSION_IPV4_UDP] = {
+		.type = RTE_FLOW_ITEM_TYPE_UDP,
+		.rss_types = ETH_RSS_NONFRAG_IPV4_UDP,
+	},
+	[MLX5_EXPANSION_IPV4_TCP] = {
+		.type = RTE_FLOW_ITEM_TYPE_TCP,
+		.rss_types = ETH_RSS_NONFRAG_IPV4_TCP,
+	},
+	[MLX5_EXPANSION_IPV6] = {
+		.next = RTE_FLOW_EXPAND_ITEMS(MLX5_EXPANSION_IPV6_UDP,
+					      MLX5_EXPANSION_IPV6_TCP),
+		.type = RTE_FLOW_ITEM_TYPE_IPV6,
+		.rss_types = ETH_RSS_IPV6 | ETH_RSS_FRAG_IPV6 |
+			ETH_RSS_NONFRAG_IPV6_OTHER,
+	},
+	[MLX5_EXPANSION_IPV6_UDP] = {
+		.type = RTE_FLOW_ITEM_TYPE_UDP,
+		.rss_types = ETH_RSS_NONFRAG_IPV6_UDP,
+	},
+	[MLX5_EXPANSION_IPV6_TCP] = {
+		.type = RTE_FLOW_ITEM_TYPE_TCP,
+		.rss_types = ETH_RSS_NONFRAG_IPV6_TCP,
+	},
+};
+
 /** Handles information leading to a drop fate. */
 struct mlx5_flow_verbs {
+	LIST_ENTRY(mlx5_flow_verbs) next;
+	uint32_t layers;
+	/**< Bit-fields of expanded layers see MLX5_FLOW_ITEMS_*. */
+	uint32_t modifier;
+	/**< Bit-fields of expanded modifier see MLX5_FLOW_MOD_*. */
 	unsigned int size; /**< Size of the attribute. */
 	struct {
 		struct ibv_flow_attr *attr;
@@ -66,20 +201,26 @@  struct mlx5_flow_verbs {
 	};
 	struct ibv_flow *flow; /**< Verbs flow pointer. */
 	struct mlx5_hrxq *hrxq; /**< Hash Rx queue object. */
+	uint64_t hash_fields; /**< Verbs hash Rx queue hash fields. */
 };
 
 /* Flow structure. */
 struct rte_flow {
 	TAILQ_ENTRY(rte_flow) next; /**< Pointer to the next flow structure. */
 	struct rte_flow_attr attributes; /**< User flow attribute. */
+	uint32_t expand:1; /**< Flow is expanded due to RSS configuration. */
 	uint32_t layers;
 	/**< Bit-fields of present layers see MLX5_FLOW_ITEMS_*. */
 	uint32_t modifier;
 	/**< Bit-fields of present modifier see MLX5_FLOW_MOD_*. */
 	uint32_t fate;
 	/**< Bit-fields of present fate see MLX5_FLOW_FATE_*. */
-	struct mlx5_flow_verbs verbs; /* Verbs flow. */
-	uint16_t queue; /**< Destination queue to redirect traffic to. */
+	LIST_HEAD(verbs, mlx5_flow_verbs) verbs; /**< Verbs flows list. */
+	struct mlx5_flow_verbs *cur_verbs;
+	/**< Current Verbs flow structure being filled. */
+	struct rte_flow_action_rss rss;/**< RSS context. */
+	uint8_t key[40]; /**< RSS hash key. */
+	uint16_t (*queue)[]; /**< Destination queues to redirect traffic to. */
 };
 
 static const struct rte_flow_ops mlx5_flow_ops = {
@@ -122,16 +263,27 @@  struct ibv_spec_header {
 	uint16_t size;
 };
 
- /**
-  * Get the maximum number of priority available.
-  *
-  * @param dev
-  *   Pointer to Ethernet device.
-  *
-  * @return
-  *   number of supported flow priority on success, a negative errno value
-  *   otherwise and rte_errno is set.
-  */
+/* Map of Verbs to Flow priority with 8 Verbs priorities. */
+static const uint32_t priority_map_3[][3] = {
+	{ 0, 1, 2 }, { 2, 3, 4 }, { 5, 6, 7 },
+};
+
+/* Map of Verbs to Flow priority with 16 Verbs priorities. */
+static const uint32_t priority_map_5[][3] = {
+	{ 0, 1, 2 }, { 3, 4, 5 }, { 6, 7, 8 },
+	{ 9, 10, 11 }, { 12, 13, 14 },
+};
+
+/**
+ * Get the maximum number of priority available.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ *
+ * @return
+ *   number of supported flow priority on success, a negative errno
+ *   value otherwise and rte_errno is set.
+ */
 int
 mlx5_flow_priorities(struct rte_eth_dev *dev)
 {
@@ -156,6 +308,7 @@  mlx5_flow_priorities(struct rte_eth_dev *dev)
 	struct mlx5_hrxq *drop = mlx5_hrxq_drop_new(dev);
 	uint16_t vprio[] = { 8, 16 };
 	int i;
+	int priority = 0;
 
 	if (!drop) {
 		rte_errno = ENOTSUP;
@@ -167,11 +320,54 @@  mlx5_flow_priorities(struct rte_eth_dev *dev)
 		if (!flow)
 			break;
 		claim_zero(mlx5_glue->destroy_flow(flow));
+		priority = vprio[i];
+	}
+	switch (priority) {
+	case 8:
+		priority = 3;
+		break;
+	case 16:
+		priority = 5;
+		break;
+	default:
+		rte_errno = ENOTSUP;
+		DRV_LOG(ERR,
+			"port %u verbs maximum priority: %d expected 8/16",
+			dev->data->port_id, vprio[i]);
+		return -rte_errno;
 	}
 	mlx5_hrxq_drop_release(dev, drop);
 	DRV_LOG(INFO, "port %u flow maximum priority: %d",
-		dev->data->port_id, vprio[i]);
-	return vprio[i];
+		dev->data->port_id, priority);
+	return priority;
+}
+
+/**
+ * Adjust flow priority.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param flow
+ *   Pointer to an rte flow.
+ *
+ * @return
+ *   The priority adjusted.
+ */
+static int
+mlx5_flow_priority(struct rte_eth_dev *dev, uint32_t priority,
+		   uint32_t subpriority)
+{
+	struct priv *priv = dev->data->dev_private;
+
+	switch (priv->config.flow_prio) {
+	case 3:
+		priority = priority_map_3[priority][subpriority];
+		break;
+	case 5:
+		priority = priority_map_5[priority][subpriority];
+		break;
+	}
+	return priority;
 }
 
 /**
@@ -185,6 +381,8 @@  void
 mlx5_flow_print(struct rte_flow *flow __rte_unused)
 {
 #ifndef NDEBUG
+	struct mlx5_flow_verbs *verbs = LIST_FIRST(&flow->verbs);
+
 	fprintf(stdout, "---------8<------------\n");
 	fprintf(stdout, "%s: flow information\n", MLX5_DRIVER_NAME);
 	fprintf(stdout, " attributes: group %u priority %u ingress %d egress %d"
@@ -193,26 +391,36 @@  mlx5_flow_print(struct rte_flow *flow __rte_unused)
 		flow->attributes.ingress,
 		flow->attributes.egress,
 		flow->attributes.transfer);
-	fprintf(stdout, " layers: %s/%s/%s\n",
-		flow->layers & MLX5_FLOW_LAYER_OUTER_L2 ? "l2" : "-",
-		flow->layers & MLX5_FLOW_LAYER_OUTER_L3 ? "l3" : "-",
-		flow->layers & MLX5_FLOW_LAYER_OUTER_L4 ? "l4" : "-");
-	if (flow->fate & MLX5_FLOW_FATE_DROP)
+	if (flow->fate & MLX5_FLOW_FATE_DROP) {
 		fprintf(stdout, " fate: drop queue\n");
-	else if (flow->fate & MLX5_FLOW_FATE_QUEUE)
-		fprintf(stdout, " fate: target queue %u\n", flow->queue);
-	if (flow->verbs.attr) {
-		struct ibv_spec_header *hdr =
-			(struct ibv_spec_header *)flow->verbs.specs;
-		const int n = flow->verbs.attr->num_of_specs;
-		int i;
-
-		fprintf(stdout, " Verbs attributes: specs_n %u\n",
-			flow->verbs.attr->num_of_specs);
-		for (i = 0; i != n; ++i) {
-			rte_hexdump(stdout, " ", hdr, hdr->size);
-			hdr = (struct ibv_spec_header *)
-				((uint8_t *)hdr + hdr->size);
+	} else {
+		uint16_t i;
+
+		fprintf(stdout, " fate: target queues");
+		for (i = 0; i != flow->rss.queue_num; ++i)
+			fprintf(stdout, " %u", (*flow->queue)[i]);
+		fprintf(stdout, "\n");
+	}
+	LIST_FOREACH(verbs, &flow->verbs, next) {
+		uint32_t layers = flow->layers | verbs->layers;
+
+		fprintf(stdout, " layers: %s/%s/%s\n",
+			layers & MLX5_FLOW_LAYER_OUTER_L2 ? "l2" : "-",
+			layers & MLX5_FLOW_LAYER_OUTER_L3 ? "l3" : "-",
+			layers & MLX5_FLOW_LAYER_OUTER_L4 ? "l4" : "-");
+		if (verbs->attr) {
+			struct ibv_spec_header *hdr =
+				(struct ibv_spec_header *)verbs->specs;
+			const int n = verbs->attr->num_of_specs;
+			int i;
+
+			fprintf(stdout, " Verbs attributes: specs_n %u\n",
+				verbs->attr->num_of_specs);
+			for (i = 0; i != n; ++i) {
+				rte_hexdump(stdout, " ", hdr, hdr->size);
+				hdr = (struct ibv_spec_header *)
+					((uint8_t *)hdr + hdr->size);
+			}
 		}
 	}
 	fprintf(stdout, "--------->8------------\n");
@@ -239,18 +447,20 @@  mlx5_flow_attributes(struct rte_eth_dev *dev, const struct rte_flow_attr *attr,
 		     struct rte_flow *flow, struct rte_flow_error *error)
 {
 	uint32_t priority_max =
-		((struct priv *)dev->data->dev_private)->config.flow_prio;
+		((struct priv *)dev->data->dev_private)->config.flow_prio - 1;
 
 	if (attr->group)
 		return rte_flow_error_set(error, ENOTSUP,
 					  RTE_FLOW_ERROR_TYPE_ATTR_GROUP,
 					  NULL,
 					  "groups are not supported");
-	if (attr->priority >= priority_max)
+	if (attr->priority != MLX5_FLOW_PRIO_RSVD &&
+	    attr->priority >= priority_max)
 		return rte_flow_error_set(error, ENOTSUP,
 					  RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
 					  NULL,
-					  "priority value is not supported");
+					  "requested priority value is not"
+					  " supported");
 	if (attr->egress)
 		return rte_flow_error_set(error, ENOTSUP,
 					  RTE_FLOW_ERROR_TYPE_ATTR_EGRESS,
@@ -267,6 +477,8 @@  mlx5_flow_attributes(struct rte_eth_dev *dev, const struct rte_flow_attr *attr,
 					  NULL,
 					  "only ingress is supported");
 	flow->attributes = *attr;
+	if (attr->priority == MLX5_FLOW_PRIO_RSVD)
+		flow->attributes.priority = priority_max;
 	return 0;
 }
 
@@ -346,14 +558,51 @@  mlx5_flow_item_validate(const struct rte_flow_item *item,
 static void
 mlx5_flow_spec_verbs_add(struct rte_flow *flow, void *src, unsigned int size)
 {
-	if (flow->verbs.specs) {
+	struct mlx5_flow_verbs *verbs = flow->cur_verbs;
+
+	if (verbs->specs) {
 		void *dst;
 
-		dst = (void *)(flow->verbs.specs + flow->verbs.size);
+		dst = (void *)(verbs->specs + verbs->size);
 		memcpy(dst, src, size);
-		++flow->verbs.attr->num_of_specs;
+		++verbs->attr->num_of_specs;
 	}
-	flow->verbs.size += size;
+	verbs->size += size;
+}
+
+/**
+ * Update layer bit-field.
+ *
+ * @param flow[in, out]
+ *   Pointer to flow structure.
+ * @param layers
+ *   Bit-fields of layers to add see MLX5_FLOW_ITEMS_*.
+ */
+static void
+mlx5_flow_layers_update(struct rte_flow *flow, uint32_t layers)
+{
+	if (flow->expand) {
+		if (flow->cur_verbs)
+			flow->cur_verbs->layers |= layers;
+	} else {
+		flow->layers |= layers;
+	}
+}
+
+/**
+ * Get layers bit-field.
+ *
+ * @param flow[in, out]
+ *   Pointer to flow structure.
+ */
+static uint32_t
+mlx5_flow_layers(struct rte_flow *flow)
+{
+	uint32_t layers = flow->layers;
+
+	if (flow->expand && flow->cur_verbs)
+		layers |= flow->cur_verbs->layers;
+	return layers;
 }
 
 /**
@@ -388,22 +637,26 @@  mlx5_flow_item_eth(const struct rte_flow_item *item, struct rte_flow *flow,
 		.type = IBV_FLOW_SPEC_ETH,
 		.size = size,
 	};
+	const uint32_t layers = mlx5_flow_layers(flow);
 	int ret;
 
-	if (flow->layers & MLX5_FLOW_LAYER_OUTER_L2)
-		return rte_flow_error_set(error, ENOTSUP,
-					  RTE_FLOW_ERROR_TYPE_ITEM,
-					  item,
-					  "L2 layers already configured");
-	if (!mask)
-		mask = &rte_flow_item_eth_mask;
-	ret = mlx5_flow_item_validate(item, (const uint8_t *)mask,
-				      (const uint8_t *)&nic_mask,
-				      sizeof(struct rte_flow_item_eth),
-				      error);
-	if (ret)
-		return ret;
-	flow->layers |= MLX5_FLOW_LAYER_OUTER_L2;
+	if (!flow->expand) {
+		if (layers & MLX5_FLOW_LAYER_OUTER_L2)
+			return rte_flow_error_set(error, ENOTSUP,
+						  RTE_FLOW_ERROR_TYPE_ITEM,
+						  item,
+						  "L2 layers already"
+						  " configured");
+		if (!mask)
+			mask = &rte_flow_item_eth_mask;
+		ret = mlx5_flow_item_validate(item, (const uint8_t *)mask,
+					      (const uint8_t *)&nic_mask,
+					      sizeof(struct rte_flow_item_eth),
+					      error);
+		if (ret)
+			return ret;
+	}
+	mlx5_flow_layers_update(flow, MLX5_FLOW_LAYER_OUTER_L2);
 	if (size > flow_size)
 		return size;
 	if (spec) {
@@ -482,6 +735,7 @@  mlx5_flow_item_vlan(const struct rte_flow_item *item, struct rte_flow *flow,
 		.tci = RTE_BE16(0x0fff),
 	};
 	unsigned int size = sizeof(struct ibv_flow_spec_eth);
+	struct mlx5_flow_verbs *verbs = flow->cur_verbs;
 	struct ibv_flow_spec_eth eth = {
 		.type = IBV_FLOW_SPEC_ETH,
 		.size = size,
@@ -491,24 +745,30 @@  mlx5_flow_item_vlan(const struct rte_flow_item *item, struct rte_flow *flow,
 			MLX5_FLOW_LAYER_OUTER_L4;
 	const uint32_t vlanm = MLX5_FLOW_LAYER_OUTER_VLAN;
 	const uint32_t l2m = MLX5_FLOW_LAYER_OUTER_L2;
+	const uint32_t layers = mlx5_flow_layers(flow);
 
-	if (flow->layers & vlanm)
-		return rte_flow_error_set(error, ENOTSUP,
-					  RTE_FLOW_ERROR_TYPE_ITEM,
-					  item,
-					  "L2 layers already configured");
-	else if ((flow->layers & lm) != 0)
-		return rte_flow_error_set(error, ENOTSUP,
-					  RTE_FLOW_ERROR_TYPE_ITEM,
-					  item,
-					  "L2 layer cannot follow L3/L4 layer");
-	if (!mask)
-		mask = &rte_flow_item_vlan_mask;
-	ret = mlx5_flow_item_validate(item, (const uint8_t *)mask,
-				      (const uint8_t *)&nic_mask,
-				      sizeof(struct rte_flow_item_vlan), error);
-	if (ret)
-		return ret;
+	if (!flow->expand) {
+		if (layers & vlanm)
+			return rte_flow_error_set(error, ENOTSUP,
+						  RTE_FLOW_ERROR_TYPE_ITEM,
+						  item,
+						  "L2 layers already"
+						  " configured");
+		else if ((layers & lm) != 0)
+			return rte_flow_error_set(error, ENOTSUP,
+						  RTE_FLOW_ERROR_TYPE_ITEM,
+						  item,
+						  "L2 layer cannot follow"
+						  " L3/L4 layer");
+		if (!mask)
+			mask = &rte_flow_item_vlan_mask;
+		ret = mlx5_flow_item_validate(item, (const uint8_t *)mask,
+					      (const uint8_t *)&nic_mask,
+					      sizeof(struct rte_flow_item_vlan),
+					      error);
+		if (ret)
+			return ret;
+	}
 	if (spec) {
 		eth.val.vlan_tag = spec->tci;
 		eth.mask.vlan_tag = mask->tci;
@@ -517,32 +777,34 @@  mlx5_flow_item_vlan(const struct rte_flow_item *item, struct rte_flow *flow,
 		eth.mask.ether_type = mask->inner_type;
 		eth.val.ether_type &= eth.mask.ether_type;
 	}
-	/*
-	 * From verbs perspective an empty VLAN is equivalent
-	 * to a packet without VLAN layer.
-	 */
-	if (!eth.mask.vlan_tag)
-		return rte_flow_error_set(error, EINVAL,
-					  RTE_FLOW_ERROR_TYPE_ITEM_SPEC,
-					  item->spec,
-					  "VLAN cannot be empty");
-	/* Outer TPID cannot be matched. */
-	if (eth.mask.ether_type)
-		return rte_flow_error_set(error, ENOTSUP,
-					  RTE_FLOW_ERROR_TYPE_ITEM_SPEC,
-					  item->spec,
-					  "VLAN TPID matching is not"
-					  " supported");
-	if (!(flow->layers & l2m)) {
+	if (!flow->expand) {
+		/*
+		 * From verbs perspective an empty VLAN is equivalent
+		 * to a packet without VLAN layer.
+		 */
+		if (!eth.mask.vlan_tag)
+			return rte_flow_error_set(error, EINVAL,
+						  RTE_FLOW_ERROR_TYPE_ITEM_SPEC,
+						  item->spec,
+						  "VLAN cannot be empty");
+		/* Outer TPID cannot be matched. */
+		if (eth.mask.ether_type)
+			return rte_flow_error_set(error, ENOTSUP,
+						  RTE_FLOW_ERROR_TYPE_ITEM_SPEC,
+						  item->spec,
+						  "VLAN TPID matching is not"
+						  " supported");
+	}
+	if (!(layers & l2m)) {
 		if (size <= flow_size)
 			mlx5_flow_spec_verbs_add(flow, &eth, size);
 	} else {
-		if (flow->verbs.attr)
-			mlx5_flow_item_vlan_update(flow->verbs.attr, &eth);
+		if (verbs->attr)
+			mlx5_flow_item_vlan_update(verbs->attr, &eth);
 		size = 0; /**< Only an update is done in eth specification. */
 	}
-	flow->layers |= MLX5_FLOW_LAYER_OUTER_L2 |
-		MLX5_FLOW_LAYER_OUTER_VLAN;
+	mlx5_flow_layers_update(flow, MLX5_FLOW_LAYER_OUTER_L2 |
+				MLX5_FLOW_LAYER_OUTER_VLAN);
 	return size;
 }
 
@@ -582,25 +844,31 @@  mlx5_flow_item_ipv4(const struct rte_flow_item *item, struct rte_flow *flow,
 		.size = size,
 	};
 	int ret;
+	const uint32_t layers = mlx5_flow_layers(flow);
 
-	if (flow->layers & MLX5_FLOW_LAYER_OUTER_L3)
-		return rte_flow_error_set(error, ENOTSUP,
-					  RTE_FLOW_ERROR_TYPE_ITEM,
-					  item,
-					  "multiple L3 layers not supported");
-	else if (flow->layers & MLX5_FLOW_LAYER_OUTER_L4)
-		return rte_flow_error_set(error, ENOTSUP,
-					  RTE_FLOW_ERROR_TYPE_ITEM,
-					  item,
-					  "L3 cannot follow an L4 layer.");
-	if (!mask)
-		mask = &rte_flow_item_ipv4_mask;
-	ret = mlx5_flow_item_validate(item, (const uint8_t *)mask,
-				      (const uint8_t *)&nic_mask,
-				      sizeof(struct rte_flow_item_ipv4), error);
-	if (ret < 0)
-		return ret;
-	flow->layers |= MLX5_FLOW_LAYER_OUTER_L3_IPV4;
+	if (!flow->expand) {
+		if (layers & MLX5_FLOW_LAYER_OUTER_L3)
+			return rte_flow_error_set(error, ENOTSUP,
+						  RTE_FLOW_ERROR_TYPE_ITEM,
+						  item,
+						  "multiple L3 layers not"
+						  " supported");
+		else if (layers & MLX5_FLOW_LAYER_OUTER_L4)
+			return rte_flow_error_set(error, ENOTSUP,
+						  RTE_FLOW_ERROR_TYPE_ITEM,
+						  item,
+						  "L3 cannot follow an L4"
+						  " layer");
+		if (!mask)
+			mask = &rte_flow_item_ipv4_mask;
+		ret = mlx5_flow_item_validate(item, (const uint8_t *)mask,
+					      (const uint8_t *)&nic_mask,
+					      sizeof(struct rte_flow_item_ipv4),
+					      error);
+		if (ret < 0)
+			return ret;
+	}
+	mlx5_flow_layers_update(flow, MLX5_FLOW_LAYER_OUTER_L3_IPV4);
 	if (size > flow_size)
 		return size;
 	if (spec) {
@@ -667,25 +935,31 @@  mlx5_flow_item_ipv6(const struct rte_flow_item *item, struct rte_flow *flow,
 		.size = size,
 	};
 	int ret;
+	const uint32_t layers = mlx5_flow_layers(flow);
 
-	if (flow->layers & MLX5_FLOW_LAYER_OUTER_L3)
-		return rte_flow_error_set(error, ENOTSUP,
-					  RTE_FLOW_ERROR_TYPE_ITEM,
-					  item,
-					  "multiple L3 layers not supported");
-	else if (flow->layers & MLX5_FLOW_LAYER_OUTER_L4)
-		return rte_flow_error_set(error, ENOTSUP,
-					  RTE_FLOW_ERROR_TYPE_ITEM,
-					  item,
-					  "L3 cannot follow an L4 layer.");
-	if (!mask)
-		mask = &rte_flow_item_ipv6_mask;
-	ret = mlx5_flow_item_validate(item, (const uint8_t *)mask,
-				      (const uint8_t *)&nic_mask,
-				      sizeof(struct rte_flow_item_ipv6), error);
-	if (ret < 0)
-		return ret;
-	flow->layers |= MLX5_FLOW_LAYER_OUTER_L3_IPV6;
+	if (!flow->expand) {
+		if (layers & MLX5_FLOW_LAYER_OUTER_L3)
+			return rte_flow_error_set(error, ENOTSUP,
+						  RTE_FLOW_ERROR_TYPE_ITEM,
+						  item,
+						  "multiple L3 layers not"
+						  " supported");
+		else if (layers & MLX5_FLOW_LAYER_OUTER_L4)
+			return rte_flow_error_set(error, ENOTSUP,
+						  RTE_FLOW_ERROR_TYPE_ITEM,
+						  item,
+						  "L3 cannot follow an L4"
+						  " layer");
+		if (!mask)
+			mask = &rte_flow_item_ipv6_mask;
+		ret = mlx5_flow_item_validate(item, (const uint8_t *)mask,
+					      (const uint8_t *)&nic_mask,
+					      sizeof(struct rte_flow_item_ipv6),
+					      error);
+		if (ret < 0)
+			return ret;
+	}
+	mlx5_flow_layers_update(flow, MLX5_FLOW_LAYER_OUTER_L3_IPV6);
 	if (size > flow_size)
 		return size;
 	if (spec) {
@@ -759,25 +1033,31 @@  mlx5_flow_item_udp(const struct rte_flow_item *item, struct rte_flow *flow,
 		.size = size,
 	};
 	int ret;
+	const uint32_t layers = mlx5_flow_layers(flow);
 
-	if (!(flow->layers & MLX5_FLOW_LAYER_OUTER_L3))
-		return rte_flow_error_set(error, ENOTSUP,
-					  RTE_FLOW_ERROR_TYPE_ITEM,
-					  item,
-					  "L3 is mandatory to filter on L4");
-	if (flow->layers & MLX5_FLOW_LAYER_OUTER_L4)
-		return rte_flow_error_set(error, ENOTSUP,
-					  RTE_FLOW_ERROR_TYPE_ITEM,
-					  item,
-					  "L4 layer is already present");
-	if (!mask)
-		mask = &rte_flow_item_udp_mask;
-	ret = mlx5_flow_item_validate(item, (const uint8_t *)mask,
-				      (const uint8_t *)&rte_flow_item_udp_mask,
-				      sizeof(struct rte_flow_item_udp), error);
-	if (ret < 0)
-		return ret;
-	flow->layers |= MLX5_FLOW_LAYER_OUTER_L4_UDP;
+	if (!flow->expand) {
+		if (!(layers & MLX5_FLOW_LAYER_OUTER_L3))
+			return rte_flow_error_set(error, ENOTSUP,
+						  RTE_FLOW_ERROR_TYPE_ITEM,
+						  item,
+						  "L3 is mandatory to filter"
+						  " on L4");
+		if (layers & MLX5_FLOW_LAYER_OUTER_L4)
+			return rte_flow_error_set(error, ENOTSUP,
+						  RTE_FLOW_ERROR_TYPE_ITEM,
+						  item,
+						  "L4 layer is already"
+						  " present");
+		if (!mask)
+			mask = &rte_flow_item_udp_mask;
+		ret = mlx5_flow_item_validate
+			(item, (const uint8_t *)mask,
+			 (const uint8_t *)&rte_flow_item_udp_mask,
+			 sizeof(struct rte_flow_item_udp), error);
+		if (ret < 0)
+			return ret;
+	}
+	mlx5_flow_layers_update(flow, MLX5_FLOW_LAYER_OUTER_L4_UDP);
 	if (size > flow_size)
 		return size;
 	if (spec) {
@@ -821,25 +1101,31 @@  mlx5_flow_item_tcp(const struct rte_flow_item *item, struct rte_flow *flow,
 		.size = size,
 	};
 	int ret;
+	const uint32_t layers = mlx5_flow_layers(flow);
 
-	if (!(flow->layers & MLX5_FLOW_LAYER_OUTER_L3))
-		return rte_flow_error_set(error, ENOTSUP,
-					  RTE_FLOW_ERROR_TYPE_ITEM,
-					  item,
-					  "L3 is mandatory to filter on L4");
-	if (flow->layers & MLX5_FLOW_LAYER_OUTER_L4)
-		return rte_flow_error_set(error, ENOTSUP,
-					  RTE_FLOW_ERROR_TYPE_ITEM,
-					  item,
-					  "L4 layer is already present");
-	if (!mask)
-		mask = &rte_flow_item_tcp_mask;
-	ret = mlx5_flow_item_validate(item, (const uint8_t *)mask,
-				      (const uint8_t *)&rte_flow_item_tcp_mask,
-				      sizeof(struct rte_flow_item_tcp), error);
-	if (ret < 0)
-		return ret;
-	flow->layers |= MLX5_FLOW_LAYER_OUTER_L4_TCP;
+	if (!flow->expand) {
+		if (!(layers & MLX5_FLOW_LAYER_OUTER_L3))
+			return rte_flow_error_set(error, ENOTSUP,
+						  RTE_FLOW_ERROR_TYPE_ITEM,
+						  item,
+						  "L3 is mandatory to filter"
+						  " on L4");
+		if (layers & MLX5_FLOW_LAYER_OUTER_L4)
+			return rte_flow_error_set(error, ENOTSUP,
+						  RTE_FLOW_ERROR_TYPE_ITEM,
+						  item,
+						  "L4 layer is already"
+						  " present");
+		if (!mask)
+			mask = &rte_flow_item_tcp_mask;
+		ret = mlx5_flow_item_validate
+			(item, (const uint8_t *)mask,
+			 (const uint8_t *)&rte_flow_item_tcp_mask,
+			 sizeof(struct rte_flow_item_tcp), error);
+		if (ret < 0)
+			return ret;
+	}
+	mlx5_flow_layers_update(flow, MLX5_FLOW_LAYER_OUTER_L4_TCP);
 	if (size > flow_size)
 		return size;
 	if (spec) {
@@ -954,18 +1240,20 @@  mlx5_flow_action_drop(const struct rte_flow_action *actions,
 			.size = size,
 	};
 
-	if (flow->fate)
-		return rte_flow_error_set(error, ENOTSUP,
-					  RTE_FLOW_ERROR_TYPE_ACTION,
-					  actions,
-					  "multiple fate actions are not"
-					  " supported");
-	if (flow->modifier & (MLX5_FLOW_MOD_FLAG | MLX5_FLOW_MOD_MARK))
-		return rte_flow_error_set(error, ENOTSUP,
-					  RTE_FLOW_ERROR_TYPE_ACTION,
-					  actions,
-					  "drop is not compatible with"
-					  " flag/mark action");
+	if (!flow->expand) {
+		if (flow->fate)
+			return rte_flow_error_set(error, ENOTSUP,
+						  RTE_FLOW_ERROR_TYPE_ACTION,
+						  actions,
+						  "multiple fate actions are"
+						  " not supported");
+		if (flow->modifier & (MLX5_FLOW_MOD_FLAG | MLX5_FLOW_MOD_MARK))
+			return rte_flow_error_set(error, ENOTSUP,
+						  RTE_FLOW_ERROR_TYPE_ACTION,
+						  actions,
+						  "drop is not compatible with"
+						  " flag/mark action");
+	}
 	if (size < flow_size)
 		mlx5_flow_spec_verbs_add(flow, &drop, size);
 	flow->fate |= MLX5_FLOW_FATE_DROP;
@@ -998,6 +1286,8 @@  mlx5_flow_action_queue(struct rte_eth_dev *dev,
 	struct priv *priv = dev->data->dev_private;
 	const struct rte_flow_action_queue *queue = actions->conf;
 
+	if (flow->expand)
+		return 0;
 	if (flow->fate)
 		return rte_flow_error_set(error, ENOTSUP,
 					  RTE_FLOW_ERROR_TYPE_ACTION,
@@ -1014,11 +1304,162 @@  mlx5_flow_action_queue(struct rte_eth_dev *dev,
 					  RTE_FLOW_ERROR_TYPE_ACTION_CONF,
 					  &queue->index,
 					  "queue is not configured");
-	flow->queue = queue->index;
+	if (flow->queue)
+		(*flow->queue)[0] = queue->index;
+	flow->rss.queue_num = 1;
 	flow->fate |= MLX5_FLOW_FATE_QUEUE;
 	return 0;
 }
 
+/**
+ * Store the Verbs hash fields and priority according to the layer and types.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param flow
+ *   Pointer to flow structure.
+ * @param types
+ *   RSS types for this flow (see ETH_RSS_*).
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_flow_action_rss_verbs_attr(struct rte_eth_dev *dev, struct rte_flow *flow,
+				uint32_t types)
+{
+	const uint32_t layers = mlx5_flow_layers(flow);
+	uint64_t hash_fields;
+	uint32_t priority;
+
+	if ((types & ETH_RSS_NONFRAG_IPV4_TCP) &&
+	    (layers & MLX5_FLOW_LAYER_OUTER_L4_TCP)) {
+		hash_fields = IBV_RX_HASH_SRC_IPV4 |
+			IBV_RX_HASH_DST_IPV4 |
+			IBV_RX_HASH_SRC_PORT_TCP |
+			IBV_RX_HASH_DST_PORT_TCP;
+		priority = 0;
+	} else if ((types & ETH_RSS_NONFRAG_IPV4_UDP) &&
+		 (layers & MLX5_FLOW_LAYER_OUTER_L4_UDP)) {
+		hash_fields = IBV_RX_HASH_SRC_IPV4 |
+			IBV_RX_HASH_DST_IPV4 |
+			IBV_RX_HASH_SRC_PORT_UDP |
+			IBV_RX_HASH_DST_PORT_UDP;
+		priority = 0;
+	} else if ((types & (ETH_RSS_IPV4 | ETH_RSS_FRAG_IPV4)) &&
+		 (layers & MLX5_FLOW_LAYER_OUTER_L3_IPV4)) {
+		hash_fields = IBV_RX_HASH_SRC_IPV4 |
+			IBV_RX_HASH_DST_IPV4;
+		priority = 1;
+	} else if ((types & ETH_RSS_NONFRAG_IPV6_TCP) &&
+		 (layers & MLX5_FLOW_LAYER_OUTER_L4_TCP)) {
+		hash_fields = IBV_RX_HASH_SRC_IPV6 |
+			IBV_RX_HASH_DST_IPV6 |
+			IBV_RX_HASH_SRC_PORT_TCP |
+			IBV_RX_HASH_DST_PORT_TCP;
+		priority = 0;
+	} else if ((types & ETH_RSS_NONFRAG_IPV6_UDP) &&
+		 (layers & MLX5_FLOW_LAYER_OUTER_L3_IPV6)) {
+		hash_fields = IBV_RX_HASH_SRC_IPV6 |
+			IBV_RX_HASH_DST_IPV6 |
+			IBV_RX_HASH_SRC_PORT_UDP |
+			IBV_RX_HASH_DST_PORT_UDP;
+		priority = 0;
+	} else if ((types & (ETH_RSS_IPV6 | ETH_RSS_FRAG_IPV6)) &&
+		 (layers & MLX5_FLOW_LAYER_OUTER_L3_IPV6)) {
+		hash_fields = IBV_RX_HASH_SRC_IPV6 |
+			IBV_RX_HASH_DST_IPV6;
+		priority = 1;
+	} else {
+		hash_fields = 0;
+		priority = 2;
+	}
+	flow->cur_verbs->hash_fields = hash_fields;
+	flow->cur_verbs->attr->priority =
+		mlx5_flow_priority(dev, flow->attributes.priority, priority);
+	return 0;
+}
+
+/**
+ * Validate action queue provided by the user.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param actions
+ *   Pointer to flow actions array.
+ * @param flow
+ *   Pointer to the rte_flow structure.
+ * @param error
+ *   Pointer to error structure.
+ */
+static int
+mlx5_flow_action_rss(struct rte_eth_dev *dev,
+		     const struct rte_flow_action *actions,
+		     struct rte_flow *flow,
+		     struct rte_flow_error *error)
+{
+	struct priv *priv = dev->data->dev_private;
+	const struct rte_flow_action_rss *rss = actions->conf;
+	unsigned int i;
+
+	if (flow->expand)
+		return 0;
+	if (flow->fate)
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_ACTION,
+					  actions,
+					  "multiple fate actions are not"
+					  " supported");
+	if (rss->func != RTE_ETH_HASH_FUNCTION_DEFAULT &&
+	    rss->func != RTE_ETH_HASH_FUNCTION_TOEPLITZ)
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_ACTION_CONF,
+					  &rss->func,
+					  "RSS hash function not supported");
+	if (rss->level > 1)
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_ACTION_CONF,
+					  &rss->level,
+					  "tunnel RSS is not supported");
+	if (rss->key_len < rss_hash_default_key_len)
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_ACTION_CONF,
+					  &rss->key_len,
+					  "RSS hash key too small");
+	if (rss->key_len > rss_hash_default_key_len)
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_ACTION_CONF,
+					  &rss->key_len,
+					  "RSS hash key too large");
+	if (rss->queue_num > priv->config.ind_table_max_size)
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_ACTION_CONF,
+					  &rss->queue_num,
+					  "number of queues too large");
+	if (rss->types & MLX5_RSS_HF_MASK)
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_ACTION_CONF,
+					  &rss->types,
+					  "some RSS protocols are not"
+					  " supported");
+	for (i = 0; i != rss->queue_num; ++i) {
+		if (!(*priv->rxqs)[rss->queue[i]])
+			return rte_flow_error_set
+				(error, EINVAL,
+				 RTE_FLOW_ERROR_TYPE_ACTION_CONF,
+				 &rss->queue[i],
+				 "queue is not configured");
+	}
+	if (flow->queue)
+		memcpy((*flow->queue), rss->queue,
+		       rss->queue_num * sizeof(uint16_t));
+	flow->rss.queue_num = rss->queue_num;
+	memcpy(flow->key, rss->key, rss_hash_default_key_len);
+	flow->rss.types = rss->types;
+	flow->fate |= MLX5_FLOW_FATE_RSS;
+	return 0;
+}
+
 /**
  * Validate action flag provided by the user.
  *
@@ -1046,43 +1487,59 @@  mlx5_flow_action_flag(const struct rte_flow_action *actions,
 		.size = size,
 		.tag_id = mlx5_flow_mark_set(MLX5_FLOW_MARK_DEFAULT),
 	};
+	struct mlx5_flow_verbs *verbs = flow->cur_verbs;
 
-	if (flow->modifier & MLX5_FLOW_MOD_FLAG)
-		return rte_flow_error_set(error, ENOTSUP,
-					  RTE_FLOW_ERROR_TYPE_ACTION,
-					  actions,
-					  "flag action already present");
-	if (flow->fate & MLX5_FLOW_FATE_DROP)
-		return rte_flow_error_set(error, ENOTSUP,
-					  RTE_FLOW_ERROR_TYPE_ACTION,
-					  actions,
-					  "flag is not compatible with drop"
-					  " action");
-	if (flow->modifier & MLX5_FLOW_MOD_MARK)
-		return 0;
+	if (!flow->expand) {
+		if (flow->modifier & MLX5_FLOW_MOD_FLAG)
+			return rte_flow_error_set(error, ENOTSUP,
+						  RTE_FLOW_ERROR_TYPE_ACTION,
+						  actions,
+						  "flag action already present");
+		if (flow->fate & MLX5_FLOW_FATE_DROP)
+			return rte_flow_error_set(error, ENOTSUP,
+						  RTE_FLOW_ERROR_TYPE_ACTION,
+						  actions,
+						  "flag is not compatible with"
+						  " drop action");
+	}
+	/*
+	 * The two only possible cases, a mark has already been added in the
+	 * specification, in such case, the flag is already present in
+	 * addition of the mark.
+	 * Second case, has it is not possible to have two flags, it just
+	 * needs to add it.
+	 */
+	if (verbs) {
+		verbs->modifier |= MLX5_FLOW_MOD_FLAG;
+		if (verbs->modifier & MLX5_FLOW_MOD_MARK)
+			size = 0;
+		else if (size <= flow_size)
+			mlx5_flow_spec_verbs_add(flow, &tag, size);
+	} else {
+		if (flow->modifier & MLX5_FLOW_MOD_MARK)
+			size = 0;
+	}
 	flow->modifier |= MLX5_FLOW_MOD_FLAG;
-	if (size <= flow_size)
-		mlx5_flow_spec_verbs_add(flow, &tag, size);
 	return size;
 }
 
 /**
  * Update verbs specification to modify the flag to mark.
  *
- * @param flow
- *   Pointer to the rte_flow structure.
+ * @param verbs
+ *   Pointer to the mlx5_flow_verbs structure.
  * @param mark_id
  *   Mark identifier to replace the flag.
  */
 static void
-mlx5_flow_verbs_mark_update(struct rte_flow *flow, uint32_t mark_id)
+mlx5_flow_verbs_mark_update(struct mlx5_flow_verbs *verbs, uint32_t mark_id)
 {
 	struct ibv_spec_header *hdr;
 	int i;
 
 	/* Update Verbs specification. */
-	hdr = (struct ibv_spec_header *)flow->verbs.specs;
-	for (i = 0; i != flow->verbs.attr->num_of_specs; ++i) {
+	hdr = (struct ibv_spec_header *)verbs->specs;
+	for (i = 0; i != verbs->attr->num_of_specs; ++i) {
 		if (hdr->type == IBV_FLOW_SPEC_ACTION_TAG) {
 			struct ibv_flow_spec_action_tag *t =
 				(struct ibv_flow_spec_action_tag *)hdr;
@@ -1120,38 +1577,52 @@  mlx5_flow_action_mark(const struct rte_flow_action *actions,
 		.type = IBV_FLOW_SPEC_ACTION_TAG,
 		.size = size,
 	};
+	struct mlx5_flow_verbs *verbs = flow->cur_verbs;
 
-	if (!mark)
-		return rte_flow_error_set(error, EINVAL,
-					  RTE_FLOW_ERROR_TYPE_ACTION,
-					  actions,
-					  "configuration cannot be null");
-	if (mark->id >= MLX5_FLOW_MARK_MAX)
-		return rte_flow_error_set(error, EINVAL,
-					  RTE_FLOW_ERROR_TYPE_ACTION_CONF,
-					  &mark->id,
-					  "mark must be between 0 and"
-					  " 16777199");
-	if (flow->modifier & MLX5_FLOW_MOD_MARK)
-		return rte_flow_error_set(error, ENOTSUP,
-					  RTE_FLOW_ERROR_TYPE_ACTION,
-					  actions,
-					  "mark action already present");
-	if (flow->fate & MLX5_FLOW_FATE_DROP)
-		return rte_flow_error_set(error, ENOTSUP,
-					  RTE_FLOW_ERROR_TYPE_ACTION,
-					  actions,
-					  "mark is not compatible with drop"
-					  " action");
-	if (flow->modifier & MLX5_FLOW_MOD_FLAG) {
-		mlx5_flow_verbs_mark_update(flow, mark->id);
-		size = 0; /**< Only an update is done in the specification. */
-	} else {
-		tag.tag_id = mlx5_flow_mark_set(mark->id);
-		if (size <= flow_size) {
+	if (!flow->expand) {
+		if (!mark)
+			return rte_flow_error_set(error, EINVAL,
+						  RTE_FLOW_ERROR_TYPE_ACTION,
+						  actions,
+						  "configuration cannot be"
+						  " null");
+		if (mark->id >= MLX5_FLOW_MARK_MAX)
+			return rte_flow_error_set
+				(error, EINVAL,
+				 RTE_FLOW_ERROR_TYPE_ACTION_CONF,
+				 &mark->id,
+				 "mark must be between 0 and 16777199");
+		if (flow->modifier & MLX5_FLOW_MOD_MARK)
+			return rte_flow_error_set(error, ENOTSUP,
+						  RTE_FLOW_ERROR_TYPE_ACTION,
+						  actions,
+						  "mark action already"
+						  " present");
+		if (flow->fate & MLX5_FLOW_FATE_DROP)
+			return rte_flow_error_set(error, ENOTSUP,
+						  RTE_FLOW_ERROR_TYPE_ACTION,
+						  actions,
+						  "mark is not compatible with"
+						  " drop action");
+	}
+	/*
+	 * The two only possible cases, a flag has already been added in the
+	 * specification, in such case, it needs to be update to add the id.
+	 * Second case, has it is not possible to have two mark, it just
+	 * needs to add it.
+	 */
+	if (verbs) {
+		verbs->modifier |= MLX5_FLOW_MOD_MARK;
+		if (verbs->modifier & MLX5_FLOW_MOD_FLAG) {
+			mlx5_flow_verbs_mark_update(verbs, mark->id);
+			size = 0;
+		} else if (size <= flow_size) {
 			tag.tag_id = mlx5_flow_mark_set(mark->id);
 			mlx5_flow_spec_verbs_add(flow, &tag, size);
 		}
+	} else {
+		if (flow->modifier & MLX5_FLOW_MOD_FLAG)
+			size = 0;
 	}
 	flow->modifier |= MLX5_FLOW_MOD_MARK;
 	return size;
@@ -1185,6 +1656,15 @@  mlx5_flow_actions(struct rte_eth_dev *dev,
 	int remain = flow_size;
 	int ret = 0;
 
+	/*
+	 * FLAG/MARK are the only actions having a specification in Verbs and
+	 * not making part of the packet fate.  Due to this specificity and to
+	 * avoid extra variable, their bit in the flow->modifier bit-field are
+	 * disabled here to compute the exact necessary memory those action
+	 * needs.
+	 */
+	flow->modifier &= ~(MLX5_FLOW_MOD_FLAG | MLX5_FLOW_MOD_MARK);
+	/* Process the actions. */
 	for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
 		switch (actions->type) {
 		case RTE_FLOW_ACTION_TYPE_VOID:
@@ -1204,6 +1684,9 @@  mlx5_flow_actions(struct rte_eth_dev *dev,
 		case RTE_FLOW_ACTION_TYPE_QUEUE:
 			ret = mlx5_flow_action_queue(dev, actions, flow, error);
 			break;
+		case RTE_FLOW_ACTION_TYPE_RSS:
+			ret = mlx5_flow_action_rss(dev, actions, flow, error);
+			break;
 		default:
 			return rte_flow_error_set(error, ENOTSUP,
 						  RTE_FLOW_ERROR_TYPE_ACTION,
@@ -1257,27 +1740,92 @@  mlx5_flow_merge(struct rte_eth_dev *dev, struct rte_flow *flow,
 		struct rte_flow_error *error)
 {
 	struct rte_flow local_flow = { .layers = 0, };
-	size_t size = sizeof(*flow) + sizeof(struct ibv_flow_attr);
+	size_t size = sizeof(*flow);
 	int remain = (flow_size > size) ? flow_size - size : 0;
+	struct rte_flow_expand_rss *buf;
 	int ret;
+	uint32_t i;
 
 	if (!remain)
 		flow = &local_flow;
 	ret = mlx5_flow_attributes(dev, attr, flow, error);
 	if (ret < 0)
 		return ret;
-	ret = mlx5_flow_items(items, flow, remain, error);
-	if (ret < 0)
-		return ret;
-	size += ret;
-	remain = (flow_size > size) ? flow_size - size : 0;
-	ret = mlx5_flow_actions(dev, actions, flow, remain, error);
+	ret = mlx5_flow_actions(dev, actions, &local_flow, 0, error);
 	if (ret < 0)
 		return ret;
-	size += ret;
+	ret = rte_flow_expand_rss(NULL, 0, items, local_flow.rss.types,
+				  mlx5_support_expansion,
+				  local_flow.rss.level < 2 ?
+				  MLX5_EXPANSION_ROOT : MLX5_EXPANSION_ROOT2);
+	assert(ret > 0);
+	buf = rte_calloc(__func__, 1, ret, 0);
+	if (!buf) {
+		rte_flow_error_set(error, ENOMEM,
+				   RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+				   NULL,
+				   "not enough memory to expand the RSS flow");
+		goto error;
+	}
+	ret = rte_flow_expand_rss(buf, ret, items, local_flow.rss.types,
+				  mlx5_support_expansion,
+				  local_flow.rss.level < 2 ?
+				  MLX5_EXPANSION_ROOT : MLX5_EXPANSION_ROOT2);
+	assert(ret > 0);
+	size += RTE_ALIGN_CEIL(local_flow.rss.queue_num * sizeof(uint16_t),
+			       sizeof(void *));
 	if (size <= flow_size)
-		flow->verbs.attr->priority = flow->attributes.priority;
+		flow->queue = (void *)(flow + 1);
+	LIST_INIT(&flow->verbs);
+	flow->layers = 0;
+	flow->modifier = 0;
+	flow->fate = 0;
+	for (i = 0; i != buf->entries; ++i) {
+		size_t off = size;
+
+		size += sizeof(struct ibv_flow_attr) +
+			sizeof(struct mlx5_flow_verbs);
+		remain = (flow_size > size) ? flow_size - size : 0;
+		if (remain) {
+			flow->cur_verbs = (void *)((uintptr_t)flow + off);
+			flow->cur_verbs->attr = (void *)(flow->cur_verbs + 1);
+			flow->cur_verbs->specs =
+				(void *)(flow->cur_verbs->attr + 1);
+		}
+		ret = mlx5_flow_items
+			((const struct rte_flow_item *)buf->patterns[i],
+			 flow, remain, error);
+		if (ret < 0)
+			goto error;
+		size += ret;
+		if (remain > ret)
+			remain -= ret;
+		else
+			remain = 0;
+		ret = mlx5_flow_actions(dev, actions, flow, remain, error);
+		if (ret < 0)
+			goto error;
+		size += ret;
+		if (remain > ret)
+			remain -= ret;
+		else
+			remain = 0;
+		if (size <= flow_size) {
+			flow->cur_verbs->attr->priority =
+				flow->attributes.priority;
+			ret = mlx5_flow_action_rss_verbs_attr(dev, flow,
+							      flow->rss.types);
+			if (ret < 0)
+				goto error;
+			LIST_INSERT_HEAD(&flow->verbs, flow->cur_verbs, next);
+		}
+		flow->expand = !!(buf->entries > 1);
+	}
+	rte_free(buf);
 	return size;
+error:
+	rte_free(buf);
+	return ret;
 }
 
 /**
@@ -1292,9 +1840,13 @@  static void
 mlx5_flow_rxq_mark(struct rte_eth_dev *dev, struct rte_flow *flow)
 {
 	struct priv *priv = dev->data->dev_private;
+	const uint32_t mask = MLX5_FLOW_MOD_FLAG | MLX5_FLOW_MOD_MARK;
+	uint32_t i;
 
-	(*priv->rxqs)[flow->queue]->mark |=
-		flow->modifier & (MLX5_FLOW_MOD_FLAG | MLX5_FLOW_MOD_MARK);
+	if (!(flow->modifier & mask))
+		return;
+	for (i = 0; i != flow->rss.queue_num; ++i)
+		(*priv->rxqs)[(*flow->queue)[i]]->mark = 1;
 }
 
 /**
@@ -1328,18 +1880,20 @@  mlx5_flow_validate(struct rte_eth_dev *dev,
 static void
 mlx5_flow_fate_remove(struct rte_eth_dev *dev, struct rte_flow *flow)
 {
-	if (flow->fate & MLX5_FLOW_FATE_DROP) {
-		if (flow->verbs.flow) {
-			claim_zero(mlx5_glue->destroy_flow(flow->verbs.flow));
-			flow->verbs.flow = NULL;
+	struct mlx5_flow_verbs *verbs;
+
+	LIST_FOREACH(verbs, &flow->verbs, next) {
+		if (verbs->flow) {
+			claim_zero(mlx5_glue->destroy_flow(verbs->flow));
+			verbs->flow = NULL;
+		}
+		if (verbs->hrxq) {
+			if (flow->fate & MLX5_FLOW_FATE_DROP)
+				mlx5_hrxq_drop_release(dev, verbs->hrxq);
+			else
+				mlx5_hrxq_release(dev, verbs->hrxq);
+			verbs->hrxq = NULL;
 		}
-	}
-	if (flow->verbs.hrxq) {
-		if (flow->fate & MLX5_FLOW_FATE_DROP)
-			mlx5_hrxq_drop_release(dev, flow->verbs.hrxq);
-		else if (flow->fate & MLX5_FLOW_FATE_QUEUE)
-			mlx5_hrxq_release(dev, flow->verbs.hrxq);
-		flow->verbs.hrxq = NULL;
 	}
 }
 
@@ -1360,46 +1914,68 @@  static int
 mlx5_flow_fate_apply(struct rte_eth_dev *dev, struct rte_flow *flow,
 		     struct rte_flow_error *error)
 {
-	if (flow->fate & MLX5_FLOW_FATE_DROP) {
-		flow->verbs.hrxq = mlx5_hrxq_drop_new(dev);
-		if (!flow->verbs.hrxq)
-			return rte_flow_error_set
-				(error, errno,
-				 RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
-				 NULL,
-				 "cannot allocate Drop queue");
-	} else if (flow->fate & MLX5_FLOW_FATE_QUEUE) {
-		struct mlx5_hrxq *hrxq;
-
-		hrxq = mlx5_hrxq_get(dev, rss_hash_default_key,
-				     rss_hash_default_key_len, 0,
-				     &flow->queue, 1, 0, 0);
-		if (!hrxq)
-			hrxq = mlx5_hrxq_new(dev, rss_hash_default_key,
-					     rss_hash_default_key_len, 0,
-					     &flow->queue, 1, 0, 0);
-		if (!hrxq)
-			return rte_flow_error_set(error, rte_errno,
-					RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
-					NULL,
-					"cannot create flow");
-		flow->verbs.hrxq = hrxq;
-	}
-	flow->verbs.flow =
-		mlx5_glue->create_flow(flow->verbs.hrxq->qp, flow->verbs.attr);
-	if (!flow->verbs.flow) {
-		if (flow->fate & MLX5_FLOW_FATE_DROP)
-			mlx5_hrxq_drop_release(dev, flow->verbs.hrxq);
-		else
-			mlx5_hrxq_release(dev, flow->verbs.hrxq);
-		flow->verbs.hrxq = NULL;
-		return rte_flow_error_set(error, errno,
-					  RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
-					  NULL,
-					  "kernel module refuses to create"
-					  " flow");
+	struct mlx5_flow_verbs *verbs;
+	int err;
+
+	LIST_FOREACH(verbs, &flow->verbs, next) {
+		if (flow->fate & MLX5_FLOW_FATE_DROP) {
+			verbs->hrxq = mlx5_hrxq_drop_new(dev);
+			if (!verbs->hrxq) {
+				rte_flow_error_set
+					(error, errno,
+					 RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+					 NULL,
+					 "cannot get drop hash queue");
+				goto error;
+			}
+		} else {
+			struct mlx5_hrxq *hrxq;
+
+			hrxq = mlx5_hrxq_get(dev, flow->key,
+					     rss_hash_default_key_len,
+					     verbs->hash_fields,
+					     (*flow->queue),
+					     flow->rss.queue_num, 0, 0);
+			if (!hrxq)
+				hrxq = mlx5_hrxq_new(dev, flow->key,
+						     rss_hash_default_key_len,
+						     verbs->hash_fields,
+						     (*flow->queue),
+						     flow->rss.queue_num, 0, 0);
+			if (!hrxq) {
+				rte_flow_error_set
+					(error, rte_errno,
+					 RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+					 NULL,
+					 "cannot get hash queue");
+				goto error;
+			}
+			verbs->hrxq = hrxq;
+		}
+		verbs->flow =
+			mlx5_glue->create_flow(verbs->hrxq->qp, verbs->attr);
+		if (!verbs->flow) {
+			rte_flow_error_set(error, errno,
+					   RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+					   NULL,
+					   "hardware refuses to create flow");
+			goto error;
+		}
 	}
 	return 0;
+error:
+	err = rte_errno; /* Save rte_errno before cleanup. */
+	LIST_FOREACH(verbs, &flow->verbs, next) {
+		if (verbs->hrxq) {
+			if (flow->fate & MLX5_FLOW_FATE_DROP)
+				mlx5_hrxq_drop_release(dev, verbs->hrxq);
+			else
+				mlx5_hrxq_release(dev, verbs->hrxq);
+			verbs->hrxq = NULL;
+		}
+	}
+	rte_errno = err; /* Restore rte_errno. */
+	return -rte_errno;
 }
 
 /**
@@ -1429,42 +2005,43 @@  mlx5_flow_list_create(struct rte_eth_dev *dev,
 		      const struct rte_flow_action actions[],
 		      struct rte_flow_error *error)
 {
-	struct rte_flow *flow;
-	size_t size;
+	struct rte_flow *flow = NULL;
+	size_t size = 0;
 	int ret;
 
-	ret = mlx5_flow_merge(dev, NULL, 0, attr, items, actions, error);
+	ret = mlx5_flow_merge(dev, flow, size, attr, items, actions, error);
 	if (ret < 0)
 		return NULL;
 	size = ret;
-	flow = rte_zmalloc(__func__, size, 0);
+	flow = rte_calloc(__func__, 1, size, 0);
 	if (!flow) {
 		rte_flow_error_set(error, ENOMEM,
 				   RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
 				   NULL,
-				   "cannot allocate memory");
+				   "not enough memory to create flow");
 		return NULL;
 	}
-	flow->verbs.attr = (struct ibv_flow_attr *)(flow + 1);
-	flow->verbs.specs = (uint8_t *)(flow->verbs.attr + 1);
 	ret = mlx5_flow_merge(dev, flow, size, attr, items, actions, error);
-	if (ret < 0)
-		goto error;
+	if (ret < 0) {
+		rte_free(flow);
+		return NULL;
+	}
 	assert((size_t)ret == size);
 	if (dev->data->dev_started) {
 		ret = mlx5_flow_fate_apply(dev, flow, error);
-		if (ret < 0)
-			goto error;
+		if (ret < 0) {
+			ret = rte_errno; /* Save rte_errno before cleanup. */
+			if (flow) {
+				mlx5_flow_fate_remove(dev, flow);
+				rte_free(flow);
+			}
+			rte_errno = ret; /* Restore rte_errno. */
+			return NULL;
+		}
 	}
 	mlx5_flow_rxq_mark(dev, flow);
 	TAILQ_INSERT_TAIL(list, flow, next);
 	return flow;
-error:
-	ret = rte_errno; /* Save rte_errno before cleanup. */
-	mlx5_flow_fate_remove(dev, flow);
-	rte_free(flow);
-	rte_errno = ret; /* Restore rte_errno. */
-	return NULL;
 }
 
 /**
@@ -1502,7 +2079,7 @@  mlx5_flow_list_destroy(struct rte_eth_dev *dev, struct mlx5_flows *list,
 	struct priv *priv = dev->data->dev_private;
 	struct rte_flow *rflow;
 	const uint32_t mask = MLX5_FLOW_MOD_FLAG & MLX5_FLOW_MOD_MARK;
-	int mark = 0;
+	unsigned int i;
 
 	mlx5_flow_fate_remove(dev, flow);
 	TAILQ_REMOVE(list, flow, next);
@@ -1512,18 +2089,28 @@  mlx5_flow_list_destroy(struct rte_eth_dev *dev, struct mlx5_flows *list,
 	}
 	/*
 	 * When a flow is removed and this flow has a flag/mark modifier, all
-	 * flows needs to be parse to verify if the Rx queue use by the flow
+	 * flows needs to be parse to verify if the Rx queues use by the flow
 	 * still need to track the flag/mark request.
 	 */
-	TAILQ_FOREACH(rflow, &priv->flows, next) {
-		if (!(rflow->modifier & mask))
-			continue;
-		if (flow->queue == rflow->queue) {
-			mark = 1;
-			break;
+	for (i = 0; i != flow->rss.queue_num; ++i) {
+		int mark = 0;
+
+		TAILQ_FOREACH(rflow, &priv->flows, next) {
+			unsigned int j;
+
+			if (!(rflow->modifier & mask))
+				continue;
+			for (j = 0; j != rflow->rss.queue_num; ++j) {
+				if ((*flow->queue)[i] == (*rflow->queue)[j]) {
+					mark = 1;
+					break;
+				}
+			}
+			if (mark)
+				break;
 		}
+		(*priv->rxqs)[i]->mark = !!mark;
 	}
-	(*priv->rxqs)[flow->queue]->mark = !!mark;
 	rte_free(flow);
 }
 
@@ -1654,7 +2241,7 @@  mlx5_ctrl_flow_vlan(struct rte_eth_dev *dev,
 	struct priv *priv = dev->data->dev_private;
 	const struct rte_flow_attr attr = {
 		.ingress = 1,
-		.priority = priv->config.flow_prio - 1,
+		.priority = MLX5_FLOW_PRIO_RSVD,
 	};
 	struct rte_flow_item items[] = {
 		{