[v4] net/mlx5: support metadata as flow rule criteria

Message ID 1539777217-64116-1-git-send-email-dekelp@mellanox.com (mailing list archive)
State Superseded, archived
Delegated to: Shahaf Shuler
Headers
Series [v4] net/mlx5: support metadata as flow rule criteria |

Checks

Context Check Description
ci/Intel-compilation success Compilation OK
ci/checkpatch success coding style OK

Commit Message

Dekel Peled Oct. 17, 2018, 11:53 a.m. UTC
  As described in series starting at [1], it adds option to set
metadata value as match pattern when creating a new flow rule.

This patch adds metadata support in mlx5 driver, in two parts:
- Add the validation and setting of metadata value in matcher,
  when creating a new flow rule.
- Add the passing of metadata value from mbuf to wqe when
  indicated by ol_flag, in different burst functions.

[1] "ethdev: support metadata as flow rule criteria"
    http://mails.dpdk.org/archives/dev/2018-October/115469.html

---
v4:
- Rebase.
- Apply code review comments.
v3:
- Update meta item validation.
v2:
- Split the support of egress rules to a different patch.
---
	
Signed-off-by: Dekel Peled <dekelp@mellanox.com>
---
 drivers/net/mlx5/mlx5_flow.c          |   2 +-
 drivers/net/mlx5/mlx5_flow.h          |   8 +++
 drivers/net/mlx5/mlx5_flow_dv.c       | 109 ++++++++++++++++++++++++++++++++++
 drivers/net/mlx5/mlx5_prm.h           |   2 +-
 drivers/net/mlx5/mlx5_rxtx.c          |  33 ++++++++--
 drivers/net/mlx5/mlx5_rxtx_vec.c      |  38 +++++++++---
 drivers/net/mlx5/mlx5_rxtx_vec.h      |   1 +
 drivers/net/mlx5/mlx5_rxtx_vec_neon.h |   9 ++-
 drivers/net/mlx5/mlx5_rxtx_vec_sse.h  |  10 ++--
 drivers/net/mlx5/mlx5_txq.c           |   6 ++
 10 files changed, 192 insertions(+), 26 deletions(-)
  

Comments

Yongseok Koh Oct. 18, 2018, 8 a.m. UTC | #1
On Wed, Oct 17, 2018 at 02:53:37PM +0300, Dekel Peled wrote:
> As described in series starting at [1], it adds option to set
> metadata value as match pattern when creating a new flow rule.
> 
> This patch adds metadata support in mlx5 driver, in two parts:
> - Add the validation and setting of metadata value in matcher,
>   when creating a new flow rule.
> - Add the passing of metadata value from mbuf to wqe when
>   indicated by ol_flag, in different burst functions.
> 
> [1] "ethdev: support metadata as flow rule criteria"
>     http://mails.dpdk.org/archives/dev/2018-October/115469.html
> 
> ---
> v4:
> - Rebase.
> - Apply code review comments.
> v3:
> - Update meta item validation.
> v2:
> - Split the support of egress rules to a different patch.
> ---
> 	
> Signed-off-by: Dekel Peled <dekelp@mellanox.com>
> ---
>  drivers/net/mlx5/mlx5_flow.c          |   2 +-
>  drivers/net/mlx5/mlx5_flow.h          |   8 +++
>  drivers/net/mlx5/mlx5_flow_dv.c       | 109 ++++++++++++++++++++++++++++++++++
>  drivers/net/mlx5/mlx5_prm.h           |   2 +-
>  drivers/net/mlx5/mlx5_rxtx.c          |  33 ++++++++--
>  drivers/net/mlx5/mlx5_rxtx_vec.c      |  38 +++++++++---
>  drivers/net/mlx5/mlx5_rxtx_vec.h      |   1 +
>  drivers/net/mlx5/mlx5_rxtx_vec_neon.h |   9 ++-
>  drivers/net/mlx5/mlx5_rxtx_vec_sse.h  |  10 ++--
>  drivers/net/mlx5/mlx5_txq.c           |   6 ++
>  10 files changed, 192 insertions(+), 26 deletions(-)
> 
> diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c
> index bd70fce..15262f6 100644
> --- a/drivers/net/mlx5/mlx5_flow.c
> +++ b/drivers/net/mlx5/mlx5_flow.c
> @@ -418,7 +418,7 @@ uint32_t mlx5_flow_adjust_priority(struct rte_eth_dev *dev, int32_t priority,
>   * @return
>   *   0 on success, a negative errno value otherwise and rte_errno is set.
>   */
> -static int
> +int
>  mlx5_flow_item_acceptable(const struct rte_flow_item *item,
>  			  const uint8_t *mask,
>  			  const uint8_t *nic_mask,
> diff --git a/drivers/net/mlx5/mlx5_flow.h b/drivers/net/mlx5/mlx5_flow.h
> index 094f666..834a6ed 100644
> --- a/drivers/net/mlx5/mlx5_flow.h
> +++ b/drivers/net/mlx5/mlx5_flow.h
> @@ -43,6 +43,9 @@
>  #define MLX5_FLOW_LAYER_GRE (1u << 14)
>  #define MLX5_FLOW_LAYER_MPLS (1u << 15)
>  
> +/* General pattern items bits. */
> +#define MLX5_FLOW_ITEM_METADATA (1u << 16)
> +
>  /* Outer Masks. */
>  #define MLX5_FLOW_LAYER_OUTER_L3 \
>  	(MLX5_FLOW_LAYER_OUTER_L3_IPV4 | MLX5_FLOW_LAYER_OUTER_L3_IPV6)
> @@ -307,6 +310,11 @@ int mlx5_flow_validate_action_rss(const struct rte_flow_action *action,
>  int mlx5_flow_validate_attributes(struct rte_eth_dev *dev,
>  				  const struct rte_flow_attr *attributes,
>  				  struct rte_flow_error *error);
> +int mlx5_flow_item_acceptable(const struct rte_flow_item *item,
> +			      const uint8_t *mask,
> +			      const uint8_t *nic_mask,
> +			      unsigned int size,
> +			      struct rte_flow_error *error);
>  int mlx5_flow_validate_item_eth(const struct rte_flow_item *item,
>  				uint64_t item_flags,
>  				struct rte_flow_error *error);
> diff --git a/drivers/net/mlx5/mlx5_flow_dv.c b/drivers/net/mlx5/mlx5_flow_dv.c
> index a013201..bfddfab 100644
> --- a/drivers/net/mlx5/mlx5_flow_dv.c
> +++ b/drivers/net/mlx5/mlx5_flow_dv.c
> @@ -36,6 +36,69 @@
>  #ifdef HAVE_IBV_FLOW_DV_SUPPORT
>  
>  /**
> + * Validate META item.
> + *
> + * @param[in] dev
> + *   Pointer to the rte_eth_dev structure.
> + * @param[in] item
> + *   Item specification.
> + * @param[in] attr
> + *   Attributes of flow that includes this item.
> + * @param[out] error
> + *   Pointer to error structure.
> + *
> + * @return
> + *   0 on success, a negative errno value otherwise and rte_errno is set.
> + */
> +static int
> +flow_dv_validate_item_meta(struct rte_eth_dev *dev,
> +			   const struct rte_flow_item *item,
> +			   const struct rte_flow_attr *attr,
> +			   struct rte_flow_error *error)
> +{
> +	const struct rte_flow_item_meta *spec = item->spec;
> +	const struct rte_flow_item_meta *mask = item->mask;
> +

No blank line.

> +	const struct rte_flow_item_meta nic_mask = {
> +		.data = RTE_BE32(UINT32_MAX)
> +	};
> +

Ditto.

> +	int ret;
> +	uint64_t offloads = dev->data->dev_conf.txmode.offloads;
> +
> +	if (!(offloads & DEV_TX_OFFLOAD_MATCH_METADATA))
> +		return rte_flow_error_set(error, EPERM,
> +					  RTE_FLOW_ERROR_TYPE_ITEM,
> +					  NULL,
> +					  "match on metadata offload "
> +					  "configuration is off for this port");
> +	if (!spec)
> +		return rte_flow_error_set(error, EINVAL,
> +					  RTE_FLOW_ERROR_TYPE_ITEM_SPEC,
> +					  item->spec,
> +					  "data cannot be empty");
> +	if (!spec->data)
> +		return rte_flow_error_set(error, EINVAL,
> +					  RTE_FLOW_ERROR_TYPE_ITEM_SPEC,
> +					  NULL,
> +					  "data cannot be zero");
> +	if (!mask)
> +		mask = &rte_flow_item_meta_mask;
> +	ret = mlx5_flow_item_acceptable(item, (const uint8_t *)mask,
> +					(const uint8_t *)&nic_mask,
> +					sizeof(struct rte_flow_item_meta),
> +					error);
> +	if (ret < 0)
> +		return ret;
> +	if (attr->ingress)
> +		return rte_flow_error_set(error, ENOTSUP,
> +					  RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
> +					  NULL,
> +					  "pattern not supported for ingress");
> +	return 0;
> +}
> +
> +/**
>   * Verify the @p attributes will be correctly understood by the NIC and store
>   * them in the @p flow if everything is correct.
>   *
> @@ -214,6 +277,13 @@
>  				return ret;
>  			item_flags |= MLX5_FLOW_LAYER_MPLS;
>  			break;
> +		case RTE_FLOW_ITEM_TYPE_META:
> +			ret = flow_dv_validate_item_meta(dev, items, attr,
> +							 error);
> +			if (ret < 0)
> +				return ret;
> +			item_flags |= MLX5_FLOW_ITEM_METADATA;
> +			break;
>  		default:
>  			return rte_flow_error_set(error, ENOTSUP,
>  						  RTE_FLOW_ERROR_TYPE_ITEM,
> @@ -855,6 +925,42 @@
>  }
>  
>  /**
> + * Add META item to matcher
> + *
> + * @param[in, out] matcher
> + *   Flow matcher.
> + * @param[in, out] key
> + *   Flow matcher value.
> + * @param[in] item
> + *   Flow pattern to translate.
> + * @param[in] inner
> + *   Item is inner pattern.
> + */
> +static void
> +flow_dv_translate_item_meta(void *matcher, void *key,
> +				const struct rte_flow_item *item)
> +{
> +	const struct rte_flow_item_meta *meta_m;
> +	const struct rte_flow_item_meta *meta_v;
> +
> +	void *misc2_m =
> +		MLX5_ADDR_OF(fte_match_param, matcher, misc_parameters_2);
> +	void *misc2_v =
> +		MLX5_ADDR_OF(fte_match_param, key, misc_parameters_2);
> +
> +	meta_m = (const void *)item->mask;
> +	if (!meta_m)
> +		meta_m = &rte_flow_item_meta_mask;
> +	meta_v = (const void *)item->spec;
> +	if (meta_v) {
> +		MLX5_SET(fte_match_set_misc2, misc2_m, metadata_reg_a,
> +			RTE_BE32(meta_m->data));

Nope. RTE_BE32() is for builtin constant, not for a variable.
You should use rte_cpu_to_be_32() instead.

> +		MLX5_SET(fte_match_set_misc2, misc2_v, metadata_reg_a,
> +			RTE_BE32(meta_v->data));

Same here.

> +	}
> +}
> +
> +/**
>   * Update the matcher and the value based the selected item.
>   *
>   * @param[in, out] matcher
> @@ -940,6 +1046,9 @@
>  		flow_dv_translate_item_vxlan(tmatcher->mask.buf, key, item,
>  					     inner);
>  		break;
> +	case RTE_FLOW_ITEM_TYPE_META:
> +		flow_dv_translate_item_meta(tmatcher->mask.buf, key, item);
> +		break;
>  	default:
>  		break;
>  	}
> diff --git a/drivers/net/mlx5/mlx5_prm.h b/drivers/net/mlx5/mlx5_prm.h
> index 69296a0..29742b1 100644
> --- a/drivers/net/mlx5/mlx5_prm.h
> +++ b/drivers/net/mlx5/mlx5_prm.h
> @@ -159,7 +159,7 @@ struct mlx5_wqe_eth_seg_small {
>  	uint8_t	cs_flags;
>  	uint8_t	rsvd1;
>  	uint16_t mss;
> -	uint32_t rsvd2;
> +	uint32_t flow_table_metadata;
>  	uint16_t inline_hdr_sz;
>  	uint8_t inline_hdr[2];
>  } __rte_aligned(MLX5_WQE_DWORD_SIZE);
> diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
> index 558e6b6..5b4d2fd 100644
> --- a/drivers/net/mlx5/mlx5_rxtx.c
> +++ b/drivers/net/mlx5/mlx5_rxtx.c
> @@ -523,6 +523,7 @@
>  		uint8_t tso = txq->tso_en && (buf->ol_flags & PKT_TX_TCP_SEG);
>  		uint32_t swp_offsets = 0;
>  		uint8_t swp_types = 0;
> +		uint32_t metadata;
>  		uint16_t tso_segsz = 0;
>  #ifdef MLX5_PMD_SOFT_COUNTERS
>  		uint32_t total_length = 0;
> @@ -566,6 +567,10 @@
>  		cs_flags = txq_ol_cksum_to_cs(buf);
>  		txq_mbuf_to_swp(txq, buf, (uint8_t *)&swp_offsets, &swp_types);
>  		raw = ((uint8_t *)(uintptr_t)wqe) + 2 * MLX5_WQE_DWORD_SIZE;
> +		/* Copy metadata from mbuf if valid */
> +		metadata = buf->ol_flags & PKT_TX_METADATA ?
> +						buf->tx_metadata : 0;

Indentation.

> +

No blank line.

>  		/* Replace the Ethernet type by the VLAN if necessary. */
>  		if (buf->ol_flags & PKT_TX_VLAN_PKT) {
>  			uint32_t vlan = rte_cpu_to_be_32(0x81000000 |
> @@ -781,7 +786,7 @@
>  				swp_offsets,
>  				cs_flags | (swp_types << 8) |
>  				(rte_cpu_to_be_16(tso_segsz) << 16),
> -				0,
> +				rte_cpu_to_be_32(metadata),
>  				(ehdr << 16) | rte_cpu_to_be_16(tso_header_sz),
>  			};
>  		} else {
> @@ -795,7 +800,7 @@
>  			wqe->eseg = (rte_v128u32_t){
>  				swp_offsets,
>  				cs_flags | (swp_types << 8),
> -				0,
> +				rte_cpu_to_be_32(metadata),
>  				(ehdr << 16) | rte_cpu_to_be_16(pkt_inline_sz),
>  			};
>  		}
> @@ -861,7 +866,7 @@
>  	mpw->wqe->eseg.inline_hdr_sz = 0;
>  	mpw->wqe->eseg.rsvd0 = 0;
>  	mpw->wqe->eseg.rsvd1 = 0;
> -	mpw->wqe->eseg.rsvd2 = 0;
> +	mpw->wqe->eseg.flow_table_metadata = 0;
>  	mpw->wqe->ctrl[0] = rte_cpu_to_be_32((MLX5_OPC_MOD_MPW << 24) |
>  					     (txq->wqe_ci << 8) |
>  					     MLX5_OPCODE_TSO);
> @@ -948,6 +953,7 @@
>  		uint32_t length;
>  		unsigned int segs_n = buf->nb_segs;
>  		uint32_t cs_flags;
> +		uint32_t metadata;
>  
>  		/*
>  		 * Make sure there is enough room to store this packet and
> @@ -964,6 +970,9 @@
>  		max_elts -= segs_n;
>  		--pkts_n;
>  		cs_flags = txq_ol_cksum_to_cs(buf);
> +		/* Copy metadata from mbuf if valid */
> +		metadata = buf->ol_flags & PKT_TX_METADATA ?
> +						buf->tx_metadata : 0;

Indentation.
And no need to change to big-endian? I think it needs.

>  		/* Retrieve packet information. */
>  		length = PKT_LEN(buf);
>  		assert(length);
> @@ -971,6 +980,7 @@
>  		if ((mpw.state == MLX5_MPW_STATE_OPENED) &&
>  		    ((mpw.len != length) ||
>  		     (segs_n != 1) ||
> +		     (mpw.wqe->eseg.flow_table_metadata != metadata) ||
>  		     (mpw.wqe->eseg.cs_flags != cs_flags)))
>  			mlx5_mpw_close(txq, &mpw);
>  		if (mpw.state == MLX5_MPW_STATE_CLOSED) {
> @@ -984,6 +994,7 @@
>  			max_wqe -= 2;
>  			mlx5_mpw_new(txq, &mpw, length);
>  			mpw.wqe->eseg.cs_flags = cs_flags;
> +			mpw.wqe->eseg.flow_table_metadata = metadata;
>  		}
>  		/* Multi-segment packets must be alone in their MPW. */
>  		assert((segs_n == 1) || (mpw.pkts_n == 0));
> @@ -1082,7 +1093,7 @@
>  	mpw->wqe->eseg.cs_flags = 0;
>  	mpw->wqe->eseg.rsvd0 = 0;
>  	mpw->wqe->eseg.rsvd1 = 0;
> -	mpw->wqe->eseg.rsvd2 = 0;
> +	mpw->wqe->eseg.flow_table_metadata = 0;
>  	inl = (struct mlx5_wqe_inl_small *)
>  		(((uintptr_t)mpw->wqe) + 2 * MLX5_WQE_DWORD_SIZE);
>  	mpw->data.raw = (uint8_t *)&inl->raw;
> @@ -1172,6 +1183,7 @@
>  		uint32_t length;
>  		unsigned int segs_n = buf->nb_segs;
>  		uint8_t cs_flags;
> +		uint32_t metadata;
>  
>  		/*
>  		 * Make sure there is enough room to store this packet and
> @@ -1193,18 +1205,23 @@
>  		 */
>  		max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi);
>  		cs_flags = txq_ol_cksum_to_cs(buf);
> +		/* Copy metadata from mbuf if valid */
> +		metadata = buf->ol_flags & PKT_TX_METADATA ?
> +						buf->tx_metadata : 0;

Indentation.
And no need to change to big-endian?

>  		/* Retrieve packet information. */
>  		length = PKT_LEN(buf);
>  		/* Start new session if packet differs. */
>  		if (mpw.state == MLX5_MPW_STATE_OPENED) {
>  			if ((mpw.len != length) ||
>  			    (segs_n != 1) ||
> +			    (mpw.wqe->eseg.flow_table_metadata != metadata) ||
>  			    (mpw.wqe->eseg.cs_flags != cs_flags))
>  				mlx5_mpw_close(txq, &mpw);
>  		} else if (mpw.state == MLX5_MPW_INL_STATE_OPENED) {
>  			if ((mpw.len != length) ||
>  			    (segs_n != 1) ||
>  			    (length > inline_room) ||
> +			    (mpw.wqe->eseg.flow_table_metadata != metadata) ||
>  			    (mpw.wqe->eseg.cs_flags != cs_flags)) {
>  				mlx5_mpw_inline_close(txq, &mpw);
>  				inline_room =
> @@ -1224,12 +1241,14 @@
>  				max_wqe -= 2;
>  				mlx5_mpw_new(txq, &mpw, length);
>  				mpw.wqe->eseg.cs_flags = cs_flags;
> +				mpw.wqe->eseg.flow_table_metadata = metadata;
>  			} else {
>  				if (unlikely(max_wqe < wqe_inl_n))
>  					break;
>  				max_wqe -= wqe_inl_n;
>  				mlx5_mpw_inline_new(txq, &mpw, length);
>  				mpw.wqe->eseg.cs_flags = cs_flags;
> +				mpw.wqe->eseg.flow_table_metadata = metadata;
>  			}
>  		}
>  		/* Multi-segment packets must be alone in their MPW. */
> @@ -1461,6 +1480,7 @@
>  		unsigned int do_inline = 0; /* Whether inline is possible. */
>  		uint32_t length;
>  		uint8_t cs_flags;
> +		uint32_t metadata;
>  
>  		/* Multi-segmented packet is handled in slow-path outside. */
>  		assert(NB_SEGS(buf) == 1);
> @@ -1468,6 +1488,9 @@
>  		if (max_elts - j == 0)
>  			break;
>  		cs_flags = txq_ol_cksum_to_cs(buf);
> +		/* Copy metadata from mbuf if valid */
> +		metadata = buf->ol_flags & PKT_TX_METADATA ?
> +						buf->tx_metadata : 0;

Indentation.
And no need to change to big-endian?

>  		/* Retrieve packet information. */
>  		length = PKT_LEN(buf);
>  		/* Start new session if:
> @@ -1482,6 +1505,7 @@
>  			    (length <= txq->inline_max_packet_sz &&
>  			     inl_pad + sizeof(inl_hdr) + length >
>  			     mpw_room) ||
> +			     (mpw.wqe->eseg.flow_table_metadata != metadata) ||
>  			    (mpw.wqe->eseg.cs_flags != cs_flags))
>  				max_wqe -= mlx5_empw_close(txq, &mpw);
>  		}
> @@ -1505,6 +1529,7 @@
>  				    sizeof(inl_hdr) + length <= mpw_room &&
>  				    !txq->mpw_hdr_dseg;
>  			mpw.wqe->eseg.cs_flags = cs_flags;
> +			mpw.wqe->eseg.flow_table_metadata = metadata;
>  		} else {
>  			/* Evaluate whether the next packet can be inlined.
>  			 * Inlininig is possible when:
> diff --git a/drivers/net/mlx5/mlx5_rxtx_vec.c b/drivers/net/mlx5/mlx5_rxtx_vec.c
> index 0a4aed8..16a8608 100644
> --- a/drivers/net/mlx5/mlx5_rxtx_vec.c
> +++ b/drivers/net/mlx5/mlx5_rxtx_vec.c
> @@ -41,6 +41,8 @@
>  
>  /**
>   * Count the number of packets having same ol_flags and calculate cs_flags.
> + * If PKT_TX_METADATA is set in ol_flags, packets must have same metadata
> + * as well.

Packets can have different metadata but we just want to count the number of
packets having same data. Please correct the comment.

>   *
>   * @param pkts
>   *   Pointer to array of packets.
> @@ -48,26 +50,41 @@
>   *   Number of packets.
>   * @param cs_flags
>   *   Pointer of flags to be returned.
> + * @param metadata
> + *   Pointer of metadata to be returned.
> + * @param txq_offloads
> + *   Offloads enabled on Tx queue
>   *
>   * @return
> - *   Number of packets having same ol_flags.
> + *   Number of packets having same ol_flags and metadata, if relevant.
>   */
>  static inline unsigned int
> -txq_calc_offload(struct rte_mbuf **pkts, uint16_t pkts_n, uint8_t *cs_flags)
> +txq_calc_offload(struct rte_mbuf **pkts, uint16_t pkts_n, uint8_t *cs_flags,
> +		 uint32_t *metadata, const uint64_t txq_offloads)
>  {
>  	unsigned int pos;
>  	const uint64_t ol_mask =
>  		PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM |
>  		PKT_TX_UDP_CKSUM | PKT_TX_TUNNEL_GRE |
> -		PKT_TX_TUNNEL_VXLAN | PKT_TX_OUTER_IP_CKSUM;
> +		PKT_TX_TUNNEL_VXLAN | PKT_TX_OUTER_IP_CKSUM | PKT_TX_METADATA;

Shouldn't add PKT_TX_METADATA. As it is for cksum, you might rather want to
change the name, e.g., cksum_ol_mask.

>  
>  	if (!pkts_n)
>  		return 0;
>  	/* Count the number of packets having same ol_flags. */

This comment has to be corrected and moved.

> -	for (pos = 1; pos < pkts_n; ++pos)
> -		if ((pkts[pos]->ol_flags ^ pkts[0]->ol_flags) & ol_mask)
> +	for (pos = 1; pos < pkts_n; ++pos) {
> +		if ((txq_offloads & MLX5_VEC_TX_CKSUM_OFFLOAD_CAP) &&
> +			((pkts[pos]->ol_flags ^ pkts[0]->ol_flags) & ol_mask))

Indentation.

>  			break;
> +		/* If the metadata ol_flag is set,
> +		 *  metadata must be same in all packets.
> +		 */

Correct comment. First line should be empty for multi-line comment.
And it can't be 'must'. We are not forcing it but just counting the number of
packets having same metadata like I mentioned above.

> +		if ((txq_offloads & DEV_TX_OFFLOAD_MATCH_METADATA) &&
> +			(pkts[pos]->ol_flags & PKT_TX_METADATA) &&
> +			pkts[0]->tx_metadata != pkts[pos]->tx_metadata)

Disagree. What if pkts[0] doesn't have PKT_TXT_METADATA while pkt[1] has it?
And, indentation.

> +			break;
> +	}
>  	*cs_flags = txq_ol_cksum_to_cs(pkts[0]);
> +	*metadata = rte_cpu_to_be_32(pkts[0]->tx_metadata);

Same here. You should check if pkts[0] has metadata first.

>  	return pos;

Here's my suggestion for the whole func.

	unsigned int pos;
	const uint64_t cksum_ol_mask =
		PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM |
		PKT_TX_UDP_CKSUM | PKT_TX_TUNNEL_GRE |
		PKT_TX_TUNNEL_VXLAN | PKT_TX_OUTER_IP_CKSUM;
	const uint32_t p0_metadata;

	if (!pkts_n)
		return 0;
	p0_metadata = pkts[0]->ol_flags & PKT_TX_METADATA ?
		      pkts[0]->tx_metadata : 0;
	/* Count the number of packets having same offload parameters. */
	for (pos = 1; pos < pkts_n; ++pos) {
		/* Check if packet can have same checksum flags. */
		if ((txq_offloads & MLX5_VEC_TX_CKSUM_OFFLOAD_CAP) &&
		    ((pkts[pos]->ol_flags ^ pkts[0]->ol_flags) & cksum_ol_mask))
			break;
		/* Check if packet has same metadata. */
		if (txq_offloads & DEV_TX_OFFLOAD_MATCH_METADATA) {
			const uint32_t p1_metadata =
				pkts[pos]->ol_flags & PKT_TX_METADATA ?
				pkts[pos]->tx_metadata : 0;

			if (p1_metadata != p0_metadata)
				break;
		}
	}
	*cs_flags = txq_ol_cksum_to_cs(pkts[0]);
	*metadata = rte_cpu_to_be_32(p0_metadata);
	return pos;
>  }
>  
> @@ -96,7 +113,7 @@
>  		uint16_t ret;
>  
>  		n = RTE_MIN((uint16_t)(pkts_n - nb_tx), MLX5_VPMD_TX_MAX_BURST);
> -		ret = txq_burst_v(txq, &pkts[nb_tx], n, 0);
> +		ret = txq_burst_v(txq, &pkts[nb_tx], n, 0, 0);
>  		nb_tx += ret;
>  		if (!ret)
>  			break;
> @@ -127,6 +144,7 @@
>  		uint8_t cs_flags = 0;
>  		uint16_t n;
>  		uint16_t ret;
> +		uint32_t metadata = 0;

Let's use rte_be32_t instead.

>  
>  		/* Transmit multi-seg packets in the head of pkts list. */
>  		if ((txq->offloads & DEV_TX_OFFLOAD_MULTI_SEGS) &&
> @@ -137,9 +155,11 @@
>  		n = RTE_MIN((uint16_t)(pkts_n - nb_tx), MLX5_VPMD_TX_MAX_BURST);
>  		if (txq->offloads & DEV_TX_OFFLOAD_MULTI_SEGS)
>  			n = txq_count_contig_single_seg(&pkts[nb_tx], n);
> -		if (txq->offloads & MLX5_VEC_TX_CKSUM_OFFLOAD_CAP)
> -			n = txq_calc_offload(&pkts[nb_tx], n, &cs_flags);
> -		ret = txq_burst_v(txq, &pkts[nb_tx], n, cs_flags);
> +		if (txq->offloads & (MLX5_VEC_TX_CKSUM_OFFLOAD_CAP |
> +				DEV_TX_OFFLOAD_MATCH_METADATA))

Indentation.

> +			n = txq_calc_offload(&pkts[nb_tx], n,
> +					&cs_flags, &metadata, txq->offloads);

Indentation.

> +		ret = txq_burst_v(txq, &pkts[nb_tx], n, cs_flags, metadata);
>  		nb_tx += ret;
>  		if (!ret)
>  			break;
> diff --git a/drivers/net/mlx5/mlx5_rxtx_vec.h b/drivers/net/mlx5/mlx5_rxtx_vec.h
> index fb884f9..fda7004 100644
> --- a/drivers/net/mlx5/mlx5_rxtx_vec.h
> +++ b/drivers/net/mlx5/mlx5_rxtx_vec.h
> @@ -22,6 +22,7 @@
>  /* HW offload capabilities of vectorized Tx. */
>  #define MLX5_VEC_TX_OFFLOAD_CAP \
>  	(MLX5_VEC_TX_CKSUM_OFFLOAD_CAP | \
> +	 DEV_TX_OFFLOAD_MATCH_METADATA | \
>  	 DEV_TX_OFFLOAD_MULTI_SEGS)
>  
>  /*
> diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_neon.h b/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
> index b37b738..a8a4d7b 100644
> --- a/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
> +++ b/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
> @@ -201,13 +201,15 @@
>   *   Number of packets to be sent (<= MLX5_VPMD_TX_MAX_BURST).
>   * @param cs_flags
>   *   Checksum offload flags to be written in the descriptor.
> + * @param metadata
> + *   Metadata value to be written in the descriptor.
>   *
>   * @return
>   *   Number of packets successfully transmitted (<= pkts_n).
>   */
>  static inline uint16_t
>  txq_burst_v(struct mlx5_txq_data *txq, struct rte_mbuf **pkts, uint16_t pkts_n,
> -	    uint8_t cs_flags)
> +	    uint8_t cs_flags, uint32_t metadata)

Let's use rte_be32_t instead.

>  {
>  	struct rte_mbuf **elts;
>  	uint16_t elts_head = txq->elts_head;
> @@ -294,10 +296,7 @@
>  	vst1q_u8((void *)t_wqe, ctrl);
>  	/* Fill ESEG in the header. */
>  	vst1q_u8((void *)(t_wqe + 1),
> -		 ((uint8x16_t) { 0, 0, 0, 0,
> -				 cs_flags, 0, 0, 0,
> -				 0, 0, 0, 0,
> -				 0, 0, 0, 0 }));
> +		 ((uint32x4_t) { 0, cs_flags, metadata, 0 }));
>  #ifdef MLX5_PMD_SOFT_COUNTERS
>  	txq->stats.opackets += pkts_n;
>  #endif
> diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_sse.h b/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
> index 54b3783..31aae4a 100644
> --- a/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
> +++ b/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
> @@ -202,13 +202,15 @@
>   *   Number of packets to be sent (<= MLX5_VPMD_TX_MAX_BURST).
>   * @param cs_flags
>   *   Checksum offload flags to be written in the descriptor.
> + * @param metadata
> + *   Metadata value to be written in the descriptor.
>   *
>   * @return
>   *   Number of packets successfully transmitted (<= pkts_n).
>   */
>  static inline uint16_t
>  txq_burst_v(struct mlx5_txq_data *txq, struct rte_mbuf **pkts, uint16_t pkts_n,
> -	    uint8_t cs_flags)
> +	    uint8_t cs_flags, uint32_t metadata)

Let's use rte_be32_t instead.

>  {
>  	struct rte_mbuf **elts;
>  	uint16_t elts_head = txq->elts_head;
> @@ -292,11 +294,7 @@
>  	ctrl = _mm_shuffle_epi8(ctrl, shuf_mask_ctrl);
>  	_mm_store_si128(t_wqe, ctrl);
>  	/* Fill ESEG in the header. */
> -	_mm_store_si128(t_wqe + 1,
> -			_mm_set_epi8(0, 0, 0, 0,
> -				     0, 0, 0, 0,
> -				     0, 0, 0, cs_flags,
> -				     0, 0, 0, 0));
> +	_mm_store_si128(t_wqe + 1, _mm_set_epi32(0, metadata, cs_flags, 0));
>  #ifdef MLX5_PMD_SOFT_COUNTERS
>  	txq->stats.opackets += pkts_n;
>  #endif
> diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
> index f9bc473..7263fb1 100644
> --- a/drivers/net/mlx5/mlx5_txq.c
> +++ b/drivers/net/mlx5/mlx5_txq.c
> @@ -128,6 +128,12 @@
>  			offloads |= (DEV_TX_OFFLOAD_VXLAN_TNL_TSO |
>  				     DEV_TX_OFFLOAD_GRE_TNL_TSO);
>  	}
> +

Please no blank line.

> +#ifdef HAVE_IBV_FLOW_DV_SUPPORT
> +	if (config->dv_flow_en)
> +		offloads |= DEV_TX_OFFLOAD_MATCH_METADATA;
> +#endif
> +

Same here.

>  	return offloads;
>  }
>  
> -- 
> 1.8.3.1
>
  
Dekel Peled Oct. 21, 2018, 1:44 p.m. UTC | #2
Thanks, PSB.

> -----Original Message-----
> From: Yongseok Koh
> Sent: Thursday, October 18, 2018 11:01 AM
> To: Dekel Peled <dekelp@mellanox.com>
> Cc: Shahaf Shuler <shahafs@mellanox.com>; dev@dpdk.org; Ori Kam
> <orika@mellanox.com>
> Subject: Re: [PATCH v4] net/mlx5: support metadata as flow rule criteria
> 
> On Wed, Oct 17, 2018 at 02:53:37PM +0300, Dekel Peled wrote:
> > As described in series starting at [1], it adds option to set metadata
> > value as match pattern when creating a new flow rule.
> >
> > This patch adds metadata support in mlx5 driver, in two parts:
> > - Add the validation and setting of metadata value in matcher,
> >   when creating a new flow rule.
> > - Add the passing of metadata value from mbuf to wqe when
> >   indicated by ol_flag, in different burst functions.
> >
> > [1] "ethdev: support metadata as flow rule criteria"
> >     http://mails.dpdk.org/archives/dev/2018-October/115469.html
> >
> > ---
> > v4:
> > - Rebase.
> > - Apply code review comments.
> > v3:
> > - Update meta item validation.
> > v2:
> > - Split the support of egress rules to a different patch.
> > ---
> >
> > Signed-off-by: Dekel Peled <dekelp@mellanox.com>
> > ---
> >  drivers/net/mlx5/mlx5_flow.c          |   2 +-
> >  drivers/net/mlx5/mlx5_flow.h          |   8 +++
> >  drivers/net/mlx5/mlx5_flow_dv.c       | 109
> ++++++++++++++++++++++++++++++++++
> >  drivers/net/mlx5/mlx5_prm.h           |   2 +-
> >  drivers/net/mlx5/mlx5_rxtx.c          |  33 ++++++++--
> >  drivers/net/mlx5/mlx5_rxtx_vec.c      |  38 +++++++++---
> >  drivers/net/mlx5/mlx5_rxtx_vec.h      |   1 +
> >  drivers/net/mlx5/mlx5_rxtx_vec_neon.h |   9 ++-
> >  drivers/net/mlx5/mlx5_rxtx_vec_sse.h  |  10 ++--
> >  drivers/net/mlx5/mlx5_txq.c           |   6 ++
> >  10 files changed, 192 insertions(+), 26 deletions(-)
> >
> > diff --git a/drivers/net/mlx5/mlx5_flow.c
> > b/drivers/net/mlx5/mlx5_flow.c index bd70fce..15262f6 100644
> > --- a/drivers/net/mlx5/mlx5_flow.c
> > +++ b/drivers/net/mlx5/mlx5_flow.c
> > @@ -418,7 +418,7 @@ uint32_t mlx5_flow_adjust_priority(struct
> rte_eth_dev *dev, int32_t priority,
> >   * @return
> >   *   0 on success, a negative errno value otherwise and rte_errno is set.
> >   */
> > -static int
> > +int
> >  mlx5_flow_item_acceptable(const struct rte_flow_item *item,
> >  			  const uint8_t *mask,
> >  			  const uint8_t *nic_mask,
> > diff --git a/drivers/net/mlx5/mlx5_flow.h
> > b/drivers/net/mlx5/mlx5_flow.h index 094f666..834a6ed 100644
> > --- a/drivers/net/mlx5/mlx5_flow.h
> > +++ b/drivers/net/mlx5/mlx5_flow.h
> > @@ -43,6 +43,9 @@
> >  #define MLX5_FLOW_LAYER_GRE (1u << 14)  #define
> MLX5_FLOW_LAYER_MPLS
> > (1u << 15)
> >
> > +/* General pattern items bits. */
> > +#define MLX5_FLOW_ITEM_METADATA (1u << 16)
> > +
> >  /* Outer Masks. */
> >  #define MLX5_FLOW_LAYER_OUTER_L3 \
> >  	(MLX5_FLOW_LAYER_OUTER_L3_IPV4 |
> MLX5_FLOW_LAYER_OUTER_L3_IPV6) @@
> > -307,6 +310,11 @@ int mlx5_flow_validate_action_rss(const struct
> > rte_flow_action *action,  int mlx5_flow_validate_attributes(struct
> rte_eth_dev *dev,
> >  				  const struct rte_flow_attr *attributes,
> >  				  struct rte_flow_error *error);
> > +int mlx5_flow_item_acceptable(const struct rte_flow_item *item,
> > +			      const uint8_t *mask,
> > +			      const uint8_t *nic_mask,
> > +			      unsigned int size,
> > +			      struct rte_flow_error *error);
> >  int mlx5_flow_validate_item_eth(const struct rte_flow_item *item,
> >  				uint64_t item_flags,
> >  				struct rte_flow_error *error);
> > diff --git a/drivers/net/mlx5/mlx5_flow_dv.c
> > b/drivers/net/mlx5/mlx5_flow_dv.c index a013201..bfddfab 100644
> > --- a/drivers/net/mlx5/mlx5_flow_dv.c
> > +++ b/drivers/net/mlx5/mlx5_flow_dv.c
> > @@ -36,6 +36,69 @@
> >  #ifdef HAVE_IBV_FLOW_DV_SUPPORT
> >
> >  /**
> > + * Validate META item.
> > + *
> > + * @param[in] dev
> > + *   Pointer to the rte_eth_dev structure.
> > + * @param[in] item
> > + *   Item specification.
> > + * @param[in] attr
> > + *   Attributes of flow that includes this item.
> > + * @param[out] error
> > + *   Pointer to error structure.
> > + *
> > + * @return
> > + *   0 on success, a negative errno value otherwise and rte_errno is set.
> > + */
> > +static int
> > +flow_dv_validate_item_meta(struct rte_eth_dev *dev,
> > +			   const struct rte_flow_item *item,
> > +			   const struct rte_flow_attr *attr,
> > +			   struct rte_flow_error *error)
> > +{
> > +	const struct rte_flow_item_meta *spec = item->spec;
> > +	const struct rte_flow_item_meta *mask = item->mask;
> > +
> 
> No blank line.

Removed.

> 
> > +	const struct rte_flow_item_meta nic_mask = {
> > +		.data = RTE_BE32(UINT32_MAX)
> > +	};
> > +
> 
> Ditto.

Removed.

> 
> > +	int ret;
> > +	uint64_t offloads = dev->data->dev_conf.txmode.offloads;
> > +
> > +	if (!(offloads & DEV_TX_OFFLOAD_MATCH_METADATA))
> > +		return rte_flow_error_set(error, EPERM,
> > +					  RTE_FLOW_ERROR_TYPE_ITEM,
> > +					  NULL,
> > +					  "match on metadata offload "
> > +					  "configuration is off for this port");
> > +	if (!spec)
> > +		return rte_flow_error_set(error, EINVAL,
> > +
> RTE_FLOW_ERROR_TYPE_ITEM_SPEC,
> > +					  item->spec,
> > +					  "data cannot be empty");
> > +	if (!spec->data)
> > +		return rte_flow_error_set(error, EINVAL,
> > +
> RTE_FLOW_ERROR_TYPE_ITEM_SPEC,
> > +					  NULL,
> > +					  "data cannot be zero");
> > +	if (!mask)
> > +		mask = &rte_flow_item_meta_mask;
> > +	ret = mlx5_flow_item_acceptable(item, (const uint8_t *)mask,
> > +					(const uint8_t *)&nic_mask,
> > +					sizeof(struct rte_flow_item_meta),
> > +					error);
> > +	if (ret < 0)
> > +		return ret;
> > +	if (attr->ingress)
> > +		return rte_flow_error_set(error, ENOTSUP,
> > +
> RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
> > +					  NULL,
> > +					  "pattern not supported for
> ingress");
> > +	return 0;
> > +}
> > +
> > +/**
> >   * Verify the @p attributes will be correctly understood by the NIC and
> store
> >   * them in the @p flow if everything is correct.
> >   *
> > @@ -214,6 +277,13 @@
> >  				return ret;
> >  			item_flags |= MLX5_FLOW_LAYER_MPLS;
> >  			break;
> > +		case RTE_FLOW_ITEM_TYPE_META:
> > +			ret = flow_dv_validate_item_meta(dev, items, attr,
> > +							 error);
> > +			if (ret < 0)
> > +				return ret;
> > +			item_flags |= MLX5_FLOW_ITEM_METADATA;
> > +			break;
> >  		default:
> >  			return rte_flow_error_set(error, ENOTSUP,
> >
> RTE_FLOW_ERROR_TYPE_ITEM,
> > @@ -855,6 +925,42 @@
> >  }
> >
> >  /**
> > + * Add META item to matcher
> > + *
> > + * @param[in, out] matcher
> > + *   Flow matcher.
> > + * @param[in, out] key
> > + *   Flow matcher value.
> > + * @param[in] item
> > + *   Flow pattern to translate.
> > + * @param[in] inner
> > + *   Item is inner pattern.
> > + */
> > +static void
> > +flow_dv_translate_item_meta(void *matcher, void *key,
> > +				const struct rte_flow_item *item) {
> > +	const struct rte_flow_item_meta *meta_m;
> > +	const struct rte_flow_item_meta *meta_v;
> > +
> > +	void *misc2_m =
> > +		MLX5_ADDR_OF(fte_match_param, matcher,
> misc_parameters_2);
> > +	void *misc2_v =
> > +		MLX5_ADDR_OF(fte_match_param, key,
> misc_parameters_2);
> > +
> > +	meta_m = (const void *)item->mask;
> > +	if (!meta_m)
> > +		meta_m = &rte_flow_item_meta_mask;
> > +	meta_v = (const void *)item->spec;
> > +	if (meta_v) {
> > +		MLX5_SET(fte_match_set_misc2, misc2_m,
> metadata_reg_a,
> > +			RTE_BE32(meta_m->data));
> 
> Nope. RTE_BE32() is for builtin constant, not for a variable.
> You should use rte_cpu_to_be_32() instead.

Replaced.

> 
> > +		MLX5_SET(fte_match_set_misc2, misc2_v, metadata_reg_a,
> > +			RTE_BE32(meta_v->data));
> 
> Same here.

Replaced.

> 
> > +	}
> > +}
> > +
> > +/**
> >   * Update the matcher and the value based the selected item.
> >   *
> >   * @param[in, out] matcher
> > @@ -940,6 +1046,9 @@
> >  		flow_dv_translate_item_vxlan(tmatcher->mask.buf, key,
> item,
> >  					     inner);
> >  		break;
> > +	case RTE_FLOW_ITEM_TYPE_META:
> > +		flow_dv_translate_item_meta(tmatcher->mask.buf, key,
> item);
> > +		break;
> >  	default:
> >  		break;
> >  	}
> > diff --git a/drivers/net/mlx5/mlx5_prm.h b/drivers/net/mlx5/mlx5_prm.h
> > index 69296a0..29742b1 100644
> > --- a/drivers/net/mlx5/mlx5_prm.h
> > +++ b/drivers/net/mlx5/mlx5_prm.h
> > @@ -159,7 +159,7 @@ struct mlx5_wqe_eth_seg_small {
> >  	uint8_t	cs_flags;
> >  	uint8_t	rsvd1;
> >  	uint16_t mss;
> > -	uint32_t rsvd2;
> > +	uint32_t flow_table_metadata;
> >  	uint16_t inline_hdr_sz;
> >  	uint8_t inline_hdr[2];
> >  } __rte_aligned(MLX5_WQE_DWORD_SIZE);
> > diff --git a/drivers/net/mlx5/mlx5_rxtx.c
> > b/drivers/net/mlx5/mlx5_rxtx.c index 558e6b6..5b4d2fd 100644
> > --- a/drivers/net/mlx5/mlx5_rxtx.c
> > +++ b/drivers/net/mlx5/mlx5_rxtx.c
> > @@ -523,6 +523,7 @@
> >  		uint8_t tso = txq->tso_en && (buf->ol_flags &
> PKT_TX_TCP_SEG);
> >  		uint32_t swp_offsets = 0;
> >  		uint8_t swp_types = 0;
> > +		uint32_t metadata;
> >  		uint16_t tso_segsz = 0;
> >  #ifdef MLX5_PMD_SOFT_COUNTERS
> >  		uint32_t total_length = 0;
> > @@ -566,6 +567,10 @@
> >  		cs_flags = txq_ol_cksum_to_cs(buf);
> >  		txq_mbuf_to_swp(txq, buf, (uint8_t *)&swp_offsets,
> &swp_types);
> >  		raw = ((uint8_t *)(uintptr_t)wqe) + 2 *
> MLX5_WQE_DWORD_SIZE;
> > +		/* Copy metadata from mbuf if valid */
> > +		metadata = buf->ol_flags & PKT_TX_METADATA ?
> > +						buf->tx_metadata : 0;
> 
> Indentation.

Changed.

> 
> > +
> 
> No blank line.

Removed.

> 
> >  		/* Replace the Ethernet type by the VLAN if necessary. */
> >  		if (buf->ol_flags & PKT_TX_VLAN_PKT) {
> >  			uint32_t vlan = rte_cpu_to_be_32(0x81000000 | @@
> -781,7 +786,7 @@
> >  				swp_offsets,
> >  				cs_flags | (swp_types << 8) |
> >  				(rte_cpu_to_be_16(tso_segsz) << 16),
> > -				0,
> > +				rte_cpu_to_be_32(metadata),
> >  				(ehdr << 16) |
> rte_cpu_to_be_16(tso_header_sz),
> >  			};
> >  		} else {
> > @@ -795,7 +800,7 @@
> >  			wqe->eseg = (rte_v128u32_t){
> >  				swp_offsets,
> >  				cs_flags | (swp_types << 8),
> > -				0,
> > +				rte_cpu_to_be_32(metadata),
> >  				(ehdr << 16) |
> rte_cpu_to_be_16(pkt_inline_sz),
> >  			};
> >  		}
> > @@ -861,7 +866,7 @@
> >  	mpw->wqe->eseg.inline_hdr_sz = 0;
> >  	mpw->wqe->eseg.rsvd0 = 0;
> >  	mpw->wqe->eseg.rsvd1 = 0;
> > -	mpw->wqe->eseg.rsvd2 = 0;
> > +	mpw->wqe->eseg.flow_table_metadata = 0;
> >  	mpw->wqe->ctrl[0] = rte_cpu_to_be_32((MLX5_OPC_MOD_MPW <<
> 24) |
> >  					     (txq->wqe_ci << 8) |
> >  					     MLX5_OPCODE_TSO);
> > @@ -948,6 +953,7 @@
> >  		uint32_t length;
> >  		unsigned int segs_n = buf->nb_segs;
> >  		uint32_t cs_flags;
> > +		uint32_t metadata;
> >
> >  		/*
> >  		 * Make sure there is enough room to store this packet and
> @@
> > -964,6 +970,9 @@
> >  		max_elts -= segs_n;
> >  		--pkts_n;
> >  		cs_flags = txq_ol_cksum_to_cs(buf);
> > +		/* Copy metadata from mbuf if valid */
> > +		metadata = buf->ol_flags & PKT_TX_METADATA ?
> > +						buf->tx_metadata : 0;
> 
> Indentation.

Changed.

> And no need to change to big-endian? I think it needs.

Metadata written in mbuf by application as big-endian.

> 
> >  		/* Retrieve packet information. */
> >  		length = PKT_LEN(buf);
> >  		assert(length);
> > @@ -971,6 +980,7 @@
> >  		if ((mpw.state == MLX5_MPW_STATE_OPENED) &&
> >  		    ((mpw.len != length) ||
> >  		     (segs_n != 1) ||
> > +		     (mpw.wqe->eseg.flow_table_metadata != metadata) ||
> >  		     (mpw.wqe->eseg.cs_flags != cs_flags)))
> >  			mlx5_mpw_close(txq, &mpw);
> >  		if (mpw.state == MLX5_MPW_STATE_CLOSED) { @@ -984,6
> +994,7 @@
> >  			max_wqe -= 2;
> >  			mlx5_mpw_new(txq, &mpw, length);
> >  			mpw.wqe->eseg.cs_flags = cs_flags;
> > +			mpw.wqe->eseg.flow_table_metadata = metadata;
> >  		}
> >  		/* Multi-segment packets must be alone in their MPW. */
> >  		assert((segs_n == 1) || (mpw.pkts_n == 0)); @@ -1082,7
> +1093,7 @@
> >  	mpw->wqe->eseg.cs_flags = 0;
> >  	mpw->wqe->eseg.rsvd0 = 0;
> >  	mpw->wqe->eseg.rsvd1 = 0;
> > -	mpw->wqe->eseg.rsvd2 = 0;
> > +	mpw->wqe->eseg.flow_table_metadata = 0;
> >  	inl = (struct mlx5_wqe_inl_small *)
> >  		(((uintptr_t)mpw->wqe) + 2 * MLX5_WQE_DWORD_SIZE);
> >  	mpw->data.raw = (uint8_t *)&inl->raw; @@ -1172,6 +1183,7 @@
> >  		uint32_t length;
> >  		unsigned int segs_n = buf->nb_segs;
> >  		uint8_t cs_flags;
> > +		uint32_t metadata;
> >
> >  		/*
> >  		 * Make sure there is enough room to store this packet and
> @@
> > -1193,18 +1205,23 @@
> >  		 */
> >  		max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq-
> >wqe_pi);
> >  		cs_flags = txq_ol_cksum_to_cs(buf);
> > +		/* Copy metadata from mbuf if valid */
> > +		metadata = buf->ol_flags & PKT_TX_METADATA ?
> > +						buf->tx_metadata : 0;
> 
> Indentation.

Changed.

> And no need to change to big-endian?

Metadata written in mbuf by application as big-endian.

> 
> >  		/* Retrieve packet information. */
> >  		length = PKT_LEN(buf);
> >  		/* Start new session if packet differs. */
> >  		if (mpw.state == MLX5_MPW_STATE_OPENED) {
> >  			if ((mpw.len != length) ||
> >  			    (segs_n != 1) ||
> > +			    (mpw.wqe->eseg.flow_table_metadata !=
> metadata) ||
> >  			    (mpw.wqe->eseg.cs_flags != cs_flags))
> >  				mlx5_mpw_close(txq, &mpw);
> >  		} else if (mpw.state == MLX5_MPW_INL_STATE_OPENED) {
> >  			if ((mpw.len != length) ||
> >  			    (segs_n != 1) ||
> >  			    (length > inline_room) ||
> > +			    (mpw.wqe->eseg.flow_table_metadata !=
> metadata) ||
> >  			    (mpw.wqe->eseg.cs_flags != cs_flags)) {
> >  				mlx5_mpw_inline_close(txq, &mpw);
> >  				inline_room =
> > @@ -1224,12 +1241,14 @@
> >  				max_wqe -= 2;
> >  				mlx5_mpw_new(txq, &mpw, length);
> >  				mpw.wqe->eseg.cs_flags = cs_flags;
> > +				mpw.wqe->eseg.flow_table_metadata =
> metadata;
> >  			} else {
> >  				if (unlikely(max_wqe < wqe_inl_n))
> >  					break;
> >  				max_wqe -= wqe_inl_n;
> >  				mlx5_mpw_inline_new(txq, &mpw, length);
> >  				mpw.wqe->eseg.cs_flags = cs_flags;
> > +				mpw.wqe->eseg.flow_table_metadata =
> metadata;
> >  			}
> >  		}
> >  		/* Multi-segment packets must be alone in their MPW. */
> @@ -1461,6
> > +1480,7 @@
> >  		unsigned int do_inline = 0; /* Whether inline is possible. */
> >  		uint32_t length;
> >  		uint8_t cs_flags;
> > +		uint32_t metadata;
> >
> >  		/* Multi-segmented packet is handled in slow-path outside.
> */
> >  		assert(NB_SEGS(buf) == 1);
> > @@ -1468,6 +1488,9 @@
> >  		if (max_elts - j == 0)
> >  			break;
> >  		cs_flags = txq_ol_cksum_to_cs(buf);
> > +		/* Copy metadata from mbuf if valid */
> > +		metadata = buf->ol_flags & PKT_TX_METADATA ?
> > +						buf->tx_metadata : 0;
> 
> Indentation.

Changed.

> And no need to change to big-endian?

Metadata written in mbuf by application as big-endian.

> 
> >  		/* Retrieve packet information. */
> >  		length = PKT_LEN(buf);
> >  		/* Start new session if:
> > @@ -1482,6 +1505,7 @@
> >  			    (length <= txq->inline_max_packet_sz &&
> >  			     inl_pad + sizeof(inl_hdr) + length >
> >  			     mpw_room) ||
> > +			     (mpw.wqe->eseg.flow_table_metadata !=
> metadata) ||
> >  			    (mpw.wqe->eseg.cs_flags != cs_flags))
> >  				max_wqe -= mlx5_empw_close(txq, &mpw);
> >  		}
> > @@ -1505,6 +1529,7 @@
> >  				    sizeof(inl_hdr) + length <= mpw_room &&
> >  				    !txq->mpw_hdr_dseg;
> >  			mpw.wqe->eseg.cs_flags = cs_flags;
> > +			mpw.wqe->eseg.flow_table_metadata = metadata;
> >  		} else {
> >  			/* Evaluate whether the next packet can be inlined.
> >  			 * Inlininig is possible when:
> > diff --git a/drivers/net/mlx5/mlx5_rxtx_vec.c
> > b/drivers/net/mlx5/mlx5_rxtx_vec.c
> > index 0a4aed8..16a8608 100644
> > --- a/drivers/net/mlx5/mlx5_rxtx_vec.c
> > +++ b/drivers/net/mlx5/mlx5_rxtx_vec.c
> > @@ -41,6 +41,8 @@
> >
> >  /**
> >   * Count the number of packets having same ol_flags and calculate
> cs_flags.
> > + * If PKT_TX_METADATA is set in ol_flags, packets must have same
> > + metadata
> > + * as well.
> 
> Packets can have different metadata but we just want to count the number
> of packets having same data. Please correct the comment.

Corrected.

> 
> >   *
> >   * @param pkts
> >   *   Pointer to array of packets.
> > @@ -48,26 +50,41 @@
> >   *   Number of packets.
> >   * @param cs_flags
> >   *   Pointer of flags to be returned.
> > + * @param metadata
> > + *   Pointer of metadata to be returned.
> > + * @param txq_offloads
> > + *   Offloads enabled on Tx queue
> >   *
> >   * @return
> > - *   Number of packets having same ol_flags.
> > + *   Number of packets having same ol_flags and metadata, if relevant.
> >   */
> >  static inline unsigned int
> > -txq_calc_offload(struct rte_mbuf **pkts, uint16_t pkts_n, uint8_t
> > *cs_flags)
> > +txq_calc_offload(struct rte_mbuf **pkts, uint16_t pkts_n, uint8_t
> *cs_flags,
> > +		 uint32_t *metadata, const uint64_t txq_offloads)
> >  {
> >  	unsigned int pos;
> >  	const uint64_t ol_mask =
> >  		PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM |
> >  		PKT_TX_UDP_CKSUM | PKT_TX_TUNNEL_GRE |
> > -		PKT_TX_TUNNEL_VXLAN | PKT_TX_OUTER_IP_CKSUM;
> > +		PKT_TX_TUNNEL_VXLAN | PKT_TX_OUTER_IP_CKSUM |
> PKT_TX_METADATA;
> 
> Shouldn't add PKT_TX_METADATA. As it is for cksum, you might rather want
> to change the name, e.g., cksum_ol_mask.
> 
> >
> >  	if (!pkts_n)
> >  		return 0;
> >  	/* Count the number of packets having same ol_flags. */
> 
> This comment has to be corrected and moved.
> 
> > -	for (pos = 1; pos < pkts_n; ++pos)
> > -		if ((pkts[pos]->ol_flags ^ pkts[0]->ol_flags) & ol_mask)
> > +	for (pos = 1; pos < pkts_n; ++pos) {
> > +		if ((txq_offloads & MLX5_VEC_TX_CKSUM_OFFLOAD_CAP)
> &&
> > +			((pkts[pos]->ol_flags ^ pkts[0]->ol_flags) & ol_mask))
> 
> Indentation.
> 
> >  			break;
> > +		/* If the metadata ol_flag is set,
> > +		 *  metadata must be same in all packets.
> > +		 */
> 
> Correct comment. First line should be empty for multi-line comment.
> And it can't be 'must'. We are not forcing it but just counting the number of
> packets having same metadata like I mentioned above.
> 
> > +		if ((txq_offloads & DEV_TX_OFFLOAD_MATCH_METADATA)
> &&
> > +			(pkts[pos]->ol_flags & PKT_TX_METADATA) &&
> > +			pkts[0]->tx_metadata != pkts[pos]->tx_metadata)
> 
> Disagree. What if pkts[0] doesn't have PKT_TXT_METADATA while pkt[1] has
> it?
> And, indentation.
> 
> > +			break;
> > +	}
> >  	*cs_flags = txq_ol_cksum_to_cs(pkts[0]);
> > +	*metadata = rte_cpu_to_be_32(pkts[0]->tx_metadata);
> 
> Same here. You should check if pkts[0] has metadata first.
> 
> >  	return pos;
> 
> Here's my suggestion for the whole func.
> 
> 	unsigned int pos;
> 	const uint64_t cksum_ol_mask =
> 		PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM |
> 		PKT_TX_UDP_CKSUM | PKT_TX_TUNNEL_GRE |
> 		PKT_TX_TUNNEL_VXLAN | PKT_TX_OUTER_IP_CKSUM;
> 	const uint32_t p0_metadata;
> 
> 	if (!pkts_n)
> 		return 0;
> 	p0_metadata = pkts[0]->ol_flags & PKT_TX_METADATA ?
> 		      pkts[0]->tx_metadata : 0;
> 	/* Count the number of packets having same offload parameters. */
> 	for (pos = 1; pos < pkts_n; ++pos) {
> 		/* Check if packet can have same checksum flags. */
> 		if ((txq_offloads & MLX5_VEC_TX_CKSUM_OFFLOAD_CAP)
> &&
> 		    ((pkts[pos]->ol_flags ^ pkts[0]->ol_flags) &
> cksum_ol_mask))
> 			break;
> 		/* Check if packet has same metadata. */
> 		if (txq_offloads & DEV_TX_OFFLOAD_MATCH_METADATA) {
> 			const uint32_t p1_metadata =
> 				pkts[pos]->ol_flags & PKT_TX_METADATA ?
> 				pkts[pos]->tx_metadata : 0;
> 
> 			if (p1_metadata != p0_metadata)
> 				break;
> 		}
> 	}
> 	*cs_flags = txq_ol_cksum_to_cs(pkts[0]);
> 	*metadata = rte_cpu_to_be_32(p0_metadata);
> 	return pos;

Modified per your suggestion.

> >  }
> >
> > @@ -96,7 +113,7 @@
> >  		uint16_t ret;
> >
> >  		n = RTE_MIN((uint16_t)(pkts_n - nb_tx),
> MLX5_VPMD_TX_MAX_BURST);
> > -		ret = txq_burst_v(txq, &pkts[nb_tx], n, 0);
> > +		ret = txq_burst_v(txq, &pkts[nb_tx], n, 0, 0);
> >  		nb_tx += ret;
> >  		if (!ret)
> >  			break;
> > @@ -127,6 +144,7 @@
> >  		uint8_t cs_flags = 0;
> >  		uint16_t n;
> >  		uint16_t ret;
> > +		uint32_t metadata = 0;
> 
> Let's use rte_be32_t instead.

Agree.

> 
> >
> >  		/* Transmit multi-seg packets in the head of pkts list. */
> >  		if ((txq->offloads & DEV_TX_OFFLOAD_MULTI_SEGS) &&
> @@ -137,9
> > +155,11 @@
> >  		n = RTE_MIN((uint16_t)(pkts_n - nb_tx),
> MLX5_VPMD_TX_MAX_BURST);
> >  		if (txq->offloads & DEV_TX_OFFLOAD_MULTI_SEGS)
> >  			n = txq_count_contig_single_seg(&pkts[nb_tx], n);
> > -		if (txq->offloads & MLX5_VEC_TX_CKSUM_OFFLOAD_CAP)
> > -			n = txq_calc_offload(&pkts[nb_tx], n, &cs_flags);
> > -		ret = txq_burst_v(txq, &pkts[nb_tx], n, cs_flags);
> > +		if (txq->offloads & (MLX5_VEC_TX_CKSUM_OFFLOAD_CAP |
> > +				DEV_TX_OFFLOAD_MATCH_METADATA))
> 
> Indentation.

Changed.

> 
> > +			n = txq_calc_offload(&pkts[nb_tx], n,
> > +					&cs_flags, &metadata, txq-
> >offloads);
> 
> Indentation.

Changed.

> 
> > +		ret = txq_burst_v(txq, &pkts[nb_tx], n, cs_flags, metadata);
> >  		nb_tx += ret;
> >  		if (!ret)
> >  			break;
> > diff --git a/drivers/net/mlx5/mlx5_rxtx_vec.h
> > b/drivers/net/mlx5/mlx5_rxtx_vec.h
> > index fb884f9..fda7004 100644
> > --- a/drivers/net/mlx5/mlx5_rxtx_vec.h
> > +++ b/drivers/net/mlx5/mlx5_rxtx_vec.h
> > @@ -22,6 +22,7 @@
> >  /* HW offload capabilities of vectorized Tx. */  #define
> > MLX5_VEC_TX_OFFLOAD_CAP \
> >  	(MLX5_VEC_TX_CKSUM_OFFLOAD_CAP | \
> > +	 DEV_TX_OFFLOAD_MATCH_METADATA | \
> >  	 DEV_TX_OFFLOAD_MULTI_SEGS)
> >
> >  /*
> > diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
> > b/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
> > index b37b738..a8a4d7b 100644
> > --- a/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
> > +++ b/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
> > @@ -201,13 +201,15 @@
> >   *   Number of packets to be sent (<= MLX5_VPMD_TX_MAX_BURST).
> >   * @param cs_flags
> >   *   Checksum offload flags to be written in the descriptor.
> > + * @param metadata
> > + *   Metadata value to be written in the descriptor.
> >   *
> >   * @return
> >   *   Number of packets successfully transmitted (<= pkts_n).
> >   */
> >  static inline uint16_t
> >  txq_burst_v(struct mlx5_txq_data *txq, struct rte_mbuf **pkts, uint16_t
> pkts_n,
> > -	    uint8_t cs_flags)
> > +	    uint8_t cs_flags, uint32_t metadata)
> 
> Let's use rte_be32_t instead.

Agree.

> 
> >  {
> >  	struct rte_mbuf **elts;
> >  	uint16_t elts_head = txq->elts_head; @@ -294,10 +296,7 @@
> >  	vst1q_u8((void *)t_wqe, ctrl);
> >  	/* Fill ESEG in the header. */
> >  	vst1q_u8((void *)(t_wqe + 1),
> > -		 ((uint8x16_t) { 0, 0, 0, 0,
> > -				 cs_flags, 0, 0, 0,
> > -				 0, 0, 0, 0,
> > -				 0, 0, 0, 0 }));
> > +		 ((uint32x4_t) { 0, cs_flags, metadata, 0 }));
> >  #ifdef MLX5_PMD_SOFT_COUNTERS
> >  	txq->stats.opackets += pkts_n;
> >  #endif
> > diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
> > b/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
> > index 54b3783..31aae4a 100644
> > --- a/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
> > +++ b/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
> > @@ -202,13 +202,15 @@
> >   *   Number of packets to be sent (<= MLX5_VPMD_TX_MAX_BURST).
> >   * @param cs_flags
> >   *   Checksum offload flags to be written in the descriptor.
> > + * @param metadata
> > + *   Metadata value to be written in the descriptor.
> >   *
> >   * @return
> >   *   Number of packets successfully transmitted (<= pkts_n).
> >   */
> >  static inline uint16_t
> >  txq_burst_v(struct mlx5_txq_data *txq, struct rte_mbuf **pkts, uint16_t
> pkts_n,
> > -	    uint8_t cs_flags)
> > +	    uint8_t cs_flags, uint32_t metadata)
> 
> Let's use rte_be32_t instead.

Agree.

> 
> >  {
> >  	struct rte_mbuf **elts;
> >  	uint16_t elts_head = txq->elts_head; @@ -292,11 +294,7 @@
> >  	ctrl = _mm_shuffle_epi8(ctrl, shuf_mask_ctrl);
> >  	_mm_store_si128(t_wqe, ctrl);
> >  	/* Fill ESEG in the header. */
> > -	_mm_store_si128(t_wqe + 1,
> > -			_mm_set_epi8(0, 0, 0, 0,
> > -				     0, 0, 0, 0,
> > -				     0, 0, 0, cs_flags,
> > -				     0, 0, 0, 0));
> > +	_mm_store_si128(t_wqe + 1, _mm_set_epi32(0, metadata, cs_flags,
> 0));
> >  #ifdef MLX5_PMD_SOFT_COUNTERS
> >  	txq->stats.opackets += pkts_n;
> >  #endif
> > diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
> > index f9bc473..7263fb1 100644
> > --- a/drivers/net/mlx5/mlx5_txq.c
> > +++ b/drivers/net/mlx5/mlx5_txq.c
> > @@ -128,6 +128,12 @@
> >  			offloads |= (DEV_TX_OFFLOAD_VXLAN_TNL_TSO |
> >  				     DEV_TX_OFFLOAD_GRE_TNL_TSO);
> >  	}
> > +
> 
> Please no blank line.

Removed.

> 
> > +#ifdef HAVE_IBV_FLOW_DV_SUPPORT
> > +	if (config->dv_flow_en)
> > +		offloads |= DEV_TX_OFFLOAD_MATCH_METADATA; #endif
> > +
> 
> Same here.

Removed.

> 
> >  	return offloads;
> >  }
> >
> > --
> > 1.8.3.1
> >
  

Patch

diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c
index bd70fce..15262f6 100644
--- a/drivers/net/mlx5/mlx5_flow.c
+++ b/drivers/net/mlx5/mlx5_flow.c
@@ -418,7 +418,7 @@  uint32_t mlx5_flow_adjust_priority(struct rte_eth_dev *dev, int32_t priority,
  * @return
  *   0 on success, a negative errno value otherwise and rte_errno is set.
  */
-static int
+int
 mlx5_flow_item_acceptable(const struct rte_flow_item *item,
 			  const uint8_t *mask,
 			  const uint8_t *nic_mask,
diff --git a/drivers/net/mlx5/mlx5_flow.h b/drivers/net/mlx5/mlx5_flow.h
index 094f666..834a6ed 100644
--- a/drivers/net/mlx5/mlx5_flow.h
+++ b/drivers/net/mlx5/mlx5_flow.h
@@ -43,6 +43,9 @@ 
 #define MLX5_FLOW_LAYER_GRE (1u << 14)
 #define MLX5_FLOW_LAYER_MPLS (1u << 15)
 
+/* General pattern items bits. */
+#define MLX5_FLOW_ITEM_METADATA (1u << 16)
+
 /* Outer Masks. */
 #define MLX5_FLOW_LAYER_OUTER_L3 \
 	(MLX5_FLOW_LAYER_OUTER_L3_IPV4 | MLX5_FLOW_LAYER_OUTER_L3_IPV6)
@@ -307,6 +310,11 @@  int mlx5_flow_validate_action_rss(const struct rte_flow_action *action,
 int mlx5_flow_validate_attributes(struct rte_eth_dev *dev,
 				  const struct rte_flow_attr *attributes,
 				  struct rte_flow_error *error);
+int mlx5_flow_item_acceptable(const struct rte_flow_item *item,
+			      const uint8_t *mask,
+			      const uint8_t *nic_mask,
+			      unsigned int size,
+			      struct rte_flow_error *error);
 int mlx5_flow_validate_item_eth(const struct rte_flow_item *item,
 				uint64_t item_flags,
 				struct rte_flow_error *error);
diff --git a/drivers/net/mlx5/mlx5_flow_dv.c b/drivers/net/mlx5/mlx5_flow_dv.c
index a013201..bfddfab 100644
--- a/drivers/net/mlx5/mlx5_flow_dv.c
+++ b/drivers/net/mlx5/mlx5_flow_dv.c
@@ -36,6 +36,69 @@ 
 #ifdef HAVE_IBV_FLOW_DV_SUPPORT
 
 /**
+ * Validate META item.
+ *
+ * @param[in] dev
+ *   Pointer to the rte_eth_dev structure.
+ * @param[in] item
+ *   Item specification.
+ * @param[in] attr
+ *   Attributes of flow that includes this item.
+ * @param[out] error
+ *   Pointer to error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+flow_dv_validate_item_meta(struct rte_eth_dev *dev,
+			   const struct rte_flow_item *item,
+			   const struct rte_flow_attr *attr,
+			   struct rte_flow_error *error)
+{
+	const struct rte_flow_item_meta *spec = item->spec;
+	const struct rte_flow_item_meta *mask = item->mask;
+
+	const struct rte_flow_item_meta nic_mask = {
+		.data = RTE_BE32(UINT32_MAX)
+	};
+
+	int ret;
+	uint64_t offloads = dev->data->dev_conf.txmode.offloads;
+
+	if (!(offloads & DEV_TX_OFFLOAD_MATCH_METADATA))
+		return rte_flow_error_set(error, EPERM,
+					  RTE_FLOW_ERROR_TYPE_ITEM,
+					  NULL,
+					  "match on metadata offload "
+					  "configuration is off for this port");
+	if (!spec)
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ITEM_SPEC,
+					  item->spec,
+					  "data cannot be empty");
+	if (!spec->data)
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ITEM_SPEC,
+					  NULL,
+					  "data cannot be zero");
+	if (!mask)
+		mask = &rte_flow_item_meta_mask;
+	ret = mlx5_flow_item_acceptable(item, (const uint8_t *)mask,
+					(const uint8_t *)&nic_mask,
+					sizeof(struct rte_flow_item_meta),
+					error);
+	if (ret < 0)
+		return ret;
+	if (attr->ingress)
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
+					  NULL,
+					  "pattern not supported for ingress");
+	return 0;
+}
+
+/**
  * Verify the @p attributes will be correctly understood by the NIC and store
  * them in the @p flow if everything is correct.
  *
@@ -214,6 +277,13 @@ 
 				return ret;
 			item_flags |= MLX5_FLOW_LAYER_MPLS;
 			break;
+		case RTE_FLOW_ITEM_TYPE_META:
+			ret = flow_dv_validate_item_meta(dev, items, attr,
+							 error);
+			if (ret < 0)
+				return ret;
+			item_flags |= MLX5_FLOW_ITEM_METADATA;
+			break;
 		default:
 			return rte_flow_error_set(error, ENOTSUP,
 						  RTE_FLOW_ERROR_TYPE_ITEM,
@@ -855,6 +925,42 @@ 
 }
 
 /**
+ * Add META item to matcher
+ *
+ * @param[in, out] matcher
+ *   Flow matcher.
+ * @param[in, out] key
+ *   Flow matcher value.
+ * @param[in] item
+ *   Flow pattern to translate.
+ * @param[in] inner
+ *   Item is inner pattern.
+ */
+static void
+flow_dv_translate_item_meta(void *matcher, void *key,
+				const struct rte_flow_item *item)
+{
+	const struct rte_flow_item_meta *meta_m;
+	const struct rte_flow_item_meta *meta_v;
+
+	void *misc2_m =
+		MLX5_ADDR_OF(fte_match_param, matcher, misc_parameters_2);
+	void *misc2_v =
+		MLX5_ADDR_OF(fte_match_param, key, misc_parameters_2);
+
+	meta_m = (const void *)item->mask;
+	if (!meta_m)
+		meta_m = &rte_flow_item_meta_mask;
+	meta_v = (const void *)item->spec;
+	if (meta_v) {
+		MLX5_SET(fte_match_set_misc2, misc2_m, metadata_reg_a,
+			RTE_BE32(meta_m->data));
+		MLX5_SET(fte_match_set_misc2, misc2_v, metadata_reg_a,
+			RTE_BE32(meta_v->data));
+	}
+}
+
+/**
  * Update the matcher and the value based the selected item.
  *
  * @param[in, out] matcher
@@ -940,6 +1046,9 @@ 
 		flow_dv_translate_item_vxlan(tmatcher->mask.buf, key, item,
 					     inner);
 		break;
+	case RTE_FLOW_ITEM_TYPE_META:
+		flow_dv_translate_item_meta(tmatcher->mask.buf, key, item);
+		break;
 	default:
 		break;
 	}
diff --git a/drivers/net/mlx5/mlx5_prm.h b/drivers/net/mlx5/mlx5_prm.h
index 69296a0..29742b1 100644
--- a/drivers/net/mlx5/mlx5_prm.h
+++ b/drivers/net/mlx5/mlx5_prm.h
@@ -159,7 +159,7 @@  struct mlx5_wqe_eth_seg_small {
 	uint8_t	cs_flags;
 	uint8_t	rsvd1;
 	uint16_t mss;
-	uint32_t rsvd2;
+	uint32_t flow_table_metadata;
 	uint16_t inline_hdr_sz;
 	uint8_t inline_hdr[2];
 } __rte_aligned(MLX5_WQE_DWORD_SIZE);
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 558e6b6..5b4d2fd 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -523,6 +523,7 @@ 
 		uint8_t tso = txq->tso_en && (buf->ol_flags & PKT_TX_TCP_SEG);
 		uint32_t swp_offsets = 0;
 		uint8_t swp_types = 0;
+		uint32_t metadata;
 		uint16_t tso_segsz = 0;
 #ifdef MLX5_PMD_SOFT_COUNTERS
 		uint32_t total_length = 0;
@@ -566,6 +567,10 @@ 
 		cs_flags = txq_ol_cksum_to_cs(buf);
 		txq_mbuf_to_swp(txq, buf, (uint8_t *)&swp_offsets, &swp_types);
 		raw = ((uint8_t *)(uintptr_t)wqe) + 2 * MLX5_WQE_DWORD_SIZE;
+		/* Copy metadata from mbuf if valid */
+		metadata = buf->ol_flags & PKT_TX_METADATA ?
+						buf->tx_metadata : 0;
+
 		/* Replace the Ethernet type by the VLAN if necessary. */
 		if (buf->ol_flags & PKT_TX_VLAN_PKT) {
 			uint32_t vlan = rte_cpu_to_be_32(0x81000000 |
@@ -781,7 +786,7 @@ 
 				swp_offsets,
 				cs_flags | (swp_types << 8) |
 				(rte_cpu_to_be_16(tso_segsz) << 16),
-				0,
+				rte_cpu_to_be_32(metadata),
 				(ehdr << 16) | rte_cpu_to_be_16(tso_header_sz),
 			};
 		} else {
@@ -795,7 +800,7 @@ 
 			wqe->eseg = (rte_v128u32_t){
 				swp_offsets,
 				cs_flags | (swp_types << 8),
-				0,
+				rte_cpu_to_be_32(metadata),
 				(ehdr << 16) | rte_cpu_to_be_16(pkt_inline_sz),
 			};
 		}
@@ -861,7 +866,7 @@ 
 	mpw->wqe->eseg.inline_hdr_sz = 0;
 	mpw->wqe->eseg.rsvd0 = 0;
 	mpw->wqe->eseg.rsvd1 = 0;
-	mpw->wqe->eseg.rsvd2 = 0;
+	mpw->wqe->eseg.flow_table_metadata = 0;
 	mpw->wqe->ctrl[0] = rte_cpu_to_be_32((MLX5_OPC_MOD_MPW << 24) |
 					     (txq->wqe_ci << 8) |
 					     MLX5_OPCODE_TSO);
@@ -948,6 +953,7 @@ 
 		uint32_t length;
 		unsigned int segs_n = buf->nb_segs;
 		uint32_t cs_flags;
+		uint32_t metadata;
 
 		/*
 		 * Make sure there is enough room to store this packet and
@@ -964,6 +970,9 @@ 
 		max_elts -= segs_n;
 		--pkts_n;
 		cs_flags = txq_ol_cksum_to_cs(buf);
+		/* Copy metadata from mbuf if valid */
+		metadata = buf->ol_flags & PKT_TX_METADATA ?
+						buf->tx_metadata : 0;
 		/* Retrieve packet information. */
 		length = PKT_LEN(buf);
 		assert(length);
@@ -971,6 +980,7 @@ 
 		if ((mpw.state == MLX5_MPW_STATE_OPENED) &&
 		    ((mpw.len != length) ||
 		     (segs_n != 1) ||
+		     (mpw.wqe->eseg.flow_table_metadata != metadata) ||
 		     (mpw.wqe->eseg.cs_flags != cs_flags)))
 			mlx5_mpw_close(txq, &mpw);
 		if (mpw.state == MLX5_MPW_STATE_CLOSED) {
@@ -984,6 +994,7 @@ 
 			max_wqe -= 2;
 			mlx5_mpw_new(txq, &mpw, length);
 			mpw.wqe->eseg.cs_flags = cs_flags;
+			mpw.wqe->eseg.flow_table_metadata = metadata;
 		}
 		/* Multi-segment packets must be alone in their MPW. */
 		assert((segs_n == 1) || (mpw.pkts_n == 0));
@@ -1082,7 +1093,7 @@ 
 	mpw->wqe->eseg.cs_flags = 0;
 	mpw->wqe->eseg.rsvd0 = 0;
 	mpw->wqe->eseg.rsvd1 = 0;
-	mpw->wqe->eseg.rsvd2 = 0;
+	mpw->wqe->eseg.flow_table_metadata = 0;
 	inl = (struct mlx5_wqe_inl_small *)
 		(((uintptr_t)mpw->wqe) + 2 * MLX5_WQE_DWORD_SIZE);
 	mpw->data.raw = (uint8_t *)&inl->raw;
@@ -1172,6 +1183,7 @@ 
 		uint32_t length;
 		unsigned int segs_n = buf->nb_segs;
 		uint8_t cs_flags;
+		uint32_t metadata;
 
 		/*
 		 * Make sure there is enough room to store this packet and
@@ -1193,18 +1205,23 @@ 
 		 */
 		max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi);
 		cs_flags = txq_ol_cksum_to_cs(buf);
+		/* Copy metadata from mbuf if valid */
+		metadata = buf->ol_flags & PKT_TX_METADATA ?
+						buf->tx_metadata : 0;
 		/* Retrieve packet information. */
 		length = PKT_LEN(buf);
 		/* Start new session if packet differs. */
 		if (mpw.state == MLX5_MPW_STATE_OPENED) {
 			if ((mpw.len != length) ||
 			    (segs_n != 1) ||
+			    (mpw.wqe->eseg.flow_table_metadata != metadata) ||
 			    (mpw.wqe->eseg.cs_flags != cs_flags))
 				mlx5_mpw_close(txq, &mpw);
 		} else if (mpw.state == MLX5_MPW_INL_STATE_OPENED) {
 			if ((mpw.len != length) ||
 			    (segs_n != 1) ||
 			    (length > inline_room) ||
+			    (mpw.wqe->eseg.flow_table_metadata != metadata) ||
 			    (mpw.wqe->eseg.cs_flags != cs_flags)) {
 				mlx5_mpw_inline_close(txq, &mpw);
 				inline_room =
@@ -1224,12 +1241,14 @@ 
 				max_wqe -= 2;
 				mlx5_mpw_new(txq, &mpw, length);
 				mpw.wqe->eseg.cs_flags = cs_flags;
+				mpw.wqe->eseg.flow_table_metadata = metadata;
 			} else {
 				if (unlikely(max_wqe < wqe_inl_n))
 					break;
 				max_wqe -= wqe_inl_n;
 				mlx5_mpw_inline_new(txq, &mpw, length);
 				mpw.wqe->eseg.cs_flags = cs_flags;
+				mpw.wqe->eseg.flow_table_metadata = metadata;
 			}
 		}
 		/* Multi-segment packets must be alone in their MPW. */
@@ -1461,6 +1480,7 @@ 
 		unsigned int do_inline = 0; /* Whether inline is possible. */
 		uint32_t length;
 		uint8_t cs_flags;
+		uint32_t metadata;
 
 		/* Multi-segmented packet is handled in slow-path outside. */
 		assert(NB_SEGS(buf) == 1);
@@ -1468,6 +1488,9 @@ 
 		if (max_elts - j == 0)
 			break;
 		cs_flags = txq_ol_cksum_to_cs(buf);
+		/* Copy metadata from mbuf if valid */
+		metadata = buf->ol_flags & PKT_TX_METADATA ?
+						buf->tx_metadata : 0;
 		/* Retrieve packet information. */
 		length = PKT_LEN(buf);
 		/* Start new session if:
@@ -1482,6 +1505,7 @@ 
 			    (length <= txq->inline_max_packet_sz &&
 			     inl_pad + sizeof(inl_hdr) + length >
 			     mpw_room) ||
+			     (mpw.wqe->eseg.flow_table_metadata != metadata) ||
 			    (mpw.wqe->eseg.cs_flags != cs_flags))
 				max_wqe -= mlx5_empw_close(txq, &mpw);
 		}
@@ -1505,6 +1529,7 @@ 
 				    sizeof(inl_hdr) + length <= mpw_room &&
 				    !txq->mpw_hdr_dseg;
 			mpw.wqe->eseg.cs_flags = cs_flags;
+			mpw.wqe->eseg.flow_table_metadata = metadata;
 		} else {
 			/* Evaluate whether the next packet can be inlined.
 			 * Inlininig is possible when:
diff --git a/drivers/net/mlx5/mlx5_rxtx_vec.c b/drivers/net/mlx5/mlx5_rxtx_vec.c
index 0a4aed8..16a8608 100644
--- a/drivers/net/mlx5/mlx5_rxtx_vec.c
+++ b/drivers/net/mlx5/mlx5_rxtx_vec.c
@@ -41,6 +41,8 @@ 
 
 /**
  * Count the number of packets having same ol_flags and calculate cs_flags.
+ * If PKT_TX_METADATA is set in ol_flags, packets must have same metadata
+ * as well.
  *
  * @param pkts
  *   Pointer to array of packets.
@@ -48,26 +50,41 @@ 
  *   Number of packets.
  * @param cs_flags
  *   Pointer of flags to be returned.
+ * @param metadata
+ *   Pointer of metadata to be returned.
+ * @param txq_offloads
+ *   Offloads enabled on Tx queue
  *
  * @return
- *   Number of packets having same ol_flags.
+ *   Number of packets having same ol_flags and metadata, if relevant.
  */
 static inline unsigned int
-txq_calc_offload(struct rte_mbuf **pkts, uint16_t pkts_n, uint8_t *cs_flags)
+txq_calc_offload(struct rte_mbuf **pkts, uint16_t pkts_n, uint8_t *cs_flags,
+		 uint32_t *metadata, const uint64_t txq_offloads)
 {
 	unsigned int pos;
 	const uint64_t ol_mask =
 		PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM |
 		PKT_TX_UDP_CKSUM | PKT_TX_TUNNEL_GRE |
-		PKT_TX_TUNNEL_VXLAN | PKT_TX_OUTER_IP_CKSUM;
+		PKT_TX_TUNNEL_VXLAN | PKT_TX_OUTER_IP_CKSUM | PKT_TX_METADATA;
 
 	if (!pkts_n)
 		return 0;
 	/* Count the number of packets having same ol_flags. */
-	for (pos = 1; pos < pkts_n; ++pos)
-		if ((pkts[pos]->ol_flags ^ pkts[0]->ol_flags) & ol_mask)
+	for (pos = 1; pos < pkts_n; ++pos) {
+		if ((txq_offloads & MLX5_VEC_TX_CKSUM_OFFLOAD_CAP) &&
+			((pkts[pos]->ol_flags ^ pkts[0]->ol_flags) & ol_mask))
 			break;
+		/* If the metadata ol_flag is set,
+		 *  metadata must be same in all packets.
+		 */
+		if ((txq_offloads & DEV_TX_OFFLOAD_MATCH_METADATA) &&
+			(pkts[pos]->ol_flags & PKT_TX_METADATA) &&
+			pkts[0]->tx_metadata != pkts[pos]->tx_metadata)
+			break;
+	}
 	*cs_flags = txq_ol_cksum_to_cs(pkts[0]);
+	*metadata = rte_cpu_to_be_32(pkts[0]->tx_metadata);
 	return pos;
 }
 
@@ -96,7 +113,7 @@ 
 		uint16_t ret;
 
 		n = RTE_MIN((uint16_t)(pkts_n - nb_tx), MLX5_VPMD_TX_MAX_BURST);
-		ret = txq_burst_v(txq, &pkts[nb_tx], n, 0);
+		ret = txq_burst_v(txq, &pkts[nb_tx], n, 0, 0);
 		nb_tx += ret;
 		if (!ret)
 			break;
@@ -127,6 +144,7 @@ 
 		uint8_t cs_flags = 0;
 		uint16_t n;
 		uint16_t ret;
+		uint32_t metadata = 0;
 
 		/* Transmit multi-seg packets in the head of pkts list. */
 		if ((txq->offloads & DEV_TX_OFFLOAD_MULTI_SEGS) &&
@@ -137,9 +155,11 @@ 
 		n = RTE_MIN((uint16_t)(pkts_n - nb_tx), MLX5_VPMD_TX_MAX_BURST);
 		if (txq->offloads & DEV_TX_OFFLOAD_MULTI_SEGS)
 			n = txq_count_contig_single_seg(&pkts[nb_tx], n);
-		if (txq->offloads & MLX5_VEC_TX_CKSUM_OFFLOAD_CAP)
-			n = txq_calc_offload(&pkts[nb_tx], n, &cs_flags);
-		ret = txq_burst_v(txq, &pkts[nb_tx], n, cs_flags);
+		if (txq->offloads & (MLX5_VEC_TX_CKSUM_OFFLOAD_CAP |
+				DEV_TX_OFFLOAD_MATCH_METADATA))
+			n = txq_calc_offload(&pkts[nb_tx], n,
+					&cs_flags, &metadata, txq->offloads);
+		ret = txq_burst_v(txq, &pkts[nb_tx], n, cs_flags, metadata);
 		nb_tx += ret;
 		if (!ret)
 			break;
diff --git a/drivers/net/mlx5/mlx5_rxtx_vec.h b/drivers/net/mlx5/mlx5_rxtx_vec.h
index fb884f9..fda7004 100644
--- a/drivers/net/mlx5/mlx5_rxtx_vec.h
+++ b/drivers/net/mlx5/mlx5_rxtx_vec.h
@@ -22,6 +22,7 @@ 
 /* HW offload capabilities of vectorized Tx. */
 #define MLX5_VEC_TX_OFFLOAD_CAP \
 	(MLX5_VEC_TX_CKSUM_OFFLOAD_CAP | \
+	 DEV_TX_OFFLOAD_MATCH_METADATA | \
 	 DEV_TX_OFFLOAD_MULTI_SEGS)
 
 /*
diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_neon.h b/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
index b37b738..a8a4d7b 100644
--- a/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
+++ b/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
@@ -201,13 +201,15 @@ 
  *   Number of packets to be sent (<= MLX5_VPMD_TX_MAX_BURST).
  * @param cs_flags
  *   Checksum offload flags to be written in the descriptor.
+ * @param metadata
+ *   Metadata value to be written in the descriptor.
  *
  * @return
  *   Number of packets successfully transmitted (<= pkts_n).
  */
 static inline uint16_t
 txq_burst_v(struct mlx5_txq_data *txq, struct rte_mbuf **pkts, uint16_t pkts_n,
-	    uint8_t cs_flags)
+	    uint8_t cs_flags, uint32_t metadata)
 {
 	struct rte_mbuf **elts;
 	uint16_t elts_head = txq->elts_head;
@@ -294,10 +296,7 @@ 
 	vst1q_u8((void *)t_wqe, ctrl);
 	/* Fill ESEG in the header. */
 	vst1q_u8((void *)(t_wqe + 1),
-		 ((uint8x16_t) { 0, 0, 0, 0,
-				 cs_flags, 0, 0, 0,
-				 0, 0, 0, 0,
-				 0, 0, 0, 0 }));
+		 ((uint32x4_t) { 0, cs_flags, metadata, 0 }));
 #ifdef MLX5_PMD_SOFT_COUNTERS
 	txq->stats.opackets += pkts_n;
 #endif
diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_sse.h b/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
index 54b3783..31aae4a 100644
--- a/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
+++ b/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
@@ -202,13 +202,15 @@ 
  *   Number of packets to be sent (<= MLX5_VPMD_TX_MAX_BURST).
  * @param cs_flags
  *   Checksum offload flags to be written in the descriptor.
+ * @param metadata
+ *   Metadata value to be written in the descriptor.
  *
  * @return
  *   Number of packets successfully transmitted (<= pkts_n).
  */
 static inline uint16_t
 txq_burst_v(struct mlx5_txq_data *txq, struct rte_mbuf **pkts, uint16_t pkts_n,
-	    uint8_t cs_flags)
+	    uint8_t cs_flags, uint32_t metadata)
 {
 	struct rte_mbuf **elts;
 	uint16_t elts_head = txq->elts_head;
@@ -292,11 +294,7 @@ 
 	ctrl = _mm_shuffle_epi8(ctrl, shuf_mask_ctrl);
 	_mm_store_si128(t_wqe, ctrl);
 	/* Fill ESEG in the header. */
-	_mm_store_si128(t_wqe + 1,
-			_mm_set_epi8(0, 0, 0, 0,
-				     0, 0, 0, 0,
-				     0, 0, 0, cs_flags,
-				     0, 0, 0, 0));
+	_mm_store_si128(t_wqe + 1, _mm_set_epi32(0, metadata, cs_flags, 0));
 #ifdef MLX5_PMD_SOFT_COUNTERS
 	txq->stats.opackets += pkts_n;
 #endif
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index f9bc473..7263fb1 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -128,6 +128,12 @@ 
 			offloads |= (DEV_TX_OFFLOAD_VXLAN_TNL_TSO |
 				     DEV_TX_OFFLOAD_GRE_TNL_TSO);
 	}
+
+#ifdef HAVE_IBV_FLOW_DV_SUPPORT
+	if (config->dv_flow_en)
+		offloads |= DEV_TX_OFFLOAD_MATCH_METADATA;
+#endif
+
 	return offloads;
 }