[v2] net/mlx5: support metadata as flow rule criteria

Message ID 1538057935-34663-1-git-send-email-dekelp@mellanox.com (mailing list archive)
State Superseded, archived
Delegated to: Shahaf Shuler
Headers
Series [v2] net/mlx5: support metadata as flow rule criteria |

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/Intel-compilation success Compilation OK

Commit Message

Dekel Peled Sept. 27, 2018, 2:18 p.m. UTC
  As described in series starting at [1], it adds option to set
metadata value as match pattern when creating a new flow rule.

This patch adds metadata support in mlx5 driver, in two parts:
- Add the setting of metadata value in matcher when creating
  a new flow rule.
- Add the passing of metadata value from mbuf to wqe when
  indicated by ol_flag, in different burst functions.

[1] "ethdev: support metadata as flow rule criteria"
    http://mails.dpdk.org/archives/dev/2018-September/113270.html
	
Signed-off-by: Dekel Peled <dekelp@mellanox.com>
---
V2:
Split the support of egress rules to a different patch.
---
 drivers/net/mlx5/mlx5_flow.c          |  2 +-
 drivers/net/mlx5/mlx5_flow.h          |  8 +++
 drivers/net/mlx5/mlx5_flow_dv.c       | 96 +++++++++++++++++++++++++++++++++++
 drivers/net/mlx5/mlx5_prm.h           |  2 +-
 drivers/net/mlx5/mlx5_rxtx.c          | 35 +++++++++++--
 drivers/net/mlx5/mlx5_rxtx_vec.c      | 31 ++++++++---
 drivers/net/mlx5/mlx5_rxtx_vec.h      |  1 +
 drivers/net/mlx5/mlx5_rxtx_vec_neon.h |  4 +-
 drivers/net/mlx5/mlx5_rxtx_vec_sse.h  |  4 +-
 drivers/net/mlx5/mlx5_txq.c           |  6 +++
 10 files changed, 174 insertions(+), 15 deletions(-)
  

Comments

Yongseok Koh Sept. 29, 2018, 9:09 a.m. UTC | #1
On Thu, Sep 27, 2018 at 05:18:55PM +0300, Dekel Peled wrote:
> As described in series starting at [1], it adds option to set
> metadata value as match pattern when creating a new flow rule.
> 
> This patch adds metadata support in mlx5 driver, in two parts:
> - Add the setting of metadata value in matcher when creating
>   a new flow rule.
> - Add the passing of metadata value from mbuf to wqe when
>   indicated by ol_flag, in different burst functions.
> 
> [1] "ethdev: support metadata as flow rule criteria"
>     http://mails.dpdk.org/archives/dev/2018-September/113270.html
> 	
> Signed-off-by: Dekel Peled <dekelp@mellanox.com>
> ---
> V2:
> Split the support of egress rules to a different patch.
> ---
>  drivers/net/mlx5/mlx5_flow.c          |  2 +-
>  drivers/net/mlx5/mlx5_flow.h          |  8 +++
>  drivers/net/mlx5/mlx5_flow_dv.c       | 96 +++++++++++++++++++++++++++++++++++
>  drivers/net/mlx5/mlx5_prm.h           |  2 +-
>  drivers/net/mlx5/mlx5_rxtx.c          | 35 +++++++++++--
>  drivers/net/mlx5/mlx5_rxtx_vec.c      | 31 ++++++++---
>  drivers/net/mlx5/mlx5_rxtx_vec.h      |  1 +
>  drivers/net/mlx5/mlx5_rxtx_vec_neon.h |  4 +-
>  drivers/net/mlx5/mlx5_rxtx_vec_sse.h  |  4 +-
>  drivers/net/mlx5/mlx5_txq.c           |  6 +++
>  10 files changed, 174 insertions(+), 15 deletions(-)
> 
> diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c
> index 8007bf1..9581691 100644
> --- a/drivers/net/mlx5/mlx5_flow.c
> +++ b/drivers/net/mlx5/mlx5_flow.c
> @@ -417,7 +417,7 @@ uint32_t mlx5_flow_adjust_priority(struct rte_eth_dev *dev, int32_t priority,
>   * @return
>   *   0 on success, a negative errno value otherwise and rte_errno is set.
>   */
> -static int
> +int
>  mlx5_flow_item_acceptable(const struct rte_flow_item *item,
>  			  const uint8_t *mask,
>  			  const uint8_t *nic_mask,
> diff --git a/drivers/net/mlx5/mlx5_flow.h b/drivers/net/mlx5/mlx5_flow.h
> index 10d700a..d91ae17 100644
> --- a/drivers/net/mlx5/mlx5_flow.h
> +++ b/drivers/net/mlx5/mlx5_flow.h
> @@ -42,6 +42,9 @@
>  #define MLX5_FLOW_LAYER_GRE (1u << 14)
>  #define MLX5_FLOW_LAYER_MPLS (1u << 15)
>  
> +/* General pattern items bits. */
> +#define MLX5_FLOW_ITEM_METADATA (1u << 16)
> +
>  /* Outer Masks. */
>  #define MLX5_FLOW_LAYER_OUTER_L3 \
>  	(MLX5_FLOW_LAYER_OUTER_L3_IPV4 | MLX5_FLOW_LAYER_OUTER_L3_IPV6)
> @@ -299,6 +302,11 @@ int mlx5_flow_validate_action_rss(const struct rte_flow_action *action,
>  int mlx5_flow_validate_attributes(struct rte_eth_dev *dev,
>  				  const struct rte_flow_attr *attributes,
>  				  struct rte_flow_error *error);
> +int mlx5_flow_item_acceptable(const struct rte_flow_item *item,
> +			      const uint8_t *mask,
> +			      const uint8_t *nic_mask,
> +			      unsigned int size,
> +			      struct rte_flow_error *error);
>  int mlx5_flow_validate_item_eth(const struct rte_flow_item *item,
>  				uint64_t item_flags,
>  				struct rte_flow_error *error);
> diff --git a/drivers/net/mlx5/mlx5_flow_dv.c b/drivers/net/mlx5/mlx5_flow_dv.c
> index cf663cd..2439f5e 100644
> --- a/drivers/net/mlx5/mlx5_flow_dv.c
> +++ b/drivers/net/mlx5/mlx5_flow_dv.c
> @@ -36,6 +36,55 @@
>  #ifdef HAVE_IBV_FLOW_DV_SUPPORT
>  
>  /**
> + * Validate META item.
> + *
> + * @param[in] item
> + *   Item specification.
> + * @param[in] attr
> + *   Attributes of flow that includes this item.
> + * @param[out] error
> + *   Pointer to error structure.
> + *
> + * @return
> + *   0 on success, a negative errno value otherwise and rte_errno is set.
> + */
> +static int
> +mlx5_flow_validate_item_meta(const struct rte_flow_item *item,

Naming rule here is starting from flow_dv_*() for static funcs. For global
funcs, it should start from mlx5_, so it should be mlx5_flow_dv_*().

Or, you can put it in the mlx5_flow.c as a common helper function although it is
used only by DV.

I prefer the latter.

> +			const struct rte_flow_attr *attr,
> +			struct rte_flow_error *error)
> +{
> +	const struct rte_flow_item_meta *spec = item->spec;
> +	const struct rte_flow_item_meta *mask = item->mask;
> +
> +	const struct rte_flow_item_meta nic_mask = {
> +		.data = RTE_BE32(UINT32_MAX)
> +	};
> +
> +	int ret;
> +
> +	if (!spec)
> +		return rte_flow_error_set(error, EINVAL,
> +					  RTE_FLOW_ERROR_TYPE_ITEM_SPEC,
> +					  item->spec,
> +					  "data cannot be empty");
> +	if (!mask)
> +		mask = &rte_flow_item_meta_mask;
> +	ret = mlx5_flow_item_acceptable(item, (const uint8_t *)mask,
> +					(const uint8_t *)&nic_mask,
> +					sizeof(struct rte_flow_item_meta),
> +					error);
> +	if (ret < 0)
> +		return ret;
> +

No blank line is allowed.

> +	if (attr->ingress)
> +		return rte_flow_error_set(error, ENOTSUP,
> +					  RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
> +					  NULL,
> +					  "pattern not supported for ingress");

If it is only supported with egress flow, then please group this patch with the
other one - "net/mlx5: allow flow rule with attribute egress", and make it
applied later than that by specifying [1/2] and [2/2].

> +	return 0;
> +}
> +
> +/**
>   * Verify the @p attributes will be correctly understood by the NIC and store
>   * them in the @p flow if everything is correct.
>   *
> @@ -216,6 +265,12 @@
>  				return ret;
>  			item_flags |= MLX5_FLOW_LAYER_MPLS;
>  			break;
> +		case RTE_FLOW_ITEM_TYPE_META:
> +			ret = mlx5_flow_validate_item_meta(items, attr, error);
> +			if (ret < 0)
> +				return ret;
> +			item_flags |= MLX5_FLOW_ITEM_METADATA;
> +			break;
>  		default:
>  			return rte_flow_error_set(error, ENOTSUP,
>  						  RTE_FLOW_ERROR_TYPE_ITEM,
> @@ -853,6 +908,43 @@
>  }
>  
>  /**
> + * Add META item to matcher
> + *
> + * @param[in, out] matcher
> + *   Flow matcher.
> + * @param[in, out] key
> + *   Flow matcher value.
> + * @param[in] item
> + *   Flow pattern to translate.
> + * @param[in] inner
> + *   Item is inner pattern.
> + */
> +static void
> +flow_dv_translate_item_meta(void *matcher, void *key,
> +				const struct rte_flow_item *item)
> +{
> +	const struct rte_flow_item_meta *metam;
> +	const struct rte_flow_item_meta *metav;

Ori changed naming like meta_m and meta_v.

> +	void *misc2_m =
> +		MLX5_ADDR_OF(fte_match_param, matcher, misc_parameters_2);
> +	void *misc2_v =
> +		MLX5_ADDR_OF(fte_match_param, key, misc_parameters_2);
> +
> +	metam = (const void *)item->mask;
> +	if (!metam)
> +		metam = &rte_flow_item_meta_mask;
> +

No blank line is allowed except for the one after variable declaration. Please
remove such blank lines in the whole patches.

> +	metav = (const void *)item->spec;
> +	if (metav) {
> +		MLX5_SET(fte_match_set_misc2, misc2_m, metadata_reg_a,
> +			RTE_BE32(metam->data));
> +		MLX5_SET(fte_match_set_misc2, misc2_v, metadata_reg_a,
> +			RTE_BE32(metav->data));
> +	}
> +}
> +
> +/**
>   * Update the matcher and the value based the selected item.
>   *
>   * @param[in, out] matcher
> @@ -938,6 +1030,10 @@
>  		flow_dv_translate_item_vxlan(tmatcher->mask.buf, key, item,
>  					     inner);
>  		break;
> +	case RTE_FLOW_ITEM_TYPE_META:
> +		flow_dv_translate_item_meta(tmatcher->mask.buf, key, item);
> +		break;
> +
>  	default:
>  		break;
>  	}
> diff --git a/drivers/net/mlx5/mlx5_prm.h b/drivers/net/mlx5/mlx5_prm.h
> index 4e2f9f4..a905397 100644
> --- a/drivers/net/mlx5/mlx5_prm.h
> +++ b/drivers/net/mlx5/mlx5_prm.h
> @@ -159,7 +159,7 @@ struct mlx5_wqe_eth_seg_small {
>  	uint8_t	cs_flags;
>  	uint8_t	rsvd1;
>  	uint16_t mss;
> -	uint32_t rsvd2;
> +	uint32_t flow_table_metadata;
>  	uint16_t inline_hdr_sz;
>  	uint8_t inline_hdr[2];
>  } __rte_aligned(MLX5_WQE_DWORD_SIZE);
> diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
> index 558e6b6..d596e4e 100644
> --- a/drivers/net/mlx5/mlx5_rxtx.c
> +++ b/drivers/net/mlx5/mlx5_rxtx.c
> @@ -523,6 +523,7 @@
>  		uint8_t tso = txq->tso_en && (buf->ol_flags & PKT_TX_TCP_SEG);
>  		uint32_t swp_offsets = 0;
>  		uint8_t swp_types = 0;
> +		uint32_t metadata = 0;
>  		uint16_t tso_segsz = 0;
>  #ifdef MLX5_PMD_SOFT_COUNTERS
>  		uint32_t total_length = 0;
> @@ -566,6 +567,9 @@
>  		cs_flags = txq_ol_cksum_to_cs(buf);
>  		txq_mbuf_to_swp(txq, buf, (uint8_t *)&swp_offsets, &swp_types);
>  		raw = ((uint8_t *)(uintptr_t)wqe) + 2 * MLX5_WQE_DWORD_SIZE;
> +		/* Copy metadata from mbuf if valid */
> +		if (buf->ol_flags & PKT_TX_METADATA)
> +			metadata = buf->hash.fdir.hi;

I saw someone suggested to add a union field to name it properly. I also agree
on the idea. It is quite confusing to get meta data from hash field.

>  		/* Replace the Ethernet type by the VLAN if necessary. */
>  		if (buf->ol_flags & PKT_TX_VLAN_PKT) {
>  			uint32_t vlan = rte_cpu_to_be_32(0x81000000 |
> @@ -781,7 +785,7 @@
>  				swp_offsets,
>  				cs_flags | (swp_types << 8) |
>  				(rte_cpu_to_be_16(tso_segsz) << 16),
> -				0,
> +				rte_cpu_to_be_32(metadata),
>  				(ehdr << 16) | rte_cpu_to_be_16(tso_header_sz),
>  			};
>  		} else {
> @@ -795,7 +799,7 @@
>  			wqe->eseg = (rte_v128u32_t){
>  				swp_offsets,
>  				cs_flags | (swp_types << 8),
> -				0,
> +				rte_cpu_to_be_32(metadata),
>  				(ehdr << 16) | rte_cpu_to_be_16(pkt_inline_sz),
>  			};
>  		}
> @@ -861,7 +865,7 @@
>  	mpw->wqe->eseg.inline_hdr_sz = 0;
>  	mpw->wqe->eseg.rsvd0 = 0;
>  	mpw->wqe->eseg.rsvd1 = 0;
> -	mpw->wqe->eseg.rsvd2 = 0;
> +	mpw->wqe->eseg.flow_table_metadata = 0;
>  	mpw->wqe->ctrl[0] = rte_cpu_to_be_32((MLX5_OPC_MOD_MPW << 24) |
>  					     (txq->wqe_ci << 8) |
>  					     MLX5_OPCODE_TSO);
> @@ -971,6 +975,8 @@
>  		if ((mpw.state == MLX5_MPW_STATE_OPENED) &&
>  		    ((mpw.len != length) ||
>  		     (segs_n != 1) ||
> +		     (mpw.wqe->eseg.flow_table_metadata !=
> +			rte_cpu_to_be_32(buf->hash.fdir.hi)) ||
>  		     (mpw.wqe->eseg.cs_flags != cs_flags)))
>  			mlx5_mpw_close(txq, &mpw);
>  		if (mpw.state == MLX5_MPW_STATE_CLOSED) {
> @@ -984,6 +990,8 @@
>  			max_wqe -= 2;
>  			mlx5_mpw_new(txq, &mpw, length);
>  			mpw.wqe->eseg.cs_flags = cs_flags;
> +			mpw.wqe->eseg.flow_table_metadata =
> +				rte_cpu_to_be_32(buf->hash.fdir.hi);
>  		}
>  		/* Multi-segment packets must be alone in their MPW. */
>  		assert((segs_n == 1) || (mpw.pkts_n == 0));
> @@ -1082,7 +1090,7 @@
>  	mpw->wqe->eseg.cs_flags = 0;
>  	mpw->wqe->eseg.rsvd0 = 0;
>  	mpw->wqe->eseg.rsvd1 = 0;
> -	mpw->wqe->eseg.rsvd2 = 0;
> +	mpw->wqe->eseg.flow_table_metadata = 0;
>  	inl = (struct mlx5_wqe_inl_small *)
>  		(((uintptr_t)mpw->wqe) + 2 * MLX5_WQE_DWORD_SIZE);
>  	mpw->data.raw = (uint8_t *)&inl->raw;
> @@ -1199,12 +1207,16 @@
>  		if (mpw.state == MLX5_MPW_STATE_OPENED) {
>  			if ((mpw.len != length) ||
>  			    (segs_n != 1) ||
> +			    (mpw.wqe->eseg.flow_table_metadata !=
> +				rte_cpu_to_be_32(buf->hash.fdir.hi)) ||
>  			    (mpw.wqe->eseg.cs_flags != cs_flags))
>  				mlx5_mpw_close(txq, &mpw);
>  		} else if (mpw.state == MLX5_MPW_INL_STATE_OPENED) {
>  			if ((mpw.len != length) ||
>  			    (segs_n != 1) ||
>  			    (length > inline_room) ||
> +			    (mpw.wqe->eseg.flow_table_metadata !=
> +				rte_cpu_to_be_32(buf->hash.fdir.hi)) ||

Isn't this mbuf field vaild only if there's PKT_TX_METADATA? Without the flag,
it could be a garbage, right? And I think the value in the eseg might not be
metadata of previous packet if the packet didn't have the flag. It would be
better to define metadata and get it early in this loop like cs_flags. E.g.

		metadata = buf->ol_flags & PKT_TX_METADATA ?
			   rte_cpu_to_be_32(buf->hash.fdir.hi) : 0;
		cs_flags = txq_ol_cksum_to_cs(buf);

Same for the rest of similar funcs.

>  			    (mpw.wqe->eseg.cs_flags != cs_flags)) {
>  				mlx5_mpw_inline_close(txq, &mpw);
>  				inline_room =
> @@ -1224,12 +1236,21 @@
>  				max_wqe -= 2;
>  				mlx5_mpw_new(txq, &mpw, length);
>  				mpw.wqe->eseg.cs_flags = cs_flags;
> +				/* Copy metadata from mbuf if valid */
> +				if (buf->ol_flags & PKT_TX_METADATA)
> +					mpw.wqe->eseg.flow_table_metadata =
> +					rte_cpu_to_be_32(buf->hash.fdir.hi);
> +

Yes, this can be allowed but the following is preferred.
					mpw.wqe->eseg.flow_table_metadata =
						rte_cpu_to_be_32
						(buf->hash.fdir.hi);
>  			} else {
>  				if (unlikely(max_wqe < wqe_inl_n))
>  					break;
>  				max_wqe -= wqe_inl_n;
>  				mlx5_mpw_inline_new(txq, &mpw, length);
>  				mpw.wqe->eseg.cs_flags = cs_flags;
> +				/* Copy metadata from mbuf if valid */
> +				if (buf->ol_flags & PKT_TX_METADATA)
> +					mpw.wqe->eseg.flow_table_metadata =
> +					rte_cpu_to_be_32(buf->hash.fdir.hi);

Same here.

>  			}
>  		}
>  		/* Multi-segment packets must be alone in their MPW. */
> @@ -1482,6 +1503,8 @@
>  			    (length <= txq->inline_max_packet_sz &&
>  			     inl_pad + sizeof(inl_hdr) + length >
>  			     mpw_room) ||
> +			     (mpw.wqe->eseg.flow_table_metadata !=
> +				rte_cpu_to_be_32(buf->hash.fdir.hi)) ||
>  			    (mpw.wqe->eseg.cs_flags != cs_flags))
>  				max_wqe -= mlx5_empw_close(txq, &mpw);
>  		}
> @@ -1505,6 +1528,10 @@
>  				    sizeof(inl_hdr) + length <= mpw_room &&
>  				    !txq->mpw_hdr_dseg;
>  			mpw.wqe->eseg.cs_flags = cs_flags;
> +			/* Copy metadata from mbuf if valid */
> +			if (buf->ol_flags & PKT_TX_METADATA)
> +				mpw.wqe->eseg.flow_table_metadata =
> +					rte_cpu_to_be_32(buf->hash.fdir.hi);
>  		} else {
>  			/* Evaluate whether the next packet can be inlined.
>  			 * Inlininig is possible when:
> diff --git a/drivers/net/mlx5/mlx5_rxtx_vec.c b/drivers/net/mlx5/mlx5_rxtx_vec.c
> index 0a4aed8..132943a 100644
> --- a/drivers/net/mlx5/mlx5_rxtx_vec.c
> +++ b/drivers/net/mlx5/mlx5_rxtx_vec.c
> @@ -41,6 +41,8 @@
>  
>  /**
>   * Count the number of packets having same ol_flags and calculate cs_flags.
> + * If PKT_TX_METADATA is set in ol_flags, packets must have same metadata
> + * as well.
>   *
>   * @param pkts
>   *   Pointer to array of packets.
> @@ -48,25 +50,38 @@
>   *   Number of packets.
>   * @param cs_flags
>   *   Pointer of flags to be returned.
> + * @param txq_offloads
> + *   Offloads enabled on Tx queue
>   *
>   * @return
> - *   Number of packets having same ol_flags.
> + *   Number of packets having same ol_flags and metadata, if relevant.
>   */
>  static inline unsigned int
> -txq_calc_offload(struct rte_mbuf **pkts, uint16_t pkts_n, uint8_t *cs_flags)
> +txq_calc_offload(struct rte_mbuf **pkts, uint16_t pkts_n, uint8_t *cs_flags,
> +		const uint64_t txq_offloads)
>  {
>  	unsigned int pos;
>  	const uint64_t ol_mask =
>  		PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM |
>  		PKT_TX_UDP_CKSUM | PKT_TX_TUNNEL_GRE |
> -		PKT_TX_TUNNEL_VXLAN | PKT_TX_OUTER_IP_CKSUM;
> +		PKT_TX_TUNNEL_VXLAN | PKT_TX_OUTER_IP_CKSUM | PKT_TX_METADATA;
>  
>  	if (!pkts_n)
>  		return 0;
>  	/* Count the number of packets having same ol_flags. */
> -	for (pos = 1; pos < pkts_n; ++pos)
> -		if ((pkts[pos]->ol_flags ^ pkts[0]->ol_flags) & ol_mask)
> +	for (pos = 1; pos < pkts_n; ++pos) {
> +		if ((txq_offloads & MLX5_VEC_TX_CKSUM_OFFLOAD_CAP) &&
> +			((pkts[pos]->ol_flags ^ pkts[0]->ol_flags) & ol_mask))
>  			break;
> +		/* If the metadata ol_flag is set,
> +		 *  metadata must be same in all packets.
> +		 */
> +		if ((txq_offloads & DEV_TX_OFFLOAD_MATCH_METADATA) &&
> +			(pkts[pos]->ol_flags & PKT_TX_METADATA) &&
> +			pkts[0]->hash.fdir.hi != pkts[pos]->hash.fdir.hi)
> +			break;
> +	}
> +
>  	*cs_flags = txq_ol_cksum_to_cs(pkts[0]);
>  	return pos;
>  }
> @@ -137,8 +152,10 @@
>  		n = RTE_MIN((uint16_t)(pkts_n - nb_tx), MLX5_VPMD_TX_MAX_BURST);
>  		if (txq->offloads & DEV_TX_OFFLOAD_MULTI_SEGS)
>  			n = txq_count_contig_single_seg(&pkts[nb_tx], n);
> -		if (txq->offloads & MLX5_VEC_TX_CKSUM_OFFLOAD_CAP)
> -			n = txq_calc_offload(&pkts[nb_tx], n, &cs_flags);
> +		if (txq->offloads & (MLX5_VEC_TX_CKSUM_OFFLOAD_CAP |
> +				DEV_TX_OFFLOAD_MATCH_METADATA))

How about writing a separate func - txq_count_contig_same_metadata()? And it
would be better to rename txq_calc_offload() to txq_count_contig_same_csflag().
Then, there could be less redundant if-clause in the funcs.

> +			n = txq_calc_offload(&pkts[nb_tx], n,
> +					&cs_flags, txq->offloads);
>  		ret = txq_burst_v(txq, &pkts[nb_tx], n, cs_flags);
>  		nb_tx += ret;
>  		if (!ret)
> diff --git a/drivers/net/mlx5/mlx5_rxtx_vec.h b/drivers/net/mlx5/mlx5_rxtx_vec.h
> index fb884f9..fda7004 100644
> --- a/drivers/net/mlx5/mlx5_rxtx_vec.h
> +++ b/drivers/net/mlx5/mlx5_rxtx_vec.h
> @@ -22,6 +22,7 @@
>  /* HW offload capabilities of vectorized Tx. */
>  #define MLX5_VEC_TX_OFFLOAD_CAP \
>  	(MLX5_VEC_TX_CKSUM_OFFLOAD_CAP | \
> +	 DEV_TX_OFFLOAD_MATCH_METADATA | \
>  	 DEV_TX_OFFLOAD_MULTI_SEGS)
>  
>  /*
> diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_neon.h b/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
> index b37b738..20c9427 100644
> --- a/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
> +++ b/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
> @@ -237,6 +237,7 @@
>  	uint8x16_t *t_wqe;
>  	uint8_t *dseg;
>  	uint8x16_t ctrl;
> +	uint32_t md; /* metadata */
>  
>  	/* Make sure all packets can fit into a single WQE. */
>  	assert(elts_n > pkts_n);
> @@ -293,10 +294,11 @@
>  	ctrl = vqtbl1q_u8(ctrl, ctrl_shuf_m);
>  	vst1q_u8((void *)t_wqe, ctrl);
>  	/* Fill ESEG in the header. */
> +	md = pkts[0]->hash.fdir.hi;
>  	vst1q_u8((void *)(t_wqe + 1),
>  		 ((uint8x16_t) { 0, 0, 0, 0,
>  				 cs_flags, 0, 0, 0,
> -				 0, 0, 0, 0,
> +				 md >> 24, md >> 16, md >> 8, md,
>  				 0, 0, 0, 0 }));

I know compiler would be a good optimization but as it is very performance
critical path, let's optimize it by adding metadata as an argument for
txq_burst_v().

static inline uint16_t
txq_burst_v(struct mlx5_txq_data *txq, struct rte_mbuf **pkts, uint16_t pkts_n,
	    uint8_t cs_flags, rte_be32_t metadata)
{
...
	vst1q_u32((void *)(t_wqe + 1),
		 ((uint32x4_t) { 0, cs_flags, metadata, 0 }));
...
}

mlx5_tx_burst_raw_vec() should call txq_burst_v(..., 0, 0). As 0 is a builtin
constant and txq_burst_v() is inline, it would get any performance drop.

In mlx5_tx_burst_vec(), 
	ret = txq_burst_v(txq, &pkts[nb_tx], n, cs_flags, metadata);

, where metadata is gotten from txq_count_contig_same_metadata() and should be
big-endian.

>  #ifdef MLX5_PMD_SOFT_COUNTERS
>  	txq->stats.opackets += pkts_n;
> diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_sse.h b/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
> index 54b3783..7c8535c 100644
> --- a/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
> +++ b/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
> @@ -236,6 +236,7 @@
>  			      0,  1,  2,  3  /* bswap32 */);
>  	__m128i *t_wqe, *dseg;
>  	__m128i ctrl;
> +	uint32_t md; /* metadata */
>  
>  	/* Make sure all packets can fit into a single WQE. */
>  	assert(elts_n > pkts_n);
> @@ -292,9 +293,10 @@
>  	ctrl = _mm_shuffle_epi8(ctrl, shuf_mask_ctrl);
>  	_mm_store_si128(t_wqe, ctrl);
>  	/* Fill ESEG in the header. */
> +	md = pkts[0]->hash.fdir.hi;
>  	_mm_store_si128(t_wqe + 1,
>  			_mm_set_epi8(0, 0, 0, 0,
> -				     0, 0, 0, 0,
> +				     md, md >> 8, md >> 16, md >> 24,
>  				     0, 0, 0, cs_flags,
>  				     0, 0, 0, 0));

If you take my proposal above, this should be:
	_mm_store_si128(t_wqe + 1, _mm_set_epi32(0, metadata, cs_flags, 0));

>  #ifdef MLX5_PMD_SOFT_COUNTERS
> diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
> index f9bc473..7263fb1 100644
> --- a/drivers/net/mlx5/mlx5_txq.c
> +++ b/drivers/net/mlx5/mlx5_txq.c
> @@ -128,6 +128,12 @@
>  			offloads |= (DEV_TX_OFFLOAD_VXLAN_TNL_TSO |
>  				     DEV_TX_OFFLOAD_GRE_TNL_TSO);
>  	}
> +
> +#ifdef HAVE_IBV_FLOW_DV_SUPPORT
> +	if (config->dv_flow_en)
> +		offloads |= DEV_TX_OFFLOAD_MATCH_METADATA;
> +#endif
> +
>  	return offloads;
>  }
>  
> -- 
> 1.8.3.1
>
  
Dekel Peled Oct. 3, 2018, 5:22 a.m. UTC | #2
Thanks, PSB.

> -----Original Message-----
> From: Yongseok Koh
> Sent: Saturday, September 29, 2018 12:09 PM
> To: Dekel Peled <dekelp@mellanox.com>
> Cc: dev@dpdk.org; Shahaf Shuler <shahafs@mellanox.com>; Ori Kam
> <orika@mellanox.com>
> Subject: Re: [PATCH v2] net/mlx5: support metadata as flow rule criteria
> 
> On Thu, Sep 27, 2018 at 05:18:55PM +0300, Dekel Peled wrote:
> > As described in series starting at [1], it adds option to set metadata
> > value as match pattern when creating a new flow rule.
> >
> > This patch adds metadata support in mlx5 driver, in two parts:
> > - Add the setting of metadata value in matcher when creating
> >   a new flow rule.
> > - Add the passing of metadata value from mbuf to wqe when
> >   indicated by ol_flag, in different burst functions.
> >
> > [1] "ethdev: support metadata as flow rule criteria"
> >     http://mails.dpdk.org/archives/dev/2018-September/113270.html
> >
> > Signed-off-by: Dekel Peled <dekelp@mellanox.com>
> > ---
> > V2:
> > Split the support of egress rules to a different patch.
> > ---
> >  drivers/net/mlx5/mlx5_flow.c          |  2 +-
> >  drivers/net/mlx5/mlx5_flow.h          |  8 +++
> >  drivers/net/mlx5/mlx5_flow_dv.c       | 96
> +++++++++++++++++++++++++++++++++++
> >  drivers/net/mlx5/mlx5_prm.h           |  2 +-
> >  drivers/net/mlx5/mlx5_rxtx.c          | 35 +++++++++++--
> >  drivers/net/mlx5/mlx5_rxtx_vec.c      | 31 ++++++++---
> >  drivers/net/mlx5/mlx5_rxtx_vec.h      |  1 +
> >  drivers/net/mlx5/mlx5_rxtx_vec_neon.h |  4 +-
> > drivers/net/mlx5/mlx5_rxtx_vec_sse.h  |  4 +-
> >  drivers/net/mlx5/mlx5_txq.c           |  6 +++
> >  10 files changed, 174 insertions(+), 15 deletions(-)
> >
> > diff --git a/drivers/net/mlx5/mlx5_flow.c
> > b/drivers/net/mlx5/mlx5_flow.c index 8007bf1..9581691 100644
> > --- a/drivers/net/mlx5/mlx5_flow.c
> > +++ b/drivers/net/mlx5/mlx5_flow.c
> > @@ -417,7 +417,7 @@ uint32_t mlx5_flow_adjust_priority(struct
> rte_eth_dev *dev, int32_t priority,
> >   * @return
> >   *   0 on success, a negative errno value otherwise and rte_errno is set.
> >   */
> > -static int
> > +int
> >  mlx5_flow_item_acceptable(const struct rte_flow_item *item,
> >  			  const uint8_t *mask,
> >  			  const uint8_t *nic_mask,
> > diff --git a/drivers/net/mlx5/mlx5_flow.h
> > b/drivers/net/mlx5/mlx5_flow.h index 10d700a..d91ae17 100644
> > --- a/drivers/net/mlx5/mlx5_flow.h
> > +++ b/drivers/net/mlx5/mlx5_flow.h
> > @@ -42,6 +42,9 @@
> >  #define MLX5_FLOW_LAYER_GRE (1u << 14)  #define
> MLX5_FLOW_LAYER_MPLS
> > (1u << 15)
> >
> > +/* General pattern items bits. */
> > +#define MLX5_FLOW_ITEM_METADATA (1u << 16)
> > +
> >  /* Outer Masks. */
> >  #define MLX5_FLOW_LAYER_OUTER_L3 \
> >  	(MLX5_FLOW_LAYER_OUTER_L3_IPV4 |
> MLX5_FLOW_LAYER_OUTER_L3_IPV6) @@
> > -299,6 +302,11 @@ int mlx5_flow_validate_action_rss(const struct
> > rte_flow_action *action,  int mlx5_flow_validate_attributes(struct
> rte_eth_dev *dev,
> >  				  const struct rte_flow_attr *attributes,
> >  				  struct rte_flow_error *error);
> > +int mlx5_flow_item_acceptable(const struct rte_flow_item *item,
> > +			      const uint8_t *mask,
> > +			      const uint8_t *nic_mask,
> > +			      unsigned int size,
> > +			      struct rte_flow_error *error);
> >  int mlx5_flow_validate_item_eth(const struct rte_flow_item *item,
> >  				uint64_t item_flags,
> >  				struct rte_flow_error *error);
> > diff --git a/drivers/net/mlx5/mlx5_flow_dv.c
> > b/drivers/net/mlx5/mlx5_flow_dv.c index cf663cd..2439f5e 100644
> > --- a/drivers/net/mlx5/mlx5_flow_dv.c
> > +++ b/drivers/net/mlx5/mlx5_flow_dv.c
> > @@ -36,6 +36,55 @@
> >  #ifdef HAVE_IBV_FLOW_DV_SUPPORT
> >
> >  /**
> > + * Validate META item.
> > + *
> > + * @param[in] item
> > + *   Item specification.
> > + * @param[in] attr
> > + *   Attributes of flow that includes this item.
> > + * @param[out] error
> > + *   Pointer to error structure.
> > + *
> > + * @return
> > + *   0 on success, a negative errno value otherwise and rte_errno is set.
> > + */
> > +static int
> > +mlx5_flow_validate_item_meta(const struct rte_flow_item *item,
> 
> Naming rule here is starting from flow_dv_*() for static funcs. For global
> funcs, it should start from mlx5_, so it should be mlx5_flow_dv_*().

Renamed function to flow_dv_validate_item_meta.
Left it as static in mlx5_flow_dv.c, since it is relevant for DV only.

> 
> Or, you can put it in the mlx5_flow.c as a common helper function although it
> is used only by DV.
> 
> I prefer the latter.
> 
> > +			const struct rte_flow_attr *attr,
> > +			struct rte_flow_error *error)
> > +{
> > +	const struct rte_flow_item_meta *spec = item->spec;
> > +	const struct rte_flow_item_meta *mask = item->mask;
> > +
> > +	const struct rte_flow_item_meta nic_mask = {
> > +		.data = RTE_BE32(UINT32_MAX)
> > +	};
> > +
> > +	int ret;
> > +
> > +	if (!spec)
> > +		return rte_flow_error_set(error, EINVAL,
> > +
> RTE_FLOW_ERROR_TYPE_ITEM_SPEC,
> > +					  item->spec,
> > +					  "data cannot be empty");
> > +	if (!mask)
> > +		mask = &rte_flow_item_meta_mask;
> > +	ret = mlx5_flow_item_acceptable(item, (const uint8_t *)mask,
> > +					(const uint8_t *)&nic_mask,
> > +					sizeof(struct rte_flow_item_meta),
> > +					error);
> > +	if (ret < 0)
> > +		return ret;
> > +
> 
> No blank line is allowed.
Blank line removed.

> 
> > +	if (attr->ingress)
> > +		return rte_flow_error_set(error, ENOTSUP,
> > +
> RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
> > +					  NULL,
> > +					  "pattern not supported for
> ingress");
> 
> If it is only supported with egress flow, then please group this patch with the
> other one - "net/mlx5: allow flow rule with attribute egress", and make it
> applied later than that by specifying [1/2] and [2/2].

I will update the commit log with the relevant data as done 
in http://mails.dpdk.org/archives/dev/2018-September/113280.html

> 
> > +	return 0;
> > +}
> > +
> > +/**
> >   * Verify the @p attributes will be correctly understood by the NIC and
> store
> >   * them in the @p flow if everything is correct.
> >   *
> > @@ -216,6 +265,12 @@
> >  				return ret;
> >  			item_flags |= MLX5_FLOW_LAYER_MPLS;
> >  			break;
> > +		case RTE_FLOW_ITEM_TYPE_META:
> > +			ret = mlx5_flow_validate_item_meta(items, attr,
> error);
> > +			if (ret < 0)
> > +				return ret;
> > +			item_flags |= MLX5_FLOW_ITEM_METADATA;
> > +			break;
> >  		default:
> >  			return rte_flow_error_set(error, ENOTSUP,
> >
> RTE_FLOW_ERROR_TYPE_ITEM,
> > @@ -853,6 +908,43 @@
> >  }
> >
> >  /**
> > + * Add META item to matcher
> > + *
> > + * @param[in, out] matcher
> > + *   Flow matcher.
> > + * @param[in, out] key
> > + *   Flow matcher value.
> > + * @param[in] item
> > + *   Flow pattern to translate.
> > + * @param[in] inner
> > + *   Item is inner pattern.
> > + */
> > +static void
> > +flow_dv_translate_item_meta(void *matcher, void *key,
> > +				const struct rte_flow_item *item) {
> > +	const struct rte_flow_item_meta *metam;
> > +	const struct rte_flow_item_meta *metav;
> 
> Ori changed naming like meta_m and meta_v.

Renamed.

> 
> > +	void *misc2_m =
> > +		MLX5_ADDR_OF(fte_match_param, matcher,
> misc_parameters_2);
> > +	void *misc2_v =
> > +		MLX5_ADDR_OF(fte_match_param, key,
> misc_parameters_2);
> > +
> > +	metam = (const void *)item->mask;
> > +	if (!metam)
> > +		metam = &rte_flow_item_meta_mask;
> > +
> 
> No blank line is allowed except for the one after variable declaration. Please
> remove such blank lines in the whole patches.
Done.

> 
> > +	metav = (const void *)item->spec;
> > +	if (metav) {
> > +		MLX5_SET(fte_match_set_misc2, misc2_m,
> metadata_reg_a,
> > +			RTE_BE32(metam->data));
> > +		MLX5_SET(fte_match_set_misc2, misc2_v, metadata_reg_a,
> > +			RTE_BE32(metav->data));
> > +	}
> > +}
> > +
> > +/**
> >   * Update the matcher and the value based the selected item.
> >   *
> >   * @param[in, out] matcher
> > @@ -938,6 +1030,10 @@
> >  		flow_dv_translate_item_vxlan(tmatcher->mask.buf, key,
> item,
> >  					     inner);
> >  		break;
> > +	case RTE_FLOW_ITEM_TYPE_META:
> > +		flow_dv_translate_item_meta(tmatcher->mask.buf, key,
> item);
> > +		break;
> > +
> >  	default:
> >  		break;
> >  	}
> > diff --git a/drivers/net/mlx5/mlx5_prm.h b/drivers/net/mlx5/mlx5_prm.h
> > index 4e2f9f4..a905397 100644
> > --- a/drivers/net/mlx5/mlx5_prm.h
> > +++ b/drivers/net/mlx5/mlx5_prm.h
> > @@ -159,7 +159,7 @@ struct mlx5_wqe_eth_seg_small {
> >  	uint8_t	cs_flags;
> >  	uint8_t	rsvd1;
> >  	uint16_t mss;
> > -	uint32_t rsvd2;
> > +	uint32_t flow_table_metadata;
> >  	uint16_t inline_hdr_sz;
> >  	uint8_t inline_hdr[2];
> >  } __rte_aligned(MLX5_WQE_DWORD_SIZE);
> > diff --git a/drivers/net/mlx5/mlx5_rxtx.c
> > b/drivers/net/mlx5/mlx5_rxtx.c index 558e6b6..d596e4e 100644
> > --- a/drivers/net/mlx5/mlx5_rxtx.c
> > +++ b/drivers/net/mlx5/mlx5_rxtx.c
> > @@ -523,6 +523,7 @@
> >  		uint8_t tso = txq->tso_en && (buf->ol_flags &
> PKT_TX_TCP_SEG);
> >  		uint32_t swp_offsets = 0;
> >  		uint8_t swp_types = 0;
> > +		uint32_t metadata = 0;
> >  		uint16_t tso_segsz = 0;
> >  #ifdef MLX5_PMD_SOFT_COUNTERS
> >  		uint32_t total_length = 0;
> > @@ -566,6 +567,9 @@
> >  		cs_flags = txq_ol_cksum_to_cs(buf);
> >  		txq_mbuf_to_swp(txq, buf, (uint8_t *)&swp_offsets,
> &swp_types);
> >  		raw = ((uint8_t *)(uintptr_t)wqe) + 2 *
> MLX5_WQE_DWORD_SIZE;
> > +		/* Copy metadata from mbuf if valid */
> > +		if (buf->ol_flags & PKT_TX_METADATA)
> > +			metadata = buf->hash.fdir.hi;
> 
> I saw someone suggested to add a union field to name it properly. I also
> agree on the idea. It is quite confusing to get meta data from hash field.

Done, added field tx_metadata in union mbuf.hash.

> 
> >  		/* Replace the Ethernet type by the VLAN if necessary. */
> >  		if (buf->ol_flags & PKT_TX_VLAN_PKT) {
> >  			uint32_t vlan = rte_cpu_to_be_32(0x81000000 | @@
> -781,7 +785,7 @@
> >  				swp_offsets,
> >  				cs_flags | (swp_types << 8) |
> >  				(rte_cpu_to_be_16(tso_segsz) << 16),
> > -				0,
> > +				rte_cpu_to_be_32(metadata),
> >  				(ehdr << 16) |
> rte_cpu_to_be_16(tso_header_sz),
> >  			};
> >  		} else {
> > @@ -795,7 +799,7 @@
> >  			wqe->eseg = (rte_v128u32_t){
> >  				swp_offsets,
> >  				cs_flags | (swp_types << 8),
> > -				0,
> > +				rte_cpu_to_be_32(metadata),
> >  				(ehdr << 16) |
> rte_cpu_to_be_16(pkt_inline_sz),
> >  			};
> >  		}
> > @@ -861,7 +865,7 @@
> >  	mpw->wqe->eseg.inline_hdr_sz = 0;
> >  	mpw->wqe->eseg.rsvd0 = 0;
> >  	mpw->wqe->eseg.rsvd1 = 0;
> > -	mpw->wqe->eseg.rsvd2 = 0;
> > +	mpw->wqe->eseg.flow_table_metadata = 0;
> >  	mpw->wqe->ctrl[0] = rte_cpu_to_be_32((MLX5_OPC_MOD_MPW
> << 24) |
> >  					     (txq->wqe_ci << 8) |
> >  					     MLX5_OPCODE_TSO);
> > @@ -971,6 +975,8 @@
> >  		if ((mpw.state == MLX5_MPW_STATE_OPENED) &&
> >  		    ((mpw.len != length) ||
> >  		     (segs_n != 1) ||
> > +		     (mpw.wqe->eseg.flow_table_metadata !=
> > +			rte_cpu_to_be_32(buf->hash.fdir.hi)) ||
> >  		     (mpw.wqe->eseg.cs_flags != cs_flags)))
> >  			mlx5_mpw_close(txq, &mpw);
> >  		if (mpw.state == MLX5_MPW_STATE_CLOSED) { @@ -984,6
> +990,8 @@
> >  			max_wqe -= 2;
> >  			mlx5_mpw_new(txq, &mpw, length);
> >  			mpw.wqe->eseg.cs_flags = cs_flags;
> > +			mpw.wqe->eseg.flow_table_metadata =
> > +				rte_cpu_to_be_32(buf->hash.fdir.hi);
> >  		}
> >  		/* Multi-segment packets must be alone in their MPW. */
> >  		assert((segs_n == 1) || (mpw.pkts_n == 0)); @@ -1082,7
> +1090,7 @@
> >  	mpw->wqe->eseg.cs_flags = 0;
> >  	mpw->wqe->eseg.rsvd0 = 0;
> >  	mpw->wqe->eseg.rsvd1 = 0;
> > -	mpw->wqe->eseg.rsvd2 = 0;
> > +	mpw->wqe->eseg.flow_table_metadata = 0;
> >  	inl = (struct mlx5_wqe_inl_small *)
> >  		(((uintptr_t)mpw->wqe) + 2 * MLX5_WQE_DWORD_SIZE);
> >  	mpw->data.raw = (uint8_t *)&inl->raw; @@ -1199,12 +1207,16 @@
> >  		if (mpw.state == MLX5_MPW_STATE_OPENED) {
> >  			if ((mpw.len != length) ||
> >  			    (segs_n != 1) ||
> > +			    (mpw.wqe->eseg.flow_table_metadata !=
> > +				rte_cpu_to_be_32(buf->hash.fdir.hi)) ||
> >  			    (mpw.wqe->eseg.cs_flags != cs_flags))
> >  				mlx5_mpw_close(txq, &mpw);
> >  		} else if (mpw.state == MLX5_MPW_INL_STATE_OPENED) {
> >  			if ((mpw.len != length) ||
> >  			    (segs_n != 1) ||
> >  			    (length > inline_room) ||
> > +			    (mpw.wqe->eseg.flow_table_metadata !=
> > +				rte_cpu_to_be_32(buf->hash.fdir.hi)) ||
> 
> Isn't this mbuf field vaild only if there's PKT_TX_METADATA? Without the
> flag, it could be a garbage, right? And I think the value in the eseg might not
> be metadata of previous packet if the packet didn't have the flag. It would be
> better to define metadata and get it early in this loop like cs_flags. E.g.
> 
> 		metadata = buf->ol_flags & PKT_TX_METADATA ?
> 			   rte_cpu_to_be_32(buf->hash.fdir.hi) : 0;
> 		cs_flags = txq_ol_cksum_to_cs(buf);
> 
> Same for the rest of similar funcs.

Done.

> 
> >  			    (mpw.wqe->eseg.cs_flags != cs_flags)) {
> >  				mlx5_mpw_inline_close(txq, &mpw);
> >  				inline_room =
> > @@ -1224,12 +1236,21 @@
> >  				max_wqe -= 2;
> >  				mlx5_mpw_new(txq, &mpw, length);
> >  				mpw.wqe->eseg.cs_flags = cs_flags;
> > +				/* Copy metadata from mbuf if valid */
> > +				if (buf->ol_flags & PKT_TX_METADATA)
> > +					mpw.wqe-
> >eseg.flow_table_metadata =
> > +					rte_cpu_to_be_32(buf-
> >hash.fdir.hi);
> > +
> 
> Yes, this can be allowed but the following is preferred.
OK.

> 					mpw.wqe-
> >eseg.flow_table_metadata =
> 						rte_cpu_to_be_32
> 						(buf->hash.fdir.hi);
> >  			} else {
> >  				if (unlikely(max_wqe < wqe_inl_n))
> >  					break;
> >  				max_wqe -= wqe_inl_n;
> >  				mlx5_mpw_inline_new(txq, &mpw, length);
> >  				mpw.wqe->eseg.cs_flags = cs_flags;
> > +				/* Copy metadata from mbuf if valid */
> > +				if (buf->ol_flags & PKT_TX_METADATA)
> > +					mpw.wqe-
> >eseg.flow_table_metadata =
> > +					rte_cpu_to_be_32(buf-
> >hash.fdir.hi);
> 
> Same here.
> 
> >  			}
> >  		}
> >  		/* Multi-segment packets must be alone in their MPW. */
> @@ -1482,6
> > +1503,8 @@
> >  			    (length <= txq->inline_max_packet_sz &&
> >  			     inl_pad + sizeof(inl_hdr) + length >
> >  			     mpw_room) ||
> > +			     (mpw.wqe->eseg.flow_table_metadata !=
> > +				rte_cpu_to_be_32(buf->hash.fdir.hi)) ||
> >  			    (mpw.wqe->eseg.cs_flags != cs_flags))
> >  				max_wqe -= mlx5_empw_close(txq, &mpw);
> >  		}
> > @@ -1505,6 +1528,10 @@
> >  				    sizeof(inl_hdr) + length <= mpw_room &&
> >  				    !txq->mpw_hdr_dseg;
> >  			mpw.wqe->eseg.cs_flags = cs_flags;
> > +			/* Copy metadata from mbuf if valid */
> > +			if (buf->ol_flags & PKT_TX_METADATA)
> > +				mpw.wqe->eseg.flow_table_metadata =
> > +					rte_cpu_to_be_32(buf-
> >hash.fdir.hi);
> >  		} else {
> >  			/* Evaluate whether the next packet can be inlined.
> >  			 * Inlininig is possible when:
> > diff --git a/drivers/net/mlx5/mlx5_rxtx_vec.c
> > b/drivers/net/mlx5/mlx5_rxtx_vec.c
> > index 0a4aed8..132943a 100644
> > --- a/drivers/net/mlx5/mlx5_rxtx_vec.c
> > +++ b/drivers/net/mlx5/mlx5_rxtx_vec.c
> > @@ -41,6 +41,8 @@
> >
> >  /**
> >   * Count the number of packets having same ol_flags and calculate
> cs_flags.
> > + * If PKT_TX_METADATA is set in ol_flags, packets must have same
> > + metadata
> > + * as well.
> >   *
> >   * @param pkts
> >   *   Pointer to array of packets.
> > @@ -48,25 +50,38 @@
> >   *   Number of packets.
> >   * @param cs_flags
> >   *   Pointer of flags to be returned.
> > + * @param txq_offloads
> > + *   Offloads enabled on Tx queue
> >   *
> >   * @return
> > - *   Number of packets having same ol_flags.
> > + *   Number of packets having same ol_flags and metadata, if relevant.
> >   */
> >  static inline unsigned int
> > -txq_calc_offload(struct rte_mbuf **pkts, uint16_t pkts_n, uint8_t
> > *cs_flags)
> > +txq_calc_offload(struct rte_mbuf **pkts, uint16_t pkts_n, uint8_t
> *cs_flags,
> > +		const uint64_t txq_offloads)
> >  {
> >  	unsigned int pos;
> >  	const uint64_t ol_mask =
> >  		PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM |
> >  		PKT_TX_UDP_CKSUM | PKT_TX_TUNNEL_GRE |
> > -		PKT_TX_TUNNEL_VXLAN | PKT_TX_OUTER_IP_CKSUM;
> > +		PKT_TX_TUNNEL_VXLAN | PKT_TX_OUTER_IP_CKSUM |
> PKT_TX_METADATA;
> >
> >  	if (!pkts_n)
> >  		return 0;
> >  	/* Count the number of packets having same ol_flags. */
> > -	for (pos = 1; pos < pkts_n; ++pos)
> > -		if ((pkts[pos]->ol_flags ^ pkts[0]->ol_flags) & ol_mask)
> > +	for (pos = 1; pos < pkts_n; ++pos) {
> > +		if ((txq_offloads & MLX5_VEC_TX_CKSUM_OFFLOAD_CAP)
> &&
> > +			((pkts[pos]->ol_flags ^ pkts[0]->ol_flags) &
> ol_mask))
> >  			break;
> > +		/* If the metadata ol_flag is set,
> > +		 *  metadata must be same in all packets.
> > +		 */
> > +		if ((txq_offloads & DEV_TX_OFFLOAD_MATCH_METADATA)
> &&
> > +			(pkts[pos]->ol_flags & PKT_TX_METADATA) &&
> > +			pkts[0]->hash.fdir.hi != pkts[pos]->hash.fdir.hi)
> > +			break;
> > +	}
> > +
> >  	*cs_flags = txq_ol_cksum_to_cs(pkts[0]);
> >  	return pos;
> >  }
> > @@ -137,8 +152,10 @@
> >  		n = RTE_MIN((uint16_t)(pkts_n - nb_tx),
> MLX5_VPMD_TX_MAX_BURST);
> >  		if (txq->offloads & DEV_TX_OFFLOAD_MULTI_SEGS)
> >  			n = txq_count_contig_single_seg(&pkts[nb_tx], n);
> > -		if (txq->offloads & MLX5_VEC_TX_CKSUM_OFFLOAD_CAP)
> > -			n = txq_calc_offload(&pkts[nb_tx], n, &cs_flags);
> > +		if (txq->offloads & (MLX5_VEC_TX_CKSUM_OFFLOAD_CAP |
> > +				DEV_TX_OFFLOAD_MATCH_METADATA))
> 
> How about writing a separate func - txq_count_contig_same_metadata()?
> And it would be better to rename txq_calc_offload() to
> txq_count_contig_same_csflag().
> Then, there could be less redundant if-clause in the funcs.

It was considered during implementation but decided to handle all related logic
In same function.

> 
> > +			n = txq_calc_offload(&pkts[nb_tx], n,
> > +					&cs_flags, txq->offloads);
> >  		ret = txq_burst_v(txq, &pkts[nb_tx], n, cs_flags);
> >  		nb_tx += ret;
> >  		if (!ret)
> > diff --git a/drivers/net/mlx5/mlx5_rxtx_vec.h
> > b/drivers/net/mlx5/mlx5_rxtx_vec.h
> > index fb884f9..fda7004 100644
> > --- a/drivers/net/mlx5/mlx5_rxtx_vec.h
> > +++ b/drivers/net/mlx5/mlx5_rxtx_vec.h
> > @@ -22,6 +22,7 @@
> >  /* HW offload capabilities of vectorized Tx. */  #define
> > MLX5_VEC_TX_OFFLOAD_CAP \
> >  	(MLX5_VEC_TX_CKSUM_OFFLOAD_CAP | \
> > +	 DEV_TX_OFFLOAD_MATCH_METADATA | \
> >  	 DEV_TX_OFFLOAD_MULTI_SEGS)
> >
> >  /*
> > diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
> > b/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
> > index b37b738..20c9427 100644
> > --- a/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
> > +++ b/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
> > @@ -237,6 +237,7 @@
> >  	uint8x16_t *t_wqe;
> >  	uint8_t *dseg;
> >  	uint8x16_t ctrl;
> > +	uint32_t md; /* metadata */
> >
> >  	/* Make sure all packets can fit into a single WQE. */
> >  	assert(elts_n > pkts_n);
> > @@ -293,10 +294,11 @@
> >  	ctrl = vqtbl1q_u8(ctrl, ctrl_shuf_m);
> >  	vst1q_u8((void *)t_wqe, ctrl);
> >  	/* Fill ESEG in the header. */
> > +	md = pkts[0]->hash.fdir.hi;
> >  	vst1q_u8((void *)(t_wqe + 1),
> >  		 ((uint8x16_t) { 0, 0, 0, 0,
> >  				 cs_flags, 0, 0, 0,
> > -				 0, 0, 0, 0,
> > +				 md >> 24, md >> 16, md >> 8, md,
> >  				 0, 0, 0, 0 }));
> 
> I know compiler would be a good optimization but as it is very performance
> critical path, let's optimize it by adding metadata as an argument for
> txq_burst_v().
> 
> static inline uint16_t
> txq_burst_v(struct mlx5_txq_data *txq, struct rte_mbuf **pkts, uint16_t
> pkts_n,
> 	    uint8_t cs_flags, rte_be32_t metadata) { ...
> 	vst1q_u32((void *)(t_wqe + 1),
> 		 ((uint32x4_t) { 0, cs_flags, metadata, 0 })); ...
> }
> 
> mlx5_tx_burst_raw_vec() should call txq_burst_v(..., 0, 0). As 0 is a builtin
> constant and txq_burst_v() is inline, it would get any performance drop.
> 
> In mlx5_tx_burst_vec(),
> 	ret = txq_burst_v(txq, &pkts[nb_tx], n, cs_flags, metadata);
> 
> , where metadata is gotten from txq_count_contig_same_metadata() and
> should be big-endian.

Done.

> 
> >  #ifdef MLX5_PMD_SOFT_COUNTERS
> >  	txq->stats.opackets += pkts_n;
> > diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
> > b/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
> > index 54b3783..7c8535c 100644
> > --- a/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
> > +++ b/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
> > @@ -236,6 +236,7 @@
> >  			      0,  1,  2,  3  /* bswap32 */);
> >  	__m128i *t_wqe, *dseg;
> >  	__m128i ctrl;
> > +	uint32_t md; /* metadata */
> >
> >  	/* Make sure all packets can fit into a single WQE. */
> >  	assert(elts_n > pkts_n);
> > @@ -292,9 +293,10 @@
> >  	ctrl = _mm_shuffle_epi8(ctrl, shuf_mask_ctrl);
> >  	_mm_store_si128(t_wqe, ctrl);
> >  	/* Fill ESEG in the header. */
> > +	md = pkts[0]->hash.fdir.hi;
> >  	_mm_store_si128(t_wqe + 1,
> >  			_mm_set_epi8(0, 0, 0, 0,
> > -				     0, 0, 0, 0,
> > +				     md, md >> 8, md >> 16, md >> 24,
> >  				     0, 0, 0, cs_flags,
> >  				     0, 0, 0, 0));
> 
> If you take my proposal above, this should be:
> 	_mm_store_si128(t_wqe + 1, _mm_set_epi32(0, metadata, cs_flags,
> 0));

Done.

> 
> >  #ifdef MLX5_PMD_SOFT_COUNTERS
> > diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
> > index f9bc473..7263fb1 100644
> > --- a/drivers/net/mlx5/mlx5_txq.c
> > +++ b/drivers/net/mlx5/mlx5_txq.c
> > @@ -128,6 +128,12 @@
> >  			offloads |= (DEV_TX_OFFLOAD_VXLAN_TNL_TSO |
> >  				     DEV_TX_OFFLOAD_GRE_TNL_TSO);
> >  	}
> > +
> > +#ifdef HAVE_IBV_FLOW_DV_SUPPORT
> > +	if (config->dv_flow_en)
> > +		offloads |= DEV_TX_OFFLOAD_MATCH_METADATA; #endif
> > +
> >  	return offloads;
> >  }
> >
> > --
> > 1.8.3.1
> >
  
Yongseok Koh Oct. 3, 2018, 7:22 a.m. UTC | #3
> On Oct 2, 2018, at 10:22 PM, Dekel Peled <dekelp@mellanox.com> wrote:
> 
>>> @@ -137,8 +152,10 @@
>>> 		n = RTE_MIN((uint16_t)(pkts_n - nb_tx),
>> MLX5_VPMD_TX_MAX_BURST);
>>> 		if (txq->offloads & DEV_TX_OFFLOAD_MULTI_SEGS)
>>> 			n = txq_count_contig_single_seg(&pkts[nb_tx], n);
>>> -		if (txq->offloads & MLX5_VEC_TX_CKSUM_OFFLOAD_CAP)
>>> -			n = txq_calc_offload(&pkts[nb_tx], n, &cs_flags);
>>> +		if (txq->offloads & (MLX5_VEC_TX_CKSUM_OFFLOAD_CAP |
>>> +				DEV_TX_OFFLOAD_MATCH_METADATA))
>> 
>> How about writing a separate func - txq_count_contig_same_metadata()?
>> And it would be better to rename txq_calc_offload() to
>> txq_count_contig_same_csflag().
>> Then, there could be less redundant if-clause in the funcs.
> 
> It was considered during implementation but decided to handle all related logic
> In same function.

But it doesn't look efficient. Note that it is performance critical datapath.

	if (A) {
		for (n)
			do_a();
	}
	if (B) {
		for (n)
			do_b();
	}

vs.

	if (A or B) {
		for (n) {
			if (A)
				do_a();
			if (B)
				do_b();
		}
	}

In the worst case, condition A and B will be checked n times each while it can be
only once in the first case.

Thanks,
Yongseok
  

Patch

diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c
index 8007bf1..9581691 100644
--- a/drivers/net/mlx5/mlx5_flow.c
+++ b/drivers/net/mlx5/mlx5_flow.c
@@ -417,7 +417,7 @@  uint32_t mlx5_flow_adjust_priority(struct rte_eth_dev *dev, int32_t priority,
  * @return
  *   0 on success, a negative errno value otherwise and rte_errno is set.
  */
-static int
+int
 mlx5_flow_item_acceptable(const struct rte_flow_item *item,
 			  const uint8_t *mask,
 			  const uint8_t *nic_mask,
diff --git a/drivers/net/mlx5/mlx5_flow.h b/drivers/net/mlx5/mlx5_flow.h
index 10d700a..d91ae17 100644
--- a/drivers/net/mlx5/mlx5_flow.h
+++ b/drivers/net/mlx5/mlx5_flow.h
@@ -42,6 +42,9 @@ 
 #define MLX5_FLOW_LAYER_GRE (1u << 14)
 #define MLX5_FLOW_LAYER_MPLS (1u << 15)
 
+/* General pattern items bits. */
+#define MLX5_FLOW_ITEM_METADATA (1u << 16)
+
 /* Outer Masks. */
 #define MLX5_FLOW_LAYER_OUTER_L3 \
 	(MLX5_FLOW_LAYER_OUTER_L3_IPV4 | MLX5_FLOW_LAYER_OUTER_L3_IPV6)
@@ -299,6 +302,11 @@  int mlx5_flow_validate_action_rss(const struct rte_flow_action *action,
 int mlx5_flow_validate_attributes(struct rte_eth_dev *dev,
 				  const struct rte_flow_attr *attributes,
 				  struct rte_flow_error *error);
+int mlx5_flow_item_acceptable(const struct rte_flow_item *item,
+			      const uint8_t *mask,
+			      const uint8_t *nic_mask,
+			      unsigned int size,
+			      struct rte_flow_error *error);
 int mlx5_flow_validate_item_eth(const struct rte_flow_item *item,
 				uint64_t item_flags,
 				struct rte_flow_error *error);
diff --git a/drivers/net/mlx5/mlx5_flow_dv.c b/drivers/net/mlx5/mlx5_flow_dv.c
index cf663cd..2439f5e 100644
--- a/drivers/net/mlx5/mlx5_flow_dv.c
+++ b/drivers/net/mlx5/mlx5_flow_dv.c
@@ -36,6 +36,55 @@ 
 #ifdef HAVE_IBV_FLOW_DV_SUPPORT
 
 /**
+ * Validate META item.
+ *
+ * @param[in] item
+ *   Item specification.
+ * @param[in] attr
+ *   Attributes of flow that includes this item.
+ * @param[out] error
+ *   Pointer to error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_flow_validate_item_meta(const struct rte_flow_item *item,
+			const struct rte_flow_attr *attr,
+			struct rte_flow_error *error)
+{
+	const struct rte_flow_item_meta *spec = item->spec;
+	const struct rte_flow_item_meta *mask = item->mask;
+
+	const struct rte_flow_item_meta nic_mask = {
+		.data = RTE_BE32(UINT32_MAX)
+	};
+
+	int ret;
+
+	if (!spec)
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ITEM_SPEC,
+					  item->spec,
+					  "data cannot be empty");
+	if (!mask)
+		mask = &rte_flow_item_meta_mask;
+	ret = mlx5_flow_item_acceptable(item, (const uint8_t *)mask,
+					(const uint8_t *)&nic_mask,
+					sizeof(struct rte_flow_item_meta),
+					error);
+	if (ret < 0)
+		return ret;
+
+	if (attr->ingress)
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
+					  NULL,
+					  "pattern not supported for ingress");
+	return 0;
+}
+
+/**
  * Verify the @p attributes will be correctly understood by the NIC and store
  * them in the @p flow if everything is correct.
  *
@@ -216,6 +265,12 @@ 
 				return ret;
 			item_flags |= MLX5_FLOW_LAYER_MPLS;
 			break;
+		case RTE_FLOW_ITEM_TYPE_META:
+			ret = mlx5_flow_validate_item_meta(items, attr, error);
+			if (ret < 0)
+				return ret;
+			item_flags |= MLX5_FLOW_ITEM_METADATA;
+			break;
 		default:
 			return rte_flow_error_set(error, ENOTSUP,
 						  RTE_FLOW_ERROR_TYPE_ITEM,
@@ -853,6 +908,43 @@ 
 }
 
 /**
+ * Add META item to matcher
+ *
+ * @param[in, out] matcher
+ *   Flow matcher.
+ * @param[in, out] key
+ *   Flow matcher value.
+ * @param[in] item
+ *   Flow pattern to translate.
+ * @param[in] inner
+ *   Item is inner pattern.
+ */
+static void
+flow_dv_translate_item_meta(void *matcher, void *key,
+				const struct rte_flow_item *item)
+{
+	const struct rte_flow_item_meta *metam;
+	const struct rte_flow_item_meta *metav;
+
+	void *misc2_m =
+		MLX5_ADDR_OF(fte_match_param, matcher, misc_parameters_2);
+	void *misc2_v =
+		MLX5_ADDR_OF(fte_match_param, key, misc_parameters_2);
+
+	metam = (const void *)item->mask;
+	if (!metam)
+		metam = &rte_flow_item_meta_mask;
+
+	metav = (const void *)item->spec;
+	if (metav) {
+		MLX5_SET(fte_match_set_misc2, misc2_m, metadata_reg_a,
+			RTE_BE32(metam->data));
+		MLX5_SET(fte_match_set_misc2, misc2_v, metadata_reg_a,
+			RTE_BE32(metav->data));
+	}
+}
+
+/**
  * Update the matcher and the value based the selected item.
  *
  * @param[in, out] matcher
@@ -938,6 +1030,10 @@ 
 		flow_dv_translate_item_vxlan(tmatcher->mask.buf, key, item,
 					     inner);
 		break;
+	case RTE_FLOW_ITEM_TYPE_META:
+		flow_dv_translate_item_meta(tmatcher->mask.buf, key, item);
+		break;
+
 	default:
 		break;
 	}
diff --git a/drivers/net/mlx5/mlx5_prm.h b/drivers/net/mlx5/mlx5_prm.h
index 4e2f9f4..a905397 100644
--- a/drivers/net/mlx5/mlx5_prm.h
+++ b/drivers/net/mlx5/mlx5_prm.h
@@ -159,7 +159,7 @@  struct mlx5_wqe_eth_seg_small {
 	uint8_t	cs_flags;
 	uint8_t	rsvd1;
 	uint16_t mss;
-	uint32_t rsvd2;
+	uint32_t flow_table_metadata;
 	uint16_t inline_hdr_sz;
 	uint8_t inline_hdr[2];
 } __rte_aligned(MLX5_WQE_DWORD_SIZE);
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 558e6b6..d596e4e 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -523,6 +523,7 @@ 
 		uint8_t tso = txq->tso_en && (buf->ol_flags & PKT_TX_TCP_SEG);
 		uint32_t swp_offsets = 0;
 		uint8_t swp_types = 0;
+		uint32_t metadata = 0;
 		uint16_t tso_segsz = 0;
 #ifdef MLX5_PMD_SOFT_COUNTERS
 		uint32_t total_length = 0;
@@ -566,6 +567,9 @@ 
 		cs_flags = txq_ol_cksum_to_cs(buf);
 		txq_mbuf_to_swp(txq, buf, (uint8_t *)&swp_offsets, &swp_types);
 		raw = ((uint8_t *)(uintptr_t)wqe) + 2 * MLX5_WQE_DWORD_SIZE;
+		/* Copy metadata from mbuf if valid */
+		if (buf->ol_flags & PKT_TX_METADATA)
+			metadata = buf->hash.fdir.hi;
 		/* Replace the Ethernet type by the VLAN if necessary. */
 		if (buf->ol_flags & PKT_TX_VLAN_PKT) {
 			uint32_t vlan = rte_cpu_to_be_32(0x81000000 |
@@ -781,7 +785,7 @@ 
 				swp_offsets,
 				cs_flags | (swp_types << 8) |
 				(rte_cpu_to_be_16(tso_segsz) << 16),
-				0,
+				rte_cpu_to_be_32(metadata),
 				(ehdr << 16) | rte_cpu_to_be_16(tso_header_sz),
 			};
 		} else {
@@ -795,7 +799,7 @@ 
 			wqe->eseg = (rte_v128u32_t){
 				swp_offsets,
 				cs_flags | (swp_types << 8),
-				0,
+				rte_cpu_to_be_32(metadata),
 				(ehdr << 16) | rte_cpu_to_be_16(pkt_inline_sz),
 			};
 		}
@@ -861,7 +865,7 @@ 
 	mpw->wqe->eseg.inline_hdr_sz = 0;
 	mpw->wqe->eseg.rsvd0 = 0;
 	mpw->wqe->eseg.rsvd1 = 0;
-	mpw->wqe->eseg.rsvd2 = 0;
+	mpw->wqe->eseg.flow_table_metadata = 0;
 	mpw->wqe->ctrl[0] = rte_cpu_to_be_32((MLX5_OPC_MOD_MPW << 24) |
 					     (txq->wqe_ci << 8) |
 					     MLX5_OPCODE_TSO);
@@ -971,6 +975,8 @@ 
 		if ((mpw.state == MLX5_MPW_STATE_OPENED) &&
 		    ((mpw.len != length) ||
 		     (segs_n != 1) ||
+		     (mpw.wqe->eseg.flow_table_metadata !=
+			rte_cpu_to_be_32(buf->hash.fdir.hi)) ||
 		     (mpw.wqe->eseg.cs_flags != cs_flags)))
 			mlx5_mpw_close(txq, &mpw);
 		if (mpw.state == MLX5_MPW_STATE_CLOSED) {
@@ -984,6 +990,8 @@ 
 			max_wqe -= 2;
 			mlx5_mpw_new(txq, &mpw, length);
 			mpw.wqe->eseg.cs_flags = cs_flags;
+			mpw.wqe->eseg.flow_table_metadata =
+				rte_cpu_to_be_32(buf->hash.fdir.hi);
 		}
 		/* Multi-segment packets must be alone in their MPW. */
 		assert((segs_n == 1) || (mpw.pkts_n == 0));
@@ -1082,7 +1090,7 @@ 
 	mpw->wqe->eseg.cs_flags = 0;
 	mpw->wqe->eseg.rsvd0 = 0;
 	mpw->wqe->eseg.rsvd1 = 0;
-	mpw->wqe->eseg.rsvd2 = 0;
+	mpw->wqe->eseg.flow_table_metadata = 0;
 	inl = (struct mlx5_wqe_inl_small *)
 		(((uintptr_t)mpw->wqe) + 2 * MLX5_WQE_DWORD_SIZE);
 	mpw->data.raw = (uint8_t *)&inl->raw;
@@ -1199,12 +1207,16 @@ 
 		if (mpw.state == MLX5_MPW_STATE_OPENED) {
 			if ((mpw.len != length) ||
 			    (segs_n != 1) ||
+			    (mpw.wqe->eseg.flow_table_metadata !=
+				rte_cpu_to_be_32(buf->hash.fdir.hi)) ||
 			    (mpw.wqe->eseg.cs_flags != cs_flags))
 				mlx5_mpw_close(txq, &mpw);
 		} else if (mpw.state == MLX5_MPW_INL_STATE_OPENED) {
 			if ((mpw.len != length) ||
 			    (segs_n != 1) ||
 			    (length > inline_room) ||
+			    (mpw.wqe->eseg.flow_table_metadata !=
+				rte_cpu_to_be_32(buf->hash.fdir.hi)) ||
 			    (mpw.wqe->eseg.cs_flags != cs_flags)) {
 				mlx5_mpw_inline_close(txq, &mpw);
 				inline_room =
@@ -1224,12 +1236,21 @@ 
 				max_wqe -= 2;
 				mlx5_mpw_new(txq, &mpw, length);
 				mpw.wqe->eseg.cs_flags = cs_flags;
+				/* Copy metadata from mbuf if valid */
+				if (buf->ol_flags & PKT_TX_METADATA)
+					mpw.wqe->eseg.flow_table_metadata =
+					rte_cpu_to_be_32(buf->hash.fdir.hi);
+
 			} else {
 				if (unlikely(max_wqe < wqe_inl_n))
 					break;
 				max_wqe -= wqe_inl_n;
 				mlx5_mpw_inline_new(txq, &mpw, length);
 				mpw.wqe->eseg.cs_flags = cs_flags;
+				/* Copy metadata from mbuf if valid */
+				if (buf->ol_flags & PKT_TX_METADATA)
+					mpw.wqe->eseg.flow_table_metadata =
+					rte_cpu_to_be_32(buf->hash.fdir.hi);
 			}
 		}
 		/* Multi-segment packets must be alone in their MPW. */
@@ -1482,6 +1503,8 @@ 
 			    (length <= txq->inline_max_packet_sz &&
 			     inl_pad + sizeof(inl_hdr) + length >
 			     mpw_room) ||
+			     (mpw.wqe->eseg.flow_table_metadata !=
+				rte_cpu_to_be_32(buf->hash.fdir.hi)) ||
 			    (mpw.wqe->eseg.cs_flags != cs_flags))
 				max_wqe -= mlx5_empw_close(txq, &mpw);
 		}
@@ -1505,6 +1528,10 @@ 
 				    sizeof(inl_hdr) + length <= mpw_room &&
 				    !txq->mpw_hdr_dseg;
 			mpw.wqe->eseg.cs_flags = cs_flags;
+			/* Copy metadata from mbuf if valid */
+			if (buf->ol_flags & PKT_TX_METADATA)
+				mpw.wqe->eseg.flow_table_metadata =
+					rte_cpu_to_be_32(buf->hash.fdir.hi);
 		} else {
 			/* Evaluate whether the next packet can be inlined.
 			 * Inlininig is possible when:
diff --git a/drivers/net/mlx5/mlx5_rxtx_vec.c b/drivers/net/mlx5/mlx5_rxtx_vec.c
index 0a4aed8..132943a 100644
--- a/drivers/net/mlx5/mlx5_rxtx_vec.c
+++ b/drivers/net/mlx5/mlx5_rxtx_vec.c
@@ -41,6 +41,8 @@ 
 
 /**
  * Count the number of packets having same ol_flags and calculate cs_flags.
+ * If PKT_TX_METADATA is set in ol_flags, packets must have same metadata
+ * as well.
  *
  * @param pkts
  *   Pointer to array of packets.
@@ -48,25 +50,38 @@ 
  *   Number of packets.
  * @param cs_flags
  *   Pointer of flags to be returned.
+ * @param txq_offloads
+ *   Offloads enabled on Tx queue
  *
  * @return
- *   Number of packets having same ol_flags.
+ *   Number of packets having same ol_flags and metadata, if relevant.
  */
 static inline unsigned int
-txq_calc_offload(struct rte_mbuf **pkts, uint16_t pkts_n, uint8_t *cs_flags)
+txq_calc_offload(struct rte_mbuf **pkts, uint16_t pkts_n, uint8_t *cs_flags,
+		const uint64_t txq_offloads)
 {
 	unsigned int pos;
 	const uint64_t ol_mask =
 		PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM |
 		PKT_TX_UDP_CKSUM | PKT_TX_TUNNEL_GRE |
-		PKT_TX_TUNNEL_VXLAN | PKT_TX_OUTER_IP_CKSUM;
+		PKT_TX_TUNNEL_VXLAN | PKT_TX_OUTER_IP_CKSUM | PKT_TX_METADATA;
 
 	if (!pkts_n)
 		return 0;
 	/* Count the number of packets having same ol_flags. */
-	for (pos = 1; pos < pkts_n; ++pos)
-		if ((pkts[pos]->ol_flags ^ pkts[0]->ol_flags) & ol_mask)
+	for (pos = 1; pos < pkts_n; ++pos) {
+		if ((txq_offloads & MLX5_VEC_TX_CKSUM_OFFLOAD_CAP) &&
+			((pkts[pos]->ol_flags ^ pkts[0]->ol_flags) & ol_mask))
 			break;
+		/* If the metadata ol_flag is set,
+		 *  metadata must be same in all packets.
+		 */
+		if ((txq_offloads & DEV_TX_OFFLOAD_MATCH_METADATA) &&
+			(pkts[pos]->ol_flags & PKT_TX_METADATA) &&
+			pkts[0]->hash.fdir.hi != pkts[pos]->hash.fdir.hi)
+			break;
+	}
+
 	*cs_flags = txq_ol_cksum_to_cs(pkts[0]);
 	return pos;
 }
@@ -137,8 +152,10 @@ 
 		n = RTE_MIN((uint16_t)(pkts_n - nb_tx), MLX5_VPMD_TX_MAX_BURST);
 		if (txq->offloads & DEV_TX_OFFLOAD_MULTI_SEGS)
 			n = txq_count_contig_single_seg(&pkts[nb_tx], n);
-		if (txq->offloads & MLX5_VEC_TX_CKSUM_OFFLOAD_CAP)
-			n = txq_calc_offload(&pkts[nb_tx], n, &cs_flags);
+		if (txq->offloads & (MLX5_VEC_TX_CKSUM_OFFLOAD_CAP |
+				DEV_TX_OFFLOAD_MATCH_METADATA))
+			n = txq_calc_offload(&pkts[nb_tx], n,
+					&cs_flags, txq->offloads);
 		ret = txq_burst_v(txq, &pkts[nb_tx], n, cs_flags);
 		nb_tx += ret;
 		if (!ret)
diff --git a/drivers/net/mlx5/mlx5_rxtx_vec.h b/drivers/net/mlx5/mlx5_rxtx_vec.h
index fb884f9..fda7004 100644
--- a/drivers/net/mlx5/mlx5_rxtx_vec.h
+++ b/drivers/net/mlx5/mlx5_rxtx_vec.h
@@ -22,6 +22,7 @@ 
 /* HW offload capabilities of vectorized Tx. */
 #define MLX5_VEC_TX_OFFLOAD_CAP \
 	(MLX5_VEC_TX_CKSUM_OFFLOAD_CAP | \
+	 DEV_TX_OFFLOAD_MATCH_METADATA | \
 	 DEV_TX_OFFLOAD_MULTI_SEGS)
 
 /*
diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_neon.h b/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
index b37b738..20c9427 100644
--- a/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
+++ b/drivers/net/mlx5/mlx5_rxtx_vec_neon.h
@@ -237,6 +237,7 @@ 
 	uint8x16_t *t_wqe;
 	uint8_t *dseg;
 	uint8x16_t ctrl;
+	uint32_t md; /* metadata */
 
 	/* Make sure all packets can fit into a single WQE. */
 	assert(elts_n > pkts_n);
@@ -293,10 +294,11 @@ 
 	ctrl = vqtbl1q_u8(ctrl, ctrl_shuf_m);
 	vst1q_u8((void *)t_wqe, ctrl);
 	/* Fill ESEG in the header. */
+	md = pkts[0]->hash.fdir.hi;
 	vst1q_u8((void *)(t_wqe + 1),
 		 ((uint8x16_t) { 0, 0, 0, 0,
 				 cs_flags, 0, 0, 0,
-				 0, 0, 0, 0,
+				 md >> 24, md >> 16, md >> 8, md,
 				 0, 0, 0, 0 }));
 #ifdef MLX5_PMD_SOFT_COUNTERS
 	txq->stats.opackets += pkts_n;
diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_sse.h b/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
index 54b3783..7c8535c 100644
--- a/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
+++ b/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
@@ -236,6 +236,7 @@ 
 			      0,  1,  2,  3  /* bswap32 */);
 	__m128i *t_wqe, *dseg;
 	__m128i ctrl;
+	uint32_t md; /* metadata */
 
 	/* Make sure all packets can fit into a single WQE. */
 	assert(elts_n > pkts_n);
@@ -292,9 +293,10 @@ 
 	ctrl = _mm_shuffle_epi8(ctrl, shuf_mask_ctrl);
 	_mm_store_si128(t_wqe, ctrl);
 	/* Fill ESEG in the header. */
+	md = pkts[0]->hash.fdir.hi;
 	_mm_store_si128(t_wqe + 1,
 			_mm_set_epi8(0, 0, 0, 0,
-				     0, 0, 0, 0,
+				     md, md >> 8, md >> 16, md >> 24,
 				     0, 0, 0, cs_flags,
 				     0, 0, 0, 0));
 #ifdef MLX5_PMD_SOFT_COUNTERS
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index f9bc473..7263fb1 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -128,6 +128,12 @@ 
 			offloads |= (DEV_TX_OFFLOAD_VXLAN_TNL_TSO |
 				     DEV_TX_OFFLOAD_GRE_TNL_TSO);
 	}
+
+#ifdef HAVE_IBV_FLOW_DV_SUPPORT
+	if (config->dv_flow_en)
+		offloads |= DEV_TX_OFFLOAD_MATCH_METADATA;
+#endif
+
 	return offloads;
 }