net/mlx5: support Flow Tag and Packet Header miniCQEs
diff mbox series

Message ID 20201028023719.14651-1-akozyrev@nvidia.com
State Superseded, archived
Delegated to: Raslan Darawsheh
Headers show
Series
  • net/mlx5: support Flow Tag and Packet Header miniCQEs
Related show

Checks

Context Check Description
ci/iol-testing fail Testing issues
ci/Intel-compilation success Compilation OK
ci/checkpatch warning coding style issues

Commit Message

Alexander Kozyrev Oct. 28, 2020, 2:37 a.m. UTC
CQE compression allows us to save the PCI bandwidth and improve
the performance by compressing several CQEs togheter to a miniCQE.
But the miniCQE size is only 8 bytes and this limits the ability
to sucessfuly keep the compression session in case of various
traffic patterns.

The current miniCQE format only keeps the compression session alive
in case of uniform traffic with the Hash RSS as the only difference.
There are requests to keep the compression session in case of tagged
traffic by RTE Flow Mark Id and mixed UDP/TCP and IPv4/IPv6 traffic.
Add 2 new miniCQE formats in order to achieve the best performance
for these traffic patterns: Flow Tag and Packet Header miniCQEs.

The existing rxq_cqe_comp_en devarg is modified to specify the
desired miniCQE format. Specifying 2 selects Flow Tag format
for better compression rate in case of RTE Flow Mark traffic.
Specifying 3 selects Checksum format (existing format for MPRQ).
Specifying 4 selects L3/L4 Header format for better compression
rate in case of mixed TCP/UDP and IPv4/IPv6 traffic.

Signed-off-by: Alexander Kozyrev <akozyrev@nvidia.com>
---
 doc/guides/nics/mlx5.rst               |   8 +
 doc/guides/rel_notes/release_20_11.rst |   2 +
 drivers/common/mlx5/mlx5_devx_cmds.c   |   7 +-
 drivers/common/mlx5/mlx5_devx_cmds.h   |   1 +
 drivers/common/mlx5/mlx5_prm.h         |  27 +++-
 drivers/net/mlx5/mlx5.c                |   7 +
 drivers/net/mlx5/mlx5.h                |   1 +
 drivers/net/mlx5/mlx5_devx.c           |  42 +++--
 drivers/net/mlx5/mlx5_rxtx.c           | 134 ++++++++++------
 drivers/net/mlx5/mlx5_rxtx.h           |   2 +
 drivers/net/mlx5/mlx5_rxtx_vec_sse.h   | 202 +++++++++++++++++--------
 11 files changed, 303 insertions(+), 130 deletions(-)

Patch
diff mbox series

diff --git a/doc/guides/nics/mlx5.rst b/doc/guides/nics/mlx5.rst
index e5e55fc409..72b026a5aa 100644
--- a/doc/guides/nics/mlx5.rst
+++ b/doc/guides/nics/mlx5.rst
@@ -431,6 +431,14 @@  Driver options
 
   A nonzero value enables the compression of CQE on RX side. This feature
   allows to save PCI bandwidth and improve performance. Enabled by default.
+  Different compression formats are supported in order to achieve the best
+  performance for different traffic patterns. Hash RSS format is the default.
+
+  Specifying 2 as a ``rxq_cqe_comp_en`` value selects Flow Tag format for
+  better compression rate in case of RTE Flow Mark traffic.
+  Specifying 3 as a ``rxq_cqe_comp_en`` value selects Checksum format.
+  Specifying 4 as a ``rxq_cqe_comp_en`` value selects L3/L4 Header format for
+  better compression rate in case of mixed TCP/UDP and IPv4/IPv6 traffic.
 
   Supported on:
 
diff --git a/doc/guides/rel_notes/release_20_11.rst b/doc/guides/rel_notes/release_20_11.rst
index e2847712e8..9bd0f96c12 100644
--- a/doc/guides/rel_notes/release_20_11.rst
+++ b/doc/guides/rel_notes/release_20_11.rst
@@ -358,6 +358,8 @@  New Features
   * Added support for QinQ packets matching.
   * Added support for the new vlan fields ``has_vlan`` in the eth item and
     ``has_more_vlan`` in the vlan item.
+  * Added vectorized Multi-Packet Rx Queue burst.
+  * Added support for 2 new miniCQE formats: Flow Tag and L3/L4 header.
 
 * **Updated vhost sample application.**
 
diff --git a/drivers/common/mlx5/mlx5_devx_cmds.c b/drivers/common/mlx5/mlx5_devx_cmds.c
index 8aee12d527..586bdda6aa 100644
--- a/drivers/common/mlx5/mlx5_devx_cmds.c
+++ b/drivers/common/mlx5/mlx5_devx_cmds.c
@@ -1564,8 +1564,11 @@  mlx5_devx_cmd_create_cq(void *ctx, struct mlx5_devx_cq_attr *attr)
 		 MLX5_ADAPTER_PAGE_SHIFT);
 	MLX5_SET(cqc, cqctx, c_eqn, attr->eqn);
 	MLX5_SET(cqc, cqctx, uar_page, attr->uar_page_id);
-	MLX5_SET(cqc, cqctx, cqe_comp_en, attr->cqe_comp_en);
-	MLX5_SET(cqc, cqctx, mini_cqe_res_format, attr->mini_cqe_res_format);
+	MLX5_SET(cqc, cqctx, cqe_comp_en, !!attr->cqe_comp_en);
+	MLX5_SET(cqc, cqctx, mini_cqe_res_format,
+		 attr->mini_cqe_res_format);
+	MLX5_SET(cqc, cqctx, mini_cqe_res_format_ext,
+		 attr->mini_cqe_res_format_ext);
 	MLX5_SET(cqc, cqctx, cqe_sz, attr->cqe_size);
 	if (attr->q_umem_valid) {
 		MLX5_SET(create_cq_in, in, cq_umem_valid, attr->q_umem_valid);
diff --git a/drivers/common/mlx5/mlx5_devx_cmds.h b/drivers/common/mlx5/mlx5_devx_cmds.h
index abbea67784..ab33ce3046 100644
--- a/drivers/common/mlx5/mlx5_devx_cmds.h
+++ b/drivers/common/mlx5/mlx5_devx_cmds.h
@@ -255,6 +255,7 @@  struct mlx5_devx_cq_attr {
 	uint32_t overrun_ignore:1;
 	uint32_t cqe_comp_en:1;
 	uint32_t mini_cqe_res_format:2;
+	uint32_t mini_cqe_res_format_ext:2;
 	uint32_t cqe_size:3;
 	uint32_t log_cq_size:5;
 	uint32_t log_page_size:5;
diff --git a/drivers/common/mlx5/mlx5_prm.h b/drivers/common/mlx5/mlx5_prm.h
index d342263c85..b893d8a348 100644
--- a/drivers/common/mlx5/mlx5_prm.h
+++ b/drivers/common/mlx5/mlx5_prm.h
@@ -239,6 +239,9 @@ 
 /* Default mark mask for metadata legacy mode. */
 #define MLX5_FLOW_MARK_MASK 0xffffff
 
+/* Byte length mask when mark is enable in miniCQE */
+#define MLX5_LEN_WITH_MARK_MASK 0xffffff00
+
 /* Maximum number of DS in WQE. Limited by 6-bit field. */
 #define MLX5_DSEG_MAX 63
 
@@ -2152,11 +2155,14 @@  struct mlx5_ifc_cqc_bits {
 	u8 cqe_comp_en[0x1];
 	u8 mini_cqe_res_format[0x2];
 	u8 st[0x4];
-	u8 reserved_at_18[0x8];
+	u8 reserved_at_18[0x1];
+	u8 cqe_comp_layout[0x7];
 	u8 dbr_umem_id[0x20];
 	u8 reserved_at_40[0x14];
 	u8 page_offset[0x6];
-	u8 reserved_at_5a[0x6];
+	u8 reserved_at_5a[0x2];
+	u8 mini_cqe_res_format_ext[0x2];
+	u8 cq_timestamp_format[0x2];
 	u8 reserved_at_60[0x3];
 	u8 log_cq_size[0x5];
 	u8 uar_page[0x18];
@@ -2913,7 +2919,14 @@  struct mlx5_mini_cqe8 {
 	union {
 		uint32_t rx_hash_result;
 		struct {
-			uint16_t checksum;
+			union {
+				uint16_t checksum;
+				uint16_t flow_tag_high;
+				union {
+					uint8_t reserved;
+					uint8_t hdr_type;
+				};
+			};
 			uint16_t stride_idx;
 		};
 		struct {
@@ -2922,15 +2935,19 @@  struct mlx5_mini_cqe8 {
 			uint8_t  reserved;
 		} s_wqe_info;
 	};
-	uint32_t byte_cnt;
+	union {
+		uint32_t byte_cnt_flow;
+		uint32_t byte_cnt;
+	};
 };
 
 /* Mini CQE responder format. */
 enum {
 	MLX5_CQE_RESP_FORMAT_HASH = 0x0,
 	MLX5_CQE_RESP_FORMAT_CSUM = 0x1,
-	MLX5_CQE_RESP_FORMAT_CSUM_FLOW_TAG = 0x2,
+	MLX5_CQE_RESP_FORMAT_FTAG_STRIDX = 0x2,
 	MLX5_CQE_RESP_FORMAT_CSUM_STRIDX = 0x3,
+	MLX5_CQE_RESP_FORMAT_L34H_STRIDX = 0x4,
 };
 
 /* srTCM PRM flow meter parameters. */
diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 27c9c2abb6..9fd8f0ebbf 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -1545,7 +1545,14 @@  mlx5_args_check(const char *key, const char *val, void *opaque)
 	}
 	mod = tmp >= 0 ? tmp : -tmp;
 	if (strcmp(MLX5_RXQ_CQE_COMP_EN, key) == 0) {
+		if (tmp > MLX5_CQE_RESP_FORMAT_L34H_STRIDX) {
+			DRV_LOG(ERR, "invalid CQE compression "
+				     "format parameter");
+			rte_errno = EINVAL;
+			return -rte_errno;
+		}
 		config->cqe_comp = !!tmp;
+		config->cqe_comp_fmt = tmp;
 	} else if (strcmp(MLX5_RXQ_CQE_PAD_EN, key) == 0) {
 		config->cqe_pad = !!tmp;
 	} else if (strcmp(MLX5_RXQ_PKT_PAD_EN, key) == 0) {
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 8de5842cc7..941a049179 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -192,6 +192,7 @@  struct mlx5_dev_config {
 	/* Whether tunnel stateless offloads are supported. */
 	unsigned int mpls_en:1; /* MPLS over GRE/UDP is enabled. */
 	unsigned int cqe_comp:1; /* CQE compression is enabled. */
+	unsigned int cqe_comp_fmt:3; /* CQE compression format. */
 	unsigned int cqe_pad:1; /* CQE padding is enabled. */
 	unsigned int tso:1; /* Whether TSO is supported. */
 	unsigned int rx_vec_en:1; /* Rx vector is enabled. */
diff --git a/drivers/net/mlx5/mlx5_devx.c b/drivers/net/mlx5/mlx5_devx.c
index 5fce4cd555..1b179abe95 100644
--- a/drivers/net/mlx5/mlx5_devx.c
+++ b/drivers/net/mlx5/mlx5_devx.c
@@ -437,17 +437,37 @@  mlx5_rxq_create_devx_cq_resources(struct rte_eth_dev *dev, uint16_t idx)
 	if (priv->config.cqe_comp && !rxq_data->hw_timestamp &&
 	    !rxq_data->lro) {
 		cq_attr.cqe_comp_en = 1u;
-		/*
-		 * Select CSUM miniCQE format only for non-vectorized MPRQ
-		 * Rx burst, use HASH miniCQE format for everything else.
-		 */
-		if (mlx5_rxq_check_vec_support(rxq_data) < 0 &&
-			mlx5_rxq_mprq_enabled(rxq_data))
-			cq_attr.mini_cqe_res_format =
-				MLX5_CQE_RESP_FORMAT_CSUM_STRIDX;
-		else
-			cq_attr.mini_cqe_res_format =
-				MLX5_CQE_RESP_FORMAT_HASH;
+		rxq_data->mcqe_format = priv->config.cqe_comp_fmt;
+		rxq_data->byte_mask = UINT32_MAX;
+		switch (priv->config.cqe_comp_fmt) {
+		case MLX5_CQE_RESP_FORMAT_HASH:
+		case MLX5_CQE_RESP_FORMAT_CSUM:
+			/*
+			 * Select CSUM miniCQE format only for non-vectorized
+			 * MPRQ Rx burst, use HASH miniCQE format for others.
+			 */
+			if (mlx5_rxq_check_vec_support(rxq_data) < 0 &&
+			    mlx5_rxq_mprq_enabled(rxq_data))
+				cq_attr.mini_cqe_res_format =
+					MLX5_CQE_RESP_FORMAT_CSUM_STRIDX;
+			else
+				cq_attr.mini_cqe_res_format =
+					MLX5_CQE_RESP_FORMAT_HASH;
+			rxq_data->mcqe_format = cq_attr.mini_cqe_res_format;
+			break;
+		case MLX5_CQE_RESP_FORMAT_FTAG_STRIDX:
+			rxq_data->byte_mask = MLX5_LEN_WITH_MARK_MASK;
+		case MLX5_CQE_RESP_FORMAT_CSUM_STRIDX:
+			cq_attr.mini_cqe_res_format = priv->config.cqe_comp_fmt;
+			break;
+		case MLX5_CQE_RESP_FORMAT_L34H_STRIDX:
+			cq_attr.mini_cqe_res_format = 0;
+			cq_attr.mini_cqe_res_format_ext = 1;
+			break;
+		}
+		DRV_LOG(DEBUG,
+			"Port %u Rx CQE compression is enabled, format %d.",
+			dev->data->port_id, priv->config.cqe_comp_fmt);
 		/*
 		 * For vectorized Rx, it must not be doubled in order to
 		 * make cq_ci and rq_ci aligned.
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 2ffacf8882..1ecae79372 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -80,7 +80,8 @@  static uint16_t mlx5_tx_burst_##func(void *txq, \
 #define MLX5_TXOFF_INFO(func, olx) {mlx5_tx_burst_##func, olx},
 
 static __rte_always_inline uint32_t
-rxq_cq_to_pkt_type(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe);
+rxq_cq_to_pkt_type(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe,
+				   volatile struct mlx5_mini_cqe8 *mcqe);
 
 static __rte_always_inline int
 mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe,
@@ -91,7 +92,8 @@  rxq_cq_to_ol_flags(volatile struct mlx5_cqe *cqe);
 
 static __rte_always_inline void
 rxq_cq_to_mbuf(struct mlx5_rxq_data *rxq, struct rte_mbuf *pkt,
-	       volatile struct mlx5_cqe *cqe, uint32_t rss_hash_res);
+	       volatile struct mlx5_cqe *cqe,
+	       volatile struct mlx5_mini_cqe8 *mcqe);
 
 static int
 mlx5_queue_state_modify(struct rte_eth_dev *dev,
@@ -100,12 +102,13 @@  mlx5_queue_state_modify(struct rte_eth_dev *dev,
 static inline void
 mlx5_lro_update_tcp_hdr(struct rte_tcp_hdr *__rte_restrict tcp,
 			volatile struct mlx5_cqe *__rte_restrict cqe,
-			uint32_t phcsum);
+			uint32_t phcsum, uint8_t l4_type);
 
 static inline void
 mlx5_lro_update_hdr(uint8_t *__rte_restrict padd,
 		    volatile struct mlx5_cqe *__rte_restrict cqe,
-		    uint32_t len);
+			volatile struct mlx5_mini_cqe8 *mcqe,
+		    struct mlx5_rxq_data *rxq, uint32_t len);
 
 uint32_t mlx5_ptype_table[] __rte_cache_aligned = {
 	[0xff] = RTE_PTYPE_ALL_MASK, /* Last entry for errored packet. */
@@ -813,12 +816,19 @@  mlx5_tx_error_cqe_handle(struct mlx5_txq_data *__rte_restrict txq,
  *   Packet type for struct rte_mbuf.
  */
 static inline uint32_t
-rxq_cq_to_pkt_type(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe)
+rxq_cq_to_pkt_type(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe,
+				   volatile struct mlx5_mini_cqe8 *mcqe)
 {
 	uint8_t idx;
-	uint8_t pinfo = cqe->pkt_info;
-	uint16_t ptype = cqe->hdr_type_etc;
+	uint8_t ptype;
+	uint8_t pinfo = (cqe->pkt_info & 0x3) << 6;
 
+	/* Get l3/l4 header from mini-CQE in case L3/L4 format*/
+	if (unlikely(mcqe == NULL ||
+		rxq->mcqe_format != MLX5_CQE_RESP_FORMAT_L34H_STRIDX))
+		ptype = (cqe->hdr_type_etc & 0xfc00) >> 10;
+	else
+		ptype = mcqe->hdr_type >> 2;
 	/*
 	 * The index to the array should have:
 	 * bit[1:0] = l3_hdr_type
@@ -827,7 +837,7 @@  rxq_cq_to_pkt_type(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe)
 	 * bit[6] = tunneled
 	 * bit[7] = outer_l3_type
 	 */
-	idx = ((pinfo & 0x3) << 6) | ((ptype & 0xfc00) >> 10);
+	idx = pinfo | ptype;
 	return mlx5_ptype_table[idx] | rxq->tunnel * !!(idx & (1 << 6));
 }
 
@@ -1131,8 +1141,8 @@  mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe,
 				(volatile struct mlx5_mini_cqe8 (*)[8])
 				(uintptr_t)(&(*rxq->cqes)[zip->ca &
 							  cqe_cnt].pkt_info);
-
-			len = rte_be_to_cpu_32((*mc)[zip->ai & 7].byte_cnt);
+			len = rte_be_to_cpu_32((*mc)[zip->ai & 7].byte_cnt &
+					       rxq->byte_mask);
 			*mcqe = &(*mc)[zip->ai & 7];
 			if ((++zip->ai & 7) == 0) {
 				/* Invalidate consumed CQEs */
@@ -1210,7 +1220,8 @@  mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe,
 				--rxq->cq_ci;
 				zip->cq_ci = rxq->cq_ci + zip->cqe_cnt;
 				/* Get packet size to return. */
-				len = rte_be_to_cpu_32((*mc)[0].byte_cnt);
+				len = rte_be_to_cpu_32((*mc)[0].byte_cnt &
+						       rxq->byte_mask);
 				*mcqe = &(*mc)[0];
 				zip->ai = 1;
 				/* Prefetch all to be invalidated */
@@ -1274,20 +1285,35 @@  rxq_cq_to_ol_flags(volatile struct mlx5_cqe *cqe)
  */
 static inline void
 rxq_cq_to_mbuf(struct mlx5_rxq_data *rxq, struct rte_mbuf *pkt,
-	       volatile struct mlx5_cqe *cqe, uint32_t rss_hash_res)
+	       volatile struct mlx5_cqe *cqe,
+	       volatile struct mlx5_mini_cqe8 *mcqe)
 {
-	/* Update packet information. */
-	pkt->packet_type = rxq_cq_to_pkt_type(rxq, cqe);
+	/* If compressed, take hash result from mini-CQE. */
+	uint32_t rss_hash_res = 0;
+	uint32_t mark = 0;
+
+		/* Update packet information. */
+	pkt->packet_type = rxq_cq_to_pkt_type(rxq, cqe, mcqe);
+	if (mcqe == NULL || rxq->mcqe_format != MLX5_CQE_RESP_FORMAT_HASH)
+		rss_hash_res = rte_be_to_cpu_32(cqe->rx_hash_res);
+	else
+		rss_hash_res = rte_be_to_cpu_32(mcqe->rx_hash_result);
+
 	if (rss_hash_res && rxq->rss_hash) {
 		pkt->hash.rss = rss_hash_res;
 		pkt->ol_flags |= PKT_RX_RSS_HASH;
 	}
-	if (rxq->mark && MLX5_FLOW_MARK_IS_VALID(cqe->sop_drop_qpn)) {
+	/* If compressed, take flow tag from mini-CQE. */
+	if (mcqe == NULL ||
+	    rxq->mcqe_format != MLX5_CQE_RESP_FORMAT_FTAG_STRIDX)
+		mark = cqe->sop_drop_qpn;
+	else
+		mark = ((mcqe->byte_cnt_flow & 0xff) << 8) |
+			(mcqe->flow_tag_high << 16);
+	if (rxq->mark && MLX5_FLOW_MARK_IS_VALID(mark)) {
 		pkt->ol_flags |= PKT_RX_FDIR;
-		if (cqe->sop_drop_qpn !=
+		if (mark !=
 		    rte_cpu_to_be_32(MLX5_FLOW_MARK_DEFAULT)) {
-			uint32_t mark = cqe->sop_drop_qpn;
-
 			pkt->ol_flags |= PKT_RX_FDIR_ID;
 			pkt->hash.fdir.hi = mlx5_flow_mark_get(mark);
 		}
@@ -1299,10 +1325,20 @@  rxq_cq_to_mbuf(struct mlx5_rxq_data *rxq, struct rte_mbuf *pkt,
 	}
 	if (rxq->csum)
 		pkt->ol_flags |= rxq_cq_to_ol_flags(cqe);
-	if (rxq->vlan_strip &&
-	    (cqe->hdr_type_etc & rte_cpu_to_be_16(MLX5_CQE_VLAN_STRIPPED))) {
-		pkt->ol_flags |= PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED;
-		pkt->vlan_tci = rte_be_to_cpu_16(cqe->vlan_info);
+	if (rxq->vlan_strip) {
+		bool vlan_strip;
+
+		if (mcqe == NULL ||
+		    rxq->mcqe_format != MLX5_CQE_RESP_FORMAT_L34H_STRIDX)
+			vlan_strip = cqe->hdr_type_etc &
+				     rte_cpu_to_be_16(MLX5_CQE_VLAN_STRIPPED);
+		else
+			vlan_strip = mcqe->hdr_type &
+				     rte_cpu_to_be_16(MLX5_CQE_VLAN_STRIPPED);
+		if (vlan_strip) {
+			pkt->ol_flags |= PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED;
+			pkt->vlan_tci = rte_be_to_cpu_16(cqe->vlan_info);
+		}
 	}
 	if (rxq->hw_timestamp) {
 		uint64_t ts = rte_be_to_cpu_64(cqe->timestamp);
@@ -1348,7 +1384,6 @@  mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 			&((volatile struct mlx5_wqe_data_seg *)rxq->wqes)[idx];
 		struct rte_mbuf *rep = (*rxq->elts)[idx];
 		volatile struct mlx5_mini_cqe8 *mcqe = NULL;
-		uint32_t rss_hash_res;
 
 		if (pkt)
 			NEXT(seg) = rep;
@@ -1387,18 +1422,14 @@  mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 			pkt = seg;
 			MLX5_ASSERT(len >= (rxq->crc_present << 2));
 			pkt->ol_flags &= EXT_ATTACHED_MBUF;
-			/* If compressed, take hash result from mini-CQE. */
-			rss_hash_res = rte_be_to_cpu_32(mcqe == NULL ?
-							cqe->rx_hash_res :
-							mcqe->rx_hash_result);
-			rxq_cq_to_mbuf(rxq, pkt, cqe, rss_hash_res);
+			rxq_cq_to_mbuf(rxq, pkt, cqe, mcqe);
 			if (rxq->crc_present)
 				len -= RTE_ETHER_CRC_LEN;
 			PKT_LEN(pkt) = len;
 			if (cqe->lro_num_seg > 1) {
 				mlx5_lro_update_hdr
 					(rte_pktmbuf_mtod(pkt, uint8_t *), cqe,
-					 len);
+					 mcqe, rxq, len);
 				pkt->ol_flags |= PKT_RX_LRO;
 				pkt->tso_segsz = len / cqe->lro_num_seg;
 			}
@@ -1468,10 +1499,8 @@  mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 static inline void
 mlx5_lro_update_tcp_hdr(struct rte_tcp_hdr *__rte_restrict tcp,
 			volatile struct mlx5_cqe *__rte_restrict cqe,
-			uint32_t phcsum)
+			uint32_t phcsum, uint8_t l4_type)
 {
-	uint8_t l4_type = (rte_be_to_cpu_16(cqe->hdr_type_etc) &
-			   MLX5_CQE_L4_TYPE_MASK) >> MLX5_CQE_L4_TYPE_SHIFT;
 	/*
 	 * The HW calculates only the TCP payload checksum, need to complete
 	 * the TCP header checksum and the L3 pseudo-header checksum.
@@ -1510,7 +1539,8 @@  mlx5_lro_update_tcp_hdr(struct rte_tcp_hdr *__rte_restrict tcp,
 static inline void
 mlx5_lro_update_hdr(uint8_t *__rte_restrict padd,
 		    volatile struct mlx5_cqe *__rte_restrict cqe,
-		    uint32_t len)
+			volatile struct mlx5_mini_cqe8 *mcqe,
+		    struct mlx5_rxq_data *rxq, uint32_t len)
 {
 	union {
 		struct rte_ether_hdr *eth;
@@ -1524,6 +1554,7 @@  mlx5_lro_update_hdr(uint8_t *__rte_restrict padd,
 	};
 	uint16_t proto = h.eth->ether_type;
 	uint32_t phcsum;
+	uint8_t l4_type;
 
 	h.eth++;
 	while (proto == RTE_BE16(RTE_ETHER_TYPE_VLAN) ||
@@ -1545,7 +1576,14 @@  mlx5_lro_update_hdr(uint8_t *__rte_restrict padd,
 		phcsum = rte_ipv6_phdr_cksum(h.ipv6, 0);
 		h.ipv6++;
 	}
-	mlx5_lro_update_tcp_hdr(h.tcp, cqe, phcsum);
+	if (mcqe == NULL ||
+	    rxq->mcqe_format != MLX5_CQE_RESP_FORMAT_L34H_STRIDX)
+		l4_type = (rte_be_to_cpu_16(cqe->hdr_type_etc) &
+			   MLX5_CQE_L4_TYPE_MASK) >> MLX5_CQE_L4_TYPE_SHIFT;
+	else
+		l4_type = (rte_be_to_cpu_16(mcqe->hdr_type) &
+			   MLX5_CQE_L4_TYPE_MASK) >> MLX5_CQE_L4_TYPE_SHIFT;
+	mlx5_lro_update_tcp_hdr(h.tcp, cqe, phcsum, l4_type);
 }
 
 void
@@ -1586,6 +1624,7 @@  mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 {
 	struct mlx5_rxq_data *rxq = dpdk_rxq;
 	const uint32_t strd_n = 1 << rxq->strd_num_n;
+	const uint32_t strd_sz = 1 << rxq->strd_sz_n;
 	const uint32_t cq_mask = (1 << rxq->cqe_n) - 1;
 	const uint32_t wq_mask = (1 << rxq->elts_n) - 1;
 	volatile struct mlx5_cqe *cqe = &(*rxq->cqes)[rxq->cq_ci & cq_mask];
@@ -1602,7 +1641,6 @@  mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		uint16_t strd_idx;
 		uint32_t byte_cnt;
 		volatile struct mlx5_mini_cqe8 *mcqe = NULL;
-		uint32_t rss_hash_res = 0;
 		enum mlx5_rqx_code rxq_code;
 
 		if (consumed_strd == strd_n) {
@@ -1618,19 +1656,23 @@  mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		if (!ret)
 			break;
 		byte_cnt = ret;
-		strd_cnt = (byte_cnt & MLX5_MPRQ_STRIDE_NUM_MASK) >>
-			   MLX5_MPRQ_STRIDE_NUM_SHIFT;
+		len = (byte_cnt & MLX5_MPRQ_LEN_MASK) >> MLX5_MPRQ_LEN_SHIFT;
+		MLX5_ASSERT((int)len >= (rxq->crc_present << 2));
+		if (rxq->crc_present)
+			len -= RTE_ETHER_CRC_LEN;
+		if (mcqe &&
+		    rxq->mcqe_format == MLX5_CQE_RESP_FORMAT_FTAG_STRIDX)
+			strd_cnt = (len / strd_sz) + (len % strd_sz) ? 1 : 0;
+		else
+			strd_cnt = (byte_cnt & MLX5_MPRQ_STRIDE_NUM_MASK) >>
+				   MLX5_MPRQ_STRIDE_NUM_SHIFT;
 		MLX5_ASSERT(strd_cnt);
 		consumed_strd += strd_cnt;
 		if (byte_cnt & MLX5_MPRQ_FILLER_MASK)
 			continue;
-		if (mcqe == NULL) {
-			rss_hash_res = rte_be_to_cpu_32(cqe->rx_hash_res);
-			strd_idx = rte_be_to_cpu_16(cqe->wqe_counter);
-		} else {
-			/* mini-CQE for MPRQ doesn't have hash result. */
-			strd_idx = rte_be_to_cpu_16(mcqe->stride_idx);
-		}
+		strd_idx = rte_be_to_cpu_16(mcqe == NULL ?
+					cqe->wqe_counter :
+					mcqe->stride_idx);
 		MLX5_ASSERT(strd_idx < strd_n);
 		MLX5_ASSERT(!((rte_be_to_cpu_16(cqe->wqe_id) ^ rq_ci) &
 			    wq_mask));
@@ -1656,10 +1698,10 @@  mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 				break;
 			}
 		}
-		rxq_cq_to_mbuf(rxq, pkt, cqe, rss_hash_res);
+		rxq_cq_to_mbuf(rxq, pkt, cqe, mcqe);
 		if (cqe->lro_num_seg > 1) {
 			mlx5_lro_update_hdr(rte_pktmbuf_mtod(pkt, uint8_t *),
-					    cqe, len);
+					    cqe, mcqe, rxq, len);
 			pkt->ol_flags |= PKT_RX_LRO;
 			pkt->tso_segsz = len / cqe->lro_num_seg;
 		}
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 1b5fba4ac7..b3038c4991 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -127,6 +127,8 @@  struct mlx5_rxq_data {
 	unsigned int strd_scatter_en:1; /* Scattered packets from a stride. */
 	unsigned int lro:1; /* Enable LRO. */
 	unsigned int dynf_meta:1; /* Dynamic metadata is configured. */
+	unsigned int mcqe_format:3; /* Dynamic metadata is configured. */
+	uint32_t byte_mask;
 	volatile uint32_t *rq_db;
 	volatile uint32_t *cq_db;
 	uint16_t port_id;
diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_sse.h b/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
index 59662fa12d..7bae27e5ef 100644
--- a/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
+++ b/drivers/net/mlx5/mlx5_rxtx_vec_sse.h
@@ -78,33 +78,47 @@  rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
 	unsigned int pos;
 	unsigned int i;
 	unsigned int inv = 0;
+	const int32_t head =
+		(rxq->mcqe_format == MLX5_CQE_RESP_FORMAT_L34H_STRIDX) ? 0 : -1;
+	const int32_t ftag =
+		(rxq->mcqe_format == MLX5_CQE_RESP_FORMAT_FTAG_STRIDX) ? 0 : -1;
+	const int32_t hash =
+		(rxq->mcqe_format == MLX5_CQE_RESP_FORMAT_HASH) ? 0 : -1;
 	/* Mask to shuffle from extracted mini CQE to mbuf. */
 	const __m128i shuf_mask1 =
-		_mm_set_epi8(0,  1,  2,  3, /* rss, bswap32 */
-			    -1, -1,         /* skip vlan_tci */
-			     6,  7,         /* data_len, bswap16 */
-			    -1, -1,  6,  7, /* pkt_len, bswap16 */
-			    -1, -1, -1, -1  /* skip packet_type */);
+		_mm_set_epi8(-1, 1 | ftag, 0 | ftag, 4 | ftag, /* fdir.hi */
+			      0 | hash, 1 | hash, 2 | hash, 3 | hash, /* rss */
+			     -1, -1,	/* skip vlan_tci */
+			      6,  7,	/* data_len, bswap16 */
+			     -1, -1,	/* zero out 2nd half of pkt_len */
+			      6,  7	/* pkt_len, bswap16 */);
 	const __m128i shuf_mask2 =
-		_mm_set_epi8(8,  9, 10, 11, /* rss, bswap32 */
-			    -1, -1,         /* skip vlan_tci */
-			    14, 15,         /* data_len, bswap16 */
-			    -1, -1, 14, 15, /* pkt_len, bswap16 */
-			    -1, -1, -1, -1  /* skip packet_type */);
+		_mm_set_epi8(-1, 9 | ftag, 8 | ftag, 12 | ftag, /* fdir.hi */
+			      8 | hash, 9 | hash, 10 | hash, 11 | hash,/* rss */
+			     -1, -1,	/* skip vlan_tci */
+			     14, 15,	/* data_len, bswap16 */
+			     -1, -1,	/* zero out 2nd half of pkt_len */
+			     14, 15	/* pkt_len, bswap16 */);
 	/* Restore the compressed count. Must be 16 bits. */
 	const uint16_t mcqe_n = t_pkt->data_len +
 				(rxq->crc_present * RTE_ETHER_CRC_LEN);
 	const __m128i rearm =
 		_mm_loadu_si128((__m128i *)&t_pkt->rearm_data);
-	const __m128i rxdf =
-		_mm_loadu_si128((__m128i *)&t_pkt->rx_descriptor_fields1);
+	const __m128i rearm_flags =
+		_mm_set1_epi32((uint32_t)t_pkt->ol_flags);
 	const __m128i crc_adj =
-		_mm_set_epi16(0, 0, 0,
+		_mm_set_epi16(0, 0, 0, 0, 0,
 			      rxq->crc_present * RTE_ETHER_CRC_LEN,
 			      0,
-			      rxq->crc_present * RTE_ETHER_CRC_LEN,
-			      0, 0);
-	const uint32_t flow_tag = t_pkt->hash.fdir.hi;
+			      rxq->crc_present * RTE_ETHER_CRC_LEN);
+	const __m128i flow_mark_adj = _mm_set_epi32(rxq->mark * (-1), 0, 0, 0);
+	const __m128i ol_flags_mask = _mm_set1_epi32(PKT_RX_RSS_HASH * !hash |
+			PKT_RX_VLAN * !head | PKT_RX_VLAN_STRIPPED * !head |
+			PKT_RX_FDIR * !ftag | PKT_RX_FDIR_ID * !ftag);
+	__m128i ol_flags =
+		_mm_set1_epi32(rxq->rss_hash * PKT_RX_RSS_HASH * !hash);
+	__m128i rearm0, rearm1, rearm2, rearm3;
+
 #ifdef MLX5_PMD_SOFT_COUNTERS
 	const __m128i zero = _mm_setzero_si128();
 	const __m128i ones = _mm_cmpeq_epi32(zero, zero);
@@ -118,14 +132,16 @@  rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
 #endif
 	/*
 	 * A. load mCQEs into a 128bit register.
-	 * B. store rearm data to mbuf.
-	 * C. combine data from mCQEs with rx_descriptor_fields1.
-	 * D. store rx_descriptor_fields1.
-	 * E. store flow tag (rte_flow mark).
+	 * B. combine data from mCQEs with rx_descriptor_fields1.
+	 * C. store rx_descriptor_fields1.
+	 * D. update and store packet type.
+	 * E. update ol_flags according to miniCQEs content.
+	 * F. store dynamic metadata.
+	 * G. store rearm data to mbuf.
 	 */
 	for (pos = 0; pos < mcqe_n; ) {
 		__m128i mcqe1, mcqe2;
-		__m128i rxdf1, rxdf2;
+		__m128i rxdf1, rxdf2, rxdf3, rxdf4;
 #ifdef MLX5_PMD_SOFT_COUNTERS
 		__m128i byte_cnt, invalid_mask;
 #endif
@@ -136,59 +152,107 @@  rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
 		/* A.1 load mCQEs into a 128bit register. */
 		mcqe1 = _mm_loadu_si128((__m128i *)&mcq[pos % 8]);
 		mcqe2 = _mm_loadu_si128((__m128i *)&mcq[pos % 8 + 2]);
-		/* B.1 store rearm data to mbuf. */
-		_mm_storeu_si128((__m128i *)&elts[pos]->rearm_data, rearm);
-		_mm_storeu_si128((__m128i *)&elts[pos + 1]->rearm_data, rearm);
-		/* C.1 combine data from mCQEs with rx_descriptor_fields1. */
+		/* B.1 combine data from mCQEs with rx_descriptor_fields1. */
 		rxdf1 = _mm_shuffle_epi8(mcqe1, shuf_mask1);
 		rxdf2 = _mm_shuffle_epi8(mcqe1, shuf_mask2);
 		rxdf1 = _mm_sub_epi16(rxdf1, crc_adj);
 		rxdf2 = _mm_sub_epi16(rxdf2, crc_adj);
-		rxdf1 = _mm_blend_epi16(rxdf1, rxdf, 0x23);
-		rxdf2 = _mm_blend_epi16(rxdf2, rxdf, 0x23);
-		/* D.1 store rx_descriptor_fields1. */
+		rxdf1 = _mm_add_epi32(rxdf1, flow_mark_adj);
+		rxdf2 = _mm_add_epi32(rxdf2, flow_mark_adj);
+		/* C.1 store rx_descriptor_fields1. */
 		_mm_storeu_si128((__m128i *)
-				  &elts[pos]->rx_descriptor_fields1,
+				  &elts[pos]->pkt_len,
 				 rxdf1);
 		_mm_storeu_si128((__m128i *)
-				  &elts[pos + 1]->rx_descriptor_fields1,
+				  &elts[pos + 1]->pkt_len,
 				 rxdf2);
-		/* B.1 store rearm data to mbuf. */
-		_mm_storeu_si128((__m128i *)&elts[pos + 2]->rearm_data, rearm);
-		_mm_storeu_si128((__m128i *)&elts[pos + 3]->rearm_data, rearm);
-		/* C.1 combine data from mCQEs with rx_descriptor_fields1. */
-		rxdf1 = _mm_shuffle_epi8(mcqe2, shuf_mask1);
-		rxdf2 = _mm_shuffle_epi8(mcqe2, shuf_mask2);
-		rxdf1 = _mm_sub_epi16(rxdf1, crc_adj);
-		rxdf2 = _mm_sub_epi16(rxdf2, crc_adj);
-		rxdf1 = _mm_blend_epi16(rxdf1, rxdf, 0x23);
-		rxdf2 = _mm_blend_epi16(rxdf2, rxdf, 0x23);
-		/* D.1 store rx_descriptor_fields1. */
+		/* B.1 combine data from mCQEs with rx_descriptor_fields1. */
+		rxdf3 = _mm_shuffle_epi8(mcqe2, shuf_mask1);
+		rxdf4 = _mm_shuffle_epi8(mcqe2, shuf_mask2);
+		rxdf3 = _mm_sub_epi16(rxdf3, crc_adj);
+		rxdf4 = _mm_sub_epi16(rxdf4, crc_adj);
+		rxdf3 = _mm_add_epi32(rxdf3, flow_mark_adj);
+		rxdf4 = _mm_add_epi32(rxdf4, flow_mark_adj);
+		/* C.1 store rx_descriptor_fields1. */
 		_mm_storeu_si128((__m128i *)
-				  &elts[pos + 2]->rx_descriptor_fields1,
-				 rxdf1);
+				  &elts[pos + 2]->pkt_len,
+				 rxdf3);
 		_mm_storeu_si128((__m128i *)
-				  &elts[pos + 3]->rx_descriptor_fields1,
-				 rxdf2);
+				  &elts[pos + 3]->pkt_len,
+				 rxdf4);
 #ifdef MLX5_PMD_SOFT_COUNTERS
 		invalid_mask = _mm_set_epi64x(0,
 					      (mcqe_n - pos) *
 					      sizeof(uint16_t) * 8);
 		invalid_mask = _mm_sll_epi64(ones, invalid_mask);
-		mcqe1 = _mm_srli_si128(mcqe1, 4);
-		byte_cnt = _mm_blend_epi16(mcqe1, mcqe2, 0xcc);
+		byte_cnt = _mm_blend_epi16(_mm_srli_si128(mcqe1, 4),
+					   mcqe2, 0xcc);
 		byte_cnt = _mm_shuffle_epi8(byte_cnt, len_shuf_mask);
 		byte_cnt = _mm_andnot_si128(invalid_mask, byte_cnt);
 		byte_cnt = _mm_hadd_epi16(byte_cnt, zero);
 		rcvd_byte += _mm_cvtsi128_si64(_mm_hadd_epi16(byte_cnt, zero));
 #endif
-		if (rxq->mark) {
-			/* E.1 store flow tag (rte_flow mark). */
-			elts[pos]->hash.fdir.hi = flow_tag;
-			elts[pos + 1]->hash.fdir.hi = flow_tag;
-			elts[pos + 2]->hash.fdir.hi = flow_tag;
-			elts[pos + 3]->hash.fdir.hi = flow_tag;
+		/* D.1 update and store packet type. */
+		if (head == -1) {
+			const uint32_t packet_type = t_pkt->packet_type;
+
+			elts[pos]->packet_type = packet_type;
+			elts[pos + 1]->packet_type = packet_type;
+			elts[pos + 2]->packet_type = packet_type;
+			elts[pos + 3]->packet_type = packet_type;
+		} else {
+			const uint8_t pkt_info = (cq->pkt_info & 0x3) << 6;
+			const uint8_t pt_idx0 = pkt_info |
+						_mm_extract_epi8(mcqe1, 0) >> 2;
+			const uint8_t pt_idx1 = pkt_info |
+						_mm_extract_epi8(mcqe1, 8) >> 2;
+			const uint8_t pt_idx2 = pkt_info |
+						_mm_extract_epi8(mcqe2, 0) >> 2;
+			const uint8_t pt_idx3 = pkt_info |
+						_mm_extract_epi8(mcqe2, 8) >> 2;
+			const __m128i vlan_mask =
+				_mm_set_epi32(_mm_extract_epi8(mcqe1, 0) & 0x1,
+					      _mm_extract_epi8(mcqe1, 8) & 0x1,
+					      _mm_extract_epi8(mcqe2, 0) & 0x1,
+					      _mm_extract_epi8(mcqe2, 8) & 0x1);
+
+			elts[pos]->packet_type = mlx5_ptype_table[pt_idx0] |
+				!!(pt_idx0 & (1 << 6)) * rxq->tunnel;
+			elts[pos + 1]->packet_type = mlx5_ptype_table[pt_idx1] |
+				!!(pt_idx1 & (1 << 6)) * rxq->tunnel;
+			elts[pos + 2]->packet_type = mlx5_ptype_table[pt_idx2] |
+				!!(pt_idx2 & (1 << 6)) * rxq->tunnel;
+			elts[pos + 3]->packet_type = mlx5_ptype_table[pt_idx3] |
+				!!(pt_idx3 & (1 << 6)) * rxq->tunnel;
+			ol_flags = _mm_or_si128(ol_flags, vlan_mask);
 		}
+		/* E.1 update ol_flags according to miniCQEs content. */
+		if (rxq->mark && ftag == 0) {
+			/* Extract flow_tag field. */
+			const __m128i ftag0 = _mm_unpackhi_epi32(rxdf1, rxdf2);
+			const __m128i ftag1 = _mm_unpackhi_epi32(rxdf3, rxdf4);
+			const __m128i ftag = _mm_unpackhi_epi64(ftag0, ftag1);
+			const __m128i ft_mask = _mm_set1_epi32(0xffffff00);
+			const __m128i fdir_flags = _mm_set1_epi32(PKT_RX_FDIR);
+			__m128i fdir_id_flags = _mm_set1_epi32(PKT_RX_FDIR_ID);
+			__m128i flow_tag, invalid_mask;
+
+			flow_tag = _mm_and_si128(ftag, ft_mask);
+			/* Check if flow tag is non-zero - set PKT_RX_FDIR. */
+			invalid_mask = _mm_cmpeq_epi32(flow_tag, zero);
+			ol_flags = _mm_or_si128(ol_flags,
+						_mm_andnot_si128(invalid_mask,
+								fdir_flags));
+			/* Mask out invalid entries. */
+			fdir_id_flags = _mm_andnot_si128(invalid_mask,
+							 fdir_id_flags);
+			/* Check if flow tag MLX5_FLOW_MARK_DEFAULT. */
+			ol_flags = _mm_or_si128(ol_flags,
+				_mm_andnot_si128(_mm_cmpeq_epi32(flow_tag,
+								 ft_mask),
+				fdir_id_flags));
+		}
+		/* F. store dynamic metadata. */
 		if (rxq->dynf_meta) {
 			int32_t offs = rxq->flow_meta_offset;
 			const uint32_t meta =
@@ -208,6 +272,21 @@  rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq,
 							uint32_t *) = meta;
 			}
 		}
+		/* Merge rearm and ol_flags. */
+		ol_flags = _mm_or_si128(ol_flags,
+				_mm_andnot_si128(ol_flags_mask, rearm_flags));
+		rearm0 = _mm_blend_epi16(rearm,
+					 _mm_slli_si128(ol_flags, 8), 0x30);
+		rearm1 = _mm_blend_epi16(rearm,
+					 _mm_slli_si128(ol_flags, 4), 0x30);
+		rearm2 = _mm_blend_epi16(rearm, ol_flags, 0x30);
+		rearm3 = _mm_blend_epi16(rearm,
+					 _mm_srli_si128(ol_flags, 4), 0x30);
+		/* G.1 store rearm data to mbuf. */
+		_mm_storeu_si128((__m128i *)&elts[pos]->rearm_data, rearm0);
+		_mm_storeu_si128((__m128i *)&elts[pos + 1]->rearm_data, rearm1);
+		_mm_storeu_si128((__m128i *)&elts[pos + 2]->rearm_data, rearm2);
+		_mm_storeu_si128((__m128i *)&elts[pos + 3]->rearm_data, rearm3);
 		pos += MLX5_VPMD_DESCS_PER_LOOP;
 		/* Move to next CQE and invalidate consumed CQEs. */
 		if (!(pos & 0x7) && pos < mcqe_n) {
@@ -251,12 +330,9 @@  rxq_cq_to_ptype_oflags_v(struct mlx5_rxq_data *rxq, __m128i cqes[4],
 					  rxq->hw_timestamp * PKT_RX_TIMESTAMP);
 	__m128i cv_flags;
 	const __m128i zero = _mm_setzero_si128();
-	const __m128i ptype_mask =
-		_mm_set_epi32(0xfd06, 0xfd06, 0xfd06, 0xfd06);
-	const __m128i ptype_ol_mask =
-		_mm_set_epi32(0x106, 0x106, 0x106, 0x106);
-	const __m128i pinfo_mask =
-		_mm_set_epi32(0x3, 0x3, 0x3, 0x3);
+	const __m128i ptype_mask = _mm_set1_epi32(0xfd06);
+	const __m128i ptype_ol_mask = _mm_set1_epi32(0x106);
+	const __m128i pinfo_mask = _mm_set1_epi32(0x3);
 	const __m128i cv_flag_sel =
 		_mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0,
 			     (uint8_t)((PKT_RX_IP_CKSUM_GOOD |
@@ -268,13 +344,7 @@  rxq_cq_to_ptype_oflags_v(struct mlx5_rxq_data *rxq, __m128i cqes[4],
 			     (uint8_t)(PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED),
 			     0);
 	const __m128i cv_mask =
-		_mm_set_epi32(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD |
-			      PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
-			      PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD |
-			      PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
-			      PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD |
-			      PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
-			      PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD |
+		_mm_set1_epi32(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD |
 			      PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED);
 	const __m128i mbuf_init =
 		_mm_load_si128((__m128i *)&rxq->mbuf_initializer);