[v5,3/8] net/ice: support vector SSE in RX

Message ID 1553223516-118453-4-git-send-email-wenzhuo.lu@intel.com (mailing list archive)
State Superseded, archived
Delegated to: Qi Zhang
Headers
Series Support vector instructions on ICE |

Checks

Context Check Description
ci/checkpatch warning coding style issues
ci/Intel-compilation success Compilation OK

Commit Message

Wenzhuo Lu March 22, 2019, 2:58 a.m. UTC
  Signed-off-by: Wenzhuo Lu <wenzhuo.lu@intel.com>
---
 doc/guides/nics/features/ice_vec.ini  |  33 +++
 drivers/net/ice/Makefile              |   3 +
 drivers/net/ice/ice_ethdev.c          |   2 -
 drivers/net/ice/ice_ethdev.h          |   2 +
 drivers/net/ice/ice_rxtx.c            |  27 +-
 drivers/net/ice/ice_rxtx.h            |  21 +-
 drivers/net/ice/ice_rxtx_vec_common.h | 155 +++++++++++
 drivers/net/ice/ice_rxtx_vec_sse.c    | 496 ++++++++++++++++++++++++++++++++++
 drivers/net/ice/meson.build           |   4 +
 9 files changed, 737 insertions(+), 6 deletions(-)
 create mode 100644 doc/guides/nics/features/ice_vec.ini
 create mode 100644 drivers/net/ice/ice_rxtx_vec_common.h
 create mode 100644 drivers/net/ice/ice_rxtx_vec_sse.c
  

Comments

Maxime Coquelin March 22, 2019, 9:42 a.m. UTC | #1
On 3/22/19 3:58 AM, Wenzhuo Lu wrote:
> Signed-off-by: Wenzhuo Lu <wenzhuo.lu@intel.com>
> ---
>   doc/guides/nics/features/ice_vec.ini  |  33 +++
>   drivers/net/ice/Makefile              |   3 +
>   drivers/net/ice/ice_ethdev.c          |   2 -
>   drivers/net/ice/ice_ethdev.h          |   2 +
>   drivers/net/ice/ice_rxtx.c            |  27 +-
>   drivers/net/ice/ice_rxtx.h            |  21 +-
>   drivers/net/ice/ice_rxtx_vec_common.h | 155 +++++++++++
>   drivers/net/ice/ice_rxtx_vec_sse.c    | 496 ++++++++++++++++++++++++++++++++++
>   drivers/net/ice/meson.build           |   4 +
>   9 files changed, 737 insertions(+), 6 deletions(-)
>   create mode 100644 doc/guides/nics/features/ice_vec.ini
>   create mode 100644 drivers/net/ice/ice_rxtx_vec_common.h
>   create mode 100644 drivers/net/ice/ice_rxtx_vec_sse.c
> 
> diff --git a/doc/guides/nics/features/ice_vec.ini b/doc/guides/nics/features/ice_vec.ini
> new file mode 100644
> index 0000000..1a19788
> --- /dev/null
> +++ b/doc/guides/nics/features/ice_vec.ini
> @@ -0,0 +1,33 @@
> +;
> +; Supported features of the 'ice_vec' network poll mode driver.
> +;
> +; Refer to default.ini for the full list of available PMD features.
> +;
> +[Features]
> +Speed capabilities   = Y
> +Link status          = Y
> +Link status event    = Y
> +Rx interrupt         = Y
> +Queue start/stop     = Y
> +MTU update           = Y
> +Jumbo frame          = Y
> +Scattered Rx         = Y
> +Promiscuous mode     = Y
> +Allmulticast mode    = Y
> +Unicast MAC filter   = Y
> +Multicast MAC filter = Y
> +RSS hash             = Y
> +RSS key update       = Y
> +RSS reta update      = Y
> +VLAN filter          = Y
> +Packet type parsing  = Y
> +Rx descriptor status = Y
> +Basic stats          = Y
> +Extended stats       = Y
> +FW version           = Y
> +Module EEPROM dump   = Y
> +BSD nic_uio          = Y
> +Linux UIO            = Y
> +Linux VFIO           = Y
> +x86-32               = Y
> +x86-64               = Y
> diff --git a/drivers/net/ice/Makefile b/drivers/net/ice/Makefile
> index 61846ca..92594bb 100644
> --- a/drivers/net/ice/Makefile
> +++ b/drivers/net/ice/Makefile
> @@ -54,5 +54,8 @@ SRCS-$(CONFIG_RTE_LIBRTE_ICE_PMD) += ice_flow.c
>   
>   SRCS-$(CONFIG_RTE_LIBRTE_ICE_PMD) += ice_ethdev.c
>   SRCS-$(CONFIG_RTE_LIBRTE_ICE_PMD) += ice_rxtx.c
> +ifeq ($(CONFIG_RTE_ARCH_X86), y)
> +SRCS-$(CONFIG_RTE_LIBRTE_ICE_PMD) += ice_rxtx_vec_sse.c
> +endif
>   
>   include $(RTE_SDK)/mk/rte.lib.mk
> diff --git a/drivers/net/ice/ice_ethdev.c b/drivers/net/ice/ice_ethdev.c
> index b804be1..8e7c7db 100644
> --- a/drivers/net/ice/ice_ethdev.c
> +++ b/drivers/net/ice/ice_ethdev.c
> @@ -2,8 +2,6 @@
>    * Copyright(c) 2018 Intel Corporation
>    */
>   
> -#include <rte_ethdev_pci.h>
> -
>   #include "base/ice_sched.h"
>   #include "ice_ethdev.h"
>   #include "ice_rxtx.h"
> diff --git a/drivers/net/ice/ice_ethdev.h b/drivers/net/ice/ice_ethdev.h
> index 3cefa5b..151a09e 100644
> --- a/drivers/net/ice/ice_ethdev.h
> +++ b/drivers/net/ice/ice_ethdev.h
> @@ -7,6 +7,8 @@
>   
>   #include <rte_kvargs.h>
>   
> +#include <rte_ethdev_pci.h>
> +
>   #include "base/ice_common.h"
>   #include "base/ice_adminq_cmd.h"
>   
> diff --git a/drivers/net/ice/ice_rxtx.c b/drivers/net/ice/ice_rxtx.c
> index d540ed1..ebb1cab 100644
> --- a/drivers/net/ice/ice_rxtx.c
> +++ b/drivers/net/ice/ice_rxtx.c
> @@ -7,8 +7,6 @@
>   
>   #include "ice_rxtx.h"
>   
> -#define ICE_TD_CMD ICE_TX_DESC_CMD_EOP
> -
>   #define ICE_TX_CKSUM_OFFLOAD_MASK (		 \
>   		PKT_TX_IP_CKSUM |		 \
>   		PKT_TX_L4_MASK |		 \
> @@ -319,6 +317,9 @@
>   	rxq->nb_rx_hold = 0;
>   	rxq->pkt_first_seg = NULL;
>   	rxq->pkt_last_seg = NULL;
> +
> +	rxq->rxrearm_start = 0;
> +	rxq->rxrearm_nb = 0;
>   }
>   
>   int
> @@ -1490,6 +1491,12 @@
>   #endif
>   	    dev->rx_pkt_burst == ice_recv_scattered_pkts)
>   		return ptypes;
> +
> +#ifdef RTE_ARCH_X86
> +	if (dev->rx_pkt_burst == ice_recv_pkts_vec)
> +		return ptypes;
> +#endif
> +
>   	return NULL;
>   }
>   
> @@ -2225,6 +2232,22 @@ void __attribute__((cold))
>   	PMD_INIT_FUNC_TRACE();
>   	struct ice_adapter *ad =
>   		ICE_DEV_PRIVATE_TO_ADAPTER(dev->data->dev_private);
> +#ifdef RTE_ARCH_X86
> +	struct ice_rx_queue *rxq;
> +	int i;
> +
> +	if (!ice_rx_vec_dev_check(dev)) {
> +		for (i = 0; i < dev->data->nb_rx_queues; i++) {
> +			rxq = dev->data->rx_queues[i];
> +			(void)ice_rxq_vec_setup(rxq);
> +		}
> +		PMD_DRV_LOG(DEBUG, "Using Vector Rx (port %d).",
> +			    dev->data->port_id);
> +		dev->rx_pkt_burst = ice_recv_pkts_vec;
> +
> +		return;
> +	}
> +#endif
>   
>   	if (dev->data->scattered_rx) {
>   		/* Set the non-LRO scattered function */
> diff --git a/drivers/net/ice/ice_rxtx.h b/drivers/net/ice/ice_rxtx.h
> index 78b4928..656ca0d 100644
> --- a/drivers/net/ice/ice_rxtx.h
> +++ b/drivers/net/ice/ice_rxtx.h
> @@ -27,6 +27,15 @@
>   
>   #define ICE_SUPPORT_CHAIN_NUM 5
>   
> +#define ICE_TD_CMD                      ICE_TX_DESC_CMD_EOP
> +
> +#define ICE_VPMD_RX_BURST           32
> +#define ICE_VPMD_TX_BURST           32
> +#define ICE_RXQ_REARM_THRESH        32
> +#define ICE_MAX_RX_BURST            ICE_RXQ_REARM_THRESH
> +#define ICE_TX_MAX_FREE_BUF_SZ      64
> +#define ICE_DESCS_PER_LOOP          4
> +
>   typedef void (*ice_rx_release_mbufs_t)(struct ice_rx_queue *rxq);
>   typedef void (*ice_tx_release_mbufs_t)(struct ice_tx_queue *txq);
>   
> @@ -45,13 +54,16 @@ struct ice_rx_queue {
>   	uint16_t nb_rx_hold; /* number of held free RX desc */
>   	struct rte_mbuf *pkt_first_seg; /**< first segment of current packet */
>   	struct rte_mbuf *pkt_last_seg; /**< last segment of current packet */
> -#ifdef RTE_LIBRTE_ICE_RX_ALLOW_BULK_ALLOC
>   	uint16_t rx_nb_avail; /**< number of staged packets ready */
>   	uint16_t rx_next_avail; /**< index of next staged packets */
>   	uint16_t rx_free_trigger; /**< triggers rx buffer allocation */
>   	struct rte_mbuf fake_mbuf; /**< dummy mbuf */
>   	struct rte_mbuf *rx_stage[ICE_RX_MAX_BURST * 2];
> -#endif
> +
> +	uint16_t rxrearm_nb;	/**< number of remaining to be re-armed */
> +	uint16_t rxrearm_start;	/**< the idx we start the re-arming from */
> +	uint64_t mbuf_initializer; /**< value to init mbufs */
> +
>   	uint8_t port_id; /* device port ID */
>   	uint8_t crc_len; /* 0 if CRC stripped, 4 otherwise */
>   	uint16_t queue_id; /* RX queue index */
> @@ -156,4 +168,9 @@ void ice_txq_info_get(struct rte_eth_dev *dev, uint16_t queue_id,
>   int ice_tx_descriptor_status(void *tx_queue, uint16_t offset);
>   void ice_set_default_ptype_table(struct rte_eth_dev *dev);
>   const uint32_t *ice_dev_supported_ptypes_get(struct rte_eth_dev *dev);
> +
> +int ice_rx_vec_dev_check(struct rte_eth_dev *dev);
> +int ice_rxq_vec_setup(struct ice_rx_queue *rxq);
> +uint16_t ice_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
> +			   uint16_t nb_pkts);
>   #endif /* _ICE_RXTX_H_ */
> diff --git a/drivers/net/ice/ice_rxtx_vec_common.h b/drivers/net/ice/ice_rxtx_vec_common.h
> new file mode 100644
> index 0000000..cfef91b
> --- /dev/null
> +++ b/drivers/net/ice/ice_rxtx_vec_common.h
> @@ -0,0 +1,155 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2019 Intel Corporation
> + */
> +
> +#ifndef _ICE_RXTX_VEC_COMMON_H_
> +#define _ICE_RXTX_VEC_COMMON_H_
> +
> +#include "ice_rxtx.h"
> +
> +static inline uint16_t
> +reassemble_packets(struct ice_rx_queue *rxq, struct rte_mbuf **rx_bufs,
As this is in the header file, I think it could be better to prefix it
with 'ice_'. Or maybe with 'ice_rx_' as it seems to be rx-only.
> +		   uint16_t nb_bufs, uint8_t *split_flags)
> +{
> +	struct rte_mbuf *pkts[ICE_VPMD_RX_BURST] = {0}; /*finished pkts*/
> +	struct rte_mbuf *start = rxq->pkt_first_seg;
> +	struct rte_mbuf *end =  rxq->pkt_last_seg;
> +	unsigned int pkt_idx, buf_idx;
> +
> +	for (buf_idx = 0, pkt_idx = 0; buf_idx < nb_bufs; buf_idx++) {
> +		if (end) {
> +			/* processing a split packet */
> +			end->next = rx_bufs[buf_idx];
> +			rx_bufs[buf_idx]->data_len += rxq->crc_len;
> +
> +			start->nb_segs++;
> +			start->pkt_len += rx_bufs[buf_idx]->data_len;
> +			end = end->next;
> +
> +			if (!split_flags[buf_idx]) {
> +				/* it's the last packet of the set */
> +				start->hash = end->hash;
> +				start->ol_flags = end->ol_flags;
> +				/* we need to strip crc for the whole packet */
> +				start->pkt_len -= rxq->crc_len;
> +				if (end->data_len > rxq->crc_len) {
> +					end->data_len -= rxq->crc_len;
> +				} else {
> +					/* free up last mbuf */
> +					struct rte_mbuf *secondlast = start;
> +
> +					start->nb_segs--;
> +					while (secondlast->next != end)
> +						secondlast = secondlast->next;
> +					secondlast->data_len -= (rxq->crc_len -
> +							end->data_len);
> +					secondlast->next = NULL;
> +					rte_pktmbuf_free_seg(end);
> +				}
> +				pkts[pkt_idx++] = start;
> +				start = NULL;
> +				end = NULL;
> +			}
> +		} else {
> +			/* not processing a split packet */
> +			if (!split_flags[buf_idx]) {
> +				/* not a split packet, save and skip */
> +				pkts[pkt_idx++] = rx_bufs[buf_idx];
> +				continue;
> +			}
> +			start = rx_bufs[buf_idx];
> +			end = start;
> +			rx_bufs[buf_idx]->data_len += rxq->crc_len;
> +			rx_bufs[buf_idx]->pkt_len += rxq->crc_len;
> +		}
> +	}
> +
> +	/* save the partial packet for next time */
> +	rxq->pkt_first_seg = start;
> +	rxq->pkt_last_seg = end;
> +	rte_memcpy(rx_bufs, pkts, pkt_idx * (sizeof(*pkts)));
> +	return pkt_idx;
> +}
> +
> +static inline void
> +_ice_rx_queue_release_mbufs_vec(struct ice_rx_queue *rxq)
> +{
> +	const unsigned int mask = rxq->nb_rx_desc - 1;
> +	unsigned int i;
> +
> +	if (!rxq->sw_ring || rxq->rxrearm_nb >= rxq->nb_rx_desc)
> +		return;

Maybe not a big deal, but I understand that !rxq->sw_ring is not the
common case, more an error. If so, the if condition could be split in
two, and having the first one tagged with unlikely.

Looking at Tx patch, you should also ensure that rxq != NULL and also
print a debug/error message to be consistent.

> +
> +	/* free all mbufs that are valid in the ring */
> +	if (rxq->rxrearm_nb == 0) {
> +		for (i = 0; i < rxq->nb_rx_desc; i++) {
> +			if (rxq->sw_ring[i].mbuf)
> +				rte_pktmbuf_free_seg(rxq->sw_ring[i].mbuf);
> +		}
> +	} else {
> +		for (i = rxq->rx_tail;
> +		     i != rxq->rxrearm_start;
> +		     i = (i + 1) & mask) {
> +			if (rxq->sw_ring[i].mbuf)
> +				rte_pktmbuf_free_seg(rxq->sw_ring[i].mbuf);
> +		}
> +	}
> +
> +	rxq->rxrearm_nb = rxq->nb_rx_desc;
> +
> +	/* set all entries to NULL */
> +	memset(rxq->sw_ring, 0, sizeof(rxq->sw_ring[0]) * rxq->nb_rx_desc);
> +}

...

> diff --git a/drivers/net/ice/ice_rxtx_vec_sse.c b/drivers/net/ice/ice_rxtx_vec_sse.c
> new file mode 100644
> index 0000000..f6fe9ef
> --- /dev/null
> +++ b/drivers/net/ice/ice_rxtx_vec_sse.c

...

> +
> +/**
> + * Notice:
> + * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet
> + * - nb_pkts > ICE_VPMD_RX_BURST, only scan ICE_VPMD_RX_BURST
> + *   numbers of DD bits
> + */
> +uint16_t
> +ice_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
> +		  uint16_t nb_pkts)
> +{
> +	return _recv_raw_pkts_vec(rx_queue, rx_pkts, nb_pkts, NULL);

Same as below comment.

> +}
> +
> +static void __attribute__((cold))
> +ice_rx_queue_release_mbufs_vec(struct ice_rx_queue *rxq)
> +{
> +	_ice_rx_queue_release_mbufs_vec(rxq);

What is the point of having _ice_rx_queue_release_mbufs_vec as it is
only called once here?

> +}
> +
> +int __attribute__((cold))
> +ice_rxq_vec_setup(struct ice_rx_queue *rxq)
> +{
> +	if (!rxq)
> +		return -1;
> +
> +	rxq->rx_rel_mbufs = ice_rx_queue_release_mbufs_vec;
> +	return ice_rxq_vec_setup_default(rxq);
> +}
> +
> +int __attribute__((cold))
> +ice_rx_vec_dev_check(struct rte_eth_dev *dev)
> +{
> +	return ice_rx_vec_dev_check_default(dev);
> +}
> diff --git a/drivers/net/ice/meson.build b/drivers/net/ice/meson.build
> index 857dc0e..469264d 100644
> --- a/drivers/net/ice/meson.build
> +++ b/drivers/net/ice/meson.build
> @@ -11,3 +11,7 @@ sources = files(
>   
>   deps += ['hash']
>   includes += include_directories('base')
> +
> +if arch_subdir == 'x86'
> +	sources += files('ice_rxtx_vec_sse.c')
> +endif
>
  
Wenzhuo Lu March 25, 2019, 1:56 a.m. UTC | #2
Hi Maxime,

> -----Original Message-----
> From: Maxime Coquelin [mailto:maxime.coquelin@redhat.com]
> Sent: Friday, March 22, 2019 5:43 PM
> To: Lu, Wenzhuo <wenzhuo.lu@intel.com>; dev@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH v5 3/8] net/ice: support vector SSE in RX
> 
> > +
> > +static inline uint16_t
> > +reassemble_packets(struct ice_rx_queue *rxq, struct rte_mbuf
> > +**rx_bufs,
> As this is in the header file, I think it could be better to prefix it with 'ice_'. Or
> maybe with 'ice_rx_' as it seems to be rx-only.
Thanks for the comment. I'll add the prefix.

> > +static inline void
> > +_ice_rx_queue_release_mbufs_vec(struct ice_rx_queue *rxq) {
> > +	const unsigned int mask = rxq->nb_rx_desc - 1;
> > +	unsigned int i;
> > +
> > +	if (!rxq->sw_ring || rxq->rxrearm_nb >= rxq->nb_rx_desc)
> > +		return;
> 
> Maybe not a big deal, but I understand that !rxq->sw_ring is not the
> common case, more an error. If so, the if condition could be split in two, and
> having the first one tagged with unlikely.
> 
> Looking at Tx patch, you should also ensure that rxq != NULL and also print a
> debug/error message to be consistent.
Thanks for the suggestion. I'll change it.

> > +/**
> > + * Notice:
> > + * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet
> > + * - nb_pkts > ICE_VPMD_RX_BURST, only scan ICE_VPMD_RX_BURST
> > + *   numbers of DD bits
> > + */
> > +uint16_t
> > +ice_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
> > +		  uint16_t nb_pkts)
> > +{
> > +	return _recv_raw_pkts_vec(rx_queue, rx_pkts, nb_pkts, NULL);
> 
> Same as below comment.
_recv_raw_pkts_vec is used by the normal RX and scatter RX. It will be called again later in the patch 4. So, we make it an inline function.

> 
> > +}
> > +
> > +static void __attribute__((cold))
> > +ice_rx_queue_release_mbufs_vec(struct ice_rx_queue *rxq) {
> > +	_ice_rx_queue_release_mbufs_vec(rxq);
> 
> What is the point of having _ice_rx_queue_release_mbufs_vec as it is only
> called once here?
To our experience, it can be reused when the vector is implemented on other platform. So we put it in the common.h.
  

Patch

diff --git a/doc/guides/nics/features/ice_vec.ini b/doc/guides/nics/features/ice_vec.ini
new file mode 100644
index 0000000..1a19788
--- /dev/null
+++ b/doc/guides/nics/features/ice_vec.ini
@@ -0,0 +1,33 @@ 
+;
+; Supported features of the 'ice_vec' network poll mode driver.
+;
+; Refer to default.ini for the full list of available PMD features.
+;
+[Features]
+Speed capabilities   = Y
+Link status          = Y
+Link status event    = Y
+Rx interrupt         = Y
+Queue start/stop     = Y
+MTU update           = Y
+Jumbo frame          = Y
+Scattered Rx         = Y
+Promiscuous mode     = Y
+Allmulticast mode    = Y
+Unicast MAC filter   = Y
+Multicast MAC filter = Y
+RSS hash             = Y
+RSS key update       = Y
+RSS reta update      = Y
+VLAN filter          = Y
+Packet type parsing  = Y
+Rx descriptor status = Y
+Basic stats          = Y
+Extended stats       = Y
+FW version           = Y
+Module EEPROM dump   = Y
+BSD nic_uio          = Y
+Linux UIO            = Y
+Linux VFIO           = Y
+x86-32               = Y
+x86-64               = Y
diff --git a/drivers/net/ice/Makefile b/drivers/net/ice/Makefile
index 61846ca..92594bb 100644
--- a/drivers/net/ice/Makefile
+++ b/drivers/net/ice/Makefile
@@ -54,5 +54,8 @@  SRCS-$(CONFIG_RTE_LIBRTE_ICE_PMD) += ice_flow.c
 
 SRCS-$(CONFIG_RTE_LIBRTE_ICE_PMD) += ice_ethdev.c
 SRCS-$(CONFIG_RTE_LIBRTE_ICE_PMD) += ice_rxtx.c
+ifeq ($(CONFIG_RTE_ARCH_X86), y)
+SRCS-$(CONFIG_RTE_LIBRTE_ICE_PMD) += ice_rxtx_vec_sse.c
+endif
 
 include $(RTE_SDK)/mk/rte.lib.mk
diff --git a/drivers/net/ice/ice_ethdev.c b/drivers/net/ice/ice_ethdev.c
index b804be1..8e7c7db 100644
--- a/drivers/net/ice/ice_ethdev.c
+++ b/drivers/net/ice/ice_ethdev.c
@@ -2,8 +2,6 @@ 
  * Copyright(c) 2018 Intel Corporation
  */
 
-#include <rte_ethdev_pci.h>
-
 #include "base/ice_sched.h"
 #include "ice_ethdev.h"
 #include "ice_rxtx.h"
diff --git a/drivers/net/ice/ice_ethdev.h b/drivers/net/ice/ice_ethdev.h
index 3cefa5b..151a09e 100644
--- a/drivers/net/ice/ice_ethdev.h
+++ b/drivers/net/ice/ice_ethdev.h
@@ -7,6 +7,8 @@ 
 
 #include <rte_kvargs.h>
 
+#include <rte_ethdev_pci.h>
+
 #include "base/ice_common.h"
 #include "base/ice_adminq_cmd.h"
 
diff --git a/drivers/net/ice/ice_rxtx.c b/drivers/net/ice/ice_rxtx.c
index d540ed1..ebb1cab 100644
--- a/drivers/net/ice/ice_rxtx.c
+++ b/drivers/net/ice/ice_rxtx.c
@@ -7,8 +7,6 @@ 
 
 #include "ice_rxtx.h"
 
-#define ICE_TD_CMD ICE_TX_DESC_CMD_EOP
-
 #define ICE_TX_CKSUM_OFFLOAD_MASK (		 \
 		PKT_TX_IP_CKSUM |		 \
 		PKT_TX_L4_MASK |		 \
@@ -319,6 +317,9 @@ 
 	rxq->nb_rx_hold = 0;
 	rxq->pkt_first_seg = NULL;
 	rxq->pkt_last_seg = NULL;
+
+	rxq->rxrearm_start = 0;
+	rxq->rxrearm_nb = 0;
 }
 
 int
@@ -1490,6 +1491,12 @@ 
 #endif
 	    dev->rx_pkt_burst == ice_recv_scattered_pkts)
 		return ptypes;
+
+#ifdef RTE_ARCH_X86
+	if (dev->rx_pkt_burst == ice_recv_pkts_vec)
+		return ptypes;
+#endif
+
 	return NULL;
 }
 
@@ -2225,6 +2232,22 @@  void __attribute__((cold))
 	PMD_INIT_FUNC_TRACE();
 	struct ice_adapter *ad =
 		ICE_DEV_PRIVATE_TO_ADAPTER(dev->data->dev_private);
+#ifdef RTE_ARCH_X86
+	struct ice_rx_queue *rxq;
+	int i;
+
+	if (!ice_rx_vec_dev_check(dev)) {
+		for (i = 0; i < dev->data->nb_rx_queues; i++) {
+			rxq = dev->data->rx_queues[i];
+			(void)ice_rxq_vec_setup(rxq);
+		}
+		PMD_DRV_LOG(DEBUG, "Using Vector Rx (port %d).",
+			    dev->data->port_id);
+		dev->rx_pkt_burst = ice_recv_pkts_vec;
+
+		return;
+	}
+#endif
 
 	if (dev->data->scattered_rx) {
 		/* Set the non-LRO scattered function */
diff --git a/drivers/net/ice/ice_rxtx.h b/drivers/net/ice/ice_rxtx.h
index 78b4928..656ca0d 100644
--- a/drivers/net/ice/ice_rxtx.h
+++ b/drivers/net/ice/ice_rxtx.h
@@ -27,6 +27,15 @@ 
 
 #define ICE_SUPPORT_CHAIN_NUM 5
 
+#define ICE_TD_CMD                      ICE_TX_DESC_CMD_EOP
+
+#define ICE_VPMD_RX_BURST           32
+#define ICE_VPMD_TX_BURST           32
+#define ICE_RXQ_REARM_THRESH        32
+#define ICE_MAX_RX_BURST            ICE_RXQ_REARM_THRESH
+#define ICE_TX_MAX_FREE_BUF_SZ      64
+#define ICE_DESCS_PER_LOOP          4
+
 typedef void (*ice_rx_release_mbufs_t)(struct ice_rx_queue *rxq);
 typedef void (*ice_tx_release_mbufs_t)(struct ice_tx_queue *txq);
 
@@ -45,13 +54,16 @@  struct ice_rx_queue {
 	uint16_t nb_rx_hold; /* number of held free RX desc */
 	struct rte_mbuf *pkt_first_seg; /**< first segment of current packet */
 	struct rte_mbuf *pkt_last_seg; /**< last segment of current packet */
-#ifdef RTE_LIBRTE_ICE_RX_ALLOW_BULK_ALLOC
 	uint16_t rx_nb_avail; /**< number of staged packets ready */
 	uint16_t rx_next_avail; /**< index of next staged packets */
 	uint16_t rx_free_trigger; /**< triggers rx buffer allocation */
 	struct rte_mbuf fake_mbuf; /**< dummy mbuf */
 	struct rte_mbuf *rx_stage[ICE_RX_MAX_BURST * 2];
-#endif
+
+	uint16_t rxrearm_nb;	/**< number of remaining to be re-armed */
+	uint16_t rxrearm_start;	/**< the idx we start the re-arming from */
+	uint64_t mbuf_initializer; /**< value to init mbufs */
+
 	uint8_t port_id; /* device port ID */
 	uint8_t crc_len; /* 0 if CRC stripped, 4 otherwise */
 	uint16_t queue_id; /* RX queue index */
@@ -156,4 +168,9 @@  void ice_txq_info_get(struct rte_eth_dev *dev, uint16_t queue_id,
 int ice_tx_descriptor_status(void *tx_queue, uint16_t offset);
 void ice_set_default_ptype_table(struct rte_eth_dev *dev);
 const uint32_t *ice_dev_supported_ptypes_get(struct rte_eth_dev *dev);
+
+int ice_rx_vec_dev_check(struct rte_eth_dev *dev);
+int ice_rxq_vec_setup(struct ice_rx_queue *rxq);
+uint16_t ice_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+			   uint16_t nb_pkts);
 #endif /* _ICE_RXTX_H_ */
diff --git a/drivers/net/ice/ice_rxtx_vec_common.h b/drivers/net/ice/ice_rxtx_vec_common.h
new file mode 100644
index 0000000..cfef91b
--- /dev/null
+++ b/drivers/net/ice/ice_rxtx_vec_common.h
@@ -0,0 +1,155 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2019 Intel Corporation
+ */
+
+#ifndef _ICE_RXTX_VEC_COMMON_H_
+#define _ICE_RXTX_VEC_COMMON_H_
+
+#include "ice_rxtx.h"
+
+static inline uint16_t
+reassemble_packets(struct ice_rx_queue *rxq, struct rte_mbuf **rx_bufs,
+		   uint16_t nb_bufs, uint8_t *split_flags)
+{
+	struct rte_mbuf *pkts[ICE_VPMD_RX_BURST] = {0}; /*finished pkts*/
+	struct rte_mbuf *start = rxq->pkt_first_seg;
+	struct rte_mbuf *end =  rxq->pkt_last_seg;
+	unsigned int pkt_idx, buf_idx;
+
+	for (buf_idx = 0, pkt_idx = 0; buf_idx < nb_bufs; buf_idx++) {
+		if (end) {
+			/* processing a split packet */
+			end->next = rx_bufs[buf_idx];
+			rx_bufs[buf_idx]->data_len += rxq->crc_len;
+
+			start->nb_segs++;
+			start->pkt_len += rx_bufs[buf_idx]->data_len;
+			end = end->next;
+
+			if (!split_flags[buf_idx]) {
+				/* it's the last packet of the set */
+				start->hash = end->hash;
+				start->ol_flags = end->ol_flags;
+				/* we need to strip crc for the whole packet */
+				start->pkt_len -= rxq->crc_len;
+				if (end->data_len > rxq->crc_len) {
+					end->data_len -= rxq->crc_len;
+				} else {
+					/* free up last mbuf */
+					struct rte_mbuf *secondlast = start;
+
+					start->nb_segs--;
+					while (secondlast->next != end)
+						secondlast = secondlast->next;
+					secondlast->data_len -= (rxq->crc_len -
+							end->data_len);
+					secondlast->next = NULL;
+					rte_pktmbuf_free_seg(end);
+				}
+				pkts[pkt_idx++] = start;
+				start = NULL;
+				end = NULL;
+			}
+		} else {
+			/* not processing a split packet */
+			if (!split_flags[buf_idx]) {
+				/* not a split packet, save and skip */
+				pkts[pkt_idx++] = rx_bufs[buf_idx];
+				continue;
+			}
+			start = rx_bufs[buf_idx];
+			end = start;
+			rx_bufs[buf_idx]->data_len += rxq->crc_len;
+			rx_bufs[buf_idx]->pkt_len += rxq->crc_len;
+		}
+	}
+
+	/* save the partial packet for next time */
+	rxq->pkt_first_seg = start;
+	rxq->pkt_last_seg = end;
+	rte_memcpy(rx_bufs, pkts, pkt_idx * (sizeof(*pkts)));
+	return pkt_idx;
+}
+
+static inline void
+_ice_rx_queue_release_mbufs_vec(struct ice_rx_queue *rxq)
+{
+	const unsigned int mask = rxq->nb_rx_desc - 1;
+	unsigned int i;
+
+	if (!rxq->sw_ring || rxq->rxrearm_nb >= rxq->nb_rx_desc)
+		return;
+
+	/* free all mbufs that are valid in the ring */
+	if (rxq->rxrearm_nb == 0) {
+		for (i = 0; i < rxq->nb_rx_desc; i++) {
+			if (rxq->sw_ring[i].mbuf)
+				rte_pktmbuf_free_seg(rxq->sw_ring[i].mbuf);
+		}
+	} else {
+		for (i = rxq->rx_tail;
+		     i != rxq->rxrearm_start;
+		     i = (i + 1) & mask) {
+			if (rxq->sw_ring[i].mbuf)
+				rte_pktmbuf_free_seg(rxq->sw_ring[i].mbuf);
+		}
+	}
+
+	rxq->rxrearm_nb = rxq->nb_rx_desc;
+
+	/* set all entries to NULL */
+	memset(rxq->sw_ring, 0, sizeof(rxq->sw_ring[0]) * rxq->nb_rx_desc);
+}
+
+static inline int
+ice_rxq_vec_setup_default(struct ice_rx_queue *rxq)
+{
+	uintptr_t p;
+	struct rte_mbuf mb_def = { .buf_addr = 0 }; /* zeroed mbuf */
+
+	mb_def.nb_segs = 1;
+	mb_def.data_off = RTE_PKTMBUF_HEADROOM;
+	mb_def.port = rxq->port_id;
+	rte_mbuf_refcnt_set(&mb_def, 1);
+
+	/* prevent compiler reordering: rearm_data covers previous fields */
+	rte_compiler_barrier();
+	p = (uintptr_t)&mb_def.rearm_data;
+	rxq->mbuf_initializer = *(uint64_t *)p;
+	return 0;
+}
+
+static inline int
+ice_rx_vec_queue_default(struct ice_rx_queue *rxq)
+{
+	if (!rxq)
+		return -1;
+
+	if (!rte_is_power_of_2(rxq->nb_rx_desc))
+		return -1;
+
+	if (rxq->rx_free_thresh < ICE_VPMD_RX_BURST)
+		return -1;
+
+	if (rxq->nb_rx_desc % rxq->rx_free_thresh)
+		return -1;
+
+	return 0;
+}
+
+static inline int
+ice_rx_vec_dev_check_default(struct rte_eth_dev *dev)
+{
+	int i;
+	struct ice_rx_queue *rxq;
+
+	for (i = 0; i < dev->data->nb_rx_queues; i++) {
+		rxq = dev->data->rx_queues[i];
+		if (ice_rx_vec_queue_default(rxq))
+			return -1;
+	}
+
+	return 0;
+}
+
+#endif
diff --git a/drivers/net/ice/ice_rxtx_vec_sse.c b/drivers/net/ice/ice_rxtx_vec_sse.c
new file mode 100644
index 0000000..f6fe9ef
--- /dev/null
+++ b/drivers/net/ice/ice_rxtx_vec_sse.c
@@ -0,0 +1,496 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2019 Intel Corporation
+ */
+
+#include "ice_rxtx_vec_common.h"
+
+#include <tmmintrin.h>
+
+#ifndef __INTEL_COMPILER
+#pragma GCC diagnostic ignored "-Wcast-qual"
+#endif
+
+static inline void
+ice_rxq_rearm(struct ice_rx_queue *rxq)
+{
+	int i;
+	uint16_t rx_id;
+	volatile union ice_rx_desc *rxdp;
+	struct ice_rx_entry *rxep = &rxq->sw_ring[rxq->rxrearm_start];
+	struct rte_mbuf *mb0, *mb1;
+	__m128i hdr_room = _mm_set_epi64x(RTE_PKTMBUF_HEADROOM,
+					  RTE_PKTMBUF_HEADROOM);
+	__m128i dma_addr0, dma_addr1;
+
+	rxdp = rxq->rx_ring + rxq->rxrearm_start;
+
+	/* Pull 'n' more MBUFs into the software ring */
+	if (rte_mempool_get_bulk(rxq->mp,
+				 (void *)rxep,
+				 ICE_RXQ_REARM_THRESH) < 0) {
+		if (rxq->rxrearm_nb + ICE_RXQ_REARM_THRESH >=
+		    rxq->nb_rx_desc) {
+			dma_addr0 = _mm_setzero_si128();
+			for (i = 0; i < ICE_DESCS_PER_LOOP; i++) {
+				rxep[i].mbuf = &rxq->fake_mbuf;
+				_mm_store_si128((__m128i *)&rxdp[i].read,
+						dma_addr0);
+			}
+		}
+		rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed +=
+			ICE_RXQ_REARM_THRESH;
+		return;
+	}
+
+	/* Initialize the mbufs in vector, process 2 mbufs in one loop */
+	for (i = 0; i < ICE_RXQ_REARM_THRESH; i += 2, rxep += 2) {
+		__m128i vaddr0, vaddr1;
+
+		mb0 = rxep[0].mbuf;
+		mb1 = rxep[1].mbuf;
+
+		/* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */
+		RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) !=
+				 offsetof(struct rte_mbuf, buf_addr) + 8);
+		vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr);
+		vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr);
+
+		/* convert pa to dma_addr hdr/data */
+		dma_addr0 = _mm_unpackhi_epi64(vaddr0, vaddr0);
+		dma_addr1 = _mm_unpackhi_epi64(vaddr1, vaddr1);
+
+		/* add headroom to pa values */
+		dma_addr0 = _mm_add_epi64(dma_addr0, hdr_room);
+		dma_addr1 = _mm_add_epi64(dma_addr1, hdr_room);
+
+		/* flush desc with pa dma_addr */
+		_mm_store_si128((__m128i *)&rxdp++->read, dma_addr0);
+		_mm_store_si128((__m128i *)&rxdp++->read, dma_addr1);
+	}
+
+	rxq->rxrearm_start += ICE_RXQ_REARM_THRESH;
+	if (rxq->rxrearm_start >= rxq->nb_rx_desc)
+		rxq->rxrearm_start = 0;
+
+	rxq->rxrearm_nb -= ICE_RXQ_REARM_THRESH;
+
+	rx_id = (uint16_t)((rxq->rxrearm_start == 0) ?
+			   (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1));
+
+	/* Update the tail pointer on the NIC */
+	ICE_PCI_REG_WRITE(rxq->qrx_tail, rx_id);
+}
+
+static inline void
+desc_to_olflags_v(struct ice_rx_queue *rxq, __m128i descs[4],
+		  struct rte_mbuf **rx_pkts)
+{
+	const __m128i mbuf_init = _mm_set_epi64x(0, rxq->mbuf_initializer);
+	__m128i rearm0, rearm1, rearm2, rearm3;
+
+	__m128i vlan0, vlan1, rss, l3_l4e;
+
+	/* mask everything except RSS, flow director and VLAN flags
+	 * bit2 is for VLAN tag, bit11 for flow director indication
+	 * bit13:12 for RSS indication.
+	 */
+	const __m128i rss_vlan_msk = _mm_set_epi32(0x1c03804, 0x1c03804,
+						   0x1c03804, 0x1c03804);
+
+	const __m128i cksum_mask = _mm_set_epi32(PKT_RX_IP_CKSUM_GOOD |
+						 PKT_RX_IP_CKSUM_BAD |
+						 PKT_RX_L4_CKSUM_GOOD |
+						 PKT_RX_L4_CKSUM_BAD |
+						 PKT_RX_EIP_CKSUM_BAD,
+						 PKT_RX_IP_CKSUM_GOOD |
+						 PKT_RX_IP_CKSUM_BAD |
+						 PKT_RX_L4_CKSUM_GOOD |
+						 PKT_RX_L4_CKSUM_BAD |
+						 PKT_RX_EIP_CKSUM_BAD,
+						 PKT_RX_IP_CKSUM_GOOD |
+						 PKT_RX_IP_CKSUM_BAD |
+						 PKT_RX_L4_CKSUM_GOOD |
+						 PKT_RX_L4_CKSUM_BAD |
+						 PKT_RX_EIP_CKSUM_BAD,
+						 PKT_RX_IP_CKSUM_GOOD |
+						 PKT_RX_IP_CKSUM_BAD |
+						 PKT_RX_L4_CKSUM_GOOD |
+						 PKT_RX_L4_CKSUM_BAD |
+						 PKT_RX_EIP_CKSUM_BAD);
+
+	/* map rss and vlan type to rss hash and vlan flag */
+	const __m128i vlan_flags = _mm_set_epi8(0, 0, 0, 0,
+			0, 0, 0, 0,
+			0, 0, 0, PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
+			0, 0, 0, 0);
+
+	const __m128i rss_flags = _mm_set_epi8(0, 0, 0, 0,
+			0, 0, 0, 0,
+			PKT_RX_RSS_HASH | PKT_RX_FDIR, PKT_RX_RSS_HASH, 0, 0,
+			0, 0, PKT_RX_FDIR, 0);
+
+	const __m128i l3_l4e_flags = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
+			/* shift right 1 bit to make sure it not exceed 255 */
+			(PKT_RX_EIP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
+			 PKT_RX_IP_CKSUM_BAD) >> 1,
+			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_EIP_CKSUM_BAD |
+			 PKT_RX_L4_CKSUM_BAD) >> 1,
+			(PKT_RX_EIP_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
+			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_EIP_CKSUM_BAD) >> 1,
+			(PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
+			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD) >> 1,
+			PKT_RX_IP_CKSUM_BAD >> 1,
+			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD) >> 1);
+
+	vlan0 = _mm_unpackhi_epi32(descs[0], descs[1]);
+	vlan1 = _mm_unpackhi_epi32(descs[2], descs[3]);
+	vlan0 = _mm_unpacklo_epi64(vlan0, vlan1);
+
+	vlan1 = _mm_and_si128(vlan0, rss_vlan_msk);
+	vlan0 = _mm_shuffle_epi8(vlan_flags, vlan1);
+
+	rss = _mm_srli_epi32(vlan1, 11);
+	rss = _mm_shuffle_epi8(rss_flags, rss);
+
+	l3_l4e = _mm_srli_epi32(vlan1, 22);
+	l3_l4e = _mm_shuffle_epi8(l3_l4e_flags, l3_l4e);
+	/* then we shift left 1 bit */
+	l3_l4e = _mm_slli_epi32(l3_l4e, 1);
+	/* we need to mask out the reduntant bits */
+	l3_l4e = _mm_and_si128(l3_l4e, cksum_mask);
+
+	vlan0 = _mm_or_si128(vlan0, rss);
+	vlan0 = _mm_or_si128(vlan0, l3_l4e);
+
+	/**
+	 * At this point, we have the 4 sets of flags in the low 16-bits
+	 * of each 32-bit value in vlan0.
+	 * We want to extract these, and merge them with the mbuf init data
+	 * so we can do a single 16-byte write to the mbuf to set the flags
+	 * and all the other initialization fields. Extracting the
+	 * appropriate flags means that we have to do a shift and blend for
+	 * each mbuf before we do the write.
+	 */
+	rearm0 = _mm_blend_epi16(mbuf_init, _mm_slli_si128(vlan0, 8), 0x10);
+	rearm1 = _mm_blend_epi16(mbuf_init, _mm_slli_si128(vlan0, 4), 0x10);
+	rearm2 = _mm_blend_epi16(mbuf_init, vlan0, 0x10);
+	rearm3 = _mm_blend_epi16(mbuf_init, _mm_srli_si128(vlan0, 4), 0x10);
+
+	/* write the rearm data and the olflags in one write */
+	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, ol_flags) !=
+			 offsetof(struct rte_mbuf, rearm_data) + 8);
+	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, rearm_data) !=
+			 RTE_ALIGN(offsetof(struct rte_mbuf, rearm_data), 16));
+	_mm_store_si128((__m128i *)&rx_pkts[0]->rearm_data, rearm0);
+	_mm_store_si128((__m128i *)&rx_pkts[1]->rearm_data, rearm1);
+	_mm_store_si128((__m128i *)&rx_pkts[2]->rearm_data, rearm2);
+	_mm_store_si128((__m128i *)&rx_pkts[3]->rearm_data, rearm3);
+}
+
+#define PKTLEN_SHIFT     10
+
+static inline void
+desc_to_ptype_v(__m128i descs[4], struct rte_mbuf **rx_pkts,
+		uint32_t *ptype_tbl)
+{
+	__m128i ptype0 = _mm_unpackhi_epi64(descs[0], descs[1]);
+	__m128i ptype1 = _mm_unpackhi_epi64(descs[2], descs[3]);
+
+	ptype0 = _mm_srli_epi64(ptype0, 30);
+	ptype1 = _mm_srli_epi64(ptype1, 30);
+
+	rx_pkts[0]->packet_type = ptype_tbl[_mm_extract_epi8(ptype0, 0)];
+	rx_pkts[1]->packet_type = ptype_tbl[_mm_extract_epi8(ptype0, 8)];
+	rx_pkts[2]->packet_type = ptype_tbl[_mm_extract_epi8(ptype1, 0)];
+	rx_pkts[3]->packet_type = ptype_tbl[_mm_extract_epi8(ptype1, 8)];
+}
+
+/**
+ * Notice:
+ * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet
+ * - nb_pkts > ICE_VPMD_RX_BURST, only scan ICE_VPMD_RX_BURST
+ *   numbers of DD bits
+ */
+static inline uint16_t
+_recv_raw_pkts_vec(struct ice_rx_queue *rxq, struct rte_mbuf **rx_pkts,
+		   uint16_t nb_pkts, uint8_t *split_packet)
+{
+	volatile union ice_rx_desc *rxdp;
+	struct ice_rx_entry *sw_ring;
+	uint16_t nb_pkts_recd;
+	int pos;
+	uint64_t var;
+	__m128i shuf_msk;
+	uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl;
+
+	__m128i crc_adjust = _mm_set_epi16
+				(0, 0, 0,    /* ignore non-length fields */
+				 -rxq->crc_len, /* sub crc on data_len */
+				 0,          /* ignore high-16bits of pkt_len */
+				 -rxq->crc_len, /* sub crc on pkt_len */
+				 0, 0            /* ignore pkt_type field */
+				);
+	/**
+	 * compile-time check the above crc_adjust layout is correct.
+	 * NOTE: the first field (lowest address) is given last in set_epi16
+	 * call above.
+	 */
+	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) !=
+			 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4);
+	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) !=
+			 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8);
+	__m128i dd_check, eop_check;
+
+	/* nb_pkts shall be less equal than ICE_MAX_RX_BURST */
+	nb_pkts = RTE_MIN(nb_pkts, ICE_MAX_RX_BURST);
+
+	/* nb_pkts has to be floor-aligned to ICE_DESCS_PER_LOOP */
+	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, ICE_DESCS_PER_LOOP);
+
+	/* Just the act of getting into the function from the application is
+	 * going to cost about 7 cycles
+	 */
+	rxdp = rxq->rx_ring + rxq->rx_tail;
+
+	rte_prefetch0(rxdp);
+
+	/* See if we need to rearm the RX queue - gives the prefetch a bit
+	 * of time to act
+	 */
+	if (rxq->rxrearm_nb > ICE_RXQ_REARM_THRESH)
+		ice_rxq_rearm(rxq);
+
+	/* Before we start moving massive data around, check to see if
+	 * there is actually a packet available
+	 */
+	if (!(rxdp->wb.qword1.status_error_len &
+	      rte_cpu_to_le_32(1 << ICE_RX_DESC_STATUS_DD_S)))
+		return 0;
+
+	/* 4 packets DD mask */
+	dd_check = _mm_set_epi64x(0x0000000100000001LL, 0x0000000100000001LL);
+
+	/* 4 packets EOP mask */
+	eop_check = _mm_set_epi64x(0x0000000200000002LL, 0x0000000200000002LL);
+
+	/* mask to shuffle from desc. to mbuf */
+	shuf_msk = _mm_set_epi8
+			(7, 6, 5, 4,  /* octet 4~7, 32bits rss */
+			 3, 2,        /* octet 2~3, low 16 bits vlan_macip */
+			 15, 14,      /* octet 15~14, 16 bits data_len */
+			 0xFF, 0xFF,  /* skip high 16 bits pkt_len, zero out */
+			 15, 14,      /* octet 15~14, low 16 bits pkt_len */
+			 0xFF, 0xFF,  /* pkt_type set as unknown */
+			 0xFF, 0xFF  /*pkt_type set as unknown */
+			);
+	/**
+	 * Compile-time verify the shuffle mask
+	 * NOTE: some field positions already verified above, but duplicated
+	 * here for completeness in case of future modifications.
+	 */
+	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) !=
+			 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4);
+	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) !=
+			 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8);
+	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, vlan_tci) !=
+			 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 10);
+	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, hash) !=
+			 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 12);
+
+	/* Cache is empty -> need to scan the buffer rings, but first move
+	 * the next 'n' mbufs into the cache
+	 */
+	sw_ring = &rxq->sw_ring[rxq->rx_tail];
+
+	/* A. load 4 packet in one loop
+	 * [A*. mask out 4 unused dirty field in desc]
+	 * B. copy 4 mbuf point from swring to rx_pkts
+	 * C. calc the number of DD bits among the 4 packets
+	 * [C*. extract the end-of-packet bit, if requested]
+	 * D. fill info. from desc to mbuf
+	 */
+
+	for (pos = 0, nb_pkts_recd = 0; pos < nb_pkts;
+	     pos += ICE_DESCS_PER_LOOP,
+	     rxdp += ICE_DESCS_PER_LOOP) {
+		__m128i descs[ICE_DESCS_PER_LOOP];
+		__m128i pkt_mb1, pkt_mb2, pkt_mb3, pkt_mb4;
+		__m128i zero, staterr, sterr_tmp1, sterr_tmp2;
+		/* 2 64 bit or 4 32 bit mbuf pointers in one XMM reg. */
+		__m128i mbp1;
+#if defined(RTE_ARCH_X86_64)
+		__m128i mbp2;
+#endif
+
+		/* B.1 load 2 (64 bit) or 4 (32 bit) mbuf points */
+		mbp1 = _mm_loadu_si128((__m128i *)&sw_ring[pos]);
+		/* Read desc statuses backwards to avoid race condition */
+		/* A.1 load 4 pkts desc */
+		descs[3] = _mm_loadu_si128((__m128i *)(rxdp + 3));
+		rte_compiler_barrier();
+
+		/* B.2 copy 2 64 bit or 4 32 bit mbuf point into rx_pkts */
+		_mm_storeu_si128((__m128i *)&rx_pkts[pos], mbp1);
+
+#if defined(RTE_ARCH_X86_64)
+		/* B.1 load 2 64 bit mbuf points */
+		mbp2 = _mm_loadu_si128((__m128i *)&sw_ring[pos + 2]);
+#endif
+
+		descs[2] = _mm_loadu_si128((__m128i *)(rxdp + 2));
+		rte_compiler_barrier();
+		/* B.1 load 2 mbuf point */
+		descs[1] = _mm_loadu_si128((__m128i *)(rxdp + 1));
+		rte_compiler_barrier();
+		descs[0] = _mm_loadu_si128((__m128i *)(rxdp));
+
+#if defined(RTE_ARCH_X86_64)
+		/* B.2 copy 2 mbuf point into rx_pkts  */
+		_mm_storeu_si128((__m128i *)&rx_pkts[pos + 2], mbp2);
+#endif
+
+		if (split_packet) {
+			rte_mbuf_prefetch_part2(rx_pkts[pos]);
+			rte_mbuf_prefetch_part2(rx_pkts[pos + 1]);
+			rte_mbuf_prefetch_part2(rx_pkts[pos + 2]);
+			rte_mbuf_prefetch_part2(rx_pkts[pos + 3]);
+		}
+
+		/* avoid compiler reorder optimization */
+		rte_compiler_barrier();
+
+		/* pkt 3,4 shift the pktlen field to be 16-bit aligned*/
+		const __m128i len3 = _mm_slli_epi32(descs[3], PKTLEN_SHIFT);
+		const __m128i len2 = _mm_slli_epi32(descs[2], PKTLEN_SHIFT);
+
+		/* merge the now-aligned packet length fields back in */
+		descs[3] = _mm_blend_epi16(descs[3], len3, 0x80);
+		descs[2] = _mm_blend_epi16(descs[2], len2, 0x80);
+
+		/* D.1 pkt 3,4 convert format from desc to pktmbuf */
+		pkt_mb4 = _mm_shuffle_epi8(descs[3], shuf_msk);
+		pkt_mb3 = _mm_shuffle_epi8(descs[2], shuf_msk);
+
+		/* C.1 4=>2 filter staterr info only */
+		sterr_tmp2 = _mm_unpackhi_epi32(descs[3], descs[2]);
+		/* C.1 4=>2 filter staterr info only */
+		sterr_tmp1 = _mm_unpackhi_epi32(descs[1], descs[0]);
+
+		desc_to_olflags_v(rxq, descs, &rx_pkts[pos]);
+
+		/* D.2 pkt 3,4 set in_port/nb_seg and remove crc */
+		pkt_mb4 = _mm_add_epi16(pkt_mb4, crc_adjust);
+		pkt_mb3 = _mm_add_epi16(pkt_mb3, crc_adjust);
+
+		/* pkt 1,2 shift the pktlen field to be 16-bit aligned*/
+		const __m128i len1 = _mm_slli_epi32(descs[1], PKTLEN_SHIFT);
+		const __m128i len0 = _mm_slli_epi32(descs[0], PKTLEN_SHIFT);
+
+		/* merge the now-aligned packet length fields back in */
+		descs[1] = _mm_blend_epi16(descs[1], len1, 0x80);
+		descs[0] = _mm_blend_epi16(descs[0], len0, 0x80);
+
+		/* D.1 pkt 1,2 convert format from desc to pktmbuf */
+		pkt_mb2 = _mm_shuffle_epi8(descs[1], shuf_msk);
+		pkt_mb1 = _mm_shuffle_epi8(descs[0], shuf_msk);
+
+		/* C.2 get 4 pkts staterr value  */
+		zero = _mm_xor_si128(dd_check, dd_check);
+		staterr = _mm_unpacklo_epi32(sterr_tmp1, sterr_tmp2);
+
+		/* D.3 copy final 3,4 data to rx_pkts */
+		_mm_storeu_si128
+			((void *)&rx_pkts[pos + 3]->rx_descriptor_fields1,
+			 pkt_mb4);
+		_mm_storeu_si128
+			((void *)&rx_pkts[pos + 2]->rx_descriptor_fields1,
+			 pkt_mb3);
+
+		/* D.2 pkt 1,2 set in_port/nb_seg and remove crc */
+		pkt_mb2 = _mm_add_epi16(pkt_mb2, crc_adjust);
+		pkt_mb1 = _mm_add_epi16(pkt_mb1, crc_adjust);
+
+		/* C* extract and record EOP bit */
+		if (split_packet) {
+			__m128i eop_shuf_mask = _mm_set_epi8(0xFF, 0xFF,
+							     0xFF, 0xFF,
+							     0xFF, 0xFF,
+							     0xFF, 0xFF,
+							     0xFF, 0xFF,
+							     0xFF, 0xFF,
+							     0x04, 0x0C,
+							     0x00, 0x08);
+
+			/* and with mask to extract bits, flipping 1-0 */
+			__m128i eop_bits = _mm_andnot_si128(staterr, eop_check);
+			/* the staterr values are not in order, as the count
+			 * count of dd bits doesn't care. However, for end of
+			 * packet tracking, we do care, so shuffle. This also
+			 * compresses the 32-bit values to 8-bit
+			 */
+			eop_bits = _mm_shuffle_epi8(eop_bits, eop_shuf_mask);
+			/* store the resulting 32-bit value */
+			*(int *)split_packet = _mm_cvtsi128_si32(eop_bits);
+			split_packet += ICE_DESCS_PER_LOOP;
+		}
+
+		/* C.3 calc available number of desc */
+		staterr = _mm_and_si128(staterr, dd_check);
+		staterr = _mm_packs_epi32(staterr, zero);
+
+		/* D.3 copy final 1,2 data to rx_pkts */
+		_mm_storeu_si128
+			((void *)&rx_pkts[pos + 1]->rx_descriptor_fields1,
+			 pkt_mb2);
+		_mm_storeu_si128((void *)&rx_pkts[pos]->rx_descriptor_fields1,
+				 pkt_mb1);
+		desc_to_ptype_v(descs, &rx_pkts[pos], ptype_tbl);
+		/* C.4 calc avaialbe number of desc */
+		var = __builtin_popcountll(_mm_cvtsi128_si64(staterr));
+		nb_pkts_recd += var;
+		if (likely(var != ICE_DESCS_PER_LOOP))
+			break;
+	}
+
+	/* Update our internal tail pointer */
+	rxq->rx_tail = (uint16_t)(rxq->rx_tail + nb_pkts_recd);
+	rxq->rx_tail = (uint16_t)(rxq->rx_tail & (rxq->nb_rx_desc - 1));
+	rxq->rxrearm_nb = (uint16_t)(rxq->rxrearm_nb + nb_pkts_recd);
+
+	return nb_pkts_recd;
+}
+
+/**
+ * Notice:
+ * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet
+ * - nb_pkts > ICE_VPMD_RX_BURST, only scan ICE_VPMD_RX_BURST
+ *   numbers of DD bits
+ */
+uint16_t
+ice_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
+		  uint16_t nb_pkts)
+{
+	return _recv_raw_pkts_vec(rx_queue, rx_pkts, nb_pkts, NULL);
+}
+
+static void __attribute__((cold))
+ice_rx_queue_release_mbufs_vec(struct ice_rx_queue *rxq)
+{
+	_ice_rx_queue_release_mbufs_vec(rxq);
+}
+
+int __attribute__((cold))
+ice_rxq_vec_setup(struct ice_rx_queue *rxq)
+{
+	if (!rxq)
+		return -1;
+
+	rxq->rx_rel_mbufs = ice_rx_queue_release_mbufs_vec;
+	return ice_rxq_vec_setup_default(rxq);
+}
+
+int __attribute__((cold))
+ice_rx_vec_dev_check(struct rte_eth_dev *dev)
+{
+	return ice_rx_vec_dev_check_default(dev);
+}
diff --git a/drivers/net/ice/meson.build b/drivers/net/ice/meson.build
index 857dc0e..469264d 100644
--- a/drivers/net/ice/meson.build
+++ b/drivers/net/ice/meson.build
@@ -11,3 +11,7 @@  sources = files(
 
 deps += ['hash']
 includes += include_directories('base')
+
+if arch_subdir == 'x86'
+	sources += files('ice_rxtx_vec_sse.c')
+endif