[v2,4/5] vhost: add packed ring vectorized dequeue

Message ID 20200921064837.15957-5-yong.liu@intel.com (mailing list archive)
State Superseded, archived
Delegated to: Maxime Coquelin
Headers
Series vhost add vectorized data path |

Checks

Context Check Description
ci/checkpatch success coding style OK

Commit Message

Marvin Liu Sept. 21, 2020, 6:48 a.m. UTC
  Optimize vhost packed ring dequeue path with SIMD instructions. Four
descriptors status check and writeback are batched handled with AVX512
instructions. Address translation operations are also accelerated by
AVX512 instructions.

If platform or compiler not support vectorization, will fallback to
default path.

Signed-off-by: Marvin Liu <yong.liu@intel.com>
  

Comments

Maxime Coquelin Oct. 6, 2020, 2:59 p.m. UTC | #1
On 9/21/20 8:48 AM, Marvin Liu wrote:
> Optimize vhost packed ring dequeue path with SIMD instructions. Four
> descriptors status check and writeback are batched handled with AVX512
> instructions. Address translation operations are also accelerated by
> AVX512 instructions.
> 
> If platform or compiler not support vectorization, will fallback to
> default path.
> 
> Signed-off-by: Marvin Liu <yong.liu@intel.com>
> 
> diff --git a/lib/librte_vhost/meson.build b/lib/librte_vhost/meson.build
> index cc9aa65c67..c1481802d7 100644
> --- a/lib/librte_vhost/meson.build
> +++ b/lib/librte_vhost/meson.build
> @@ -8,6 +8,22 @@ endif
>  if has_libnuma == 1
>  	dpdk_conf.set10('RTE_LIBRTE_VHOST_NUMA', true)
>  endif
> +
> +if arch_subdir == 'x86'
> +        if not machine_args.contains('-mno-avx512f')
> +                if cc.has_argument('-mavx512f') and cc.has_argument('-mavx512vl') and cc.has_argument('-mavx512bw')
> +                        cflags += ['-DCC_AVX512_SUPPORT']
> +                        vhost_avx512_lib = static_library('vhost_avx512_lib',
> +                                              'vhost_vec_avx.c',
> +                                              dependencies: [static_rte_eal, static_rte_mempool,
> +                                                  static_rte_mbuf, static_rte_ethdev, static_rte_net],
> +                                              include_directories: includes,
> +                                              c_args: [cflags, '-mavx512f', '-mavx512bw', '-mavx512vl'])
> +                        objs += vhost_avx512_lib.extract_objects('vhost_vec_avx.c')
> +                endif
> +        endif
> +endif

Not a Meson expert, but wonder how I can disable CC_AVX512_SUPPORT.
I checked the DPDK doc, but I could not find how to pass -mno-avx512f to
the machine_args.

> +
>  if (toolchain == 'gcc' and cc.version().version_compare('>=8.3.0'))
>  	cflags += '-DVHOST_GCC_UNROLL_PRAGMA'
>  elif (toolchain == 'clang' and cc.version().version_compare('>=3.7.0'))
> diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
> index 4a81f18f01..fc7daf2145 100644
> --- a/lib/librte_vhost/vhost.h
> +++ b/lib/librte_vhost/vhost.h
> @@ -1124,4 +1124,12 @@ virtio_dev_pktmbuf_alloc(struct virtio_net *dev, struct rte_mempool *mp,
>  	return NULL;
>  }
>  
> +int
> +vhost_reserve_avail_batch_packed_avx(struct virtio_net *dev,
> +				 struct vhost_virtqueue *vq,
> +				 struct rte_mempool *mbuf_pool,
> +				 struct rte_mbuf **pkts,
> +				 uint16_t avail_idx,
> +				 uintptr_t *desc_addrs,
> +				 uint16_t *ids);
>  #endif /* _VHOST_NET_CDEV_H_ */
> diff --git a/lib/librte_vhost/vhost_vec_avx.c b/lib/librte_vhost/vhost_vec_avx.c
> new file mode 100644
> index 0000000000..dc5322d002
> --- /dev/null
> +++ b/lib/librte_vhost/vhost_vec_avx.c

For consistency it should be prefixed with virtio_net, not vhost.

> @@ -0,0 +1,181 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2010-2016 Intel Corporation
> + */
> +#include <stdint.h>
> +
> +#include "vhost.h"
> +
> +#define BYTE_SIZE 8
> +/* reference count offset in mbuf rearm data */
> +#define REFCNT_BITS_OFFSET ((offsetof(struct rte_mbuf, refcnt) - \
> +	offsetof(struct rte_mbuf, rearm_data)) * BYTE_SIZE)
> +/* segment number offset in mbuf rearm data */
> +#define SEG_NUM_BITS_OFFSET ((offsetof(struct rte_mbuf, nb_segs) - \
> +	offsetof(struct rte_mbuf, rearm_data)) * BYTE_SIZE)
> +
> +/* default rearm data */
> +#define DEFAULT_REARM_DATA (1ULL << SEG_NUM_BITS_OFFSET | \
> +	1ULL << REFCNT_BITS_OFFSET)
> +
> +#define DESC_FLAGS_SHORT_OFFSET (offsetof(struct vring_packed_desc, flags) / \
> +	sizeof(uint16_t))
> +
> +#define DESC_FLAGS_SHORT_SIZE (sizeof(struct vring_packed_desc) / \
> +	sizeof(uint16_t))
> +#define BATCH_FLAGS_MASK (1 << DESC_FLAGS_SHORT_OFFSET | \
> +	1 << (DESC_FLAGS_SHORT_OFFSET + DESC_FLAGS_SHORT_SIZE) | \
> +	1 << (DESC_FLAGS_SHORT_OFFSET + DESC_FLAGS_SHORT_SIZE * 2)  | \
> +	1 << (DESC_FLAGS_SHORT_OFFSET + DESC_FLAGS_SHORT_SIZE * 3))
> +
> +#define FLAGS_BITS_OFFSET ((offsetof(struct vring_packed_desc, flags) - \
> +	offsetof(struct vring_packed_desc, len)) * BYTE_SIZE)
> +
> +#define PACKED_FLAGS_MASK ((0ULL | VRING_DESC_F_AVAIL | VRING_DESC_F_USED) \
> +	<< FLAGS_BITS_OFFSET)
> +#define PACKED_AVAIL_FLAG ((0ULL | VRING_DESC_F_AVAIL) << FLAGS_BITS_OFFSET)
> +#define PACKED_AVAIL_FLAG_WRAP ((0ULL | VRING_DESC_F_USED) << \
> +	FLAGS_BITS_OFFSET)
> +
> +#define DESC_FLAGS_POS 0xaa
> +#define MBUF_LENS_POS 0x6666
> +
> +int
> +vhost_reserve_avail_batch_packed_avx(struct virtio_net *dev,
> +				 struct vhost_virtqueue *vq,
> +				 struct rte_mempool *mbuf_pool,
> +				 struct rte_mbuf **pkts,
> +				 uint16_t avail_idx,
> +				 uintptr_t *desc_addrs,
> +				 uint16_t *ids)
> +{
> +	struct vring_packed_desc *descs = vq->desc_packed;
> +	uint32_t descs_status;
> +	void *desc_addr;
> +	uint16_t i;
> +	uint8_t cmp_low, cmp_high, cmp_result;
> +	uint64_t lens[PACKED_BATCH_SIZE];
> +	struct virtio_net_hdr *hdr;
> +
> +	if (unlikely(avail_idx & PACKED_BATCH_MASK))
> +		return -1;
> +
> +	/* load 4 descs */
> +	desc_addr = &vq->desc_packed[avail_idx];
> +	__m512i desc_vec = _mm512_loadu_si512(desc_addr);

Unlike split ring, packed ring specification does not mandate the ring
size to be a power of two. So checking  avail_idx is aligned on 64 bytes
is not enough given a descriptor is 16 bytes.

You need to also check against ring size to prevent out of bounds
accesses.

I see the non vectorized batch processing you introduced in v19.11 also
do that wrong assumption. Please fix it.

Also, I wonder whether it is assumed that &vq->desc_packed[avail_idx];
is aligned on a cache-line. Meaning, does below intrinsics have such a
requirement?

> +	/* burst check four status */
> +	__m512i avail_flag_vec;
> +	if (vq->avail_wrap_counter)
> +#if defined(RTE_ARCH_I686)
> +		avail_flag_vec = _mm512_set4_epi64(PACKED_AVAIL_FLAG, 0x0,
> +					PACKED_FLAGS_MASK, 0x0);
> +#else
> +		avail_flag_vec = _mm512_maskz_set1_epi64(DESC_FLAGS_POS,
> +					PACKED_AVAIL_FLAG);
> +
> +#endif
> +	else
> +#if defined(RTE_ARCH_I686)
> +		avail_flag_vec = _mm512_set4_epi64(PACKED_AVAIL_FLAG_WRAP,
> +					0x0, PACKED_AVAIL_FLAG_WRAP, 0x0);
> +#else
> +		avail_flag_vec = _mm512_maskz_set1_epi64(DESC_FLAGS_POS,
> +					PACKED_AVAIL_FLAG_WRAP);
> +#endif
> +
> +	descs_status = _mm512_cmp_epu16_mask(desc_vec, avail_flag_vec,
> +		_MM_CMPINT_NE);
> +	if (descs_status & BATCH_FLAGS_MASK)
> +		return -1;
> +
> +	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM)) {
> +		vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
> +			uint64_t size = (uint64_t)descs[avail_idx + i].len;
> +			desc_addrs[i] = __vhost_iova_to_vva(dev, vq,
> +				descs[avail_idx + i].addr, &size,
> +				VHOST_ACCESS_RO);
> +
> +			if (!desc_addrs[i])
> +				goto free_buf;
> +			lens[i] = descs[avail_idx + i].len;
> +			rte_prefetch0((void *)(uintptr_t)desc_addrs[i]);
> +
> +			pkts[i] = virtio_dev_pktmbuf_alloc(dev, mbuf_pool,
> +					lens[i]);
> +			if (!pkts[i])
> +				goto free_buf;
> +		}
> +	} else {
> +		/* check buffer fit into one region & translate address */
> +		__m512i regions_low_addrs =
> +			_mm512_loadu_si512((void *)&dev->regions_low_addrs);
> +		__m512i regions_high_addrs =
> +			_mm512_loadu_si512((void *)&dev->regions_high_addrs);
> +		vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
> +			uint64_t addr_low = descs[avail_idx + i].addr;
> +			uint64_t addr_high = addr_low +
> +						descs[avail_idx + i].len;
> +			__m512i low_addr_vec = _mm512_set1_epi64(addr_low);
> +			__m512i high_addr_vec = _mm512_set1_epi64(addr_high);
> +
> +			cmp_low = _mm512_cmp_epi64_mask(low_addr_vec,
> +					regions_low_addrs, _MM_CMPINT_NLT);
> +			cmp_high = _mm512_cmp_epi64_mask(high_addr_vec,
> +					regions_high_addrs, _MM_CMPINT_LT);
> +			cmp_result = cmp_low & cmp_high;
> +			int index = __builtin_ctz(cmp_result);
> +			if (unlikely((uint32_t)index >= dev->mem->nregions))
> +				goto free_buf;
> +
> +			desc_addrs[i] = addr_low +
> +				dev->mem->regions[index].host_user_addr -
> +				dev->mem->regions[index].guest_phys_addr;
> +			lens[i] = descs[avail_idx + i].len;
> +			rte_prefetch0((void *)(uintptr_t)desc_addrs[i]);
> +
> +			pkts[i] = virtio_dev_pktmbuf_alloc(dev, mbuf_pool,
> +					lens[i]);
> +			if (!pkts[i])
> +				goto free_buf;
> +		}
> +	}
> +
> +	if (virtio_net_with_host_offload(dev)) {
> +		vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
> +			hdr = (struct virtio_net_hdr *)(desc_addrs[i]);
> +			vhost_dequeue_offload(hdr, pkts[i]);
> +		}
> +	}
> +
> +	if (unlikely(virtio_net_is_inorder(dev))) {
> +		ids[PACKED_BATCH_SIZE - 1] =
> +			descs[avail_idx + PACKED_BATCH_SIZE - 1].id;

Isn't in-order a likely case? Maybe just remove the unlikely.

> +	} else {
> +		vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
> +			ids[i] = descs[avail_idx + i].id;
> +	}
> +
> +	uint64_t addrs[PACKED_BATCH_SIZE << 1];
> +	/* store mbuf data_len, pkt_len */
> +	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
> +		addrs[i << 1] = (uint64_t)pkts[i]->rx_descriptor_fields1;
> +		addrs[(i << 1) + 1] = (uint64_t)pkts[i]->rx_descriptor_fields1
> +					+ sizeof(uint64_t);
> +	}
> +
> +	/* save pkt_len and data_len into mbufs */
> +	__m512i value_vec = _mm512_maskz_shuffle_epi32(MBUF_LENS_POS, desc_vec,
> +					0xAA);
> +	__m512i offsets_vec = _mm512_maskz_set1_epi32(MBUF_LENS_POS,
> +					(uint32_t)-12);
> +	value_vec = _mm512_add_epi32(value_vec, offsets_vec);
> +	__m512i vindex = _mm512_loadu_si512((void *)addrs);
> +	_mm512_i64scatter_epi64(0, vindex, value_vec, 1);
> +
> +	return 0;
> +free_buf:
> +	for (i = 0; i < PACKED_BATCH_SIZE; i++)
> +		rte_pktmbuf_free(pkts[i]);
> +
> +	return -1;
> +}
> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
> index 6107662685..e4d2e2e7d6 100644
> --- a/lib/librte_vhost/virtio_net.c
> +++ b/lib/librte_vhost/virtio_net.c
> @@ -2249,6 +2249,28 @@ vhost_reserve_avail_batch_packed(struct virtio_net *dev,
>  	return -1;
>  }
>  
> +static __rte_always_inline int
> +vhost_handle_avail_batch_packed(struct virtio_net *dev,
> +				 struct vhost_virtqueue *vq,
> +				 struct rte_mempool *mbuf_pool,
> +				 struct rte_mbuf **pkts,
> +				 uint16_t avail_idx,
> +				 uintptr_t *desc_addrs,
> +				 uint16_t *ids)
> +{
> +	if (unlikely(dev->vectorized))
> +#ifdef CC_AVX512_SUPPORT
> +		return vhost_reserve_avail_batch_packed_avx(dev, vq, mbuf_pool,
> +				pkts, avail_idx, desc_addrs, ids);
> +#else
> +		return vhost_reserve_avail_batch_packed(dev, vq, mbuf_pool,
> +				pkts, avail_idx, desc_addrs, ids);
> +
> +#endif
> +	return vhost_reserve_avail_batch_packed(dev, vq, mbuf_pool, pkts,
> +			avail_idx, desc_addrs, ids);
> +}


It should be as below to not have any performance impact when
CC_AVX512_SUPPORT is not set:

#ifdef CC_AVX512_SUPPORT
	if (unlikely(dev->vectorized))
		return vhost_reserve_avail_batch_packed_avx(dev, vq, mbuf_pool,
			pkts, avail_idx, desc_addrs, ids);
#else
	return vhost_reserve_avail_batch_packed(dev, vq, mbuf_pool, pkts,
		avail_idx, desc_addrs, ids);
#endif
> +
>  static __rte_always_inline int
>  virtio_dev_tx_batch_packed(struct virtio_net *dev,
>  			   struct vhost_virtqueue *vq,
> @@ -2261,8 +2283,9 @@ virtio_dev_tx_batch_packed(struct virtio_net *dev,
>  	uint16_t ids[PACKED_BATCH_SIZE];
>  	uint16_t i;
>  
> -	if (vhost_reserve_avail_batch_packed(dev, vq, mbuf_pool, pkts,
> -					     avail_idx, desc_addrs, ids))
> +
> +	if (vhost_handle_avail_batch_packed(dev, vq, mbuf_pool, pkts,
> +		avail_idx, desc_addrs, ids))
>  		return -1;
>  
>  	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
>
  
Maxime Coquelin Oct. 6, 2020, 3:18 p.m. UTC | #2
On 9/21/20 8:48 AM, Marvin Liu wrote:
> Optimize vhost packed ring dequeue path with SIMD instructions. Four
> descriptors status check and writeback are batched handled with AVX512
> instructions. Address translation operations are also accelerated by
> AVX512 instructions.
> 
> If platform or compiler not support vectorization, will fallback to
> default path.
> 
> Signed-off-by: Marvin Liu <yong.liu@intel.com>
> 
> diff --git a/lib/librte_vhost/meson.build b/lib/librte_vhost/meson.build
> index cc9aa65c67..c1481802d7 100644
> --- a/lib/librte_vhost/meson.build
> +++ b/lib/librte_vhost/meson.build
> @@ -8,6 +8,22 @@ endif
>  if has_libnuma == 1
>  	dpdk_conf.set10('RTE_LIBRTE_VHOST_NUMA', true)
>  endif
> +
> +if arch_subdir == 'x86'
> +        if not machine_args.contains('-mno-avx512f')
> +                if cc.has_argument('-mavx512f') and cc.has_argument('-mavx512vl') and cc.has_argument('-mavx512bw')
> +                        cflags += ['-DCC_AVX512_SUPPORT']
> +                        vhost_avx512_lib = static_library('vhost_avx512_lib',
> +                                              'vhost_vec_avx.c',
> +                                              dependencies: [static_rte_eal, static_rte_mempool,
> +                                                  static_rte_mbuf, static_rte_ethdev, static_rte_net],
> +                                              include_directories: includes,
> +                                              c_args: [cflags, '-mavx512f', '-mavx512bw', '-mavx512vl'])
> +                        objs += vhost_avx512_lib.extract_objects('vhost_vec_avx.c')
> +                endif
> +        endif
> +endif
> +
>  if (toolchain == 'gcc' and cc.version().version_compare('>=8.3.0'))
>  	cflags += '-DVHOST_GCC_UNROLL_PRAGMA'
>  elif (toolchain == 'clang' and cc.version().version_compare('>=3.7.0'))
> diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
> index 4a81f18f01..fc7daf2145 100644
> --- a/lib/librte_vhost/vhost.h
> +++ b/lib/librte_vhost/vhost.h
> @@ -1124,4 +1124,12 @@ virtio_dev_pktmbuf_alloc(struct virtio_net *dev, struct rte_mempool *mp,
>  	return NULL;
>  }
>  
> +int
> +vhost_reserve_avail_batch_packed_avx(struct virtio_net *dev,
> +				 struct vhost_virtqueue *vq,
> +				 struct rte_mempool *mbuf_pool,
> +				 struct rte_mbuf **pkts,
> +				 uint16_t avail_idx,
> +				 uintptr_t *desc_addrs,
> +				 uint16_t *ids);
>  #endif /* _VHOST_NET_CDEV_H_ */
> diff --git a/lib/librte_vhost/vhost_vec_avx.c b/lib/librte_vhost/vhost_vec_avx.c
> new file mode 100644
> index 0000000000..dc5322d002
> --- /dev/null
> +++ b/lib/librte_vhost/vhost_vec_avx.c
> @@ -0,0 +1,181 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2010-2016 Intel Corporation
> + */
> +#include <stdint.h>
> +
> +#include "vhost.h"
> +
> +#define BYTE_SIZE 8
> +/* reference count offset in mbuf rearm data */
> +#define REFCNT_BITS_OFFSET ((offsetof(struct rte_mbuf, refcnt) - \
> +	offsetof(struct rte_mbuf, rearm_data)) * BYTE_SIZE)
> +/* segment number offset in mbuf rearm data */
> +#define SEG_NUM_BITS_OFFSET ((offsetof(struct rte_mbuf, nb_segs) - \
> +	offsetof(struct rte_mbuf, rearm_data)) * BYTE_SIZE)
> +
> +/* default rearm data */
> +#define DEFAULT_REARM_DATA (1ULL << SEG_NUM_BITS_OFFSET | \
> +	1ULL << REFCNT_BITS_OFFSET)
> +
> +#define DESC_FLAGS_SHORT_OFFSET (offsetof(struct vring_packed_desc, flags) / \
> +	sizeof(uint16_t))
> +
> +#define DESC_FLAGS_SHORT_SIZE (sizeof(struct vring_packed_desc) / \
> +	sizeof(uint16_t))
> +#define BATCH_FLAGS_MASK (1 << DESC_FLAGS_SHORT_OFFSET | \
> +	1 << (DESC_FLAGS_SHORT_OFFSET + DESC_FLAGS_SHORT_SIZE) | \
> +	1 << (DESC_FLAGS_SHORT_OFFSET + DESC_FLAGS_SHORT_SIZE * 2)  | \
> +	1 << (DESC_FLAGS_SHORT_OFFSET + DESC_FLAGS_SHORT_SIZE * 3))
> +
> +#define FLAGS_BITS_OFFSET ((offsetof(struct vring_packed_desc, flags) - \
> +	offsetof(struct vring_packed_desc, len)) * BYTE_SIZE)
> +
> +#define PACKED_FLAGS_MASK ((0ULL | VRING_DESC_F_AVAIL | VRING_DESC_F_USED) \
> +	<< FLAGS_BITS_OFFSET)
> +#define PACKED_AVAIL_FLAG ((0ULL | VRING_DESC_F_AVAIL) << FLAGS_BITS_OFFSET)
> +#define PACKED_AVAIL_FLAG_WRAP ((0ULL | VRING_DESC_F_USED) << \
> +	FLAGS_BITS_OFFSET)
> +
> +#define DESC_FLAGS_POS 0xaa
> +#define MBUF_LENS_POS 0x6666
> +
> +int
> +vhost_reserve_avail_batch_packed_avx(struct virtio_net *dev,
> +				 struct vhost_virtqueue *vq,
> +				 struct rte_mempool *mbuf_pool,
> +				 struct rte_mbuf **pkts,
> +				 uint16_t avail_idx,
> +				 uintptr_t *desc_addrs,
> +				 uint16_t *ids)
> +{
> +	struct vring_packed_desc *descs = vq->desc_packed;
> +	uint32_t descs_status;
> +	void *desc_addr;
> +	uint16_t i;
> +	uint8_t cmp_low, cmp_high, cmp_result;
> +	uint64_t lens[PACKED_BATCH_SIZE];
> +	struct virtio_net_hdr *hdr;
> +
> +	if (unlikely(avail_idx & PACKED_BATCH_MASK))
> +		return -1;
> +
> +	/* load 4 descs */
> +	desc_addr = &vq->desc_packed[avail_idx];
> +	__m512i desc_vec = _mm512_loadu_si512(desc_addr);
> +
> +	/* burst check four status */
> +	__m512i avail_flag_vec;
> +	if (vq->avail_wrap_counter)
> +#if defined(RTE_ARCH_I686)
> +		avail_flag_vec = _mm512_set4_epi64(PACKED_AVAIL_FLAG, 0x0,
> +					PACKED_FLAGS_MASK, 0x0);
> +#else
> +		avail_flag_vec = _mm512_maskz_set1_epi64(DESC_FLAGS_POS,
> +					PACKED_AVAIL_FLAG);
> +
> +#endif
> +	else
> +#if defined(RTE_ARCH_I686)
> +		avail_flag_vec = _mm512_set4_epi64(PACKED_AVAIL_FLAG_WRAP,
> +					0x0, PACKED_AVAIL_FLAG_WRAP, 0x0);
> +#else
> +		avail_flag_vec = _mm512_maskz_set1_epi64(DESC_FLAGS_POS,
> +					PACKED_AVAIL_FLAG_WRAP);
> +#endif
> +
> +	descs_status = _mm512_cmp_epu16_mask(desc_vec, avail_flag_vec,
> +		_MM_CMPINT_NE);
> +	if (descs_status & BATCH_FLAGS_MASK)
> +		return -1;
> +


Also, please try to factorize code to avoid duplication between Tx and
Rx paths for desc address translation:
> +	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM)) {
> +		vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
> +			uint64_t size = (uint64_t)descs[avail_idx + i].len;
> +			desc_addrs[i] = __vhost_iova_to_vva(dev, vq,
> +				descs[avail_idx + i].addr, &size,
> +				VHOST_ACCESS_RO);
> +
> +			if (!desc_addrs[i])
> +				goto free_buf;
> +			lens[i] = descs[avail_idx + i].len;
> +			rte_prefetch0((void *)(uintptr_t)desc_addrs[i]);
> +
> +			pkts[i] = virtio_dev_pktmbuf_alloc(dev, mbuf_pool,
> +					lens[i]);
> +			if (!pkts[i])
> +				goto free_buf;
> +		}
> +	} else {> +		/* check buffer fit into one region & translate address */
> +		__m512i regions_low_addrs =
> +			_mm512_loadu_si512((void *)&dev->regions_low_addrs);
> +		__m512i regions_high_addrs =
> +			_mm512_loadu_si512((void *)&dev->regions_high_addrs);
> +		vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
> +			uint64_t addr_low = descs[avail_idx + i].addr;
> +			uint64_t addr_high = addr_low +
> +						descs[avail_idx + i].len;
> +			__m512i low_addr_vec = _mm512_set1_epi64(addr_low);
> +			__m512i high_addr_vec = _mm512_set1_epi64(addr_high);
> +
> +			cmp_low = _mm512_cmp_epi64_mask(low_addr_vec,
> +					regions_low_addrs, _MM_CMPINT_NLT);
> +			cmp_high = _mm512_cmp_epi64_mask(high_addr_vec,
> +					regions_high_addrs, _MM_CMPINT_LT);
> +			cmp_result = cmp_low & cmp_high;
> +			int index = __builtin_ctz(cmp_result);
> +			if (unlikely((uint32_t)index >= dev->mem->nregions))
> +				goto free_buf;
> +
> +			desc_addrs[i] = addr_low +
> +				dev->mem->regions[index].host_user_addr -
> +				dev->mem->regions[index].guest_phys_addr;
> +			lens[i] = descs[avail_idx + i].len;
> +			rte_prefetch0((void *)(uintptr_t)desc_addrs[i]);
> +
> +			pkts[i] = virtio_dev_pktmbuf_alloc(dev, mbuf_pool,
> +					lens[i]);
> +			if (!pkts[i])
> +				goto free_buf;
> +		}
> +	}
> +
> +	if (virtio_net_with_host_offload(dev)) {
> +		vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
> +			hdr = (struct virtio_net_hdr *)(desc_addrs[i]);
> +			vhost_dequeue_offload(hdr, pkts[i]);
> +		}
> +	}
> +
> +	if (unlikely(virtio_net_is_inorder(dev))) {
> +		ids[PACKED_BATCH_SIZE - 1] =
> +			descs[avail_idx + PACKED_BATCH_SIZE - 1].id;
> +	} else {
> +		vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
> +			ids[i] = descs[avail_idx + i].id;
> +	}
> +
> +	uint64_t addrs[PACKED_BATCH_SIZE << 1];
> +	/* store mbuf data_len, pkt_len */
> +	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
> +		addrs[i << 1] = (uint64_t)pkts[i]->rx_descriptor_fields1;
> +		addrs[(i << 1) + 1] = (uint64_t)pkts[i]->rx_descriptor_fields1
> +					+ sizeof(uint64_t);
> +	}
> +
> +	/* save pkt_len and data_len into mbufs */
> +	__m512i value_vec = _mm512_maskz_shuffle_epi32(MBUF_LENS_POS, desc_vec,
> +					0xAA);
> +	__m512i offsets_vec = _mm512_maskz_set1_epi32(MBUF_LENS_POS,
> +					(uint32_t)-12);
> +	value_vec = _mm512_add_epi32(value_vec, offsets_vec);
> +	__m512i vindex = _mm512_loadu_si512((void *)addrs);
> +	_mm512_i64scatter_epi64(0, vindex, value_vec, 1);
> +
> +	return 0;
> +free_buf:
> +	for (i = 0; i < PACKED_BATCH_SIZE; i++)
> +		rte_pktmbuf_free(pkts[i]);
> +
> +	return -1;
> +}
> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
> index 6107662685..e4d2e2e7d6 100644
> --- a/lib/librte_vhost/virtio_net.c
> +++ b/lib/librte_vhost/virtio_net.c
> @@ -2249,6 +2249,28 @@ vhost_reserve_avail_batch_packed(struct virtio_net *dev,
>  	return -1;
>  }
>  
> +static __rte_always_inline int
> +vhost_handle_avail_batch_packed(struct virtio_net *dev,
> +				 struct vhost_virtqueue *vq,
> +				 struct rte_mempool *mbuf_pool,
> +				 struct rte_mbuf **pkts,
> +				 uint16_t avail_idx,
> +				 uintptr_t *desc_addrs,
> +				 uint16_t *ids)
> +{
> +	if (unlikely(dev->vectorized))
> +#ifdef CC_AVX512_SUPPORT
> +		return vhost_reserve_avail_batch_packed_avx(dev, vq, mbuf_pool,
> +				pkts, avail_idx, desc_addrs, ids);
> +#else
> +		return vhost_reserve_avail_batch_packed(dev, vq, mbuf_pool,
> +				pkts, avail_idx, desc_addrs, ids);
> +
> +#endif
> +	return vhost_reserve_avail_batch_packed(dev, vq, mbuf_pool, pkts,
> +			avail_idx, desc_addrs, ids);
> +}
> +
>  static __rte_always_inline int
>  virtio_dev_tx_batch_packed(struct virtio_net *dev,
>  			   struct vhost_virtqueue *vq,
> @@ -2261,8 +2283,9 @@ virtio_dev_tx_batch_packed(struct virtio_net *dev,
>  	uint16_t ids[PACKED_BATCH_SIZE];
>  	uint16_t i;
>  
> -	if (vhost_reserve_avail_batch_packed(dev, vq, mbuf_pool, pkts,
> -					     avail_idx, desc_addrs, ids))
> +
> +	if (vhost_handle_avail_batch_packed(dev, vq, mbuf_pool, pkts,
> +		avail_idx, desc_addrs, ids))
>  		return -1;
>  
>  	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
>
  
Marvin Liu Oct. 8, 2020, 7:05 a.m. UTC | #3
> -----Original Message-----
> From: Maxime Coquelin <maxime.coquelin@redhat.com>
> Sent: Tuesday, October 6, 2020 10:59 PM
> To: Liu, Yong <yong.liu@intel.com>; Xia, Chenbo <chenbo.xia@intel.com>;
> Wang, Zhihong <zhihong.wang@intel.com>
> Cc: dev@dpdk.org
> Subject: Re: [PATCH v2 4/5] vhost: add packed ring vectorized dequeue
> 
> 
> 
> On 9/21/20 8:48 AM, Marvin Liu wrote:
> > Optimize vhost packed ring dequeue path with SIMD instructions. Four
> > descriptors status check and writeback are batched handled with AVX512
> > instructions. Address translation operations are also accelerated by
> > AVX512 instructions.
> >
> > If platform or compiler not support vectorization, will fallback to
> > default path.
> >
> > Signed-off-by: Marvin Liu <yong.liu@intel.com>
> >
> > diff --git a/lib/librte_vhost/meson.build b/lib/librte_vhost/meson.build
> > index cc9aa65c67..c1481802d7 100644
> > --- a/lib/librte_vhost/meson.build
> > +++ b/lib/librte_vhost/meson.build
> > @@ -8,6 +8,22 @@ endif
> >  if has_libnuma == 1
> >  	dpdk_conf.set10('RTE_LIBRTE_VHOST_NUMA', true)
> >  endif
> > +
> > +if arch_subdir == 'x86'
> > +        if not machine_args.contains('-mno-avx512f')
> > +                if cc.has_argument('-mavx512f') and cc.has_argument('-
> mavx512vl') and cc.has_argument('-mavx512bw')
> > +                        cflags += ['-DCC_AVX512_SUPPORT']
> > +                        vhost_avx512_lib = static_library('vhost_avx512_lib',
> > +                                              'vhost_vec_avx.c',
> > +                                              dependencies: [static_rte_eal,
> static_rte_mempool,
> > +                                                  static_rte_mbuf, static_rte_ethdev,
> static_rte_net],
> > +                                              include_directories: includes,
> > +                                              c_args: [cflags, '-mavx512f', '-mavx512bw', '-
> mavx512vl'])
> > +                        objs += vhost_avx512_lib.extract_objects('vhost_vec_avx.c')
> > +                endif
> > +        endif
> > +endif
> 
> Not a Meson expert, but wonder how I can disable CC_AVX512_SUPPORT.
> I checked the DPDK doc, but I could not find how to pass -mno-avx512f to
> the machine_args.

Hi Maxime,
By now mno-avx512f flag will be set only if binutils check script found issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=90028.
So avx512 code will be built-in if compiler support that. There's alternative way is that introduce one new option in meson build. 

Thanks,
Marvin

> 
> > +
> >  if (toolchain == 'gcc' and cc.version().version_compare('>=8.3.0'))
> >  	cflags += '-DVHOST_GCC_UNROLL_PRAGMA'
> >  elif (toolchain == 'clang' and cc.version().version_compare('>=3.7.0'))
> > diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
> > index 4a81f18f01..fc7daf2145 100644
> > --- a/lib/librte_vhost/vhost.h
> > +++ b/lib/librte_vhost/vhost.h
> > @@ -1124,4 +1124,12 @@ virtio_dev_pktmbuf_alloc(struct virtio_net
> *dev, struct rte_mempool *mp,
> >  	return NULL;
> >  }
> >
> > +int
> > +vhost_reserve_avail_batch_packed_avx(struct virtio_net *dev,
> > +				 struct vhost_virtqueue *vq,
> > +				 struct rte_mempool *mbuf_pool,
> > +				 struct rte_mbuf **pkts,
> > +				 uint16_t avail_idx,
> > +				 uintptr_t *desc_addrs,
> > +				 uint16_t *ids);
> >  #endif /* _VHOST_NET_CDEV_H_ */
> > diff --git a/lib/librte_vhost/vhost_vec_avx.c
> b/lib/librte_vhost/vhost_vec_avx.c
> > new file mode 100644
> > index 0000000000..dc5322d002
> > --- /dev/null
> > +++ b/lib/librte_vhost/vhost_vec_avx.c
> 
> For consistency it should be prefixed with virtio_net, not vhost.
> 
> > @@ -0,0 +1,181 @@
> > +/* SPDX-License-Identifier: BSD-3-Clause
> > + * Copyright(c) 2010-2016 Intel Corporation
> > + */
> > +#include <stdint.h>
> > +
> > +#include "vhost.h"
> > +
> > +#define BYTE_SIZE 8
> > +/* reference count offset in mbuf rearm data */
> > +#define REFCNT_BITS_OFFSET ((offsetof(struct rte_mbuf, refcnt) - \
> > +	offsetof(struct rte_mbuf, rearm_data)) * BYTE_SIZE)
> > +/* segment number offset in mbuf rearm data */
> > +#define SEG_NUM_BITS_OFFSET ((offsetof(struct rte_mbuf, nb_segs) - \
> > +	offsetof(struct rte_mbuf, rearm_data)) * BYTE_SIZE)
> > +
> > +/* default rearm data */
> > +#define DEFAULT_REARM_DATA (1ULL << SEG_NUM_BITS_OFFSET | \
> > +	1ULL << REFCNT_BITS_OFFSET)
> > +
> > +#define DESC_FLAGS_SHORT_OFFSET (offsetof(struct vring_packed_desc,
> flags) / \
> > +	sizeof(uint16_t))
> > +
> > +#define DESC_FLAGS_SHORT_SIZE (sizeof(struct vring_packed_desc) / \
> > +	sizeof(uint16_t))
> > +#define BATCH_FLAGS_MASK (1 << DESC_FLAGS_SHORT_OFFSET | \
> > +	1 << (DESC_FLAGS_SHORT_OFFSET + DESC_FLAGS_SHORT_SIZE) | \
> > +	1 << (DESC_FLAGS_SHORT_OFFSET + DESC_FLAGS_SHORT_SIZE * 2)  |
> \
> > +	1 << (DESC_FLAGS_SHORT_OFFSET + DESC_FLAGS_SHORT_SIZE * 3))
> > +
> > +#define FLAGS_BITS_OFFSET ((offsetof(struct vring_packed_desc, flags) - \
> > +	offsetof(struct vring_packed_desc, len)) * BYTE_SIZE)
> > +
> > +#define PACKED_FLAGS_MASK ((0ULL | VRING_DESC_F_AVAIL |
> VRING_DESC_F_USED) \
> > +	<< FLAGS_BITS_OFFSET)
> > +#define PACKED_AVAIL_FLAG ((0ULL | VRING_DESC_F_AVAIL) <<
> FLAGS_BITS_OFFSET)
> > +#define PACKED_AVAIL_FLAG_WRAP ((0ULL | VRING_DESC_F_USED) << \
> > +	FLAGS_BITS_OFFSET)
> > +
> > +#define DESC_FLAGS_POS 0xaa
> > +#define MBUF_LENS_POS 0x6666
> > +
> > +int
> > +vhost_reserve_avail_batch_packed_avx(struct virtio_net *dev,
> > +				 struct vhost_virtqueue *vq,
> > +				 struct rte_mempool *mbuf_pool,
> > +				 struct rte_mbuf **pkts,
> > +				 uint16_t avail_idx,
> > +				 uintptr_t *desc_addrs,
> > +				 uint16_t *ids)
> > +{
> > +	struct vring_packed_desc *descs = vq->desc_packed;
> > +	uint32_t descs_status;
> > +	void *desc_addr;
> > +	uint16_t i;
> > +	uint8_t cmp_low, cmp_high, cmp_result;
> > +	uint64_t lens[PACKED_BATCH_SIZE];
> > +	struct virtio_net_hdr *hdr;
> > +
> > +	if (unlikely(avail_idx & PACKED_BATCH_MASK))
> > +		return -1;
> > +
> > +	/* load 4 descs */
> > +	desc_addr = &vq->desc_packed[avail_idx];
> > +	__m512i desc_vec = _mm512_loadu_si512(desc_addr);
> 
> Unlike split ring, packed ring specification does not mandate the ring
> size to be a power of two. So checking  avail_idx is aligned on 64 bytes
> is not enough given a descriptor is 16 bytes.
> 
> You need to also check against ring size to prevent out of bounds
> accesses.
> 
> I see the non vectorized batch processing you introduced in v19.11 also
> do that wrong assumption. Please fix it.
> 
> Also, I wonder whether it is assumed that &vq->desc_packed[avail_idx];
> is aligned on a cache-line. Meaning, does below intrinsics have such a
> requirement?
> 

Got, packed ring size may arbitrary number. In v19.11 batch handling function has already checked available index not oversized. 
I forgot that in vectorized path, will fix it in next release. 

In vectorized path, loading function mm512_loadu_si512 do not need cache-aligned memory. So no special requirement is needed. 

> > +	/* burst check four status */
> > +	__m512i avail_flag_vec;
> > +	if (vq->avail_wrap_counter)
> > +#if defined(RTE_ARCH_I686)
> > +		avail_flag_vec = _mm512_set4_epi64(PACKED_AVAIL_FLAG,
> 0x0,
> > +					PACKED_FLAGS_MASK, 0x0);
> > +#else
> > +		avail_flag_vec =
> _mm512_maskz_set1_epi64(DESC_FLAGS_POS,
> > +					PACKED_AVAIL_FLAG);
> > +
> > +#endif
> > +	else
> > +#if defined(RTE_ARCH_I686)
> > +		avail_flag_vec =
> _mm512_set4_epi64(PACKED_AVAIL_FLAG_WRAP,
> > +					0x0, PACKED_AVAIL_FLAG_WRAP,
> 0x0);
> > +#else
> > +		avail_flag_vec =
> _mm512_maskz_set1_epi64(DESC_FLAGS_POS,
> > +					PACKED_AVAIL_FLAG_WRAP);
> > +#endif
> > +
> > +	descs_status = _mm512_cmp_epu16_mask(desc_vec, avail_flag_vec,
> > +		_MM_CMPINT_NE);
> > +	if (descs_status & BATCH_FLAGS_MASK)
> > +		return -1;
> > +
> > +	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM)) {
> > +		vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
> > +			uint64_t size = (uint64_t)descs[avail_idx + i].len;
> > +			desc_addrs[i] = __vhost_iova_to_vva(dev, vq,
> > +				descs[avail_idx + i].addr, &size,
> > +				VHOST_ACCESS_RO);
> > +
> > +			if (!desc_addrs[i])
> > +				goto free_buf;
> > +			lens[i] = descs[avail_idx + i].len;
> > +			rte_prefetch0((void *)(uintptr_t)desc_addrs[i]);
> > +
> > +			pkts[i] = virtio_dev_pktmbuf_alloc(dev, mbuf_pool,
> > +					lens[i]);
> > +			if (!pkts[i])
> > +				goto free_buf;
> > +		}
> > +	} else {
> > +		/* check buffer fit into one region & translate address */
> > +		__m512i regions_low_addrs =
> > +			_mm512_loadu_si512((void *)&dev-
> >regions_low_addrs);
> > +		__m512i regions_high_addrs =
> > +			_mm512_loadu_si512((void *)&dev-
> >regions_high_addrs);
> > +		vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
> > +			uint64_t addr_low = descs[avail_idx + i].addr;
> > +			uint64_t addr_high = addr_low +
> > +						descs[avail_idx + i].len;
> > +			__m512i low_addr_vec =
> _mm512_set1_epi64(addr_low);
> > +			__m512i high_addr_vec =
> _mm512_set1_epi64(addr_high);
> > +
> > +			cmp_low =
> _mm512_cmp_epi64_mask(low_addr_vec,
> > +					regions_low_addrs,
> _MM_CMPINT_NLT);
> > +			cmp_high =
> _mm512_cmp_epi64_mask(high_addr_vec,
> > +					regions_high_addrs,
> _MM_CMPINT_LT);
> > +			cmp_result = cmp_low & cmp_high;
> > +			int index = __builtin_ctz(cmp_result);
> > +			if (unlikely((uint32_t)index >= dev->mem->nregions))
> > +				goto free_buf;
> > +
> > +			desc_addrs[i] = addr_low +
> > +				dev->mem->regions[index].host_user_addr -
> > +				dev->mem->regions[index].guest_phys_addr;
> > +			lens[i] = descs[avail_idx + i].len;
> > +			rte_prefetch0((void *)(uintptr_t)desc_addrs[i]);
> > +
> > +			pkts[i] = virtio_dev_pktmbuf_alloc(dev, mbuf_pool,
> > +					lens[i]);
> > +			if (!pkts[i])
> > +				goto free_buf;
> > +		}
> > +	}
> > +
> > +	if (virtio_net_with_host_offload(dev)) {
> > +		vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
> > +			hdr = (struct virtio_net_hdr *)(desc_addrs[i]);
> > +			vhost_dequeue_offload(hdr, pkts[i]);
> > +		}
> > +	}
> > +
> > +	if (unlikely(virtio_net_is_inorder(dev))) {
> > +		ids[PACKED_BATCH_SIZE - 1] =
> > +			descs[avail_idx + PACKED_BATCH_SIZE - 1].id;
> 
> Isn't in-order a likely case? Maybe just remove the unlikely.
> 
In_order option is depended on feature negotiation , will remove unlikely. 

> > +	} else {
> > +		vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
> > +			ids[i] = descs[avail_idx + i].id;
> > +	}
> > +
> > +	uint64_t addrs[PACKED_BATCH_SIZE << 1];
> > +	/* store mbuf data_len, pkt_len */
> > +	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
> > +		addrs[i << 1] = (uint64_t)pkts[i]->rx_descriptor_fields1;
> > +		addrs[(i << 1) + 1] = (uint64_t)pkts[i]->rx_descriptor_fields1
> > +					+ sizeof(uint64_t);
> > +	}
> > +
> > +	/* save pkt_len and data_len into mbufs */
> > +	__m512i value_vec =
> _mm512_maskz_shuffle_epi32(MBUF_LENS_POS, desc_vec,
> > +					0xAA);
> > +	__m512i offsets_vec = _mm512_maskz_set1_epi32(MBUF_LENS_POS,
> > +					(uint32_t)-12);
> > +	value_vec = _mm512_add_epi32(value_vec, offsets_vec);
> > +	__m512i vindex = _mm512_loadu_si512((void *)addrs);
> > +	_mm512_i64scatter_epi64(0, vindex, value_vec, 1);
> > +
> > +	return 0;
> > +free_buf:
> > +	for (i = 0; i < PACKED_BATCH_SIZE; i++)
> > +		rte_pktmbuf_free(pkts[i]);
> > +
> > +	return -1;
> > +}
> > diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
> > index 6107662685..e4d2e2e7d6 100644
> > --- a/lib/librte_vhost/virtio_net.c
> > +++ b/lib/librte_vhost/virtio_net.c
> > @@ -2249,6 +2249,28 @@ vhost_reserve_avail_batch_packed(struct
> virtio_net *dev,
> >  	return -1;
> >  }
> >
> > +static __rte_always_inline int
> > +vhost_handle_avail_batch_packed(struct virtio_net *dev,
> > +				 struct vhost_virtqueue *vq,
> > +				 struct rte_mempool *mbuf_pool,
> > +				 struct rte_mbuf **pkts,
> > +				 uint16_t avail_idx,
> > +				 uintptr_t *desc_addrs,
> > +				 uint16_t *ids)
> > +{
> > +	if (unlikely(dev->vectorized))
> > +#ifdef CC_AVX512_SUPPORT
> > +		return vhost_reserve_avail_batch_packed_avx(dev, vq,
> mbuf_pool,
> > +				pkts, avail_idx, desc_addrs, ids);
> > +#else
> > +		return vhost_reserve_avail_batch_packed(dev, vq,
> mbuf_pool,
> > +				pkts, avail_idx, desc_addrs, ids);
> > +
> > +#endif
> > +	return vhost_reserve_avail_batch_packed(dev, vq, mbuf_pool, pkts,
> > +			avail_idx, desc_addrs, ids);
> > +}
> 
> 
> It should be as below to not have any performance impact when
> CC_AVX512_SUPPORT is not set:
> 
> #ifdef CC_AVX512_SUPPORT
> 	if (unlikely(dev->vectorized))
> 		return vhost_reserve_avail_batch_packed_avx(dev, vq,
> mbuf_pool,
> 			pkts, avail_idx, desc_addrs, ids);
> #else
> 	return vhost_reserve_avail_batch_packed(dev, vq, mbuf_pool, pkts,
> 		avail_idx, desc_addrs, ids);
> #endif

Got, will change in next release. 

> > +
> >  static __rte_always_inline int
> >  virtio_dev_tx_batch_packed(struct virtio_net *dev,
> >  			   struct vhost_virtqueue *vq,
> > @@ -2261,8 +2283,9 @@ virtio_dev_tx_batch_packed(struct virtio_net
> *dev,
> >  	uint16_t ids[PACKED_BATCH_SIZE];
> >  	uint16_t i;
> >
> > -	if (vhost_reserve_avail_batch_packed(dev, vq, mbuf_pool, pkts,
> > -					     avail_idx, desc_addrs, ids))
> > +
> > +	if (vhost_handle_avail_batch_packed(dev, vq, mbuf_pool, pkts,
> > +		avail_idx, desc_addrs, ids))
> >  		return -1;
> >
> >  	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
> >
  
Marvin Liu Oct. 9, 2020, 7:59 a.m. UTC | #4
> -----Original Message-----
> From: Maxime Coquelin <maxime.coquelin@redhat.com>
> Sent: Tuesday, October 6, 2020 11:19 PM
> To: Liu, Yong <yong.liu@intel.com>; Xia, Chenbo <chenbo.xia@intel.com>;
> Wang, Zhihong <zhihong.wang@intel.com>
> Cc: dev@dpdk.org
> Subject: Re: [PATCH v2 4/5] vhost: add packed ring vectorized dequeue
> 
> 
> 
> On 9/21/20 8:48 AM, Marvin Liu wrote:
> > Optimize vhost packed ring dequeue path with SIMD instructions. Four
> > descriptors status check and writeback are batched handled with AVX512
> > instructions. Address translation operations are also accelerated by
> > AVX512 instructions.
> >
> > If platform or compiler not support vectorization, will fallback to
> > default path.
> >
> > Signed-off-by: Marvin Liu <yong.liu@intel.com>
> >
> > diff --git a/lib/librte_vhost/meson.build b/lib/librte_vhost/meson.build
> > index cc9aa65c67..c1481802d7 100644
> > --- a/lib/librte_vhost/meson.build
> > +++ b/lib/librte_vhost/meson.build
> > @@ -8,6 +8,22 @@ endif
> >  if has_libnuma == 1
> >  	dpdk_conf.set10('RTE_LIBRTE_VHOST_NUMA', true)
> >  endif
> > +
> > +if arch_subdir == 'x86'
> > +        if not machine_args.contains('-mno-avx512f')
> > +                if cc.has_argument('-mavx512f') and cc.has_argument('-
> mavx512vl') and cc.has_argument('-mavx512bw')
> > +                        cflags += ['-DCC_AVX512_SUPPORT']
> > +                        vhost_avx512_lib = static_library('vhost_avx512_lib',
> > +                                              'vhost_vec_avx.c',
> > +                                              dependencies: [static_rte_eal,
> static_rte_mempool,
> > +                                                  static_rte_mbuf, static_rte_ethdev,
> static_rte_net],
> > +                                              include_directories: includes,
> > +                                              c_args: [cflags, '-mavx512f', '-mavx512bw', '-
> mavx512vl'])
> > +                        objs += vhost_avx512_lib.extract_objects('vhost_vec_avx.c')
> > +                endif
> > +        endif
> > +endif
> > +
> >  if (toolchain == 'gcc' and cc.version().version_compare('>=8.3.0'))
> >  	cflags += '-DVHOST_GCC_UNROLL_PRAGMA'
> >  elif (toolchain == 'clang' and cc.version().version_compare('>=3.7.0'))
> > diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
> > index 4a81f18f01..fc7daf2145 100644
> > --- a/lib/librte_vhost/vhost.h
> > +++ b/lib/librte_vhost/vhost.h
> > @@ -1124,4 +1124,12 @@ virtio_dev_pktmbuf_alloc(struct virtio_net
> *dev, struct rte_mempool *mp,
> >  	return NULL;
> >  }
> >
> > +int
> > +vhost_reserve_avail_batch_packed_avx(struct virtio_net *dev,
> > +				 struct vhost_virtqueue *vq,
> > +				 struct rte_mempool *mbuf_pool,
> > +				 struct rte_mbuf **pkts,
> > +				 uint16_t avail_idx,
> > +				 uintptr_t *desc_addrs,
> > +				 uint16_t *ids);
> >  #endif /* _VHOST_NET_CDEV_H_ */
> > diff --git a/lib/librte_vhost/vhost_vec_avx.c
> b/lib/librte_vhost/vhost_vec_avx.c
> > new file mode 100644
> > index 0000000000..dc5322d002
> > --- /dev/null
> > +++ b/lib/librte_vhost/vhost_vec_avx.c
> > @@ -0,0 +1,181 @@
> > +/* SPDX-License-Identifier: BSD-3-Clause
> > + * Copyright(c) 2010-2016 Intel Corporation
> > + */
> > +#include <stdint.h>
> > +
> > +#include "vhost.h"
> > +
> > +#define BYTE_SIZE 8
> > +/* reference count offset in mbuf rearm data */
> > +#define REFCNT_BITS_OFFSET ((offsetof(struct rte_mbuf, refcnt) - \
> > +	offsetof(struct rte_mbuf, rearm_data)) * BYTE_SIZE)
> > +/* segment number offset in mbuf rearm data */
> > +#define SEG_NUM_BITS_OFFSET ((offsetof(struct rte_mbuf, nb_segs) - \
> > +	offsetof(struct rte_mbuf, rearm_data)) * BYTE_SIZE)
> > +
> > +/* default rearm data */
> > +#define DEFAULT_REARM_DATA (1ULL << SEG_NUM_BITS_OFFSET | \
> > +	1ULL << REFCNT_BITS_OFFSET)
> > +
> > +#define DESC_FLAGS_SHORT_OFFSET (offsetof(struct vring_packed_desc,
> flags) / \
> > +	sizeof(uint16_t))
> > +
> > +#define DESC_FLAGS_SHORT_SIZE (sizeof(struct vring_packed_desc) / \
> > +	sizeof(uint16_t))
> > +#define BATCH_FLAGS_MASK (1 << DESC_FLAGS_SHORT_OFFSET | \
> > +	1 << (DESC_FLAGS_SHORT_OFFSET + DESC_FLAGS_SHORT_SIZE) | \
> > +	1 << (DESC_FLAGS_SHORT_OFFSET + DESC_FLAGS_SHORT_SIZE * 2)  |
> \
> > +	1 << (DESC_FLAGS_SHORT_OFFSET + DESC_FLAGS_SHORT_SIZE * 3))
> > +
> > +#define FLAGS_BITS_OFFSET ((offsetof(struct vring_packed_desc, flags) - \
> > +	offsetof(struct vring_packed_desc, len)) * BYTE_SIZE)
> > +
> > +#define PACKED_FLAGS_MASK ((0ULL | VRING_DESC_F_AVAIL |
> VRING_DESC_F_USED) \
> > +	<< FLAGS_BITS_OFFSET)
> > +#define PACKED_AVAIL_FLAG ((0ULL | VRING_DESC_F_AVAIL) <<
> FLAGS_BITS_OFFSET)
> > +#define PACKED_AVAIL_FLAG_WRAP ((0ULL | VRING_DESC_F_USED) << \
> > +	FLAGS_BITS_OFFSET)
> > +
> > +#define DESC_FLAGS_POS 0xaa
> > +#define MBUF_LENS_POS 0x6666
> > +
> > +int
> > +vhost_reserve_avail_batch_packed_avx(struct virtio_net *dev,
> > +				 struct vhost_virtqueue *vq,
> > +				 struct rte_mempool *mbuf_pool,
> > +				 struct rte_mbuf **pkts,
> > +				 uint16_t avail_idx,
> > +				 uintptr_t *desc_addrs,
> > +				 uint16_t *ids)
> > +{
> > +	struct vring_packed_desc *descs = vq->desc_packed;
> > +	uint32_t descs_status;
> > +	void *desc_addr;
> > +	uint16_t i;
> > +	uint8_t cmp_low, cmp_high, cmp_result;
> > +	uint64_t lens[PACKED_BATCH_SIZE];
> > +	struct virtio_net_hdr *hdr;
> > +
> > +	if (unlikely(avail_idx & PACKED_BATCH_MASK))
> > +		return -1;
> > +
> > +	/* load 4 descs */
> > +	desc_addr = &vq->desc_packed[avail_idx];
> > +	__m512i desc_vec = _mm512_loadu_si512(desc_addr);
> > +
> > +	/* burst check four status */
> > +	__m512i avail_flag_vec;
> > +	if (vq->avail_wrap_counter)
> > +#if defined(RTE_ARCH_I686)
> > +		avail_flag_vec = _mm512_set4_epi64(PACKED_AVAIL_FLAG,
> 0x0,
> > +					PACKED_FLAGS_MASK, 0x0);
> > +#else
> > +		avail_flag_vec =
> _mm512_maskz_set1_epi64(DESC_FLAGS_POS,
> > +					PACKED_AVAIL_FLAG);
> > +
> > +#endif
> > +	else
> > +#if defined(RTE_ARCH_I686)
> > +		avail_flag_vec =
> _mm512_set4_epi64(PACKED_AVAIL_FLAG_WRAP,
> > +					0x0, PACKED_AVAIL_FLAG_WRAP,
> 0x0);
> > +#else
> > +		avail_flag_vec =
> _mm512_maskz_set1_epi64(DESC_FLAGS_POS,
> > +					PACKED_AVAIL_FLAG_WRAP);
> > +#endif
> > +
> > +	descs_status = _mm512_cmp_epu16_mask(desc_vec, avail_flag_vec,
> > +		_MM_CMPINT_NE);
> > +	if (descs_status & BATCH_FLAGS_MASK)
> > +		return -1;
> > +
> 
> 
> Also, please try to factorize code to avoid duplication between Tx and
> Rx paths for desc address translation:

Hi Maxime,
I  have factorized the translation function in Rx and Tx paths, but there's a few performance drop after the change.
Since vectorized datapath is focusing on performance, I'd like to keep current implementation. 

Thanks,
Marvin

> > +	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM)) {
> > +		vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
> > +			uint64_t size = (uint64_t)descs[avail_idx + i].len;
> > +			desc_addrs[i] = __vhost_iova_to_vva(dev, vq,
> > +				descs[avail_idx + i].addr, &size,
> > +				VHOST_ACCESS_RO);
> > +
> > +			if (!desc_addrs[i])
> > +				goto free_buf;
> > +			lens[i] = descs[avail_idx + i].len;
> > +			rte_prefetch0((void *)(uintptr_t)desc_addrs[i]);
> > +
> > +			pkts[i] = virtio_dev_pktmbuf_alloc(dev, mbuf_pool,
> > +					lens[i]);
> > +			if (!pkts[i])
> > +				goto free_buf;
> > +		}
> > +	} else {> +		/* check buffer fit into one region &
> translate address */
> > +		__m512i regions_low_addrs =
> > +			_mm512_loadu_si512((void *)&dev-
> >regions_low_addrs);
> > +		__m512i regions_high_addrs =
> > +			_mm512_loadu_si512((void *)&dev-
> >regions_high_addrs);
> > +		vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
> > +			uint64_t addr_low = descs[avail_idx + i].addr;
> > +			uint64_t addr_high = addr_low +
> > +						descs[avail_idx + i].len;
> > +			__m512i low_addr_vec =
> _mm512_set1_epi64(addr_low);
> > +			__m512i high_addr_vec =
> _mm512_set1_epi64(addr_high);
> > +
> > +			cmp_low =
> _mm512_cmp_epi64_mask(low_addr_vec,
> > +					regions_low_addrs,
> _MM_CMPINT_NLT);
> > +			cmp_high =
> _mm512_cmp_epi64_mask(high_addr_vec,
> > +					regions_high_addrs,
> _MM_CMPINT_LT);
> > +			cmp_result = cmp_low & cmp_high;
> > +			int index = __builtin_ctz(cmp_result);
> > +			if (unlikely((uint32_t)index >= dev->mem->nregions))
> > +				goto free_buf;
> > +
> > +			desc_addrs[i] = addr_low +
> > +				dev->mem->regions[index].host_user_addr -
> > +				dev->mem->regions[index].guest_phys_addr;
> > +			lens[i] = descs[avail_idx + i].len;
> > +			rte_prefetch0((void *)(uintptr_t)desc_addrs[i]);
> > +
> > +			pkts[i] = virtio_dev_pktmbuf_alloc(dev, mbuf_pool,
> > +					lens[i]);
> > +			if (!pkts[i])
> > +				goto free_buf;
> > +		}
> > +	}
> > +
> > +	if (virtio_net_with_host_offload(dev)) {
> > +		vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
> > +			hdr = (struct virtio_net_hdr *)(desc_addrs[i]);
> > +			vhost_dequeue_offload(hdr, pkts[i]);
> > +		}
> > +	}
> > +
> > +	if (unlikely(virtio_net_is_inorder(dev))) {
> > +		ids[PACKED_BATCH_SIZE - 1] =
> > +			descs[avail_idx + PACKED_BATCH_SIZE - 1].id;
> > +	} else {
> > +		vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
> > +			ids[i] = descs[avail_idx + i].id;
> > +	}
> > +
> > +	uint64_t addrs[PACKED_BATCH_SIZE << 1];
> > +	/* store mbuf data_len, pkt_len */
> > +	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
> > +		addrs[i << 1] = (uint64_t)pkts[i]->rx_descriptor_fields1;
> > +		addrs[(i << 1) + 1] = (uint64_t)pkts[i]->rx_descriptor_fields1
> > +					+ sizeof(uint64_t);
> > +	}
> > +
> > +	/* save pkt_len and data_len into mbufs */
> > +	__m512i value_vec =
> _mm512_maskz_shuffle_epi32(MBUF_LENS_POS, desc_vec,
> > +					0xAA);
> > +	__m512i offsets_vec = _mm512_maskz_set1_epi32(MBUF_LENS_POS,
> > +					(uint32_t)-12);
> > +	value_vec = _mm512_add_epi32(value_vec, offsets_vec);
> > +	__m512i vindex = _mm512_loadu_si512((void *)addrs);
> > +	_mm512_i64scatter_epi64(0, vindex, value_vec, 1);
> > +
> > +	return 0;
> > +free_buf:
> > +	for (i = 0; i < PACKED_BATCH_SIZE; i++)
> > +		rte_pktmbuf_free(pkts[i]);
> > +
> > +	return -1;
> > +}
> > diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
> > index 6107662685..e4d2e2e7d6 100644
> > --- a/lib/librte_vhost/virtio_net.c
> > +++ b/lib/librte_vhost/virtio_net.c
> > @@ -2249,6 +2249,28 @@ vhost_reserve_avail_batch_packed(struct
> virtio_net *dev,
> >  	return -1;
> >  }
> >
> > +static __rte_always_inline int
> > +vhost_handle_avail_batch_packed(struct virtio_net *dev,
> > +				 struct vhost_virtqueue *vq,
> > +				 struct rte_mempool *mbuf_pool,
> > +				 struct rte_mbuf **pkts,
> > +				 uint16_t avail_idx,
> > +				 uintptr_t *desc_addrs,
> > +				 uint16_t *ids)
> > +{
> > +	if (unlikely(dev->vectorized))
> > +#ifdef CC_AVX512_SUPPORT
> > +		return vhost_reserve_avail_batch_packed_avx(dev, vq,
> mbuf_pool,
> > +				pkts, avail_idx, desc_addrs, ids);
> > +#else
> > +		return vhost_reserve_avail_batch_packed(dev, vq,
> mbuf_pool,
> > +				pkts, avail_idx, desc_addrs, ids);
> > +
> > +#endif
> > +	return vhost_reserve_avail_batch_packed(dev, vq, mbuf_pool, pkts,
> > +			avail_idx, desc_addrs, ids);
> > +}
> > +
> >  static __rte_always_inline int
> >  virtio_dev_tx_batch_packed(struct virtio_net *dev,
> >  			   struct vhost_virtqueue *vq,
> > @@ -2261,8 +2283,9 @@ virtio_dev_tx_batch_packed(struct virtio_net
> *dev,
> >  	uint16_t ids[PACKED_BATCH_SIZE];
> >  	uint16_t i;
> >
> > -	if (vhost_reserve_avail_batch_packed(dev, vq, mbuf_pool, pkts,
> > -					     avail_idx, desc_addrs, ids))
> > +
> > +	if (vhost_handle_avail_batch_packed(dev, vq, mbuf_pool, pkts,
> > +		avail_idx, desc_addrs, ids))
> >  		return -1;
> >
> >  	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
> >
  

Patch

diff --git a/lib/librte_vhost/meson.build b/lib/librte_vhost/meson.build
index cc9aa65c67..c1481802d7 100644
--- a/lib/librte_vhost/meson.build
+++ b/lib/librte_vhost/meson.build
@@ -8,6 +8,22 @@  endif
 if has_libnuma == 1
 	dpdk_conf.set10('RTE_LIBRTE_VHOST_NUMA', true)
 endif
+
+if arch_subdir == 'x86'
+        if not machine_args.contains('-mno-avx512f')
+                if cc.has_argument('-mavx512f') and cc.has_argument('-mavx512vl') and cc.has_argument('-mavx512bw')
+                        cflags += ['-DCC_AVX512_SUPPORT']
+                        vhost_avx512_lib = static_library('vhost_avx512_lib',
+                                              'vhost_vec_avx.c',
+                                              dependencies: [static_rte_eal, static_rte_mempool,
+                                                  static_rte_mbuf, static_rte_ethdev, static_rte_net],
+                                              include_directories: includes,
+                                              c_args: [cflags, '-mavx512f', '-mavx512bw', '-mavx512vl'])
+                        objs += vhost_avx512_lib.extract_objects('vhost_vec_avx.c')
+                endif
+        endif
+endif
+
 if (toolchain == 'gcc' and cc.version().version_compare('>=8.3.0'))
 	cflags += '-DVHOST_GCC_UNROLL_PRAGMA'
 elif (toolchain == 'clang' and cc.version().version_compare('>=3.7.0'))
diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
index 4a81f18f01..fc7daf2145 100644
--- a/lib/librte_vhost/vhost.h
+++ b/lib/librte_vhost/vhost.h
@@ -1124,4 +1124,12 @@  virtio_dev_pktmbuf_alloc(struct virtio_net *dev, struct rte_mempool *mp,
 	return NULL;
 }
 
+int
+vhost_reserve_avail_batch_packed_avx(struct virtio_net *dev,
+				 struct vhost_virtqueue *vq,
+				 struct rte_mempool *mbuf_pool,
+				 struct rte_mbuf **pkts,
+				 uint16_t avail_idx,
+				 uintptr_t *desc_addrs,
+				 uint16_t *ids);
 #endif /* _VHOST_NET_CDEV_H_ */
diff --git a/lib/librte_vhost/vhost_vec_avx.c b/lib/librte_vhost/vhost_vec_avx.c
new file mode 100644
index 0000000000..dc5322d002
--- /dev/null
+++ b/lib/librte_vhost/vhost_vec_avx.c
@@ -0,0 +1,181 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2016 Intel Corporation
+ */
+#include <stdint.h>
+
+#include "vhost.h"
+
+#define BYTE_SIZE 8
+/* reference count offset in mbuf rearm data */
+#define REFCNT_BITS_OFFSET ((offsetof(struct rte_mbuf, refcnt) - \
+	offsetof(struct rte_mbuf, rearm_data)) * BYTE_SIZE)
+/* segment number offset in mbuf rearm data */
+#define SEG_NUM_BITS_OFFSET ((offsetof(struct rte_mbuf, nb_segs) - \
+	offsetof(struct rte_mbuf, rearm_data)) * BYTE_SIZE)
+
+/* default rearm data */
+#define DEFAULT_REARM_DATA (1ULL << SEG_NUM_BITS_OFFSET | \
+	1ULL << REFCNT_BITS_OFFSET)
+
+#define DESC_FLAGS_SHORT_OFFSET (offsetof(struct vring_packed_desc, flags) / \
+	sizeof(uint16_t))
+
+#define DESC_FLAGS_SHORT_SIZE (sizeof(struct vring_packed_desc) / \
+	sizeof(uint16_t))
+#define BATCH_FLAGS_MASK (1 << DESC_FLAGS_SHORT_OFFSET | \
+	1 << (DESC_FLAGS_SHORT_OFFSET + DESC_FLAGS_SHORT_SIZE) | \
+	1 << (DESC_FLAGS_SHORT_OFFSET + DESC_FLAGS_SHORT_SIZE * 2)  | \
+	1 << (DESC_FLAGS_SHORT_OFFSET + DESC_FLAGS_SHORT_SIZE * 3))
+
+#define FLAGS_BITS_OFFSET ((offsetof(struct vring_packed_desc, flags) - \
+	offsetof(struct vring_packed_desc, len)) * BYTE_SIZE)
+
+#define PACKED_FLAGS_MASK ((0ULL | VRING_DESC_F_AVAIL | VRING_DESC_F_USED) \
+	<< FLAGS_BITS_OFFSET)
+#define PACKED_AVAIL_FLAG ((0ULL | VRING_DESC_F_AVAIL) << FLAGS_BITS_OFFSET)
+#define PACKED_AVAIL_FLAG_WRAP ((0ULL | VRING_DESC_F_USED) << \
+	FLAGS_BITS_OFFSET)
+
+#define DESC_FLAGS_POS 0xaa
+#define MBUF_LENS_POS 0x6666
+
+int
+vhost_reserve_avail_batch_packed_avx(struct virtio_net *dev,
+				 struct vhost_virtqueue *vq,
+				 struct rte_mempool *mbuf_pool,
+				 struct rte_mbuf **pkts,
+				 uint16_t avail_idx,
+				 uintptr_t *desc_addrs,
+				 uint16_t *ids)
+{
+	struct vring_packed_desc *descs = vq->desc_packed;
+	uint32_t descs_status;
+	void *desc_addr;
+	uint16_t i;
+	uint8_t cmp_low, cmp_high, cmp_result;
+	uint64_t lens[PACKED_BATCH_SIZE];
+	struct virtio_net_hdr *hdr;
+
+	if (unlikely(avail_idx & PACKED_BATCH_MASK))
+		return -1;
+
+	/* load 4 descs */
+	desc_addr = &vq->desc_packed[avail_idx];
+	__m512i desc_vec = _mm512_loadu_si512(desc_addr);
+
+	/* burst check four status */
+	__m512i avail_flag_vec;
+	if (vq->avail_wrap_counter)
+#if defined(RTE_ARCH_I686)
+		avail_flag_vec = _mm512_set4_epi64(PACKED_AVAIL_FLAG, 0x0,
+					PACKED_FLAGS_MASK, 0x0);
+#else
+		avail_flag_vec = _mm512_maskz_set1_epi64(DESC_FLAGS_POS,
+					PACKED_AVAIL_FLAG);
+
+#endif
+	else
+#if defined(RTE_ARCH_I686)
+		avail_flag_vec = _mm512_set4_epi64(PACKED_AVAIL_FLAG_WRAP,
+					0x0, PACKED_AVAIL_FLAG_WRAP, 0x0);
+#else
+		avail_flag_vec = _mm512_maskz_set1_epi64(DESC_FLAGS_POS,
+					PACKED_AVAIL_FLAG_WRAP);
+#endif
+
+	descs_status = _mm512_cmp_epu16_mask(desc_vec, avail_flag_vec,
+		_MM_CMPINT_NE);
+	if (descs_status & BATCH_FLAGS_MASK)
+		return -1;
+
+	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM)) {
+		vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
+			uint64_t size = (uint64_t)descs[avail_idx + i].len;
+			desc_addrs[i] = __vhost_iova_to_vva(dev, vq,
+				descs[avail_idx + i].addr, &size,
+				VHOST_ACCESS_RO);
+
+			if (!desc_addrs[i])
+				goto free_buf;
+			lens[i] = descs[avail_idx + i].len;
+			rte_prefetch0((void *)(uintptr_t)desc_addrs[i]);
+
+			pkts[i] = virtio_dev_pktmbuf_alloc(dev, mbuf_pool,
+					lens[i]);
+			if (!pkts[i])
+				goto free_buf;
+		}
+	} else {
+		/* check buffer fit into one region & translate address */
+		__m512i regions_low_addrs =
+			_mm512_loadu_si512((void *)&dev->regions_low_addrs);
+		__m512i regions_high_addrs =
+			_mm512_loadu_si512((void *)&dev->regions_high_addrs);
+		vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
+			uint64_t addr_low = descs[avail_idx + i].addr;
+			uint64_t addr_high = addr_low +
+						descs[avail_idx + i].len;
+			__m512i low_addr_vec = _mm512_set1_epi64(addr_low);
+			__m512i high_addr_vec = _mm512_set1_epi64(addr_high);
+
+			cmp_low = _mm512_cmp_epi64_mask(low_addr_vec,
+					regions_low_addrs, _MM_CMPINT_NLT);
+			cmp_high = _mm512_cmp_epi64_mask(high_addr_vec,
+					regions_high_addrs, _MM_CMPINT_LT);
+			cmp_result = cmp_low & cmp_high;
+			int index = __builtin_ctz(cmp_result);
+			if (unlikely((uint32_t)index >= dev->mem->nregions))
+				goto free_buf;
+
+			desc_addrs[i] = addr_low +
+				dev->mem->regions[index].host_user_addr -
+				dev->mem->regions[index].guest_phys_addr;
+			lens[i] = descs[avail_idx + i].len;
+			rte_prefetch0((void *)(uintptr_t)desc_addrs[i]);
+
+			pkts[i] = virtio_dev_pktmbuf_alloc(dev, mbuf_pool,
+					lens[i]);
+			if (!pkts[i])
+				goto free_buf;
+		}
+	}
+
+	if (virtio_net_with_host_offload(dev)) {
+		vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
+			hdr = (struct virtio_net_hdr *)(desc_addrs[i]);
+			vhost_dequeue_offload(hdr, pkts[i]);
+		}
+	}
+
+	if (unlikely(virtio_net_is_inorder(dev))) {
+		ids[PACKED_BATCH_SIZE - 1] =
+			descs[avail_idx + PACKED_BATCH_SIZE - 1].id;
+	} else {
+		vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
+			ids[i] = descs[avail_idx + i].id;
+	}
+
+	uint64_t addrs[PACKED_BATCH_SIZE << 1];
+	/* store mbuf data_len, pkt_len */
+	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
+		addrs[i << 1] = (uint64_t)pkts[i]->rx_descriptor_fields1;
+		addrs[(i << 1) + 1] = (uint64_t)pkts[i]->rx_descriptor_fields1
+					+ sizeof(uint64_t);
+	}
+
+	/* save pkt_len and data_len into mbufs */
+	__m512i value_vec = _mm512_maskz_shuffle_epi32(MBUF_LENS_POS, desc_vec,
+					0xAA);
+	__m512i offsets_vec = _mm512_maskz_set1_epi32(MBUF_LENS_POS,
+					(uint32_t)-12);
+	value_vec = _mm512_add_epi32(value_vec, offsets_vec);
+	__m512i vindex = _mm512_loadu_si512((void *)addrs);
+	_mm512_i64scatter_epi64(0, vindex, value_vec, 1);
+
+	return 0;
+free_buf:
+	for (i = 0; i < PACKED_BATCH_SIZE; i++)
+		rte_pktmbuf_free(pkts[i]);
+
+	return -1;
+}
diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 6107662685..e4d2e2e7d6 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -2249,6 +2249,28 @@  vhost_reserve_avail_batch_packed(struct virtio_net *dev,
 	return -1;
 }
 
+static __rte_always_inline int
+vhost_handle_avail_batch_packed(struct virtio_net *dev,
+				 struct vhost_virtqueue *vq,
+				 struct rte_mempool *mbuf_pool,
+				 struct rte_mbuf **pkts,
+				 uint16_t avail_idx,
+				 uintptr_t *desc_addrs,
+				 uint16_t *ids)
+{
+	if (unlikely(dev->vectorized))
+#ifdef CC_AVX512_SUPPORT
+		return vhost_reserve_avail_batch_packed_avx(dev, vq, mbuf_pool,
+				pkts, avail_idx, desc_addrs, ids);
+#else
+		return vhost_reserve_avail_batch_packed(dev, vq, mbuf_pool,
+				pkts, avail_idx, desc_addrs, ids);
+
+#endif
+	return vhost_reserve_avail_batch_packed(dev, vq, mbuf_pool, pkts,
+			avail_idx, desc_addrs, ids);
+}
+
 static __rte_always_inline int
 virtio_dev_tx_batch_packed(struct virtio_net *dev,
 			   struct vhost_virtqueue *vq,
@@ -2261,8 +2283,9 @@  virtio_dev_tx_batch_packed(struct virtio_net *dev,
 	uint16_t ids[PACKED_BATCH_SIZE];
 	uint16_t i;
 
-	if (vhost_reserve_avail_batch_packed(dev, vq, mbuf_pool, pkts,
-					     avail_idx, desc_addrs, ids))
+
+	if (vhost_handle_avail_batch_packed(dev, vq, mbuf_pool, pkts,
+		avail_idx, desc_addrs, ids))
 		return -1;
 
 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)