[v1,1/2] vhost: introduce async data path registration API

Message ID 1591869725-13331-2-git-send-email-patrick.fu@intel.com (mailing list archive)
State Superseded, archived
Delegated to: Maxime Coquelin
Headers
Series introduce asynchronous data path for vhost |

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/iol-broadcom-Performance success Performance Testing PASS
ci/iol-intel-Performance success Performance Testing PASS
ci/iol-nxp-Performance success Performance Testing PASS
ci/iol-mellanox-Performance success Performance Testing PASS
ci/iol-testing success Testing PASS
ci/Intel-compilation success Compilation OK

Commit Message

Patrick Fu June 11, 2020, 10:02 a.m. UTC
  From: Patrick <patrick.fu@intel.com>

This patch introduces registration/un-registration APIs
for async data path together with all required data
structures and DMA callback function proto-types.

Signed-off-by: Patrick <patrick.fu@intel.com>
---
 lib/librte_vhost/Makefile          |   3 +-
 lib/librte_vhost/rte_vhost.h       |   1 +
 lib/librte_vhost/rte_vhost_async.h | 134 +++++++++++++++++++++++++++++++++++++
 lib/librte_vhost/socket.c          |  20 ++++++
 lib/librte_vhost/vhost.c           |  74 +++++++++++++++++++-
 lib/librte_vhost/vhost.h           |  30 ++++++++-
 lib/librte_vhost/vhost_user.c      |  28 ++++++--
 7 files changed, 283 insertions(+), 7 deletions(-)
 create mode 100644 lib/librte_vhost/rte_vhost_async.h
  

Comments

Marvin Liu June 18, 2020, 5:50 a.m. UTC | #1
Thanks, Patrick. So comments are inline.

> -----Original Message-----
> From: dev <dev-bounces@dpdk.org> On Behalf Of patrick.fu@intel.com
> Sent: Thursday, June 11, 2020 6:02 PM
> To: dev@dpdk.org; maxime.coquelin@redhat.com; Xia, Chenbo
> <chenbo.xia@intel.com>; Wang, Zhihong <zhihong.wang@intel.com>; Ye,
> Xiaolong <xiaolong.ye@intel.com>
> Cc: Fu, Patrick <patrick.fu@intel.com>; Jiang, Cheng1
> <cheng1.jiang@intel.com>; Liang, Cunming <cunming.liang@intel.com>
> Subject: [dpdk-dev] [PATCH v1 1/2] vhost: introduce async data path
> registration API
> 
> From: Patrick <patrick.fu@intel.com>
> 
> This patch introduces registration/un-registration APIs
> for async data path together with all required data
> structures and DMA callback function proto-types.
> 
> Signed-off-by: Patrick <patrick.fu@intel.com>
> ---
>  lib/librte_vhost/Makefile          |   3 +-
>  lib/librte_vhost/rte_vhost.h       |   1 +
>  lib/librte_vhost/rte_vhost_async.h | 134
> +++++++++++++++++++++++++++++++++++++
>  lib/librte_vhost/socket.c          |  20 ++++++
>  lib/librte_vhost/vhost.c           |  74 +++++++++++++++++++-
>  lib/librte_vhost/vhost.h           |  30 ++++++++-
>  lib/librte_vhost/vhost_user.c      |  28 ++++++--
>  7 files changed, 283 insertions(+), 7 deletions(-)
>  create mode 100644 lib/librte_vhost/rte_vhost_async.h
> 
> diff --git a/lib/librte_vhost/Makefile b/lib/librte_vhost/Makefile
> index e592795..3aed094 100644
> --- a/lib/librte_vhost/Makefile
> +++ b/lib/librte_vhost/Makefile
> @@ -41,7 +41,8 @@ SRCS-$(CONFIG_RTE_LIBRTE_VHOST) := fd_man.c
> iotlb.c socket.c vhost.c \
>  					vhost_user.c virtio_net.c vdpa.c
> 
>  # install includes
> -SYMLINK-$(CONFIG_RTE_LIBRTE_VHOST)-include += rte_vhost.h rte_vdpa.h
> +SYMLINK-$(CONFIG_RTE_LIBRTE_VHOST)-include += rte_vhost.h rte_vdpa.h
> \
> +						rte_vhost_async.h
> 
Hi Patrick,
Please also update meson build for newly added file.

Thanks,
Marvin

>  # only compile vhost crypto when cryptodev is enabled
>  ifeq ($(CONFIG_RTE_LIBRTE_CRYPTODEV),y)
> diff --git a/lib/librte_vhost/rte_vhost.h b/lib/librte_vhost/rte_vhost.h
> index d43669f..cec4d07 100644
> --- a/lib/librte_vhost/rte_vhost.h
> +++ b/lib/librte_vhost/rte_vhost.h
> @@ -35,6 +35,7 @@
>  #define RTE_VHOST_USER_EXTBUF_SUPPORT	(1ULL << 5)
>  /* support only linear buffers (no chained mbufs) */
>  #define RTE_VHOST_USER_LINEARBUF_SUPPORT	(1ULL << 6)
> +#define RTE_VHOST_USER_ASYNC_COPY	(1ULL << 7)
> 
>  /** Protocol features. */
>  #ifndef VHOST_USER_PROTOCOL_F_MQ
> diff --git a/lib/librte_vhost/rte_vhost_async.h
> b/lib/librte_vhost/rte_vhost_async.h
> new file mode 100644
> index 0000000..82f2ebe
> --- /dev/null
> +++ b/lib/librte_vhost/rte_vhost_async.h
> @@ -0,0 +1,134 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2018 Intel Corporation
> + */

s/2018/2020/ 

> +
> +#ifndef _RTE_VHOST_ASYNC_H_
> +#define _RTE_VHOST_ASYNC_H_
> +
> +#include "rte_vhost.h"
> +
> +/**
> + * iovec iterator
> + */
> +struct iov_it {
> +	/** offset to the first byte of interesting data */
> +	size_t offset;
> +	/** total bytes of data in this iterator */
> +	size_t count;
> +	/** pointer to the iovec array */
> +	struct iovec *iov;
> +	/** number of iovec in this iterator */
> +	unsigned long nr_segs;
> +};

Patrick,
I think structure named as "it" is too generic for understanding, please use more meaningful name like "iov_iter". 

> +
> +/**
> + * dma transfer descriptor pair
> + */
> +struct dma_trans_desc {
> +	/** source memory iov_it */
> +	struct iov_it *src;
> +	/** destination memory iov_it */
> +	struct iov_it *dst;
> +};
> +

This series patch named as sync copy,  and dma is just one async copy method which underneath hardware supplied. 
IMHO, structure is better to named as "async_copy_desc" which matched the overall concept. 

> +/**
> + * dma transfer status
> + */
> +struct dma_trans_status {
> +	/** An array of application specific data for source memory */
> +	uintptr_t *src_opaque_data;
> +	/** An array of application specific data for destination memory */
> +	uintptr_t *dst_opaque_data;
> +};
> +
Same as pervious comment.

> +/**
> + * dma operation callbacks to be implemented by applications
> + */
> +struct rte_vhost_async_channel_ops {
> +	/**
> +	 * instruct a DMA channel to perform copies for a batch of packets
> +	 *
> +	 * @param vid
> +	 *  id of vhost device to perform data copies
> +	 * @param queue_id
> +	 *  queue id to perform data copies
> +	 * @param descs
> +	 *  an array of DMA transfer memory descriptors
> +	 * @param opaque_data
> +	 *  opaque data pair sending to DMA engine
> +	 * @param count
> +	 *  number of elements in the "descs" array
> +	 * @return
> +	 *  -1 on failure, number of descs processed on success
> +	 */
> +	int (*transfer_data)(int vid, uint16_t queue_id,
> +		struct dma_trans_desc *descs,
> +		struct dma_trans_status *opaque_data,
> +		uint16_t count);
> +	/**
> +	 * check copy-completed packets from a DMA channel
> +	 * @param vid
> +	 *  id of vhost device to check copy completion
> +	 * @param queue_id
> +	 *  queue id to check copyp completion
> +	 * @param opaque_data
> +	 *  buffer to receive the opaque data pair from DMA engine
> +	 * @param max_packets
> +	 *  max number of packets could be completed
> +	 * @return
> +	 *  -1 on failure, number of iov segments completed on success
> +	 */
> +	int (*check_completed_copies)(int vid, uint16_t queue_id,
> +		struct dma_trans_status *opaque_data,
> +		uint16_t max_packets);
> +};
> +
> +/**
> + *  dma channel feature bit definition
> + */
> +struct dma_channel_features {
> +	union {
> +		uint32_t intval;
> +		struct {
> +			uint32_t inorder:1;
> +			uint32_t resvd0115:15;
> +			uint32_t threshold:12;
> +			uint32_t resvd2831:4;
> +		};
> +	};
> +};
> +

Naming feature bits as "intval" may cause confusion, why not just use its meaning like "engine_features"?
I'm not sure whether format "resvd0115" match dpdk copy style. In my mind, dpdk will use resvd_0 and resvd_1 for two reserved elements.

> +/**
> + * register a dma channel for vhost
> + *
> + * @param vid
> + *  vhost device id DMA channel to be attached to
> + * @param queue_id
> + *  vhost queue id DMA channel to be attached to
> + * @param features
> + *  DMA channel feature bit
> + *    b0       : DMA supports inorder data transfer
> + *    b1  - b15: reserved
> + *    b16 - b27: Packet length threshold for DMA transfer
> + *    b28 - b31: reserved
> + * @param ops
> + *  DMA operation callbacks
> + * @return
> + *  0 on success, -1 on failures
> + */
> +int rte_vhost_async_channel_register(int vid, uint16_t queue_id,
> +	uint32_t features, struct rte_vhost_async_channel_ops *ops);
> +
> +/**
> + * unregister a dma channel for vhost
> + *
> + * @param vid
> + *  vhost device id DMA channel to be detached
> + * @param queue_id
> + *  vhost queue id DMA channel to be detached
> + * @return
> + *  0 on success, -1 on failures
> + */
> +int rte_vhost_async_channel_unregister(int vid, uint16_t queue_id);
> +
> +#endif /* _RTE_VDPA_H_ */
> diff --git a/lib/librte_vhost/socket.c b/lib/librte_vhost/socket.c
> index 0a66ef9..f817783 100644
> --- a/lib/librte_vhost/socket.c
> +++ b/lib/librte_vhost/socket.c
> @@ -42,6 +42,7 @@ struct vhost_user_socket {
>  	bool use_builtin_virtio_net;
>  	bool extbuf;
>  	bool linearbuf;
> +	bool async_copy;
> 
>  	/*
>  	 * The "supported_features" indicates the feature bits the
> @@ -210,6 +211,7 @@ struct vhost_user {
>  	size_t size;
>  	struct vhost_user_connection *conn;
>  	int ret;
> +	struct virtio_net *dev;
> 
>  	if (vsocket == NULL)
>  		return;
> @@ -241,6 +243,13 @@ struct vhost_user {
>  	if (vsocket->linearbuf)
>  		vhost_enable_linearbuf(vid);
> 
> +	if (vsocket->async_copy) {
> +		dev = get_device(vid);
> +
> +		if (dev)
> +			dev->async_copy = 1;
> +	}
> +

IMHO, user can chose which queue utilize async copy as backend hardware resource is limited. 
So should async_copy enable flag be saved in virtqueue structure? 

>  	VHOST_LOG_CONFIG(INFO, "new device, handle is %d\n", vid);
> 
>  	if (vsocket->notify_ops->new_connection) {
> @@ -891,6 +900,17 @@ struct vhost_user_reconnect_list {
>  		goto out_mutex;
>  	}
> 
> +	vsocket->async_copy = flags & RTE_VHOST_USER_ASYNC_COPY;
> +
> +	if (vsocket->async_copy &&
> +		(flags & (RTE_VHOST_USER_IOMMU_SUPPORT |
> +		RTE_VHOST_USER_POSTCOPY_SUPPORT))) {
> +		VHOST_LOG_CONFIG(ERR, "error: enabling async copy and
> IOMMU "
> +			"or post-copy feature simultaneously is not "
> +			"supported\n");
> +		goto out_mutex;
> +	}
> +
>  	/*
>  	 * Set the supported features correctly for the builtin vhost-user
>  	 * net driver.
> diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
> index 0266318..e6b688a 100644
> --- a/lib/librte_vhost/vhost.c
> +++ b/lib/librte_vhost/vhost.c
> @@ -332,8 +332,13 @@
>  {
>  	if (vq_is_packed(dev))
>  		rte_free(vq->shadow_used_packed);
> -	else
> +	else {
>  		rte_free(vq->shadow_used_split);
> +		if (vq->async_pkts_pending)
> +			rte_free(vq->async_pkts_pending);
> +		if (vq->async_pending_info)
> +			rte_free(vq->async_pending_info);
> +	}
>  	rte_free(vq->batch_copy_elems);
>  	rte_mempool_free(vq->iotlb_pool);
>  	rte_free(vq);
> @@ -1527,3 +1532,70 @@ int rte_vhost_extern_callback_register(int vid,
>  	if (vhost_data_log_level >= 0)
>  		rte_log_set_level(vhost_data_log_level,
> RTE_LOG_WARNING);
>  }
> +
> +int rte_vhost_async_channel_register(int vid, uint16_t queue_id,
> +					uint32_t features,
> +					struct rte_vhost_async_channel_ops
> *ops)
> +{
> +	struct vhost_virtqueue *vq;
> +	struct virtio_net *dev = get_device(vid);
> +	struct dma_channel_features f;
> +
> +	if (dev == NULL || ops == NULL)
> +		return -1;
> +
> +	f.intval = features;
> +
> +	vq = dev->virtqueue[queue_id];
> +
> +	if (vq == NULL)
> +		return -1;
> +
> +	/** packed queue is not supported */
> +	if (vq_is_packed(dev) || !f.inorder)
> +		return -1;
> +
Virtio already has in_order concept, these two names are so like and can be easily messed up.  Please consider how to distinguish them.

> +	if (ops->check_completed_copies == NULL ||
> +		ops->transfer_data == NULL)
> +		return -1;
> +

Previous error is unlikely to be true, unlikely macro may be helpful for understanding. 

> +	rte_spinlock_lock(&vq->access_lock);
> +
> +	vq->async_ops.check_completed_copies = ops-
> >check_completed_copies;
> +	vq->async_ops.transfer_data = ops->transfer_data;
> +
> +	vq->async_inorder = f.inorder;
> +	vq->async_threshold = f.threshold;
> +
> +	vq->async_registered = true;
> +
> +	rte_spinlock_unlock(&vq->access_lock);
> +
> +	return 0;
> +}
> +
> +int rte_vhost_async_channel_unregister(int vid, uint16_t queue_id)
> +{
> +	struct vhost_virtqueue *vq;
> +	struct virtio_net *dev = get_device(vid);
> +
> +	if (dev == NULL)
> +		return -1;
> +
> +	vq = dev->virtqueue[queue_id];
> +
> +	if (vq == NULL)
> +		return -1;
> +
> +	rte_spinlock_lock(&vq->access_lock);
> +
> +	vq->async_ops.transfer_data = NULL;
> +	vq->async_ops.check_completed_copies = NULL;
> +
> +	vq->async_registered = false;
> +
> +	rte_spinlock_unlock(&vq->access_lock);
> +
> +	return 0;
> +}
> +
> diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
> index df98d15..a7fbe23 100644
> --- a/lib/librte_vhost/vhost.h
> +++ b/lib/librte_vhost/vhost.h
> @@ -23,6 +23,8 @@
>  #include "rte_vhost.h"
>  #include "rte_vdpa.h"
> 
> +#include "rte_vhost_async.h"
> +
>  /* Used to indicate that the device is running on a data core */
>  #define VIRTIO_DEV_RUNNING 1
>  /* Used to indicate that the device is ready to operate */
> @@ -39,6 +41,11 @@
> 
>  #define VHOST_LOG_CACHE_NR 32
> 
> +#define MAX_PKT_BURST 32
> +
> +#define VHOST_MAX_ASYNC_IT (MAX_PKT_BURST * 2)
> +#define VHOST_MAX_ASYNC_VEC (BUF_VECTOR_MAX * 2)
> +
>  #define PACKED_DESC_ENQUEUE_USED_FLAG(w)	\
>  	((w) ? (VRING_DESC_F_AVAIL | VRING_DESC_F_USED |
> VRING_DESC_F_WRITE) : \
>  		VRING_DESC_F_WRITE)
> @@ -200,6 +207,25 @@ struct vhost_virtqueue {
>  	TAILQ_HEAD(, vhost_iotlb_entry) iotlb_list;
>  	int				iotlb_cache_nr;
>  	TAILQ_HEAD(, vhost_iotlb_entry) iotlb_pending_list;
> +
> +	/* operation callbacks for async dma */
> +	struct rte_vhost_async_channel_ops	async_ops;
> +
> +	struct iov_it it_pool[VHOST_MAX_ASYNC_IT];
> +	struct iovec vec_pool[VHOST_MAX_ASYNC_VEC];
> +
> +	/* async data transfer status */
> +	uintptr_t	**async_pkts_pending;
> +	#define		ASYNC_PENDING_INFO_N_MSK 0xFFFF
> +	#define		ASYNC_PENDING_INFO_N_SFT 16
> +	uint64_t	*async_pending_info;
> +	uint16_t	async_pkts_idx;
> +	uint16_t	async_pkts_inflight_n;
> +
> +	/* vq async features */
> +	bool		async_inorder;
> +	bool		async_registered;
> +	uint16_t	async_threshold;
>  } __rte_cache_aligned;
> 
>  /* Old kernels have no such macros defined */
> @@ -353,6 +379,7 @@ struct virtio_net {
>  	int16_t			broadcast_rarp;
>  	uint32_t		nr_vring;
>  	int			dequeue_zero_copy;
> +	int			async_copy;
>  	int			extbuf;
>  	int			linearbuf;
>  	struct vhost_virtqueue	*virtqueue[VHOST_MAX_QUEUE_PAIRS * 2];
> @@ -702,7 +729,8 @@ uint64_t translate_log_addr(struct virtio_net *dev,
> struct vhost_virtqueue *vq,
>  	/* Don't kick guest if we don't reach index specified by guest. */
>  	if (dev->features & (1ULL << VIRTIO_RING_F_EVENT_IDX)) {
>  		uint16_t old = vq->signalled_used;
> -		uint16_t new = vq->last_used_idx;
> +		uint16_t new = vq->async_pkts_inflight_n ?
> +					vq->used->idx:vq->last_used_idx;
>  		bool signalled_used_valid = vq->signalled_used_valid;
> 
>  		vq->signalled_used = new;
> diff --git a/lib/librte_vhost/vhost_user.c b/lib/librte_vhost/vhost_user.c
> index 84bebad..d7600bf 100644
> --- a/lib/librte_vhost/vhost_user.c
> +++ b/lib/librte_vhost/vhost_user.c
> @@ -464,12 +464,25 @@
>  	} else {
>  		if (vq->shadow_used_split)
>  			rte_free(vq->shadow_used_split);
> +		if (vq->async_pkts_pending)
> +			rte_free(vq->async_pkts_pending);
> +		if (vq->async_pending_info)
> +			rte_free(vq->async_pending_info);
> +
>  		vq->shadow_used_split = rte_malloc(NULL,
>  				vq->size * sizeof(struct vring_used_elem),
>  				RTE_CACHE_LINE_SIZE);
> -		if (!vq->shadow_used_split) {
> +		vq->async_pkts_pending = rte_malloc(NULL,
> +				vq->size * sizeof(uintptr_t),
> +				RTE_CACHE_LINE_SIZE);
> +		vq->async_pending_info = rte_malloc(NULL,
> +				vq->size * sizeof(uint64_t),
> +				RTE_CACHE_LINE_SIZE);
> +		if (!vq->shadow_used_split ||
> +			!vq->async_pkts_pending ||
> +			!vq->async_pending_info) {
>  			VHOST_LOG_CONFIG(ERR,
> -					"failed to allocate memory for
> shadow used ring.\n");
> +					"failed to allocate memory for vq
> internal data.\n");

If async copy not enabled, there will be no need to allocate related structures. 

>  			return RTE_VHOST_MSG_RESULT_ERR;
>  		}
>  	}
> @@ -1147,7 +1160,8 @@
>  			goto err_mmap;
>  		}
> 
> -		populate = (dev->dequeue_zero_copy) ? MAP_POPULATE : 0;
> +		populate = (dev->dequeue_zero_copy || dev->async_copy) ?
> +			MAP_POPULATE : 0;
>  		mmap_addr = mmap(NULL, mmap_size, PROT_READ |
> PROT_WRITE,
>  				 MAP_SHARED | populate, fd, 0);
> 
> @@ -1162,7 +1176,7 @@
>  		reg->host_user_addr = (uint64_t)(uintptr_t)mmap_addr +
>  				      mmap_offset;
> 
> -		if (dev->dequeue_zero_copy)
> +		if (dev->dequeue_zero_copy || dev->async_copy)
>  			if (add_guest_pages(dev, reg, alignment) < 0) {
>  				VHOST_LOG_CONFIG(ERR,
>  					"adding guest pages to region %u
> failed.\n",
> @@ -1945,6 +1959,12 @@ static int vhost_user_set_vring_err(struct
> virtio_net **pdev __rte_unused,
>  	} else {
>  		rte_free(vq->shadow_used_split);
>  		vq->shadow_used_split = NULL;
> +		if (vq->async_pkts_pending)
> +			rte_free(vq->async_pkts_pending);
> +		if (vq->async_pending_info)
> +			rte_free(vq->async_pending_info);
> +		vq->async_pkts_pending = NULL;
> +		vq->async_pending_info = NULL;
>  	}
> 
>  	rte_free(vq->batch_copy_elems);
> --
> 1.8.3.1
  
Patrick Fu June 18, 2020, 9:08 a.m. UTC | #2
> -----Original Message-----
> From: Liu, Yong <yong.liu@intel.com>
> Sent: Thursday, June 18, 2020 1:51 PM
> To: Fu, Patrick <patrick.fu@intel.com>
> Cc: Fu, Patrick <patrick.fu@intel.com>; Jiang, Cheng1
> <cheng1.jiang@intel.com>; Liang, Cunming <cunming.liang@intel.com>;
> dev@dpdk.org; maxime.coquelin@redhat.com; Xia, Chenbo
> <chenbo.xia@intel.com>; Wang, Zhihong <zhihong.wang@intel.com>; Ye,
> Xiaolong <xiaolong.ye@intel.com>
> Subject: RE: [dpdk-dev] [PATCH v1 1/2] vhost: introduce async data path
> registration API
> 
> Thanks, Patrick. So comments are inline.
> 
> > -----Original Message-----
> > From: dev <dev-bounces@dpdk.org> On Behalf Of patrick.fu@intel.com
> > Sent: Thursday, June 11, 2020 6:02 PM
> > To: dev@dpdk.org; maxime.coquelin@redhat.com; Xia, Chenbo
> > <chenbo.xia@intel.com>; Wang, Zhihong <zhihong.wang@intel.com>; Ye,
> > Xiaolong <xiaolong.ye@intel.com>
> > Cc: Fu, Patrick <patrick.fu@intel.com>; Jiang, Cheng1
> > <cheng1.jiang@intel.com>; Liang, Cunming <cunming.liang@intel.com>
> > Subject: [dpdk-dev] [PATCH v1 1/2] vhost: introduce async data path
> > registration API
> >
> > From: Patrick <patrick.fu@intel.com>
> >
> > This patch introduces registration/un-registration APIs for async data
> > path together with all required data structures and DMA callback
> > function proto-types.
> >
> > Signed-off-by: Patrick <patrick.fu@intel.com>
> > ---
> >  lib/librte_vhost/Makefile          |   3 +-
> >  lib/librte_vhost/rte_vhost.h       |   1 +
> >  lib/librte_vhost/rte_vhost_async.h | 134
> > +++++++++++++++++++++++++++++++++++++
> >  lib/librte_vhost/socket.c          |  20 ++++++
> >  lib/librte_vhost/vhost.c           |  74 +++++++++++++++++++-
> >  lib/librte_vhost/vhost.h           |  30 ++++++++-
> >  lib/librte_vhost/vhost_user.c      |  28 ++++++--
> >  7 files changed, 283 insertions(+), 7 deletions(-)  create mode
> > 100644 lib/librte_vhost/rte_vhost_async.h
> >
> > diff --git a/lib/librte_vhost/Makefile b/lib/librte_vhost/Makefile
> > index e592795..3aed094 100644
> > --- a/lib/librte_vhost/Makefile
> > +++ b/lib/librte_vhost/Makefile
> > @@ -41,7 +41,8 @@ SRCS-$(CONFIG_RTE_LIBRTE_VHOST) := fd_man.c
> iotlb.c
> > socket.c vhost.c \
> >  					vhost_user.c virtio_net.c vdpa.c
> >
> >  # install includes
> > -SYMLINK-$(CONFIG_RTE_LIBRTE_VHOST)-include += rte_vhost.h
> rte_vdpa.h
> > +SYMLINK-$(CONFIG_RTE_LIBRTE_VHOST)-include += rte_vhost.h
> rte_vdpa.h
> > \
> > +						rte_vhost_async.h
> >
> Hi Patrick,
> Please also update meson build for newly added file.
> 
> Thanks,
> Marvin
> 
> >  # only compile vhost crypto when cryptodev is enabled  ifeq
> > ($(CONFIG_RTE_LIBRTE_CRYPTODEV),y)
> > diff --git a/lib/librte_vhost/rte_vhost.h
> > b/lib/librte_vhost/rte_vhost.h index d43669f..cec4d07 100644
> > --- a/lib/librte_vhost/rte_vhost.h
> > +++ b/lib/librte_vhost/rte_vhost.h
> > @@ -35,6 +35,7 @@
> >  #define RTE_VHOST_USER_EXTBUF_SUPPORT	(1ULL << 5)
> >  /* support only linear buffers (no chained mbufs) */
> >  #define RTE_VHOST_USER_LINEARBUF_SUPPORT	(1ULL << 6)
> > +#define RTE_VHOST_USER_ASYNC_COPY	(1ULL << 7)
> >
> >  /** Protocol features. */
> >  #ifndef VHOST_USER_PROTOCOL_F_MQ
> > diff --git a/lib/librte_vhost/rte_vhost_async.h
> > b/lib/librte_vhost/rte_vhost_async.h
> > new file mode 100644
> > index 0000000..82f2ebe
> > --- /dev/null
> > +++ b/lib/librte_vhost/rte_vhost_async.h
> > @@ -0,0 +1,134 @@
> > +/* SPDX-License-Identifier: BSD-3-Clause
> > + * Copyright(c) 2018 Intel Corporation  */
> 
> s/2018/2020/
> 
> > +
> > +#ifndef _RTE_VHOST_ASYNC_H_
> > +#define _RTE_VHOST_ASYNC_H_
> > +
> > +#include "rte_vhost.h"
> > +
> > +/**
> > + * iovec iterator
> > + */
> > +struct iov_it {
> > +	/** offset to the first byte of interesting data */
> > +	size_t offset;
> > +	/** total bytes of data in this iterator */
> > +	size_t count;
> > +	/** pointer to the iovec array */
> > +	struct iovec *iov;
> > +	/** number of iovec in this iterator */
> > +	unsigned long nr_segs;
> > +};
> 
> Patrick,
> I think structure named as "it" is too generic for understanding, please use
> more meaningful name like "iov_iter".
> 
> > +
> > +/**
> > + * dma transfer descriptor pair
> > + */
> > +struct dma_trans_desc {
> > +	/** source memory iov_it */
> > +	struct iov_it *src;
> > +	/** destination memory iov_it */
> > +	struct iov_it *dst;
> > +};
> > +
> 
> This series patch named as sync copy,  and dma is just one async copy
> method which underneath hardware supplied.
> IMHO, structure is better to named as "async_copy_desc" which matched the
> overall concept.
> 
> > +/**
> > + * dma transfer status
> > + */
> > +struct dma_trans_status {
> > +	/** An array of application specific data for source memory */
> > +	uintptr_t *src_opaque_data;
> > +	/** An array of application specific data for destination memory */
> > +	uintptr_t *dst_opaque_data;
> > +};
> > +
> Same as pervious comment.
> 
> > +/**
> > + * dma operation callbacks to be implemented by applications  */
> > +struct rte_vhost_async_channel_ops {
> > +	/**
> > +	 * instruct a DMA channel to perform copies for a batch of packets
> > +	 *
> > +	 * @param vid
> > +	 *  id of vhost device to perform data copies
> > +	 * @param queue_id
> > +	 *  queue id to perform data copies
> > +	 * @param descs
> > +	 *  an array of DMA transfer memory descriptors
> > +	 * @param opaque_data
> > +	 *  opaque data pair sending to DMA engine
> > +	 * @param count
> > +	 *  number of elements in the "descs" array
> > +	 * @return
> > +	 *  -1 on failure, number of descs processed on success
> > +	 */
> > +	int (*transfer_data)(int vid, uint16_t queue_id,
> > +		struct dma_trans_desc *descs,
> > +		struct dma_trans_status *opaque_data,
> > +		uint16_t count);
> > +	/**
> > +	 * check copy-completed packets from a DMA channel
> > +	 * @param vid
> > +	 *  id of vhost device to check copy completion
> > +	 * @param queue_id
> > +	 *  queue id to check copyp completion
> > +	 * @param opaque_data
> > +	 *  buffer to receive the opaque data pair from DMA engine
> > +	 * @param max_packets
> > +	 *  max number of packets could be completed
> > +	 * @return
> > +	 *  -1 on failure, number of iov segments completed on success
> > +	 */
> > +	int (*check_completed_copies)(int vid, uint16_t queue_id,
> > +		struct dma_trans_status *opaque_data,
> > +		uint16_t max_packets);
> > +};
> > +
> > +/**
> > + *  dma channel feature bit definition  */ struct
> > +dma_channel_features {
> > +	union {
> > +		uint32_t intval;
> > +		struct {
> > +			uint32_t inorder:1;
> > +			uint32_t resvd0115:15;
> > +			uint32_t threshold:12;
> > +			uint32_t resvd2831:4;
> > +		};
> > +	};
> > +};
> > +
> 
> Naming feature bits as "intval" may cause confusion, why not just use its
> meaning like "engine_features"?
> I'm not sure whether format "resvd0115" match dpdk copy style. In my mind,
> dpdk will use resvd_0 and resvd_1 for two reserved elements.
> 
For comments here above, I will take changes in v2 patch

> > +		if (dev)
> > +			dev->async_copy = 1;
> > +	}
> > +
> 
> IMHO, user can chose which queue utilize async copy as backend hardware
> resource is limited.
> So should async_copy enable flag be saved in virtqueue structure?
> 
We have per queue flag to identify the enabling status of a specific queue. 
"async_copy" flag is a dev level flag which identifies the async capability of a vhost device. 
This is necessary because we rely on this flag to do initialization work if the vhost backend need to support async mode at any of its queues. 

> >  	VHOST_LOG_CONFIG(INFO, "new device, handle is %d\n", vid);
> >
> >  	if (vsocket->notify_ops->new_connection) { @@ -891,6 +900,17 @@
> > struct vhost_user_reconnect_list {
> >  		goto out_mutex;
> >  	}
> >
> > +	vsocket->async_copy = flags & RTE_VHOST_USER_ASYNC_COPY;
> > +
> > +	if (vsocket->async_copy &&
> > +		(flags & (RTE_VHOST_USER_IOMMU_SUPPORT |
> > +		RTE_VHOST_USER_POSTCOPY_SUPPORT))) {
> > +		VHOST_LOG_CONFIG(ERR, "error: enabling async copy and
> > IOMMU "
> > +			"or post-copy feature simultaneously is not "
> > +			"supported\n");
> > +		goto out_mutex;
> > +	}
> > +
> >  	/*
> >  	 * Set the supported features correctly for the builtin vhost-user
> >  	 * net driver.
> > diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c index
> > 0266318..e6b688a 100644
> > --- a/lib/librte_vhost/vhost.c
> > +++ b/lib/librte_vhost/vhost.c
> > @@ -332,8 +332,13 @@
> >  {
> >  	if (vq_is_packed(dev))
> >  		rte_free(vq->shadow_used_packed);
> > -	else
> > +	else {
> >  		rte_free(vq->shadow_used_split);
> > +		if (vq->async_pkts_pending)
> > +			rte_free(vq->async_pkts_pending);
> > +		if (vq->async_pending_info)
> > +			rte_free(vq->async_pending_info);
> > +	}
> >  	rte_free(vq->batch_copy_elems);
> >  	rte_mempool_free(vq->iotlb_pool);
> >  	rte_free(vq);
> > @@ -1527,3 +1532,70 @@ int rte_vhost_extern_callback_register(int vid,
> >  	if (vhost_data_log_level >= 0)
> >  		rte_log_set_level(vhost_data_log_level,
> > RTE_LOG_WARNING);
> >  }
> > +
> > +int rte_vhost_async_channel_register(int vid, uint16_t queue_id,
> > +					uint32_t features,
> > +					struct rte_vhost_async_channel_ops
> > *ops)
> > +{
> > +	struct vhost_virtqueue *vq;
> > +	struct virtio_net *dev = get_device(vid);
> > +	struct dma_channel_features f;
> > +
> > +	if (dev == NULL || ops == NULL)
> > +		return -1;
> > +
> > +	f.intval = features;
> > +
> > +	vq = dev->virtqueue[queue_id];
> > +
> > +	if (vq == NULL)
> > +		return -1;
> > +
> > +	/** packed queue is not supported */
> > +	if (vq_is_packed(dev) || !f.inorder)
> > +		return -1;
> > +
> Virtio already has in_order concept, these two names are so like and can be
> easily messed up.  Please consider how to distinguish them.
> 
What about "async_inorder"

> > +	if (ops->check_completed_copies == NULL ||
> > +		ops->transfer_data == NULL)
> > +		return -1;
> > +
> 
> Previous error is unlikely to be true, unlikely macro may be helpful for
> understanding.
> 
Will update in v2 patch

> > +	rte_spinlock_lock(&vq->access_lock);
> > +
> > +	vq->async_ops.check_completed_copies = ops-
> > >check_completed_copies;
> > +	vq->async_ops.transfer_data = ops->transfer_data;
> > +
> > +	vq->async_inorder = f.inorder;
> > +	vq->async_threshold = f.threshold;
> > +
> > +	vq->async_registered = true;
> > +
> > +	rte_spinlock_unlock(&vq->access_lock);
> > +
> > +	return 0;
> > +}
> > +
> > +int rte_vhost_async_channel_unregister(int vid, uint16_t queue_id) {
> > +	struct vhost_virtqueue *vq;
> > +	struct virtio_net *dev = get_device(vid);
> > +
> > +	if (dev == NULL)
> > +		return -1;
> > +
> > +	vq = dev->virtqueue[queue_id];
> > +
> > +	if (vq == NULL)
> > +		return -1;
> > +
> > +	rte_spinlock_lock(&vq->access_lock);
> > +
> > +	vq->async_ops.transfer_data = NULL;
> > +	vq->async_ops.check_completed_copies = NULL;
> > +
> > +	vq->async_registered = false;
> > +
> > +	rte_spinlock_unlock(&vq->access_lock);
> > +
> > +	return 0;
> > +}
> > +
> > diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h index
> > df98d15..a7fbe23 100644
> > --- a/lib/librte_vhost/vhost.h
> > +++ b/lib/librte_vhost/vhost.h
> > @@ -23,6 +23,8 @@
> >  #include "rte_vhost.h"
> >  #include "rte_vdpa.h"
> >
> > +#include "rte_vhost_async.h"
> > +
> >  /* Used to indicate that the device is running on a data core */
> > #define VIRTIO_DEV_RUNNING 1
> >  /* Used to indicate that the device is ready to operate */ @@ -39,6
> > +41,11 @@
> >
> >  #define VHOST_LOG_CACHE_NR 32
> >
> > +#define MAX_PKT_BURST 32
> > +
> > +#define VHOST_MAX_ASYNC_IT (MAX_PKT_BURST * 2) #define
> > +VHOST_MAX_ASYNC_VEC (BUF_VECTOR_MAX * 2)
> > +
> >  #define PACKED_DESC_ENQUEUE_USED_FLAG(w)	\
> >  	((w) ? (VRING_DESC_F_AVAIL | VRING_DESC_F_USED |
> > VRING_DESC_F_WRITE) : \
> >  		VRING_DESC_F_WRITE)
> > @@ -200,6 +207,25 @@ struct vhost_virtqueue {
> >  	TAILQ_HEAD(, vhost_iotlb_entry) iotlb_list;
> >  	int				iotlb_cache_nr;
> >  	TAILQ_HEAD(, vhost_iotlb_entry) iotlb_pending_list;
> > +
> > +	/* operation callbacks for async dma */
> > +	struct rte_vhost_async_channel_ops	async_ops;
> > +
> > +	struct iov_it it_pool[VHOST_MAX_ASYNC_IT];
> > +	struct iovec vec_pool[VHOST_MAX_ASYNC_VEC];
> > +
> > +	/* async data transfer status */
> > +	uintptr_t	**async_pkts_pending;
> > +	#define		ASYNC_PENDING_INFO_N_MSK 0xFFFF
> > +	#define		ASYNC_PENDING_INFO_N_SFT 16
> > +	uint64_t	*async_pending_info;
> > +	uint16_t	async_pkts_idx;
> > +	uint16_t	async_pkts_inflight_n;
> > +
> > +	/* vq async features */
> > +	bool		async_inorder;
> > +	bool		async_registered;
> > +	uint16_t	async_threshold;
> >  } __rte_cache_aligned;
> >
> >  /* Old kernels have no such macros defined */ @@ -353,6 +379,7 @@
> > struct virtio_net {
> >  	int16_t			broadcast_rarp;
> >  	uint32_t		nr_vring;
> >  	int			dequeue_zero_copy;
> > +	int			async_copy;
> >  	int			extbuf;
> >  	int			linearbuf;
> >  	struct vhost_virtqueue	*virtqueue[VHOST_MAX_QUEUE_PAIRS * 2];
> > @@ -702,7 +729,8 @@ uint64_t translate_log_addr(struct virtio_net
> > *dev, struct vhost_virtqueue *vq,
> >  	/* Don't kick guest if we don't reach index specified by guest. */
> >  	if (dev->features & (1ULL << VIRTIO_RING_F_EVENT_IDX)) {
> >  		uint16_t old = vq->signalled_used;
> > -		uint16_t new = vq->last_used_idx;
> > +		uint16_t new = vq->async_pkts_inflight_n ?
> > +					vq->used->idx:vq->last_used_idx;
> >  		bool signalled_used_valid = vq->signalled_used_valid;
> >
> >  		vq->signalled_used = new;
> > diff --git a/lib/librte_vhost/vhost_user.c
> > b/lib/librte_vhost/vhost_user.c index 84bebad..d7600bf 100644
> > --- a/lib/librte_vhost/vhost_user.c
> > +++ b/lib/librte_vhost/vhost_user.c
> > @@ -464,12 +464,25 @@
> >  	} else {
> >  		if (vq->shadow_used_split)
> >  			rte_free(vq->shadow_used_split);
> > +		if (vq->async_pkts_pending)
> > +			rte_free(vq->async_pkts_pending);
> > +		if (vq->async_pending_info)
> > +			rte_free(vq->async_pending_info);
> > +
> >  		vq->shadow_used_split = rte_malloc(NULL,
> >  				vq->size * sizeof(struct vring_used_elem),
> >  				RTE_CACHE_LINE_SIZE);
> > -		if (!vq->shadow_used_split) {
> > +		vq->async_pkts_pending = rte_malloc(NULL,
> > +				vq->size * sizeof(uintptr_t),
> > +				RTE_CACHE_LINE_SIZE);
> > +		vq->async_pending_info = rte_malloc(NULL,
> > +				vq->size * sizeof(uint64_t),
> > +				RTE_CACHE_LINE_SIZE);
> > +		if (!vq->shadow_used_split ||
> > +			!vq->async_pkts_pending ||
> > +			!vq->async_pending_info) {
> >  			VHOST_LOG_CONFIG(ERR,
> > -					"failed to allocate memory for
> > shadow used ring.\n");
> > +					"failed to allocate memory for vq
> > internal data.\n");
> 
> If async copy not enabled, there will be no need to allocate related structures.
> 
Will update in v2 patch

Thanks,

Patrick
  
Marvin Liu June 19, 2020, 12:40 a.m. UTC | #3
> -----Original Message-----
> From: Fu, Patrick <patrick.fu@intel.com>
> Sent: Thursday, June 18, 2020 5:09 PM
> To: Liu, Yong <yong.liu@intel.com>
> Cc: Jiang, Cheng1 <cheng1.jiang@intel.com>; Liang, Cunming
> <cunming.liang@intel.com>; dev@dpdk.org; maxime.coquelin@redhat.com;
> Xia, Chenbo <chenbo.xia@intel.com>; Wang, Zhihong
> <zhihong.wang@intel.com>; Ye, Xiaolong <xiaolong.ye@intel.com>
> Subject: RE: [dpdk-dev] [PATCH v1 1/2] vhost: introduce async data path
> registration API
> 
> 
> 
> > -----Original Message-----
> > From: Liu, Yong <yong.liu@intel.com>
> > Sent: Thursday, June 18, 2020 1:51 PM
> > To: Fu, Patrick <patrick.fu@intel.com>
> > Cc: Fu, Patrick <patrick.fu@intel.com>; Jiang, Cheng1
> > <cheng1.jiang@intel.com>; Liang, Cunming <cunming.liang@intel.com>;
> > dev@dpdk.org; maxime.coquelin@redhat.com; Xia, Chenbo
> > <chenbo.xia@intel.com>; Wang, Zhihong <zhihong.wang@intel.com>; Ye,
> > Xiaolong <xiaolong.ye@intel.com>
> > Subject: RE: [dpdk-dev] [PATCH v1 1/2] vhost: introduce async data path
> > registration API
> >
> > Thanks, Patrick. So comments are inline.
> >
> > > -----Original Message-----
> > > From: dev <dev-bounces@dpdk.org> On Behalf Of patrick.fu@intel.com
> > > Sent: Thursday, June 11, 2020 6:02 PM
> > > To: dev@dpdk.org; maxime.coquelin@redhat.com; Xia, Chenbo
> > > <chenbo.xia@intel.com>; Wang, Zhihong <zhihong.wang@intel.com>;
> Ye,
> > > Xiaolong <xiaolong.ye@intel.com>
> > > Cc: Fu, Patrick <patrick.fu@intel.com>; Jiang, Cheng1
> > > <cheng1.jiang@intel.com>; Liang, Cunming <cunming.liang@intel.com>
> > > Subject: [dpdk-dev] [PATCH v1 1/2] vhost: introduce async data path
> > > registration API
> > >
> > > From: Patrick <patrick.fu@intel.com>
> > >
> > > This patch introduces registration/un-registration APIs for async data
> > > path together with all required data structures and DMA callback
> > > function proto-types.
> > >
> > > Signed-off-by: Patrick <patrick.fu@intel.com>
> > > ---
> > >  lib/librte_vhost/Makefile          |   3 +-
> > >  lib/librte_vhost/rte_vhost.h       |   1 +
> > >  lib/librte_vhost/rte_vhost_async.h | 134
> > > +++++++++++++++++++++++++++++++++++++
> > >  lib/librte_vhost/socket.c          |  20 ++++++
> > >  lib/librte_vhost/vhost.c           |  74 +++++++++++++++++++-
> > >  lib/librte_vhost/vhost.h           |  30 ++++++++-
> > >  lib/librte_vhost/vhost_user.c      |  28 ++++++--
> > >  7 files changed, 283 insertions(+), 7 deletions(-)  create mode
> > > 100644 lib/librte_vhost/rte_vhost_async.h
> > >
> > > diff --git a/lib/librte_vhost/Makefile b/lib/librte_vhost/Makefile
> > > index e592795..3aed094 100644
> > > --- a/lib/librte_vhost/Makefile
> > > +++ b/lib/librte_vhost/Makefile
> > > @@ -41,7 +41,8 @@ SRCS-$(CONFIG_RTE_LIBRTE_VHOST) := fd_man.c
> > iotlb.c
> > > socket.c vhost.c \
> > >  vhost_user.c virtio_net.c vdpa.c
> > >
> > >  # install includes
> > > -SYMLINK-$(CONFIG_RTE_LIBRTE_VHOST)-include += rte_vhost.h
> > rte_vdpa.h
> > > +SYMLINK-$(CONFIG_RTE_LIBRTE_VHOST)-include += rte_vhost.h
> > rte_vdpa.h
> > > \
> > > +rte_vhost_async.h
> > >
> > Hi Patrick,
> > Please also update meson build for newly added file.
> >
> > Thanks,
> > Marvin
> >
> > >  # only compile vhost crypto when cryptodev is enabled  ifeq
> > > ($(CONFIG_RTE_LIBRTE_CRYPTODEV),y)
> > > diff --git a/lib/librte_vhost/rte_vhost.h
> > > b/lib/librte_vhost/rte_vhost.h index d43669f..cec4d07 100644
> > > --- a/lib/librte_vhost/rte_vhost.h
> > > +++ b/lib/librte_vhost/rte_vhost.h
> > > @@ -35,6 +35,7 @@
> > >  #define RTE_VHOST_USER_EXTBUF_SUPPORT(1ULL << 5)
> > >  /* support only linear buffers (no chained mbufs) */
> > >  #define RTE_VHOST_USER_LINEARBUF_SUPPORT(1ULL << 6)
> > > +#define RTE_VHOST_USER_ASYNC_COPY(1ULL << 7)
> > >
> > >  /** Protocol features. */
> > >  #ifndef VHOST_USER_PROTOCOL_F_MQ
> > > diff --git a/lib/librte_vhost/rte_vhost_async.h
> > > b/lib/librte_vhost/rte_vhost_async.h
> > > new file mode 100644
> > > index 0000000..82f2ebe
> > > --- /dev/null
> > > +++ b/lib/librte_vhost/rte_vhost_async.h
> > > @@ -0,0 +1,134 @@
> > > +/* SPDX-License-Identifier: BSD-3-Clause
> > > + * Copyright(c) 2018 Intel Corporation  */
> >
> > s/2018/2020/
> >
> > > +
> > > +#ifndef _RTE_VHOST_ASYNC_H_
> > > +#define _RTE_VHOST_ASYNC_H_
> > > +
> > > +#include "rte_vhost.h"
> > > +
> > > +/**
> > > + * iovec iterator
> > > + */
> > > +struct iov_it {
> > > +/** offset to the first byte of interesting data */
> > > +size_t offset;
> > > +/** total bytes of data in this iterator */
> > > +size_t count;
> > > +/** pointer to the iovec array */
> > > +struct iovec *iov;
> > > +/** number of iovec in this iterator */
> > > +unsigned long nr_segs;
> > > +};
> >
> > Patrick,
> > I think structure named as "it" is too generic for understanding, please
> use
> > more meaningful name like "iov_iter".
> >
> > > +
> > > +/**
> > > + * dma transfer descriptor pair
> > > + */
> > > +struct dma_trans_desc {
> > > +/** source memory iov_it */
> > > +struct iov_it *src;
> > > +/** destination memory iov_it */
> > > +struct iov_it *dst;
> > > +};
> > > +
> >
> > This series patch named as sync copy,  and dma is just one async copy
> > method which underneath hardware supplied.
> > IMHO, structure is better to named as "async_copy_desc" which matched
> the
> > overall concept.
> >
> > > +/**
> > > + * dma transfer status
> > > + */
> > > +struct dma_trans_status {
> > > +/** An array of application specific data for source memory */
> > > +uintptr_t *src_opaque_data;
> > > +/** An array of application specific data for destination memory */
> > > +uintptr_t *dst_opaque_data;
> > > +};
> > > +
> > Same as pervious comment.
> >
> > > +/**
> > > + * dma operation callbacks to be implemented by applications  */
> > > +struct rte_vhost_async_channel_ops {
> > > +/**
> > > + * instruct a DMA channel to perform copies for a batch of packets
> > > + *
> > > + * @param vid
> > > + *  id of vhost device to perform data copies
> > > + * @param queue_id
> > > + *  queue id to perform data copies
> > > + * @param descs
> > > + *  an array of DMA transfer memory descriptors
> > > + * @param opaque_data
> > > + *  opaque data pair sending to DMA engine
> > > + * @param count
> > > + *  number of elements in the "descs" array
> > > + * @return
> > > + *  -1 on failure, number of descs processed on success
> > > + */
> > > +int (*transfer_data)(int vid, uint16_t queue_id,
> > > +struct dma_trans_desc *descs,
> > > +struct dma_trans_status *opaque_data,
> > > +uint16_t count);
> > > +/**
> > > + * check copy-completed packets from a DMA channel
> > > + * @param vid
> > > + *  id of vhost device to check copy completion
> > > + * @param queue_id
> > > + *  queue id to check copyp completion
> > > + * @param opaque_data
> > > + *  buffer to receive the opaque data pair from DMA engine
> > > + * @param max_packets
> > > + *  max number of packets could be completed
> > > + * @return
> > > + *  -1 on failure, number of iov segments completed on success
> > > + */
> > > +int (*check_completed_copies)(int vid, uint16_t queue_id,
> > > +struct dma_trans_status *opaque_data,
> > > +uint16_t max_packets);
> > > +};
> > > +
> > > +/**
> > > + *  dma channel feature bit definition  */ struct
> > > +dma_channel_features {
> > > +union {
> > > +uint32_t intval;
> > > +struct {
> > > +uint32_t inorder:1;
> > > +uint32_t resvd0115:15;
> > > +uint32_t threshold:12;
> > > +uint32_t resvd2831:4;
> > > +};
> > > +};
> > > +};
> > > +
> >
> > Naming feature bits as "intval" may cause confusion, why not just use its
> > meaning like "engine_features"?
> > I'm not sure whether format "resvd0115" match dpdk copy style. In my
> mind,
> > dpdk will use resvd_0 and resvd_1 for two reserved elements.
> >
> For comments here above, I will take changes in v2 patch
> 
> > > +if (dev)
> > > +dev->async_copy = 1;
> > > +}
> > > +
> >
> > IMHO, user can chose which queue utilize async copy as backend
> hardware
> > resource is limited.
> > So should async_copy enable flag be saved in virtqueue structure?
> >
> We have per queue flag to identify the enabling status of a specific queue.
> "async_copy" flag is a dev level flag which identifies the async capability of a
> vhost device.
> This is necessary because we rely on this flag to do initialization work if the
> vhost backend need to support async mode at any of its queues.
> 

I got u, how about rename this variable to "async_copy_enabled" which more aligned to its real implication. 

Thanks,
Marvin

> > >  VHOST_LOG_CONFIG(INFO, "new device, handle is %d\n", vid);
> > >
> > >  if (vsocket->notify_ops->new_connection) { @@ -891,6 +900,17 @@
> > > struct vhost_user_reconnect_list {
> > >  goto out_mutex;
> > >  }
> > >
> > > +vsocket->async_copy = flags & RTE_VHOST_USER_ASYNC_COPY;
> > > +
> > > +if (vsocket->async_copy &&
> > > +(flags & (RTE_VHOST_USER_IOMMU_SUPPORT |
> > > +RTE_VHOST_USER_POSTCOPY_SUPPORT))) {
> > > +VHOST_LOG_CONFIG(ERR, "error: enabling async copy and
> > > IOMMU "
> > > +"or post-copy feature simultaneously is not "
> > > +"supported\n");
> > > +goto out_mutex;
> > > +}
> > > +
> > >  /*
> > >   * Set the supported features correctly for the builtin vhost-user
> > >   * net driver.
> > > diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c index
> > > 0266318..e6b688a 100644
> > > --- a/lib/librte_vhost/vhost.c
> > > +++ b/lib/librte_vhost/vhost.c
> > > @@ -332,8 +332,13 @@
> > >  {
> > >  if (vq_is_packed(dev))
> > >  rte_free(vq->shadow_used_packed);
> > > -else
> > > +else {
> > >  rte_free(vq->shadow_used_split);
> > > +if (vq->async_pkts_pending)
> > > +rte_free(vq->async_pkts_pending);
> > > +if (vq->async_pending_info)
> > > +rte_free(vq->async_pending_info);
> > > +}
> > >  rte_free(vq->batch_copy_elems);
> > >  rte_mempool_free(vq->iotlb_pool);
> > >  rte_free(vq);
> > > @@ -1527,3 +1532,70 @@ int rte_vhost_extern_callback_register(int
> vid,
> > >  if (vhost_data_log_level >= 0)
> > >  rte_log_set_level(vhost_data_log_level,
> > > RTE_LOG_WARNING);
> > >  }
> > > +
> > > +int rte_vhost_async_channel_register(int vid, uint16_t queue_id,
> > > +uint32_t features,
> > > +struct rte_vhost_async_channel_ops
> > > *ops)
> > > +{
> > > +struct vhost_virtqueue *vq;
> > > +struct virtio_net *dev = get_device(vid);
> > > +struct dma_channel_features f;
> > > +
> > > +if (dev == NULL || ops == NULL)
> > > +return -1;
> > > +
> > > +f.intval = features;
> > > +
> > > +vq = dev->virtqueue[queue_id];
> > > +
> > > +if (vq == NULL)
> > > +return -1;
> > > +
> > > +/** packed queue is not supported */
> > > +if (vq_is_packed(dev) || !f.inorder)
> > > +return -1;
> > > +
> > Virtio already has in_order concept, these two names are so like and can
> be
> > easily messed up.  Please consider how to distinguish them.
> >
> What about "async_inorder"

Look great for me.

> 
> > > +if (ops->check_completed_copies == NULL ||
> > > +ops->transfer_data == NULL)
> > > +return -1;
> > > +
> >
> > Previous error is unlikely to be true, unlikely macro may be helpful for
> > understanding.
> >
> Will update in v2 patch
> 
> > > +rte_spinlock_lock(&vq->access_lock);
> > > +
> > > +vq->async_ops.check_completed_copies = ops-
> > > >check_completed_copies;
> > > +vq->async_ops.transfer_data = ops->transfer_data;
> > > +
> > > +vq->async_inorder = f.inorder;
> > > +vq->async_threshold = f.threshold;
> > > +
> > > +vq->async_registered = true;
> > > +
> > > +rte_spinlock_unlock(&vq->access_lock);
> > > +
> > > +return 0;
> > > +}
> > > +
> > > +int rte_vhost_async_channel_unregister(int vid, uint16_t queue_id) {
> > > +struct vhost_virtqueue *vq;
> > > +struct virtio_net *dev = get_device(vid);
> > > +
> > > +if (dev == NULL)
> > > +return -1;
> > > +
> > > +vq = dev->virtqueue[queue_id];
> > > +
> > > +if (vq == NULL)
> > > +return -1;
> > > +
> > > +rte_spinlock_lock(&vq->access_lock);
> > > +
> > > +vq->async_ops.transfer_data = NULL;
> > > +vq->async_ops.check_completed_copies = NULL;
> > > +
> > > +vq->async_registered = false;
> > > +
> > > +rte_spinlock_unlock(&vq->access_lock);
> > > +
> > > +return 0;
> > > +}
> > > +
> > > diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h index
> > > df98d15..a7fbe23 100644
> > > --- a/lib/librte_vhost/vhost.h
> > > +++ b/lib/librte_vhost/vhost.h
> > > @@ -23,6 +23,8 @@
> > >  #include "rte_vhost.h"
> > >  #include "rte_vdpa.h"
> > >
> > > +#include "rte_vhost_async.h"
> > > +
> > >  /* Used to indicate that the device is running on a data core */
> > > #define VIRTIO_DEV_RUNNING 1
> > >  /* Used to indicate that the device is ready to operate */ @@ -39,6
> > > +41,11 @@
> > >
> > >  #define VHOST_LOG_CACHE_NR 32
> > >
> > > +#define MAX_PKT_BURST 32
> > > +
> > > +#define VHOST_MAX_ASYNC_IT (MAX_PKT_BURST * 2) #define
> > > +VHOST_MAX_ASYNC_VEC (BUF_VECTOR_MAX * 2)
> > > +
> > >  #define PACKED_DESC_ENQUEUE_USED_FLAG(w)\
> > >  ((w) ? (VRING_DESC_F_AVAIL | VRING_DESC_F_USED |
> > > VRING_DESC_F_WRITE) : \
> > >  VRING_DESC_F_WRITE)
> > > @@ -200,6 +207,25 @@ struct vhost_virtqueue {
> > >  TAILQ_HEAD(, vhost_iotlb_entry) iotlb_list;
> > >  intiotlb_cache_nr;
> > >  TAILQ_HEAD(, vhost_iotlb_entry) iotlb_pending_list;
> > > +
> > > +/* operation callbacks for async dma */
> > > +struct rte_vhost_async_channel_opsasync_ops;
> > > +
> > > +struct iov_it it_pool[VHOST_MAX_ASYNC_IT];
> > > +struct iovec vec_pool[VHOST_MAX_ASYNC_VEC];
> > > +
> > > +/* async data transfer status */
> > > +uintptr_t**async_pkts_pending;
> > > +#defineASYNC_PENDING_INFO_N_MSK 0xFFFF
> > > +#defineASYNC_PENDING_INFO_N_SFT 16
> > > +uint64_t*async_pending_info;
> > > +uint16_tasync_pkts_idx;
> > > +uint16_tasync_pkts_inflight_n;
> > > +
> > > +/* vq async features */
> > > +boolasync_inorder;
> > > +boolasync_registered;
> > > +uint16_tasync_threshold;
> > >  } __rte_cache_aligned;
> > >
> > >  /* Old kernels have no such macros defined */ @@ -353,6 +379,7 @@
> > > struct virtio_net {
> > >  int16_tbroadcast_rarp;
> > >  uint32_tnr_vring;
> > >  intdequeue_zero_copy;
> > > +intasync_copy;
> > >  intextbuf;
> > >  intlinearbuf;
> > >  struct vhost_virtqueue*virtqueue[VHOST_MAX_QUEUE_PAIRS * 2];
> > > @@ -702,7 +729,8 @@ uint64_t translate_log_addr(struct virtio_net
> > > *dev, struct vhost_virtqueue *vq,
> > >  /* Don't kick guest if we don't reach index specified by guest. */
> > >  if (dev->features & (1ULL << VIRTIO_RING_F_EVENT_IDX)) {
> > >  uint16_t old = vq->signalled_used;
> > > -uint16_t new = vq->last_used_idx;
> > > +uint16_t new = vq->async_pkts_inflight_n ?
> > > +vq->used->idx:vq->last_used_idx;
> > >  bool signalled_used_valid = vq->signalled_used_valid;
> > >
> > >  vq->signalled_used = new;
> > > diff --git a/lib/librte_vhost/vhost_user.c
> > > b/lib/librte_vhost/vhost_user.c index 84bebad..d7600bf 100644
> > > --- a/lib/librte_vhost/vhost_user.c
> > > +++ b/lib/librte_vhost/vhost_user.c
> > > @@ -464,12 +464,25 @@
> > >  } else {
> > >  if (vq->shadow_used_split)
> > >  rte_free(vq->shadow_used_split);
> > > +if (vq->async_pkts_pending)
> > > +rte_free(vq->async_pkts_pending);
> > > +if (vq->async_pending_info)
> > > +rte_free(vq->async_pending_info);
> > > +
> > >  vq->shadow_used_split = rte_malloc(NULL,
> > >  vq->size * sizeof(struct vring_used_elem),
> > >  RTE_CACHE_LINE_SIZE);
> > > -if (!vq->shadow_used_split) {
> > > +vq->async_pkts_pending = rte_malloc(NULL,
> > > +vq->size * sizeof(uintptr_t),
> > > +RTE_CACHE_LINE_SIZE);
> > > +vq->async_pending_info = rte_malloc(NULL,
> > > +vq->size * sizeof(uint64_t),
> > > +RTE_CACHE_LINE_SIZE);
> > > +if (!vq->shadow_used_split ||
> > > +!vq->async_pkts_pending ||
> > > +!vq->async_pending_info) {
> > >  VHOST_LOG_CONFIG(ERR,
> > > -"failed to allocate memory for
> > > shadow used ring.\n");
> > > +"failed to allocate memory for vq
> > > internal data.\n");
> >
> > If async copy not enabled, there will be no need to allocate related
> structures.
> >
> Will update in v2 patch
> 
> Thanks,
> 
> Patrick
  
Maxime Coquelin June 25, 2020, 1:42 p.m. UTC | #4
On 6/18/20 7:50 AM, Liu, Yong wrote:
> Thanks, Patrick. So comments are inline.
> 
>> -----Original Message-----
>> From: dev <dev-bounces@dpdk.org> On Behalf Of patrick.fu@intel.com
>> Sent: Thursday, June 11, 2020 6:02 PM
>> To: dev@dpdk.org; maxime.coquelin@redhat.com; Xia, Chenbo
>> <chenbo.xia@intel.com>; Wang, Zhihong <zhihong.wang@intel.com>; Ye,
>> Xiaolong <xiaolong.ye@intel.com>
>> Cc: Fu, Patrick <patrick.fu@intel.com>; Jiang, Cheng1
>> <cheng1.jiang@intel.com>; Liang, Cunming <cunming.liang@intel.com>
>> Subject: [dpdk-dev] [PATCH v1 1/2] vhost: introduce async data path
>> registration API
>>
>> From: Patrick <patrick.fu@intel.com>
>>
>> This patch introduces registration/un-registration APIs
>> for async data path together with all required data
>> structures and DMA callback function proto-types.
>>
>> Signed-off-by: Patrick <patrick.fu@intel.com>
>> ---
>>  lib/librte_vhost/Makefile          |   3 +-
>>  lib/librte_vhost/rte_vhost.h       |   1 +
>>  lib/librte_vhost/rte_vhost_async.h | 134
>> +++++++++++++++++++++++++++++++++++++
>>  lib/librte_vhost/socket.c          |  20 ++++++
>>  lib/librte_vhost/vhost.c           |  74 +++++++++++++++++++-
>>  lib/librte_vhost/vhost.h           |  30 ++++++++-
>>  lib/librte_vhost/vhost_user.c      |  28 ++++++--
>>  7 files changed, 283 insertions(+), 7 deletions(-)
>>  create mode 100644 lib/librte_vhost/rte_vhost_async.h
>>
>> diff --git a/lib/librte_vhost/Makefile b/lib/librte_vhost/Makefile
>> index e592795..3aed094 100644
>> --- a/lib/librte_vhost/Makefile
>> +++ b/lib/librte_vhost/Makefile
>> @@ -41,7 +41,8 @@ SRCS-$(CONFIG_RTE_LIBRTE_VHOST) := fd_man.c
>> iotlb.c socket.c vhost.c \
>>  					vhost_user.c virtio_net.c vdpa.c
>>
>>  # install includes
>> -SYMLINK-$(CONFIG_RTE_LIBRTE_VHOST)-include += rte_vhost.h rte_vdpa.h
>> +SYMLINK-$(CONFIG_RTE_LIBRTE_VHOST)-include += rte_vhost.h rte_vdpa.h
>> \
>> +						rte_vhost_async.h
>>
> Hi Patrick,
> Please also update meson build for newly added file.
> 
> Thanks,
> Marvin
> 
>>  # only compile vhost crypto when cryptodev is enabled
>>  ifeq ($(CONFIG_RTE_LIBRTE_CRYPTODEV),y)
>> diff --git a/lib/librte_vhost/rte_vhost.h b/lib/librte_vhost/rte_vhost.h
>> index d43669f..cec4d07 100644
>> --- a/lib/librte_vhost/rte_vhost.h
>> +++ b/lib/librte_vhost/rte_vhost.h
>> @@ -35,6 +35,7 @@
>>  #define RTE_VHOST_USER_EXTBUF_SUPPORT	(1ULL << 5)
>>  /* support only linear buffers (no chained mbufs) */
>>  #define RTE_VHOST_USER_LINEARBUF_SUPPORT	(1ULL << 6)
>> +#define RTE_VHOST_USER_ASYNC_COPY	(1ULL << 7)
>>
>>  /** Protocol features. */
>>  #ifndef VHOST_USER_PROTOCOL_F_MQ
>> diff --git a/lib/librte_vhost/rte_vhost_async.h
>> b/lib/librte_vhost/rte_vhost_async.h
>> new file mode 100644
>> index 0000000..82f2ebe
>> --- /dev/null
>> +++ b/lib/librte_vhost/rte_vhost_async.h
>> @@ -0,0 +1,134 @@
>> +/* SPDX-License-Identifier: BSD-3-Clause
>> + * Copyright(c) 2018 Intel Corporation
>> + */
> 
> s/2018/2020/ 
> 
>> +
>> +#ifndef _RTE_VHOST_ASYNC_H_
>> +#define _RTE_VHOST_ASYNC_H_
>> +
>> +#include "rte_vhost.h"
>> +
>> +/**
>> + * iovec iterator
>> + */
>> +struct iov_it {
>> +	/** offset to the first byte of interesting data */
>> +	size_t offset;
>> +	/** total bytes of data in this iterator */
>> +	size_t count;
>> +	/** pointer to the iovec array */
>> +	struct iovec *iov;
>> +	/** number of iovec in this iterator */
>> +	unsigned long nr_segs;
>> +};
> 
> Patrick,
> I think structure named as "it" is too generic for understanding, please use more meaningful name like "iov_iter". 

I would also add that being pat if the Vhost API, it needs to be
prefixed with rte_vhost_.

This comment applies to all structures in this header.

>> +
>> +/**
>> + * dma transfer descriptor pair
>> + */
>> +struct dma_trans_desc {
>> +	/** source memory iov_it */
>> +	struct iov_it *src;
>> +	/** destination memory iov_it */
>> +	struct iov_it *dst;
>> +};
>> +
> 
> This series patch named as sync copy,  and dma is just one async copy method which underneath hardware supplied. 
> IMHO, structure is better to named as "async_copy_desc" which matched the overall concept. 
> 
>> +/**
>> + * dma transfer status
>> + */
>> +struct dma_trans_status {
>> +	/** An array of application specific data for source memory */
>> +	uintptr_t *src_opaque_data;
>> +	/** An array of application specific data for destination memory */
>> +	uintptr_t *dst_opaque_data;
>> +};
>> +
> Same as pervious comment.
> 
>> +/**
>> + * dma operation callbacks to be implemented by applications
>> + */
>> +struct rte_vhost_async_channel_ops {
>> +	/**
>> +	 * instruct a DMA channel to perform copies for a batch of packets
>> +	 *
>> +	 * @param vid
>> +	 *  id of vhost device to perform data copies
>> +	 * @param queue_id
>> +	 *  queue id to perform data copies
>> +	 * @param descs
>> +	 *  an array of DMA transfer memory descriptors
>> +	 * @param opaque_data
>> +	 *  opaque data pair sending to DMA engine
>> +	 * @param count
>> +	 *  number of elements in the "descs" array
>> +	 * @return
>> +	 *  -1 on failure, number of descs processed on success
>> +	 */
>> +	int (*transfer_data)(int vid, uint16_t queue_id,
>> +		struct dma_trans_desc *descs,
>> +		struct dma_trans_status *opaque_data,
>> +		uint16_t count);
>> +	/**
>> +	 * check copy-completed packets from a DMA channel
>> +	 * @param vid
>> +	 *  id of vhost device to check copy completion
>> +	 * @param queue_id
>> +	 *  queue id to check copyp completion
>> +	 * @param opaque_data
>> +	 *  buffer to receive the opaque data pair from DMA engine
>> +	 * @param max_packets
>> +	 *  max number of packets could be completed
>> +	 * @return
>> +	 *  -1 on failure, number of iov segments completed on success
>> +	 */
>> +	int (*check_completed_copies)(int vid, uint16_t queue_id,
>> +		struct dma_trans_status *opaque_data,
>> +		uint16_t max_packets);
>> +};
>> +
>> +/**
>> + *  dma channel feature bit definition
>> + */
>> +struct dma_channel_features {
>> +	union {
>> +		uint32_t intval;
>> +		struct {
>> +			uint32_t inorder:1;
>> +			uint32_t resvd0115:15;
>> +			uint32_t threshold:12;
>> +			uint32_t resvd2831:4;
>> +		};
>> +	};
>> +};
>> +
> 
> Naming feature bits as "intval" may cause confusion, why not just use its meaning like "engine_features"?
> I'm not sure whether format "resvd0115" match dpdk copy style. In my mind, dpdk will use resvd_0 and resvd_1 for two reserved elements.
> 
>> +/**
>> + * register a dma channel for vhost
>> + *
>> + * @param vid
>> + *  vhost device id DMA channel to be attached to
>> + * @param queue_id
>> + *  vhost queue id DMA channel to be attached to
>> + * @param features
>> + *  DMA channel feature bit
>> + *    b0       : DMA supports inorder data transfer
>> + *    b1  - b15: reserved
>> + *    b16 - b27: Packet length threshold for DMA transfer
>> + *    b28 - b31: reserved
>> + * @param ops
>> + *  DMA operation callbacks
>> + * @return
>> + *  0 on success, -1 on failures
>> + */
>> +int rte_vhost_async_channel_register(int vid, uint16_t queue_id,
>> +	uint32_t features, struct rte_vhost_async_channel_ops *ops);
>> +
>> +/**
>> + * unregister a dma channel for vhost
>> + *
>> + * @param vid
>> + *  vhost device id DMA channel to be detached
>> + * @param queue_id
>> + *  vhost queue id DMA channel to be detached
>> + * @return
>> + *  0 on success, -1 on failures
>> + */
>> +int rte_vhost_async_channel_unregister(int vid, uint16_t queue_id);
>> +
>> +#endif /* _RTE_VDPA_H_ */
>> diff --git a/lib/librte_vhost/socket.c b/lib/librte_vhost/socket.c
>> index 0a66ef9..f817783 100644
>> --- a/lib/librte_vhost/socket.c
>> +++ b/lib/librte_vhost/socket.c
>> @@ -42,6 +42,7 @@ struct vhost_user_socket {
>>  	bool use_builtin_virtio_net;
>>  	bool extbuf;
>>  	bool linearbuf;
>> +	bool async_copy;
>>
>>  	/*
>>  	 * The "supported_features" indicates the feature bits the
>> @@ -210,6 +211,7 @@ struct vhost_user {
>>  	size_t size;
>>  	struct vhost_user_connection *conn;
>>  	int ret;
>> +	struct virtio_net *dev;
>>
>>  	if (vsocket == NULL)
>>  		return;
>> @@ -241,6 +243,13 @@ struct vhost_user {
>>  	if (vsocket->linearbuf)
>>  		vhost_enable_linearbuf(vid);
>>
>> +	if (vsocket->async_copy) {
>> +		dev = get_device(vid);
>> +
>> +		if (dev)
>> +			dev->async_copy = 1;
>> +	}
>> +
> 
> IMHO, user can chose which queue utilize async copy as backend hardware resource is limited. 
> So should async_copy enable flag be saved in virtqueue structure? 
> 
>>  	VHOST_LOG_CONFIG(INFO, "new device, handle is %d\n", vid);
>>
>>  	if (vsocket->notify_ops->new_connection) {
>> @@ -891,6 +900,17 @@ struct vhost_user_reconnect_list {
>>  		goto out_mutex;
>>  	}
>>
>> +	vsocket->async_copy = flags & RTE_VHOST_USER_ASYNC_COPY;
>> +
>> +	if (vsocket->async_copy &&
>> +		(flags & (RTE_VHOST_USER_IOMMU_SUPPORT |
>> +		RTE_VHOST_USER_POSTCOPY_SUPPORT))) {
>> +		VHOST_LOG_CONFIG(ERR, "error: enabling async copy and
>> IOMMU "
>> +			"or post-copy feature simultaneously is not "
>> +			"supported\n");
>> +		goto out_mutex;
>> +	}
>> +
>>  	/*
>>  	 * Set the supported features correctly for the builtin vhost-user
>>  	 * net driver.
>> diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
>> index 0266318..e6b688a 100644
>> --- a/lib/librte_vhost/vhost.c
>> +++ b/lib/librte_vhost/vhost.c
>> @@ -332,8 +332,13 @@
>>  {
>>  	if (vq_is_packed(dev))
>>  		rte_free(vq->shadow_used_packed);
>> -	else
>> +	else {
>>  		rte_free(vq->shadow_used_split);
>> +		if (vq->async_pkts_pending)
>> +			rte_free(vq->async_pkts_pending);
>> +		if (vq->async_pending_info)
>> +			rte_free(vq->async_pending_info);
>> +	}
>>  	rte_free(vq->batch_copy_elems);
>>  	rte_mempool_free(vq->iotlb_pool);
>>  	rte_free(vq);
>> @@ -1527,3 +1532,70 @@ int rte_vhost_extern_callback_register(int vid,
>>  	if (vhost_data_log_level >= 0)
>>  		rte_log_set_level(vhost_data_log_level,
>> RTE_LOG_WARNING);
>>  }
>> +
>> +int rte_vhost_async_channel_register(int vid, uint16_t queue_id,
>> +					uint32_t features,
>> +					struct rte_vhost_async_channel_ops
>> *ops)
>> +{
>> +	struct vhost_virtqueue *vq;
>> +	struct virtio_net *dev = get_device(vid);
>> +	struct dma_channel_features f;
>> +
>> +	if (dev == NULL || ops == NULL)
>> +		return -1;
>> +
>> +	f.intval = features;
>> +
>> +	vq = dev->virtqueue[queue_id];
>> +
>> +	if (vq == NULL)
>> +		return -1;
>> +
>> +	/** packed queue is not supported */
>> +	if (vq_is_packed(dev) || !f.inorder)
>> +		return -1;
>> +
> Virtio already has in_order concept, these two names are so like and can be easily messed up.  Please consider how to distinguish them.
> 
>> +	if (ops->check_completed_copies == NULL ||
>> +		ops->transfer_data == NULL)
>> +		return -1;
>> +
> 
> Previous error is unlikely to be true, unlikely macro may be helpful for understanding. 
> 
>> +	rte_spinlock_lock(&vq->access_lock);
>> +
>> +	vq->async_ops.check_completed_copies = ops-
>>> check_completed_copies;
>> +	vq->async_ops.transfer_data = ops->transfer_data;
>> +
>> +	vq->async_inorder = f.inorder;
>> +	vq->async_threshold = f.threshold;
>> +
>> +	vq->async_registered = true;
>> +
>> +	rte_spinlock_unlock(&vq->access_lock);
>> +
>> +	return 0;
>> +}
>> +
>> +int rte_vhost_async_channel_unregister(int vid, uint16_t queue_id)
>> +{
>> +	struct vhost_virtqueue *vq;
>> +	struct virtio_net *dev = get_device(vid);
>> +
>> +	if (dev == NULL)
>> +		return -1;
>> +
>> +	vq = dev->virtqueue[queue_id];
>> +
>> +	if (vq == NULL)
>> +		return -1;
>> +
>> +	rte_spinlock_lock(&vq->access_lock);
>> +
>> +	vq->async_ops.transfer_data = NULL;
>> +	vq->async_ops.check_completed_copies = NULL;
>> +
>> +	vq->async_registered = false;
>> +
>> +	rte_spinlock_unlock(&vq->access_lock);
>> +
>> +	return 0;
>> +}
>> +
>> diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
>> index df98d15..a7fbe23 100644
>> --- a/lib/librte_vhost/vhost.h
>> +++ b/lib/librte_vhost/vhost.h
>> @@ -23,6 +23,8 @@
>>  #include "rte_vhost.h"
>>  #include "rte_vdpa.h"
>>
>> +#include "rte_vhost_async.h"
>> +
>>  /* Used to indicate that the device is running on a data core */
>>  #define VIRTIO_DEV_RUNNING 1
>>  /* Used to indicate that the device is ready to operate */
>> @@ -39,6 +41,11 @@
>>
>>  #define VHOST_LOG_CACHE_NR 32
>>
>> +#define MAX_PKT_BURST 32
>> +
>> +#define VHOST_MAX_ASYNC_IT (MAX_PKT_BURST * 2)
>> +#define VHOST_MAX_ASYNC_VEC (BUF_VECTOR_MAX * 2)
>> +
>>  #define PACKED_DESC_ENQUEUE_USED_FLAG(w)	\
>>  	((w) ? (VRING_DESC_F_AVAIL | VRING_DESC_F_USED |
>> VRING_DESC_F_WRITE) : \
>>  		VRING_DESC_F_WRITE)
>> @@ -200,6 +207,25 @@ struct vhost_virtqueue {
>>  	TAILQ_HEAD(, vhost_iotlb_entry) iotlb_list;
>>  	int				iotlb_cache_nr;
>>  	TAILQ_HEAD(, vhost_iotlb_entry) iotlb_pending_list;
>> +
>> +	/* operation callbacks for async dma */
>> +	struct rte_vhost_async_channel_ops	async_ops;
>> +
>> +	struct iov_it it_pool[VHOST_MAX_ASYNC_IT];
>> +	struct iovec vec_pool[VHOST_MAX_ASYNC_VEC];
>> +
>> +	/* async data transfer status */
>> +	uintptr_t	**async_pkts_pending;
>> +	#define		ASYNC_PENDING_INFO_N_MSK 0xFFFF
>> +	#define		ASYNC_PENDING_INFO_N_SFT 16
>> +	uint64_t	*async_pending_info;
>> +	uint16_t	async_pkts_idx;
>> +	uint16_t	async_pkts_inflight_n;
>> +
>> +	/* vq async features */
>> +	bool		async_inorder;
>> +	bool		async_registered;
>> +	uint16_t	async_threshold;
>>  } __rte_cache_aligned;
>>
>>  /* Old kernels have no such macros defined */
>> @@ -353,6 +379,7 @@ struct virtio_net {
>>  	int16_t			broadcast_rarp;
>>  	uint32_t		nr_vring;
>>  	int			dequeue_zero_copy;
>> +	int			async_copy;
>>  	int			extbuf;
>>  	int			linearbuf;
>>  	struct vhost_virtqueue	*virtqueue[VHOST_MAX_QUEUE_PAIRS * 2];
>> @@ -702,7 +729,8 @@ uint64_t translate_log_addr(struct virtio_net *dev,
>> struct vhost_virtqueue *vq,
>>  	/* Don't kick guest if we don't reach index specified by guest. */
>>  	if (dev->features & (1ULL << VIRTIO_RING_F_EVENT_IDX)) {
>>  		uint16_t old = vq->signalled_used;
>> -		uint16_t new = vq->last_used_idx;
>> +		uint16_t new = vq->async_pkts_inflight_n ?
>> +					vq->used->idx:vq->last_used_idx;
>>  		bool signalled_used_valid = vq->signalled_used_valid;
>>
>>  		vq->signalled_used = new;
>> diff --git a/lib/librte_vhost/vhost_user.c b/lib/librte_vhost/vhost_user.c
>> index 84bebad..d7600bf 100644
>> --- a/lib/librte_vhost/vhost_user.c
>> +++ b/lib/librte_vhost/vhost_user.c
>> @@ -464,12 +464,25 @@
>>  	} else {
>>  		if (vq->shadow_used_split)
>>  			rte_free(vq->shadow_used_split);
>> +		if (vq->async_pkts_pending)
>> +			rte_free(vq->async_pkts_pending);
>> +		if (vq->async_pending_info)
>> +			rte_free(vq->async_pending_info);
>> +
>>  		vq->shadow_used_split = rte_malloc(NULL,
>>  				vq->size * sizeof(struct vring_used_elem),
>>  				RTE_CACHE_LINE_SIZE);
>> -		if (!vq->shadow_used_split) {
>> +		vq->async_pkts_pending = rte_malloc(NULL,
>> +				vq->size * sizeof(uintptr_t),
>> +				RTE_CACHE_LINE_SIZE);
>> +		vq->async_pending_info = rte_malloc(NULL,
>> +				vq->size * sizeof(uint64_t),
>> +				RTE_CACHE_LINE_SIZE);
>> +		if (!vq->shadow_used_split ||
>> +			!vq->async_pkts_pending ||
>> +			!vq->async_pending_info) {
>>  			VHOST_LOG_CONFIG(ERR,
>> -					"failed to allocate memory for
>> shadow used ring.\n");
>> +					"failed to allocate memory for vq
>> internal data.\n");
> 
> If async copy not enabled, there will be no need to allocate related structures. 
> 
>>  			return RTE_VHOST_MSG_RESULT_ERR;
>>  		}
>>  	}
>> @@ -1147,7 +1160,8 @@
>>  			goto err_mmap;
>>  		}
>>
>> -		populate = (dev->dequeue_zero_copy) ? MAP_POPULATE : 0;
>> +		populate = (dev->dequeue_zero_copy || dev->async_copy) ?
>> +			MAP_POPULATE : 0;
>>  		mmap_addr = mmap(NULL, mmap_size, PROT_READ |
>> PROT_WRITE,
>>  				 MAP_SHARED | populate, fd, 0);
>>
>> @@ -1162,7 +1176,7 @@
>>  		reg->host_user_addr = (uint64_t)(uintptr_t)mmap_addr +
>>  				      mmap_offset;
>>
>> -		if (dev->dequeue_zero_copy)
>> +		if (dev->dequeue_zero_copy || dev->async_copy)
>>  			if (add_guest_pages(dev, reg, alignment) < 0) {
>>  				VHOST_LOG_CONFIG(ERR,
>>  					"adding guest pages to region %u
>> failed.\n",
>> @@ -1945,6 +1959,12 @@ static int vhost_user_set_vring_err(struct
>> virtio_net **pdev __rte_unused,
>>  	} else {
>>  		rte_free(vq->shadow_used_split);
>>  		vq->shadow_used_split = NULL;
>> +		if (vq->async_pkts_pending)
>> +			rte_free(vq->async_pkts_pending);
>> +		if (vq->async_pending_info)
>> +			rte_free(vq->async_pending_info);
>> +		vq->async_pkts_pending = NULL;
>> +		vq->async_pending_info = NULL;
>>  	}
>>
>>  	rte_free(vq->batch_copy_elems);
>> --
>> 1.8.3.1
>
  
Maxime Coquelin June 26, 2020, 2:28 p.m. UTC | #5
On 6/11/20 12:02 PM, patrick.fu@intel.com wrote:
> From: Patrick <patrick.fu@intel.com>
> 
> This patch introduces registration/un-registration APIs
> for async data path together with all required data
> structures and DMA callback function proto-types.
> 
> Signed-off-by: Patrick <patrick.fu@intel.com>
> ---
>  lib/librte_vhost/Makefile          |   3 +-
>  lib/librte_vhost/rte_vhost.h       |   1 +
>  lib/librte_vhost/rte_vhost_async.h | 134 +++++++++++++++++++++++++++++++++++++
>  lib/librte_vhost/socket.c          |  20 ++++++
>  lib/librte_vhost/vhost.c           |  74 +++++++++++++++++++-
>  lib/librte_vhost/vhost.h           |  30 ++++++++-
>  lib/librte_vhost/vhost_user.c      |  28 ++++++--
>  7 files changed, 283 insertions(+), 7 deletions(-)
>  create mode 100644 lib/librte_vhost/rte_vhost_async.h
> 
> diff --git a/lib/librte_vhost/Makefile b/lib/librte_vhost/Makefile
> index e592795..3aed094 100644
> --- a/lib/librte_vhost/Makefile
> +++ b/lib/librte_vhost/Makefile
> @@ -41,7 +41,8 @@ SRCS-$(CONFIG_RTE_LIBRTE_VHOST) := fd_man.c iotlb.c socket.c vhost.c \
>  					vhost_user.c virtio_net.c vdpa.c
>  
>  # install includes
> -SYMLINK-$(CONFIG_RTE_LIBRTE_VHOST)-include += rte_vhost.h rte_vdpa.h
> +SYMLINK-$(CONFIG_RTE_LIBRTE_VHOST)-include += rte_vhost.h rte_vdpa.h \
> +						rte_vhost_async.h
>  
>  # only compile vhost crypto when cryptodev is enabled
>  ifeq ($(CONFIG_RTE_LIBRTE_CRYPTODEV),y)
> diff --git a/lib/librte_vhost/rte_vhost.h b/lib/librte_vhost/rte_vhost.h
> index d43669f..cec4d07 100644
> --- a/lib/librte_vhost/rte_vhost.h
> +++ b/lib/librte_vhost/rte_vhost.h
> @@ -35,6 +35,7 @@
>  #define RTE_VHOST_USER_EXTBUF_SUPPORT	(1ULL << 5)
>  /* support only linear buffers (no chained mbufs) */
>  #define RTE_VHOST_USER_LINEARBUF_SUPPORT	(1ULL << 6)
> +#define RTE_VHOST_USER_ASYNC_COPY	(1ULL << 7)
>  
>  /** Protocol features. */
>  #ifndef VHOST_USER_PROTOCOL_F_MQ
> diff --git a/lib/librte_vhost/rte_vhost_async.h b/lib/librte_vhost/rte_vhost_async.h
> new file mode 100644
> index 0000000..82f2ebe
> --- /dev/null
> +++ b/lib/librte_vhost/rte_vhost_async.h
> @@ -0,0 +1,134 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2018 Intel Corporation
> + */
> +
> +#ifndef _RTE_VHOST_ASYNC_H_
> +#define _RTE_VHOST_ASYNC_H_
> +
> +#include "rte_vhost.h"
> +
> +/**
> + * iovec iterator
> + */
> +struct iov_it {
> +	/** offset to the first byte of interesting data */
> +	size_t offset;
> +	/** total bytes of data in this iterator */
> +	size_t count;
> +	/** pointer to the iovec array */
> +	struct iovec *iov;
> +	/** number of iovec in this iterator */
> +	unsigned long nr_segs;
> +};
> +
> +/**
> + * dma transfer descriptor pair
> + */
> +struct dma_trans_desc {
> +	/** source memory iov_it */
> +	struct iov_it *src;
> +	/** destination memory iov_it */
> +	struct iov_it *dst;
> +};
> +
> +/**
> + * dma transfer status
> + */
> +struct dma_trans_status {
> +	/** An array of application specific data for source memory */
> +	uintptr_t *src_opaque_data;
> +	/** An array of application specific data for destination memory */
> +	uintptr_t *dst_opaque_data;
> +};
> +
> +/**
> + * dma operation callbacks to be implemented by applications
> + */
> +struct rte_vhost_async_channel_ops {
> +	/**
> +	 * instruct a DMA channel to perform copies for a batch of packets
> +	 *
> +	 * @param vid
> +	 *  id of vhost device to perform data copies
> +	 * @param queue_id
> +	 *  queue id to perform data copies
> +	 * @param descs
> +	 *  an array of DMA transfer memory descriptors
> +	 * @param opaque_data
> +	 *  opaque data pair sending to DMA engine
> +	 * @param count
> +	 *  number of elements in the "descs" array
> +	 * @return
> +	 *  -1 on failure, number of descs processed on success
> +	 */
> +	int (*transfer_data)(int vid, uint16_t queue_id,
> +		struct dma_trans_desc *descs,
> +		struct dma_trans_status *opaque_data,
> +		uint16_t count);
> +	/**
> +	 * check copy-completed packets from a DMA channel
> +	 * @param vid
> +	 *  id of vhost device to check copy completion
> +	 * @param queue_id
> +	 *  queue id to check copyp completion
> +	 * @param opaque_data
> +	 *  buffer to receive the opaque data pair from DMA engine
> +	 * @param max_packets
> +	 *  max number of packets could be completed
> +	 * @return
> +	 *  -1 on failure, number of iov segments completed on success
> +	 */
> +	int (*check_completed_copies)(int vid, uint16_t queue_id,
> +		struct dma_trans_status *opaque_data,
> +		uint16_t max_packets);
> +};
> +
> +/**
> + *  dma channel feature bit definition
> + */
> +struct dma_channel_features {
> +	union {
> +		uint32_t intval;
> +		struct {
> +			uint32_t inorder:1;
> +			uint32_t resvd0115:15;
> +			uint32_t threshold:12;
> +			uint32_t resvd2831:4;
> +		};
> +	};
> +};
> +
> +/**
> + * register a dma channel for vhost
> + *
> + * @param vid
> + *  vhost device id DMA channel to be attached to
> + * @param queue_id
> + *  vhost queue id DMA channel to be attached to
> + * @param features
> + *  DMA channel feature bit
> + *    b0       : DMA supports inorder data transfer
> + *    b1  - b15: reserved
> + *    b16 - b27: Packet length threshold for DMA transfer
> + *    b28 - b31: reserved
> + * @param ops
> + *  DMA operation callbacks
> + * @return
> + *  0 on success, -1 on failures
> + */
> +int rte_vhost_async_channel_register(int vid, uint16_t queue_id,
> +	uint32_t features, struct rte_vhost_async_channel_ops *ops);
> +
> +/**
> + * unregister a dma channel for vhost
> + *
> + * @param vid
> + *  vhost device id DMA channel to be detached
> + * @param queue_id
> + *  vhost queue id DMA channel to be detached
> + * @return
> + *  0 on success, -1 on failures
> + */
> +int rte_vhost_async_channel_unregister(int vid, uint16_t queue_id);
> +
> +#endif /* _RTE_VDPA_H_ */
> diff --git a/lib/librte_vhost/socket.c b/lib/librte_vhost/socket.c
> index 0a66ef9..f817783 100644
> --- a/lib/librte_vhost/socket.c
> +++ b/lib/librte_vhost/socket.c
> @@ -42,6 +42,7 @@ struct vhost_user_socket {
>  	bool use_builtin_virtio_net;
>  	bool extbuf;
>  	bool linearbuf;
> +	bool async_copy;
>  
>  	/*
>  	 * The "supported_features" indicates the feature bits the
> @@ -210,6 +211,7 @@ struct vhost_user {
>  	size_t size;
>  	struct vhost_user_connection *conn;
>  	int ret;
> +	struct virtio_net *dev;
>  
>  	if (vsocket == NULL)
>  		return;
> @@ -241,6 +243,13 @@ struct vhost_user {
>  	if (vsocket->linearbuf)
>  		vhost_enable_linearbuf(vid);
>  
> +	if (vsocket->async_copy) {
> +		dev = get_device(vid);
> +
> +		if (dev)
> +			dev->async_copy = 1;
> +	}
> +
>  	VHOST_LOG_CONFIG(INFO, "new device, handle is %d\n", vid);
>  
>  	if (vsocket->notify_ops->new_connection) {
> @@ -891,6 +900,17 @@ struct vhost_user_reconnect_list {
>  		goto out_mutex;
>  	}
>  
> +	vsocket->async_copy = flags & RTE_VHOST_USER_ASYNC_COPY;
> +
> +	if (vsocket->async_copy &&
> +		(flags & (RTE_VHOST_USER_IOMMU_SUPPORT |
> +		RTE_VHOST_USER_POSTCOPY_SUPPORT))) {
> +		VHOST_LOG_CONFIG(ERR, "error: enabling async copy and IOMMU "
> +			"or post-copy feature simultaneously is not "
> +			"supported\n");
> +		goto out_mutex;
> +	}
> +

Have you ensure compatibility with the linearbuf feature (TSO)?
You will want to do same kind of check if not compatible.

>  	/*
>  	 * Set the supported features correctly for the builtin vhost-user
>  	 * net driver.
> diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
> index 0266318..e6b688a 100644
> --- a/lib/librte_vhost/vhost.c
> +++ b/lib/librte_vhost/vhost.c
> @@ -332,8 +332,13 @@
>  {
>  	if (vq_is_packed(dev))
>  		rte_free(vq->shadow_used_packed);
> -	else
> +	else {
>  		rte_free(vq->shadow_used_split);
> +		if (vq->async_pkts_pending)
> +			rte_free(vq->async_pkts_pending);
> +		if (vq->async_pending_info)
> +			rte_free(vq->async_pending_info);
> +	}
>  	rte_free(vq->batch_copy_elems);
>  	rte_mempool_free(vq->iotlb_pool);
>  	rte_free(vq);
> @@ -1527,3 +1532,70 @@ int rte_vhost_extern_callback_register(int vid,
>  	if (vhost_data_log_level >= 0)
>  		rte_log_set_level(vhost_data_log_level, RTE_LOG_WARNING);
>  }
> +
> +int rte_vhost_async_channel_register(int vid, uint16_t queue_id,
> +					uint32_t features,
> +					struct rte_vhost_async_channel_ops *ops)
> +{
> +	struct vhost_virtqueue *vq;
> +	struct virtio_net *dev = get_device(vid);
> +	struct dma_channel_features f;
> +
> +	if (dev == NULL || ops == NULL)
> +		return -1;
> +
> +	f.intval = features;
> +
> +	vq = dev->virtqueue[queue_id];
> +
> +	if (vq == NULL)
> +		return -1;
> +
> +	/** packed queue is not supported */
> +	if (vq_is_packed(dev) || !f.inorder)
> +		return -1;

You might want to print an error message to help the user understanding
why it fails.

> +
> +	if (ops->check_completed_copies == NULL ||
> +		ops->transfer_data == NULL)
> +		return -1;
> +
> +	rte_spinlock_lock(&vq->access_lock);
> +
> +	vq->async_ops.check_completed_copies = ops->check_completed_copies;
> +	vq->async_ops.transfer_data = ops->transfer_data;
> +
> +	vq->async_inorder = f.inorder;
> +	vq->async_threshold = f.threshold;
> +
> +	vq->async_registered = true;
> +
> +	rte_spinlock_unlock(&vq->access_lock);
> +
> +	return 0;
> +}
> +
> +int rte_vhost_async_channel_unregister(int vid, uint16_t queue_id)
> +{
> +	struct vhost_virtqueue *vq;
> +	struct virtio_net *dev = get_device(vid);
> +
> +	if (dev == NULL)
> +		return -1;
> +
> +	vq = dev->virtqueue[queue_id];
> +
> +	if (vq == NULL)
> +		return -1;
> +
> +	rte_spinlock_lock(&vq->access_lock);

We might want to wait all async transfers are done berfore
unregistering?

> +
> +	vq->async_ops.transfer_data = NULL;
> +	vq->async_ops.check_completed_copies = NULL;
> +
> +	vq->async_registered = false;
> +
> +	rte_spinlock_unlock(&vq->access_lock);
> +
> +	return 0;
> +}
> +
> diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
> index df98d15..a7fbe23 100644
> --- a/lib/librte_vhost/vhost.h
> +++ b/lib/librte_vhost/vhost.h
> @@ -23,6 +23,8 @@
>  #include "rte_vhost.h"
>  #include "rte_vdpa.h"
>  
> +#include "rte_vhost_async.h"
> +
>  /* Used to indicate that the device is running on a data core */
>  #define VIRTIO_DEV_RUNNING 1
>  /* Used to indicate that the device is ready to operate */
> @@ -39,6 +41,11 @@
>  
>  #define VHOST_LOG_CACHE_NR 32
>  
> +#define MAX_PKT_BURST 32
> +
> +#define VHOST_MAX_ASYNC_IT (MAX_PKT_BURST * 2)
> +#define VHOST_MAX_ASYNC_VEC (BUF_VECTOR_MAX * 2)
> +
>  #define PACKED_DESC_ENQUEUE_USED_FLAG(w)	\
>  	((w) ? (VRING_DESC_F_AVAIL | VRING_DESC_F_USED | VRING_DESC_F_WRITE) : \
>  		VRING_DESC_F_WRITE)
> @@ -200,6 +207,25 @@ struct vhost_virtqueue {
>  	TAILQ_HEAD(, vhost_iotlb_entry) iotlb_list;
>  	int				iotlb_cache_nr;
>  	TAILQ_HEAD(, vhost_iotlb_entry) iotlb_pending_list;
> +
> +	/* operation callbacks for async dma */
> +	struct rte_vhost_async_channel_ops	async_ops;
> +
> +	struct iov_it it_pool[VHOST_MAX_ASYNC_IT];
> +	struct iovec vec_pool[VHOST_MAX_ASYNC_VEC];
> +
> +	/* async data transfer status */
> +	uintptr_t	**async_pkts_pending;
> +	#define		ASYNC_PENDING_INFO_N_MSK 0xFFFF
> +	#define		ASYNC_PENDING_INFO_N_SFT 16
> +	uint64_t	*async_pending_info;
> +	uint16_t	async_pkts_idx;
> +	uint16_t	async_pkts_inflight_n;
> +
> +	/* vq async features */
> +	bool		async_inorder;
> +	bool		async_registered;
> +	uint16_t	async_threshold;
>  } __rte_cache_aligned;
>  
>  /* Old kernels have no such macros defined */
> @@ -353,6 +379,7 @@ struct virtio_net {
>  	int16_t			broadcast_rarp;
>  	uint32_t		nr_vring;
>  	int			dequeue_zero_copy;
> +	int			async_copy;
>  	int			extbuf;
>  	int			linearbuf;
>  	struct vhost_virtqueue	*virtqueue[VHOST_MAX_QUEUE_PAIRS * 2];
> @@ -702,7 +729,8 @@ uint64_t translate_log_addr(struct virtio_net *dev, struct vhost_virtqueue *vq,
>  	/* Don't kick guest if we don't reach index specified by guest. */
>  	if (dev->features & (1ULL << VIRTIO_RING_F_EVENT_IDX)) {
>  		uint16_t old = vq->signalled_used;
> -		uint16_t new = vq->last_used_idx;
> +		uint16_t new = vq->async_pkts_inflight_n ?
> +					vq->used->idx:vq->last_used_idx;
>  		bool signalled_used_valid = vq->signalled_used_valid;
>  
>  		vq->signalled_used = new;
> diff --git a/lib/librte_vhost/vhost_user.c b/lib/librte_vhost/vhost_user.c
> index 84bebad..d7600bf 100644
> --- a/lib/librte_vhost/vhost_user.c
> +++ b/lib/librte_vhost/vhost_user.c
> @@ -464,12 +464,25 @@
>  	} else {
>  		if (vq->shadow_used_split)
>  			rte_free(vq->shadow_used_split);
> +		if (vq->async_pkts_pending)
> +			rte_free(vq->async_pkts_pending);
> +		if (vq->async_pending_info)
> +			rte_free(vq->async_pending_info);
> +
>  		vq->shadow_used_split = rte_malloc(NULL,
>  				vq->size * sizeof(struct vring_used_elem),
>  				RTE_CACHE_LINE_SIZE);
> -		if (!vq->shadow_used_split) {
> +		vq->async_pkts_pending = rte_malloc(NULL,
> +				vq->size * sizeof(uintptr_t),
> +				RTE_CACHE_LINE_SIZE);
> +		vq->async_pending_info = rte_malloc(NULL,
> +				vq->size * sizeof(uint64_t),
> +				RTE_CACHE_LINE_SIZE);
> +		if (!vq->shadow_used_split ||
> +			!vq->async_pkts_pending ||
> +			!vq->async_pending_info) {
>  			VHOST_LOG_CONFIG(ERR,
> -					"failed to allocate memory for shadow used ring.\n");
> +					"failed to allocate memory for vq internal data.\n");
>  			return RTE_VHOST_MSG_RESULT_ERR;
>  		}
>  	}
> @@ -1147,7 +1160,8 @@
>  			goto err_mmap;
>  		}
>  
> -		populate = (dev->dequeue_zero_copy) ? MAP_POPULATE : 0;
> +		populate = (dev->dequeue_zero_copy || dev->async_copy) ?
> +			MAP_POPULATE : 0;
>  		mmap_addr = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
>  				 MAP_SHARED | populate, fd, 0);
>  
> @@ -1162,7 +1176,7 @@
>  		reg->host_user_addr = (uint64_t)(uintptr_t)mmap_addr +
>  				      mmap_offset;
>  
> -		if (dev->dequeue_zero_copy)
> +		if (dev->dequeue_zero_copy || dev->async_copy)
>  			if (add_guest_pages(dev, reg, alignment) < 0) {
>  				VHOST_LOG_CONFIG(ERR,
>  					"adding guest pages to region %u failed.\n",
> @@ -1945,6 +1959,12 @@ static int vhost_user_set_vring_err(struct virtio_net **pdev __rte_unused,
>  	} else {
>  		rte_free(vq->shadow_used_split);
>  		vq->shadow_used_split = NULL;
> +		if (vq->async_pkts_pending)
> +			rte_free(vq->async_pkts_pending);
> +		if (vq->async_pending_info)
> +			rte_free(vq->async_pending_info);
> +		vq->async_pkts_pending = NULL;
> +		vq->async_pending_info = NULL;
>  	}
>  
>  	rte_free(vq->batch_copy_elems);
>
  
Maxime Coquelin June 26, 2020, 2:44 p.m. UTC | #6
On 6/11/20 12:02 PM, patrick.fu@intel.com wrote:
> From: Patrick <patrick.fu@intel.com>
> 
> This patch introduces registration/un-registration APIs
> for async data path together with all required data
> structures and DMA callback function proto-types.
> 
> Signed-off-by: Patrick <patrick.fu@intel.com>
> ---
>  lib/librte_vhost/Makefile          |   3 +-
>  lib/librte_vhost/rte_vhost.h       |   1 +
>  lib/librte_vhost/rte_vhost_async.h | 134 +++++++++++++++++++++++++++++++++++++
>  lib/librte_vhost/socket.c          |  20 ++++++
>  lib/librte_vhost/vhost.c           |  74 +++++++++++++++++++-
>  lib/librte_vhost/vhost.h           |  30 ++++++++-
>  lib/librte_vhost/vhost_user.c      |  28 ++++++--
>  7 files changed, 283 insertions(+), 7 deletions(-)
>  create mode 100644 lib/librte_vhost/rte_vhost_async.h
> 

> +/**
> + * register a dma channel for vhost
> + *
> + * @param vid
> + *  vhost device id DMA channel to be attached to
> + * @param queue_id
> + *  vhost queue id DMA channel to be attached to
> + * @param features
> + *  DMA channel feature bit
> + *    b0       : DMA supports inorder data transfer
> + *    b1  - b15: reserved
> + *    b16 - b27: Packet length threshold for DMA transfer
> + *    b28 - b31: reserved
> + * @param ops
> + *  DMA operation callbacks
> + * @return
> + *  0 on success, -1 on failures
> + */
> +int rte_vhost_async_channel_register(int vid, uint16_t queue_id,
> +	uint32_t features, struct rte_vhost_async_channel_ops *ops);
> +
> +/**
> + * unregister a dma channel for vhost
> + *
> + * @param vid
> + *  vhost device id DMA channel to be detached
> + * @param queue_id
> + *  vhost queue id DMA channel to be detached
> + * @return
> + *  0 on success, -1 on failures
> + */
> +int rte_vhost_async_channel_unregister(int vid, uint16_t queue_id);

These new APIs need to be tagged as experimental. We'll need a few
releases before considering them stable.

You need to add them to rte_vhost_version.map too.
  
Patrick Fu June 29, 2020, 1:15 a.m. UTC | #7
Hi Maxime,

> -----Original Message-----
> From: Maxime Coquelin <maxime.coquelin@redhat.com>
> Sent: Friday, June 26, 2020 10:29 PM
> To: Fu, Patrick <patrick.fu@intel.com>; dev@dpdk.org; Xia, Chenbo
> <chenbo.xia@intel.com>; Wang, Zhihong <zhihong.wang@intel.com>; Ye,
> Xiaolong <xiaolong.ye@intel.com>
> Cc: Jiang, Cheng1 <cheng1.jiang@intel.com>; Liang, Cunming
> <cunming.liang@intel.com>
> Subject: Re: [PATCH v1 1/2] vhost: introduce async data path registration API
> 
> 
> 
> On 6/11/20 12:02 PM, patrick.fu@intel.com wrote:
> > From: Patrick <patrick.fu@intel.com>
> >
> > diff --git a/lib/librte_vhost/socket.c b/lib/librte_vhost/socket.c
> > index 0a66ef9..f817783 100644
> > --- a/lib/librte_vhost/socket.c
> > +++ b/lib/librte_vhost/socket.c
> > @@ -42,6 +42,7 @@ struct vhost_user_socket {
> >  	bool use_builtin_virtio_net;
> >  	bool extbuf;
> >  	bool linearbuf;
> > +	bool async_copy;
> >
> >  	/*
> >  	 * The "supported_features" indicates the feature bits the @@ -210,6
> > +211,7 @@ struct vhost_user {
> >  	size_t size;
> >  	struct vhost_user_connection *conn;
> >  	int ret;
> > +	struct virtio_net *dev;
> >
> >  	if (vsocket == NULL)
> >  		return;
> > @@ -241,6 +243,13 @@ struct vhost_user {
> >  	if (vsocket->linearbuf)
> >  		vhost_enable_linearbuf(vid);
> >
> > +	if (vsocket->async_copy) {
> > +		dev = get_device(vid);
> > +
> > +		if (dev)
> > +			dev->async_copy = 1;
> > +	}
> > +
> >  	VHOST_LOG_CONFIG(INFO, "new device, handle is %d\n", vid);
> >
> >  	if (vsocket->notify_ops->new_connection) { @@ -891,6 +900,17 @@
> > struct vhost_user_reconnect_list {
> >  		goto out_mutex;
> >  	}
> >
> > +	vsocket->async_copy = flags & RTE_VHOST_USER_ASYNC_COPY;
> > +
> > +	if (vsocket->async_copy &&
> > +		(flags & (RTE_VHOST_USER_IOMMU_SUPPORT |
> > +		RTE_VHOST_USER_POSTCOPY_SUPPORT))) {
> > +		VHOST_LOG_CONFIG(ERR, "error: enabling async copy and
> IOMMU "
> > +			"or post-copy feature simultaneously is not "
> > +			"supported\n");
> > +		goto out_mutex;
> > +	}
> > +
> 
> Have you ensure compatibility with the linearbuf feature (TSO)?
> You will want to do same kind of check if not compatible.
> 
I think this concern comes primarily from dequeue side. As current patch is for enqueue only, 
linearbuf check doesn't seem to be necessary. For future dequeue implementation, we may 
need to add an additional feature bit to let application to decide if the async callback is 
compatible with linearbuf or not. For a real hardware DMA engine, it should usually support 
linearbuf. For a pure software backend (something like dequeue-zero-copy), it may not support. 

> >  	/*
> >  	 * Set the supported features correctly for the builtin vhost-user
> >  	 * net driver.
> > diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c index
> > 0266318..e6b688a 100644
> > --- a/lib/librte_vhost/vhost.c
> > +++ b/lib/librte_vhost/vhost.c
> > @@ -332,8 +332,13 @@
> >  {
> >  	if (vq_is_packed(dev))
> >  		rte_free(vq->shadow_used_packed);
> > -	else
> > +	else {
> >  		rte_free(vq->shadow_used_split);
> > +		if (vq->async_pkts_pending)
> > +			rte_free(vq->async_pkts_pending);
> > +		if (vq->async_pending_info)
> > +			rte_free(vq->async_pending_info);
> > +	}
> >  	rte_free(vq->batch_copy_elems);
> >  	rte_mempool_free(vq->iotlb_pool);
> >  	rte_free(vq);
> > @@ -1527,3 +1532,70 @@ int rte_vhost_extern_callback_register(int vid,
> >  	if (vhost_data_log_level >= 0)
> >  		rte_log_set_level(vhost_data_log_level,
> RTE_LOG_WARNING);  }
> > +
> > +int rte_vhost_async_channel_register(int vid, uint16_t queue_id,
> > +					uint32_t features,
> > +					struct rte_vhost_async_channel_ops
> *ops) {
> > +	struct vhost_virtqueue *vq;
> > +	struct virtio_net *dev = get_device(vid);
> > +	struct dma_channel_features f;
> > +
> > +	if (dev == NULL || ops == NULL)
> > +		return -1;
> > +
> > +	f.intval = features;
> > +
> > +	vq = dev->virtqueue[queue_id];
> > +
> > +	if (vq == NULL)
> > +		return -1;
> > +
> > +	/** packed queue is not supported */
> > +	if (vq_is_packed(dev) || !f.inorder)
> > +		return -1;
> 
> You might want to print an error message to help the user understanding
> why it fails.
> 
Will update in v2 patch

> > +
> > +	if (ops->check_completed_copies == NULL ||
> > +		ops->transfer_data == NULL)
> > +		return -1;
> > +
> > +	rte_spinlock_lock(&vq->access_lock);
> > +
> > +	vq->async_ops.check_completed_copies = ops-
> >check_completed_copies;
> > +	vq->async_ops.transfer_data = ops->transfer_data;
> > +
> > +	vq->async_inorder = f.inorder;
> > +	vq->async_threshold = f.threshold;
> > +
> > +	vq->async_registered = true;
> > +
> > +	rte_spinlock_unlock(&vq->access_lock);
> > +
> > +	return 0;
> > +}
> > +
> > +int rte_vhost_async_channel_unregister(int vid, uint16_t queue_id) {
> > +	struct vhost_virtqueue *vq;
> > +	struct virtio_net *dev = get_device(vid);
> > +
> > +	if (dev == NULL)
> > +		return -1;
> > +
> > +	vq = dev->virtqueue[queue_id];
> > +
> > +	if (vq == NULL)
> > +		return -1;
> > +
> > +	rte_spinlock_lock(&vq->access_lock);
> 
> We might want to wait all async transfers are done berfore unregistering?
> 
This could be a little bit tricky. In our async enqueue API design, applications send mbuf to DMA engine 
from one API call and get finished mbuf back from another API call. If we want to drain DMA buffer at this 
unregistration API, we need to either return mbuf from unregistration, or save the mbuf and return it somewhere else. 
I'm thinking if it's better to just report error in unregistration in case in-flight packets existing and rely on application to 
poll out all in-flight packets.
  

Patch

diff --git a/lib/librte_vhost/Makefile b/lib/librte_vhost/Makefile
index e592795..3aed094 100644
--- a/lib/librte_vhost/Makefile
+++ b/lib/librte_vhost/Makefile
@@ -41,7 +41,8 @@  SRCS-$(CONFIG_RTE_LIBRTE_VHOST) := fd_man.c iotlb.c socket.c vhost.c \
 					vhost_user.c virtio_net.c vdpa.c
 
 # install includes
-SYMLINK-$(CONFIG_RTE_LIBRTE_VHOST)-include += rte_vhost.h rte_vdpa.h
+SYMLINK-$(CONFIG_RTE_LIBRTE_VHOST)-include += rte_vhost.h rte_vdpa.h \
+						rte_vhost_async.h
 
 # only compile vhost crypto when cryptodev is enabled
 ifeq ($(CONFIG_RTE_LIBRTE_CRYPTODEV),y)
diff --git a/lib/librte_vhost/rte_vhost.h b/lib/librte_vhost/rte_vhost.h
index d43669f..cec4d07 100644
--- a/lib/librte_vhost/rte_vhost.h
+++ b/lib/librte_vhost/rte_vhost.h
@@ -35,6 +35,7 @@ 
 #define RTE_VHOST_USER_EXTBUF_SUPPORT	(1ULL << 5)
 /* support only linear buffers (no chained mbufs) */
 #define RTE_VHOST_USER_LINEARBUF_SUPPORT	(1ULL << 6)
+#define RTE_VHOST_USER_ASYNC_COPY	(1ULL << 7)
 
 /** Protocol features. */
 #ifndef VHOST_USER_PROTOCOL_F_MQ
diff --git a/lib/librte_vhost/rte_vhost_async.h b/lib/librte_vhost/rte_vhost_async.h
new file mode 100644
index 0000000..82f2ebe
--- /dev/null
+++ b/lib/librte_vhost/rte_vhost_async.h
@@ -0,0 +1,134 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#ifndef _RTE_VHOST_ASYNC_H_
+#define _RTE_VHOST_ASYNC_H_
+
+#include "rte_vhost.h"
+
+/**
+ * iovec iterator
+ */
+struct iov_it {
+	/** offset to the first byte of interesting data */
+	size_t offset;
+	/** total bytes of data in this iterator */
+	size_t count;
+	/** pointer to the iovec array */
+	struct iovec *iov;
+	/** number of iovec in this iterator */
+	unsigned long nr_segs;
+};
+
+/**
+ * dma transfer descriptor pair
+ */
+struct dma_trans_desc {
+	/** source memory iov_it */
+	struct iov_it *src;
+	/** destination memory iov_it */
+	struct iov_it *dst;
+};
+
+/**
+ * dma transfer status
+ */
+struct dma_trans_status {
+	/** An array of application specific data for source memory */
+	uintptr_t *src_opaque_data;
+	/** An array of application specific data for destination memory */
+	uintptr_t *dst_opaque_data;
+};
+
+/**
+ * dma operation callbacks to be implemented by applications
+ */
+struct rte_vhost_async_channel_ops {
+	/**
+	 * instruct a DMA channel to perform copies for a batch of packets
+	 *
+	 * @param vid
+	 *  id of vhost device to perform data copies
+	 * @param queue_id
+	 *  queue id to perform data copies
+	 * @param descs
+	 *  an array of DMA transfer memory descriptors
+	 * @param opaque_data
+	 *  opaque data pair sending to DMA engine
+	 * @param count
+	 *  number of elements in the "descs" array
+	 * @return
+	 *  -1 on failure, number of descs processed on success
+	 */
+	int (*transfer_data)(int vid, uint16_t queue_id,
+		struct dma_trans_desc *descs,
+		struct dma_trans_status *opaque_data,
+		uint16_t count);
+	/**
+	 * check copy-completed packets from a DMA channel
+	 * @param vid
+	 *  id of vhost device to check copy completion
+	 * @param queue_id
+	 *  queue id to check copyp completion
+	 * @param opaque_data
+	 *  buffer to receive the opaque data pair from DMA engine
+	 * @param max_packets
+	 *  max number of packets could be completed
+	 * @return
+	 *  -1 on failure, number of iov segments completed on success
+	 */
+	int (*check_completed_copies)(int vid, uint16_t queue_id,
+		struct dma_trans_status *opaque_data,
+		uint16_t max_packets);
+};
+
+/**
+ *  dma channel feature bit definition
+ */
+struct dma_channel_features {
+	union {
+		uint32_t intval;
+		struct {
+			uint32_t inorder:1;
+			uint32_t resvd0115:15;
+			uint32_t threshold:12;
+			uint32_t resvd2831:4;
+		};
+	};
+};
+
+/**
+ * register a dma channel for vhost
+ *
+ * @param vid
+ *  vhost device id DMA channel to be attached to
+ * @param queue_id
+ *  vhost queue id DMA channel to be attached to
+ * @param features
+ *  DMA channel feature bit
+ *    b0       : DMA supports inorder data transfer
+ *    b1  - b15: reserved
+ *    b16 - b27: Packet length threshold for DMA transfer
+ *    b28 - b31: reserved
+ * @param ops
+ *  DMA operation callbacks
+ * @return
+ *  0 on success, -1 on failures
+ */
+int rte_vhost_async_channel_register(int vid, uint16_t queue_id,
+	uint32_t features, struct rte_vhost_async_channel_ops *ops);
+
+/**
+ * unregister a dma channel for vhost
+ *
+ * @param vid
+ *  vhost device id DMA channel to be detached
+ * @param queue_id
+ *  vhost queue id DMA channel to be detached
+ * @return
+ *  0 on success, -1 on failures
+ */
+int rte_vhost_async_channel_unregister(int vid, uint16_t queue_id);
+
+#endif /* _RTE_VDPA_H_ */
diff --git a/lib/librte_vhost/socket.c b/lib/librte_vhost/socket.c
index 0a66ef9..f817783 100644
--- a/lib/librte_vhost/socket.c
+++ b/lib/librte_vhost/socket.c
@@ -42,6 +42,7 @@  struct vhost_user_socket {
 	bool use_builtin_virtio_net;
 	bool extbuf;
 	bool linearbuf;
+	bool async_copy;
 
 	/*
 	 * The "supported_features" indicates the feature bits the
@@ -210,6 +211,7 @@  struct vhost_user {
 	size_t size;
 	struct vhost_user_connection *conn;
 	int ret;
+	struct virtio_net *dev;
 
 	if (vsocket == NULL)
 		return;
@@ -241,6 +243,13 @@  struct vhost_user {
 	if (vsocket->linearbuf)
 		vhost_enable_linearbuf(vid);
 
+	if (vsocket->async_copy) {
+		dev = get_device(vid);
+
+		if (dev)
+			dev->async_copy = 1;
+	}
+
 	VHOST_LOG_CONFIG(INFO, "new device, handle is %d\n", vid);
 
 	if (vsocket->notify_ops->new_connection) {
@@ -891,6 +900,17 @@  struct vhost_user_reconnect_list {
 		goto out_mutex;
 	}
 
+	vsocket->async_copy = flags & RTE_VHOST_USER_ASYNC_COPY;
+
+	if (vsocket->async_copy &&
+		(flags & (RTE_VHOST_USER_IOMMU_SUPPORT |
+		RTE_VHOST_USER_POSTCOPY_SUPPORT))) {
+		VHOST_LOG_CONFIG(ERR, "error: enabling async copy and IOMMU "
+			"or post-copy feature simultaneously is not "
+			"supported\n");
+		goto out_mutex;
+	}
+
 	/*
 	 * Set the supported features correctly for the builtin vhost-user
 	 * net driver.
diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
index 0266318..e6b688a 100644
--- a/lib/librte_vhost/vhost.c
+++ b/lib/librte_vhost/vhost.c
@@ -332,8 +332,13 @@ 
 {
 	if (vq_is_packed(dev))
 		rte_free(vq->shadow_used_packed);
-	else
+	else {
 		rte_free(vq->shadow_used_split);
+		if (vq->async_pkts_pending)
+			rte_free(vq->async_pkts_pending);
+		if (vq->async_pending_info)
+			rte_free(vq->async_pending_info);
+	}
 	rte_free(vq->batch_copy_elems);
 	rte_mempool_free(vq->iotlb_pool);
 	rte_free(vq);
@@ -1527,3 +1532,70 @@  int rte_vhost_extern_callback_register(int vid,
 	if (vhost_data_log_level >= 0)
 		rte_log_set_level(vhost_data_log_level, RTE_LOG_WARNING);
 }
+
+int rte_vhost_async_channel_register(int vid, uint16_t queue_id,
+					uint32_t features,
+					struct rte_vhost_async_channel_ops *ops)
+{
+	struct vhost_virtqueue *vq;
+	struct virtio_net *dev = get_device(vid);
+	struct dma_channel_features f;
+
+	if (dev == NULL || ops == NULL)
+		return -1;
+
+	f.intval = features;
+
+	vq = dev->virtqueue[queue_id];
+
+	if (vq == NULL)
+		return -1;
+
+	/** packed queue is not supported */
+	if (vq_is_packed(dev) || !f.inorder)
+		return -1;
+
+	if (ops->check_completed_copies == NULL ||
+		ops->transfer_data == NULL)
+		return -1;
+
+	rte_spinlock_lock(&vq->access_lock);
+
+	vq->async_ops.check_completed_copies = ops->check_completed_copies;
+	vq->async_ops.transfer_data = ops->transfer_data;
+
+	vq->async_inorder = f.inorder;
+	vq->async_threshold = f.threshold;
+
+	vq->async_registered = true;
+
+	rte_spinlock_unlock(&vq->access_lock);
+
+	return 0;
+}
+
+int rte_vhost_async_channel_unregister(int vid, uint16_t queue_id)
+{
+	struct vhost_virtqueue *vq;
+	struct virtio_net *dev = get_device(vid);
+
+	if (dev == NULL)
+		return -1;
+
+	vq = dev->virtqueue[queue_id];
+
+	if (vq == NULL)
+		return -1;
+
+	rte_spinlock_lock(&vq->access_lock);
+
+	vq->async_ops.transfer_data = NULL;
+	vq->async_ops.check_completed_copies = NULL;
+
+	vq->async_registered = false;
+
+	rte_spinlock_unlock(&vq->access_lock);
+
+	return 0;
+}
+
diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
index df98d15..a7fbe23 100644
--- a/lib/librte_vhost/vhost.h
+++ b/lib/librte_vhost/vhost.h
@@ -23,6 +23,8 @@ 
 #include "rte_vhost.h"
 #include "rte_vdpa.h"
 
+#include "rte_vhost_async.h"
+
 /* Used to indicate that the device is running on a data core */
 #define VIRTIO_DEV_RUNNING 1
 /* Used to indicate that the device is ready to operate */
@@ -39,6 +41,11 @@ 
 
 #define VHOST_LOG_CACHE_NR 32
 
+#define MAX_PKT_BURST 32
+
+#define VHOST_MAX_ASYNC_IT (MAX_PKT_BURST * 2)
+#define VHOST_MAX_ASYNC_VEC (BUF_VECTOR_MAX * 2)
+
 #define PACKED_DESC_ENQUEUE_USED_FLAG(w)	\
 	((w) ? (VRING_DESC_F_AVAIL | VRING_DESC_F_USED | VRING_DESC_F_WRITE) : \
 		VRING_DESC_F_WRITE)
@@ -200,6 +207,25 @@  struct vhost_virtqueue {
 	TAILQ_HEAD(, vhost_iotlb_entry) iotlb_list;
 	int				iotlb_cache_nr;
 	TAILQ_HEAD(, vhost_iotlb_entry) iotlb_pending_list;
+
+	/* operation callbacks for async dma */
+	struct rte_vhost_async_channel_ops	async_ops;
+
+	struct iov_it it_pool[VHOST_MAX_ASYNC_IT];
+	struct iovec vec_pool[VHOST_MAX_ASYNC_VEC];
+
+	/* async data transfer status */
+	uintptr_t	**async_pkts_pending;
+	#define		ASYNC_PENDING_INFO_N_MSK 0xFFFF
+	#define		ASYNC_PENDING_INFO_N_SFT 16
+	uint64_t	*async_pending_info;
+	uint16_t	async_pkts_idx;
+	uint16_t	async_pkts_inflight_n;
+
+	/* vq async features */
+	bool		async_inorder;
+	bool		async_registered;
+	uint16_t	async_threshold;
 } __rte_cache_aligned;
 
 /* Old kernels have no such macros defined */
@@ -353,6 +379,7 @@  struct virtio_net {
 	int16_t			broadcast_rarp;
 	uint32_t		nr_vring;
 	int			dequeue_zero_copy;
+	int			async_copy;
 	int			extbuf;
 	int			linearbuf;
 	struct vhost_virtqueue	*virtqueue[VHOST_MAX_QUEUE_PAIRS * 2];
@@ -702,7 +729,8 @@  uint64_t translate_log_addr(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	/* Don't kick guest if we don't reach index specified by guest. */
 	if (dev->features & (1ULL << VIRTIO_RING_F_EVENT_IDX)) {
 		uint16_t old = vq->signalled_used;
-		uint16_t new = vq->last_used_idx;
+		uint16_t new = vq->async_pkts_inflight_n ?
+					vq->used->idx:vq->last_used_idx;
 		bool signalled_used_valid = vq->signalled_used_valid;
 
 		vq->signalled_used = new;
diff --git a/lib/librte_vhost/vhost_user.c b/lib/librte_vhost/vhost_user.c
index 84bebad..d7600bf 100644
--- a/lib/librte_vhost/vhost_user.c
+++ b/lib/librte_vhost/vhost_user.c
@@ -464,12 +464,25 @@ 
 	} else {
 		if (vq->shadow_used_split)
 			rte_free(vq->shadow_used_split);
+		if (vq->async_pkts_pending)
+			rte_free(vq->async_pkts_pending);
+		if (vq->async_pending_info)
+			rte_free(vq->async_pending_info);
+
 		vq->shadow_used_split = rte_malloc(NULL,
 				vq->size * sizeof(struct vring_used_elem),
 				RTE_CACHE_LINE_SIZE);
-		if (!vq->shadow_used_split) {
+		vq->async_pkts_pending = rte_malloc(NULL,
+				vq->size * sizeof(uintptr_t),
+				RTE_CACHE_LINE_SIZE);
+		vq->async_pending_info = rte_malloc(NULL,
+				vq->size * sizeof(uint64_t),
+				RTE_CACHE_LINE_SIZE);
+		if (!vq->shadow_used_split ||
+			!vq->async_pkts_pending ||
+			!vq->async_pending_info) {
 			VHOST_LOG_CONFIG(ERR,
-					"failed to allocate memory for shadow used ring.\n");
+					"failed to allocate memory for vq internal data.\n");
 			return RTE_VHOST_MSG_RESULT_ERR;
 		}
 	}
@@ -1147,7 +1160,8 @@ 
 			goto err_mmap;
 		}
 
-		populate = (dev->dequeue_zero_copy) ? MAP_POPULATE : 0;
+		populate = (dev->dequeue_zero_copy || dev->async_copy) ?
+			MAP_POPULATE : 0;
 		mmap_addr = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
 				 MAP_SHARED | populate, fd, 0);
 
@@ -1162,7 +1176,7 @@ 
 		reg->host_user_addr = (uint64_t)(uintptr_t)mmap_addr +
 				      mmap_offset;
 
-		if (dev->dequeue_zero_copy)
+		if (dev->dequeue_zero_copy || dev->async_copy)
 			if (add_guest_pages(dev, reg, alignment) < 0) {
 				VHOST_LOG_CONFIG(ERR,
 					"adding guest pages to region %u failed.\n",
@@ -1945,6 +1959,12 @@  static int vhost_user_set_vring_err(struct virtio_net **pdev __rte_unused,
 	} else {
 		rte_free(vq->shadow_used_split);
 		vq->shadow_used_split = NULL;
+		if (vq->async_pkts_pending)
+			rte_free(vq->async_pkts_pending);
+		if (vq->async_pending_info)
+			rte_free(vq->async_pending_info);
+		vq->async_pkts_pending = NULL;
+		vq->async_pending_info = NULL;
 	}
 
 	rte_free(vq->batch_copy_elems);