From patchwork Mon Aug 23 09:53:55 2021
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Sunil Pai G <sunil.pai.g@intel.com>
X-Patchwork-Id: 97222
X-Patchwork-Delegate: maxime.coquelin@redhat.com
Return-Path: <dev-bounces@dpdk.org>
X-Original-To: patchwork@inbox.dpdk.org
Delivered-To: patchwork@inbox.dpdk.org
Received: from mails.dpdk.org (mails.dpdk.org [217.70.189.124])
	by inbox.dpdk.org (Postfix) with ESMTP id EEC0FA0C56;
	Mon, 23 Aug 2021 11:54:28 +0200 (CEST)
Received: from [217.70.189.124] (localhost [127.0.0.1])
	by mails.dpdk.org (Postfix) with ESMTP id B51F141159;
	Mon, 23 Aug 2021 11:54:28 +0200 (CEST)
Received: from mga14.intel.com (mga14.intel.com [192.55.52.115])
 by mails.dpdk.org (Postfix) with ESMTP id 497534014D
 for <dev@dpdk.org>; Mon, 23 Aug 2021 11:54:26 +0200 (CEST)
X-IronPort-AV: E=McAfee;i="6200,9189,10084"; a="216793265"
X-IronPort-AV: E=Sophos;i="5.84,344,1620716400"; d="scan'208";a="216793265"
Received: from orsmga003.jf.intel.com ([10.7.209.27])
 by fmsmga103.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384;
 23 Aug 2021 02:54:25 -0700
X-ExtLoop1: 1
X-IronPort-AV: E=Sophos;i="5.84,344,1620716400"; d="scan'208";a="425699497"
Received: from silpixa00400896.ir.intel.com ([10.243.22.68])
 by orsmga003.jf.intel.com with ESMTP; 23 Aug 2021 02:54:23 -0700
From: Sunil Pai G <sunil.pai.g@intel.com>
To: dev@dpdk.org
Cc: harry.van.haaren@intel.com, bruce.richardson@intel.com,
 Jiayu.Hu@intel.com,
 Cian.Ferriter@intel.com, john.mcnamara@intel.com, qian.q.xu@intel.com,
 ian.stokes@intel.com, sunil.pai.g@intel.com
Date: Mon, 23 Aug 2021 09:53:55 +0000
Message-Id: <20210823095355.2478423-2-sunil.pai.g@intel.com>
X-Mailer: git-send-email 2.25.1
In-Reply-To: <20210823095355.2478423-1-sunil.pai.g@intel.com>
References: <20210823095355.2478423-1-sunil.pai.g@intel.com>
MIME-Version: 1.0
Subject: [dpdk-dev] [PATCH RFC 1/1] vhost: add DMADEV support for async
 datapath
X-BeenThere: dev@dpdk.org
X-Mailman-Version: 2.1.29
Precedence: list
List-Id: DPDK patches and discussions <dev.dpdk.org>
List-Unsubscribe: <https://mails.dpdk.org/options/dev>,
 <mailto:dev-request@dpdk.org?subject=unsubscribe>
List-Archive: <http://mails.dpdk.org/archives/dev/>
List-Post: <mailto:dev@dpdk.org>
List-Help: <mailto:dev-request@dpdk.org?subject=help>
List-Subscribe: <https://mails.dpdk.org/listinfo/dev>,
 <mailto:dev-request@dpdk.org?subject=subscribe>
Errors-To: dev-bounces@dpdk.org
Sender: "dev" <dev-bounces@dpdk.org>

This patch simplifies the async data path enablement for applications
by allowing just the DMADEV ID to be passed as a parameter rather
than having to implement its own logic to enable DMA offload.

Remove the callbacks transfer_data and check_completed_copies
and utilize the generic DMADEV API's to perform packet copy for
vhost async datapath.


Signed-off-by: Sunil Pai G <sunil.pai.g@intel.com>
---
 lib/vhost/meson.build       |   2 +-
 lib/vhost/rte_vhost_async.h |  55 +------
 lib/vhost/vhost.c           |  46 +++---
 lib/vhost/vhost.h           |  24 ++-
 lib/vhost/virtio_net.c      | 311 +++++++++++++++++++++++++++++++-----
 5 files changed, 316 insertions(+), 122 deletions(-)

diff --git a/lib/vhost/meson.build b/lib/vhost/meson.build
index 2d8fe0239f..bea17ed4f5 100644
--- a/lib/vhost/meson.build
+++ b/lib/vhost/meson.build
@@ -34,4 +34,4 @@ headers = files(
         'rte_vhost_async.h',
         'rte_vhost_crypto.h',
 )
-deps += ['ethdev', 'cryptodev', 'hash', 'pci']
+deps += ['ethdev', 'cryptodev', 'hash', 'pci', 'dmadev']
diff --git a/lib/vhost/rte_vhost_async.h b/lib/vhost/rte_vhost_async.h
index a37588188c..0d9706d52b 100644
--- a/lib/vhost/rte_vhost_async.h
+++ b/lib/vhost/rte_vhost_async.h
@@ -42,47 +42,8 @@ struct rte_vhost_async_status {
 };
 
 /**
- * dma operation callbacks to be implemented by applications
+ * in-flight async packet information
  */
-struct rte_vhost_async_channel_ops {
-	/**
-	 * instruct async engines to perform copies for a batch of packets
-	 *
-	 * @param vid
-	 *  id of vhost device to perform data copies
-	 * @param queue_id
-	 *  queue id to perform data copies
-	 * @param descs
-	 *  an array of DMA transfer memory descriptors
-	 * @param opaque_data
-	 *  opaque data pair sending to DMA engine
-	 * @param count
-	 *  number of elements in the "descs" array
-	 * @return
-	 *  number of descs processed, negative value means error
-	 */
-	int32_t (*transfer_data)(int vid, uint16_t queue_id,
-		struct rte_vhost_async_desc *descs,
-		struct rte_vhost_async_status *opaque_data,
-		uint16_t count);
-	/**
-	 * check copy-completed packets from the async engine
-	 * @param vid
-	 *  id of vhost device to check copy completion
-	 * @param queue_id
-	 *  queue id to check copy completion
-	 * @param opaque_data
-	 *  buffer to receive the opaque data pair from DMA engine
-	 * @param max_packets
-	 *  max number of packets could be completed
-	 * @return
-	 *  number of async descs completed, negative value means error
-	 */
-	int32_t (*check_completed_copies)(int vid, uint16_t queue_id,
-		struct rte_vhost_async_status *opaque_data,
-		uint16_t max_packets);
-};
-
 struct async_nethdr {
 	struct virtio_net_hdr hdr;
 	bool valid;
@@ -132,8 +93,7 @@ struct rte_vhost_async_config {
  */
 __rte_experimental
 int rte_vhost_async_channel_register(int vid, uint16_t queue_id,
-	struct rte_vhost_async_config config,
-	struct rte_vhost_async_channel_ops *ops);
+	struct rte_vhost_async_config config);
 
 /**
  * Unregister an async channel for a vhost queue
@@ -168,8 +128,7 @@ int rte_vhost_async_channel_unregister(int vid, uint16_t queue_id);
  */
 __rte_experimental
 int rte_vhost_async_channel_register_thread_unsafe(int vid, uint16_t queue_id,
-	struct rte_vhost_async_config config,
-	struct rte_vhost_async_channel_ops *ops);
+	struct rte_vhost_async_config config);
 
 /**
  * Unregister an async channel for a vhost queue without performing any
@@ -218,7 +177,7 @@ int rte_vhost_async_channel_unregister_thread_unsafe(int vid,
 __rte_experimental
 uint16_t rte_vhost_submit_enqueue_burst(int vid, uint16_t queue_id,
 		struct rte_mbuf **pkts, uint16_t count,
-		struct rte_mbuf **comp_pkts, uint32_t *comp_count);
+		struct rte_mbuf **comp_pkts, uint32_t *comp_count, int dmadev_id);
 
 /**
  * This function checks async completion status for a specific vhost
@@ -238,7 +197,7 @@ uint16_t rte_vhost_submit_enqueue_burst(int vid, uint16_t queue_id,
  */
 __rte_experimental
 uint16_t rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
-		struct rte_mbuf **pkts, uint16_t count);
+		struct rte_mbuf **pkts, uint16_t count, int dmadev_id);
 
 /**
  * This function returns the amount of in-flight packets for the vhost
@@ -274,7 +233,7 @@ int rte_vhost_async_get_inflight(int vid, uint16_t queue_id);
  */
 __rte_experimental
 uint16_t rte_vhost_clear_queue_thread_unsafe(int vid, uint16_t queue_id,
-		struct rte_mbuf **pkts, uint16_t count);
+		struct rte_mbuf **pkts, uint16_t count, int dmadev_id);
 /**
  * This function tries to receive packets from the guest with offloading
  * large copies to the async channel. The packets that are transfer completed
@@ -300,6 +259,6 @@ __rte_experimental
 uint16_t
 rte_vhost_async_try_dequeue_burst(int vid, uint16_t queue_id,
 	struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count,
-	int *nr_inflight);
+	int *nr_inflight, int dmadev_id);
 
 #endif /* _RTE_VHOST_ASYNC_H_ */
diff --git a/lib/vhost/vhost.c b/lib/vhost/vhost.c
index 355ff37651..3fdba5949a 100644
--- a/lib/vhost/vhost.c
+++ b/lib/vhost/vhost.c
@@ -340,6 +340,7 @@ cleanup_device(struct virtio_net *dev, int destroy)
 static void
 vhost_free_async_mem(struct vhost_virtqueue *vq)
 {
+	rte_free(vq->dma_completions);
 	rte_free(vq->async_pkts_info);
 
 	rte_free(vq->async_buffers_packed);
@@ -350,6 +351,7 @@ vhost_free_async_mem(struct vhost_virtqueue *vq)
 	rte_free(vq->it_pool);
 	rte_free(vq->vec_pool);
 
+	vq->dma_completions = NULL;
 	vq->async_pkts_info = NULL;
 	vq->it_pool = NULL;
 	vq->vec_pool = NULL;
@@ -1621,8 +1623,7 @@ int rte_vhost_extern_callback_register(int vid,
 
 static __rte_always_inline int
 async_channel_register(int vid, uint16_t queue_id,
-		struct rte_vhost_async_config config,
-		struct rte_vhost_async_channel_ops *ops)
+		struct rte_vhost_async_config config)
 {
 	struct virtio_net *dev = get_device(vid);
 	struct vhost_virtqueue *vq = dev->virtqueue[queue_id];
@@ -1691,8 +1692,17 @@ async_channel_register(int vid, uint16_t queue_id,
 		}
 	}
 
-	vq->async_ops.check_completed_copies = ops->check_completed_copies;
-	vq->async_ops.transfer_data = ops->transfer_data;
+	vq->dma_completions = rte_malloc_socket(NULL,
+		sizeof(struct dma_completions_t),
+		RTE_CACHE_LINE_SIZE, vq->numa_node);
+	if (!vq->dma_completions) {
+		vhost_free_async_mem(vq);
+		VHOST_LOG_CONFIG(ERR,
+			"async register failed: cannot allocate memory dma_completions ring "
+			"(vid %d, qid: %d)\n", vid, queue_id);
+		return -1;
+	}
+
 	vq->async_threshold = config.async_threshold;
 
 	vq->async_registered = true;
@@ -1702,14 +1712,13 @@ async_channel_register(int vid, uint16_t queue_id,
 
 int
 rte_vhost_async_channel_register(int vid, uint16_t queue_id,
-		struct rte_vhost_async_config config,
-		struct rte_vhost_async_channel_ops *ops)
+		struct rte_vhost_async_config config)
 {
 	struct vhost_virtqueue *vq;
 	struct virtio_net *dev = get_device(vid);
 	int ret;
 
-	if (dev == NULL || ops == NULL)
+	if (dev == NULL)
 		return -1;
 
 	if (queue_id >= VHOST_MAX_VRING)
@@ -1727,12 +1736,8 @@ rte_vhost_async_channel_register(int vid, uint16_t queue_id,
 		return -1;
 	}
 
-	if (unlikely(ops->check_completed_copies == NULL ||
-		ops->transfer_data == NULL))
-		return -1;
-
 	rte_spinlock_lock(&vq->access_lock);
-	ret = async_channel_register(vid, queue_id, config, ops);
+	ret = async_channel_register(vid, queue_id, config);
 	rte_spinlock_unlock(&vq->access_lock);
 
 	return ret;
@@ -1740,13 +1745,12 @@ rte_vhost_async_channel_register(int vid, uint16_t queue_id,
 
 int
 rte_vhost_async_channel_register_thread_unsafe(int vid, uint16_t queue_id,
-		struct rte_vhost_async_config config,
-		struct rte_vhost_async_channel_ops *ops)
+		struct rte_vhost_async_config config)
 {
 	struct vhost_virtqueue *vq;
 	struct virtio_net *dev = get_device(vid);
 
-	if (dev == NULL || ops == NULL)
+	if (dev == NULL)
 		return -1;
 
 	if (queue_id >= VHOST_MAX_VRING)
@@ -1764,11 +1768,7 @@ rte_vhost_async_channel_register_thread_unsafe(int vid, uint16_t queue_id,
 		return -1;
 	}
 
-	if (unlikely(ops->check_completed_copies == NULL ||
-		ops->transfer_data == NULL))
-		return -1;
-
-	return async_channel_register(vid, queue_id, config, ops);
+	return async_channel_register(vid, queue_id, config);
 }
 
 int
@@ -1808,9 +1808,6 @@ rte_vhost_async_channel_unregister(int vid, uint16_t queue_id)
 	}
 
 	vhost_free_async_mem(vq);
-
-	vq->async_ops.transfer_data = NULL;
-	vq->async_ops.check_completed_copies = NULL;
 	vq->async_registered = false;
 
 out:
@@ -1846,9 +1843,6 @@ rte_vhost_async_channel_unregister_thread_unsafe(int vid, uint16_t queue_id)
 	}
 
 	vhost_free_async_mem(vq);
-
-	vq->async_ops.transfer_data = NULL;
-	vq->async_ops.check_completed_copies = NULL;
 	vq->async_registered = false;
 
 	return 0;
diff --git a/lib/vhost/vhost.h b/lib/vhost/vhost.h
index a2309b06cd..2c996c4414 100644
--- a/lib/vhost/vhost.h
+++ b/lib/vhost/vhost.h
@@ -120,6 +120,26 @@ struct vring_used_elem_packed {
 	uint32_t count;
 };
 
+
+/* vHost async DMADEV ring size. */
+#define VHOST_ASYNC_DMADEV_RING_SIZE 4096
+
+#define DMA_COMPLETION_RING_SIZE VHOST_ASYNC_DMADEV_RING_SIZE
+
+struct enq_info_t{
+    uint8_t pkt_rcvd; //Make this atomic
+};
+
+/* DMA completion tracking ring to reorder the packets.
+ * The write's to the enq_info array should be atomic
+ * to guarantee correct behaviour. */
+struct dma_completions_t {
+    struct enq_info_t enq_info[DMA_COMPLETION_RING_SIZE];
+    uint16_t count;
+    uint16_t read_idx;
+    uint16_t write_idx;
+};
+
 /**
  * Structure contains variables relevant to RX/TX virtqueues.
  */
@@ -194,9 +214,6 @@ struct vhost_virtqueue {
 	struct rte_vhost_resubmit_info *resubmit_inflight;
 	uint64_t		global_counter;
 
-	/* operation callbacks for async dma */
-	struct rte_vhost_async_channel_ops	async_ops;
-
 	struct rte_vhost_iov_iter *it_pool;
 	struct iovec *vec_pool;
 
@@ -221,6 +238,7 @@ struct vhost_virtqueue {
 	/* vq async features */
 	bool		async_registered;
 	uint32_t	async_threshold;
+	struct dma_completions_t *dma_completions;
 
 	int			notif_enable;
 #define VIRTIO_UNINITIALIZED_NOTIF	(-1)
diff --git a/lib/vhost/virtio_net.c b/lib/vhost/virtio_net.c
index c69dc35988..5b1209bb91 100644
--- a/lib/vhost/virtio_net.c
+++ b/lib/vhost/virtio_net.c
@@ -11,6 +11,7 @@
 #include <rte_net.h>
 #include <rte_ether.h>
 #include <rte_ip.h>
+#include <rte_dmadev.h>
 #include <rte_vhost.h>
 #include <rte_tcp.h>
 #include <rte_udp.h>
@@ -1588,6 +1589,227 @@ rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
 	return virtio_dev_rx(dev, queue_id, pkts, count);
 }
 
+
+/* Checks if the dma_completion ring is full. */
+static inline bool
+is_compl_ring_full(struct dma_completions_t *dma_compl)
+{
+    return dma_compl->count == DMA_COMPLETION_RING_SIZE;
+}
+
+/* Checks if the dma_completion ring is empty. */
+static inline bool
+is_compl_ring_empty(struct dma_completions_t *dma_compl)
+{
+    return dma_compl->count == 0;
+}
+
+static void *dmadev_enq_track[RTE_DMADEV_MAX_DEVS][VHOST_ASYNC_DMADEV_RING_SIZE];
+
+/* Enqueue a packet via DMA. */
+static inline void
+dmadev_enqueue_packet(const uint16_t dev_id,
+                    const struct rte_vhost_iov_iter *src_ptr,
+                    const struct rte_vhost_iov_iter *dst_ptr,
+                    const uint16_t nr_segs,
+                    struct enq_info_t *slot_addr)
+{
+    uint16_t seg_idx = 0;
+    struct enq_info_t *addr = NULL;
+    uint64_t dma_flags = RTE_DMA_OP_FLAG_LLC;
+    const uint16_t dmadev_ring_mask = VHOST_ASYNC_DMADEV_RING_SIZE-1;
+
+    while (likely(seg_idx < nr_segs)) {
+        /* Fetch DMA source start addr. */
+        const rte_iova_t s_base = (uintptr_t)(src_ptr->iov[seg_idx].iov_base);
+        const rte_iova_t dma_src_start_addr = src_ptr->offset + s_base;
+        /* Fetch DMA destination start addr. */
+        const rte_iova_t d_base = (uintptr_t)(dst_ptr->iov[seg_idx].iov_base);
+        const rte_iova_t dma_dst_start_addr = dst_ptr->offset + d_base;
+        /* Fetch packet segment length. */
+        const uint32_t dma_src_len = src_ptr->iov[seg_idx].iov_len;
+        /* Check if this segment is the last. */
+        if (seg_idx == nr_segs - 1) {
+            addr = slot_addr;
+        }
+
+        int enq_index = rte_dmadev_copy(dev_id,
+                                        0,
+                                        dma_src_start_addr,
+                                        dma_dst_start_addr,
+                                        dma_src_len,
+                                        dma_flags);
+        if (enq_index < 0)
+            break;
+        dmadev_enq_track[dev_id][enq_index & dmadev_ring_mask] = (void *)addr;
+        seg_idx++;
+    }
+}
+
+/* Enqueue a packet through SW copy. */
+static inline void
+sw_enqueue_packet(const struct rte_vhost_iov_iter *src_ptr,
+                  const struct rte_vhost_iov_iter *dst_ptr,
+                  const uint16_t nr_segs)
+{
+    uint16_t seg_idx = 0;
+
+    while (likely(seg_idx < nr_segs)) {
+        /* Fetch source start addr. */
+        const uintptr_t s_base = (uintptr_t)(src_ptr->iov[seg_idx].iov_base);
+        const uintptr_t src_start_addr = src_ptr->offset + s_base;
+        /* Fetch destination start addr. */
+        const uintptr_t d_base = (uintptr_t)(dst_ptr->iov[seg_idx].iov_base);
+        const uintptr_t dst_start_addr = dst_ptr->offset + d_base;
+        /* Fetch segment length. */
+        const size_t src_len = src_ptr->iov[seg_idx].iov_len;
+
+        rte_memcpy((void *) dst_start_addr,
+                   (void *) src_start_addr,
+                   src_len);
+        seg_idx++;
+    }
+}
+
+/* Fetch the slot address for a packet. */
+static inline struct enq_info_t *
+compl_slot_get_and_inc(struct dma_completions_t *dma_compl)
+{
+    struct enq_info_t *slot_addr
+                        = &(dma_compl->enq_info[dma_compl->write_idx]);
+    const uint16_t ring_mask = DMA_COMPLETION_RING_SIZE - 1;
+
+    dma_compl->write_idx++;
+    dma_compl->write_idx &= ring_mask;
+    dma_compl->count++;
+    return slot_addr;
+}
+
+/* Calculate packets sent for a txq by parsing dma_completion ring. */
+static inline uint32_t
+count_completed_packets(struct dma_completions_t *dma_compl,
+                        const int max_pkts)
+{
+    uint32_t pkts;
+    int count = dma_compl->count;
+    int read_idx = dma_compl->read_idx;
+    uint8_t pkt_rcvd = 0;
+    const uint16_t ring_mask = DMA_COMPLETION_RING_SIZE - 1;
+
+    for (pkts = 0; (pkts < (uint32_t)max_pkts) && (count > 0); pkts++) {
+        read_idx &= ring_mask;
+        pkt_rcvd = dma_compl->enq_info[read_idx].pkt_rcvd;
+        if (!pkt_rcvd) {
+            break;
+        }
+
+        dma_compl->enq_info[read_idx].pkt_rcvd = 0;
+        count--;
+        read_idx++;
+    }
+    dma_compl->count = count;
+    dma_compl->read_idx = read_idx;
+    return pkts;
+}
+
+/* Offload enqueue via DMA. */
+static int32_t
+dmadev_transfer_data(int dev_id,
+                     struct dma_completions_t *compl,
+                     struct rte_vhost_async_desc *descs,
+                     uint16_t count)
+{
+    uint16_t desc_idx = 0;
+    struct enq_info_t *slot_addr = NULL;
+
+    if (is_compl_ring_full(compl)) {
+        goto out;
+    }
+
+    /* Cache space left in DMA ring to avoid driver call for every packet. */
+    uint16_t dmadev_space_left = rte_dmadev_burst_capacity(dev_id);
+    const int compl_space_left = DMA_COMPLETION_RING_SIZE - compl->count;
+    if (count > compl_space_left) {
+        count = compl_space_left;
+    }
+
+    while (desc_idx < count) {
+        const struct rte_vhost_iov_iter *src_ptr = descs[desc_idx].src;
+        const struct rte_vhost_iov_iter *dst_ptr = descs[desc_idx].dst;
+        const uint16_t nr_segs = src_ptr->nr_segs;
+        if (dmadev_space_left < nr_segs) {
+            goto ring_doorbell;
+        }
+        slot_addr = compl_slot_get_and_inc(compl);
+        dmadev_enqueue_packet(dev_id, src_ptr, dst_ptr, nr_segs, slot_addr);
+        dmadev_space_left -= nr_segs;
+        desc_idx++;
+    }
+
+ring_doorbell:
+    if (desc_idx != 0) {
+        /* Ring the doorbell. */
+        rte_dmadev_submit(dev_id, 0);
+    }
+
+    /* Do software copy for packets that do no fit in the DMA ring. */
+    while (desc_idx < count) {
+        const struct rte_vhost_iov_iter *src_ptr = descs[desc_idx].src;
+        const struct rte_vhost_iov_iter *dst_ptr = descs[desc_idx].dst;
+        slot_addr = compl_slot_get_and_inc(compl);
+        sw_enqueue_packet(src_ptr, dst_ptr, src_ptr->nr_segs);
+        slot_addr->pkt_rcvd = 1;
+        desc_idx++;
+    }
+
+out:
+    return desc_idx;
+}
+
+/* Query transfer status of DMA. */
+static int32_t
+dmadev_check_completed_copies(int dev_id,
+                              struct dma_completions_t *compl,
+                              uint16_t max_pkts)
+{
+    bool error;
+    uint16_t last_idx;
+    uint32_t nr_pkts = 0;
+    struct enq_info_t *slots;
+    const uint16_t mask = VHOST_ASYNC_DMADEV_RING_SIZE-1;
+
+    if (unlikely(is_compl_ring_empty(compl))) {
+        goto out;
+    }
+
+    /* Check the completion status of DMA. */
+    const int ret_segs = rte_dmadev_completed(dev_id,
+                                              0,
+                                              MAX_PKT_BURST,
+                                              &last_idx,
+                                              &error);
+    if (unlikely(error)) {
+        return -1;
+    }
+    /* Compute the start index. */
+    uint16_t idx = (last_idx - ret_segs + 1);
+    for (int i = 0; i < ret_segs; i++) {
+        slots = (struct enq_info_t* )dmadev_enq_track[dev_id][idx & mask];
+        if (slots) {
+            /* Mark the packet slot as recieved.
+             * The slot could belong to another queue but writes are atomic. */
+            slots->pkt_rcvd = 1;
+        }
+        idx++;
+    }
+    /* Calculate packets successfully offloaded from this virtqueue. */
+    nr_pkts = count_completed_packets(compl, max_pkts);
+
+out:
+    return nr_pkts;
+}
+
+
 static __rte_always_inline uint16_t
 virtio_dev_rx_async_get_info_idx(uint16_t pkts_idx,
 	uint16_t vq_size, uint16_t n_inflight)
@@ -1631,9 +1853,9 @@ store_dma_desc_info_packed(struct vring_used_elem_packed *s_ring,
 
 static __rte_noinline uint32_t
 virtio_dev_rx_async_submit_split(struct virtio_net *dev,
-	struct vhost_virtqueue *vq, uint16_t queue_id,
+	struct vhost_virtqueue *vq, uint16_t queue_id __rte_unused,
 	struct rte_mbuf **pkts, uint32_t count,
-	struct rte_mbuf **comp_pkts, uint32_t *comp_count)
+	struct rte_mbuf **comp_pkts, uint32_t *comp_count, int dmadev_id)
 {
 	uint32_t pkt_idx = 0, pkt_burst_idx = 0;
 	uint16_t num_buffers;
@@ -1732,8 +1954,8 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 		if (unlikely(pkt_burst_idx >= VHOST_ASYNC_BATCH_THRESHOLD ||
 			((VHOST_MAX_ASYNC_VEC >> 1) - segs_await <
 			BUF_VECTOR_MAX))) {
-			n_xfer = vq->async_ops.transfer_data(dev->vid,
-					queue_id, tdes, 0, pkt_burst_idx);
+			n_xfer = dmadev_transfer_data(dmadev_id, vq->dma_completions, tdes,
+							pkt_burst_idx);
 			if (n_xfer >= 0) {
 				n_pkts = n_xfer;
 			} else {
@@ -1765,7 +1987,8 @@ virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 	}
 
 	if (pkt_burst_idx) {
-		n_xfer = vq->async_ops.transfer_data(dev->vid, queue_id, tdes, 0, pkt_burst_idx);
+		n_xfer = dmadev_transfer_data(dmadev_id, vq->dma_completions, tdes,
+					pkt_burst_idx);
 		if (n_xfer >= 0) {
 			n_pkts = n_xfer;
 		} else {
@@ -2013,9 +2236,9 @@ dma_error_handler_packed(struct vhost_virtqueue *vq, struct vring_packed_desc *a
 
 static __rte_noinline uint32_t
 virtio_dev_rx_async_submit_packed(struct virtio_net *dev,
-	struct vhost_virtqueue *vq, uint16_t queue_id,
+	struct vhost_virtqueue *vq, uint16_t queue_id  __rte_unused,
 	struct rte_mbuf **pkts, uint32_t count,
-	struct rte_mbuf **comp_pkts, uint32_t *comp_count)
+	struct rte_mbuf **comp_pkts, uint32_t *comp_count, int dmadev_id)
 {
 	uint32_t pkt_idx = 0, pkt_burst_idx = 0;
 	uint32_t remained = count;
@@ -2105,8 +2328,8 @@ virtio_dev_rx_async_submit_packed(struct virtio_net *dev,
 		 */
 		if (unlikely(pkt_burst_idx >= VHOST_ASYNC_BATCH_THRESHOLD ||
 			((VHOST_MAX_ASYNC_VEC >> 1) - segs_await < BUF_VECTOR_MAX))) {
-			n_xfer = vq->async_ops.transfer_data(dev->vid,
-					queue_id, tdes, 0, pkt_burst_idx);
+			n_xfer = dmadev_transfer_data(dmadev_id, vq->dma_completions, tdes,
+						pkt_burst_idx);
 			if (n_xfer >= 0) {
 				n_pkts = n_xfer;
 			} else {
@@ -2137,7 +2360,9 @@ virtio_dev_rx_async_submit_packed(struct virtio_net *dev,
 	} while (pkt_idx < count);
 
 	if (pkt_burst_idx) {
-		n_xfer = vq->async_ops.transfer_data(dev->vid, queue_id, tdes, 0, pkt_burst_idx);
+		n_xfer = dmadev_transfer_data(dmadev_id, vq->dma_completions, tdes,
+						pkt_burst_idx);
+
 		if (n_xfer >= 0) {
 			n_pkts = n_xfer;
 		} else {
@@ -2225,7 +2450,7 @@ write_back_completed_descs_packed(struct vhost_virtqueue *vq,
 
 static __rte_always_inline uint16_t
 vhost_poll_enqueue_completed(struct virtio_net *dev, uint16_t queue_id,
-		struct rte_mbuf **pkts, uint16_t count)
+		struct rte_mbuf **pkts, uint16_t count, int dmadev_id)
 {
 	struct vhost_virtqueue *vq;
 	uint16_t n_pkts_cpl = 0, n_pkts_put = 0, n_descs = 0, n_buffers = 0;
@@ -2243,8 +2468,9 @@ vhost_poll_enqueue_completed(struct virtio_net *dev, uint16_t queue_id,
 		vq_size, vq->async_pkts_inflight_n);
 
 	if (count > vq->async_last_pkts_n) {
-		n_cpl = vq->async_ops.check_completed_copies(dev->vid,
-			queue_id, 0, count - vq->async_last_pkts_n);
+		n_cpl = dmadev_check_completed_copies(dmadev_id, vq->dma_completions,
+							count - vq->async_last_pkts_n);
+
 		if (n_cpl >= 0) {
 			n_pkts_cpl = n_cpl;
 		} else {
@@ -2306,7 +2532,7 @@ vhost_poll_enqueue_completed(struct virtio_net *dev, uint16_t queue_id,
 
 uint16_t
 rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
-		struct rte_mbuf **pkts, uint16_t count)
+		struct rte_mbuf **pkts, uint16_t count, int dmadev_id)
 {
 	struct virtio_net *dev = get_device(vid);
 	struct vhost_virtqueue *vq;
@@ -2332,7 +2558,7 @@ rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
 
 	rte_spinlock_lock(&vq->access_lock);
 
-	n_pkts_cpl = vhost_poll_enqueue_completed(dev, queue_id, pkts, count);
+	n_pkts_cpl = vhost_poll_enqueue_completed(dev, queue_id, pkts, count, dmadev_id);
 
 	rte_spinlock_unlock(&vq->access_lock);
 
@@ -2341,7 +2567,7 @@ rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
 
 uint16_t
 rte_vhost_clear_queue_thread_unsafe(int vid, uint16_t queue_id,
-		struct rte_mbuf **pkts, uint16_t count)
+		struct rte_mbuf **pkts, uint16_t count, int dmadev_id)
 {
 	struct virtio_net *dev = get_device(vid);
 	struct vhost_virtqueue *vq;
@@ -2365,7 +2591,7 @@ rte_vhost_clear_queue_thread_unsafe(int vid, uint16_t queue_id,
 		return 0;
 	}
 
-	n_pkts_cpl = vhost_poll_enqueue_completed(dev, queue_id, pkts, count);
+	n_pkts_cpl = vhost_poll_enqueue_completed(dev, queue_id, pkts, count, dmadev_id);
 
 	return n_pkts_cpl;
 }
@@ -2373,7 +2599,7 @@ rte_vhost_clear_queue_thread_unsafe(int vid, uint16_t queue_id,
 static __rte_always_inline uint32_t
 virtio_dev_rx_async_submit(struct virtio_net *dev, uint16_t queue_id,
 	struct rte_mbuf **pkts, uint32_t count,
-	struct rte_mbuf **comp_pkts, uint32_t *comp_count)
+	struct rte_mbuf **comp_pkts, uint32_t *comp_count, int dmadev_id)
 {
 	struct vhost_virtqueue *vq;
 	uint32_t nb_tx = 0;
@@ -2406,11 +2632,11 @@ virtio_dev_rx_async_submit(struct virtio_net *dev, uint16_t queue_id,
 	if (vq_is_packed(dev))
 		nb_tx = virtio_dev_rx_async_submit_packed(dev,
 				vq, queue_id, pkts, count, comp_pkts,
-				comp_count);
+				comp_count, dmadev_id);
 	else
 		nb_tx = virtio_dev_rx_async_submit_split(dev,
 				vq, queue_id, pkts, count, comp_pkts,
-				comp_count);
+				comp_count, dmadev_id);
 
 out:
 	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
@@ -2425,7 +2651,7 @@ virtio_dev_rx_async_submit(struct virtio_net *dev, uint16_t queue_id,
 uint16_t
 rte_vhost_submit_enqueue_burst(int vid, uint16_t queue_id,
 		struct rte_mbuf **pkts, uint16_t count,
-		struct rte_mbuf **comp_pkts, uint32_t *comp_count)
+		struct rte_mbuf **comp_pkts, uint32_t *comp_count, int dmadev_id)
 {
 	struct virtio_net *dev = get_device(vid);
 
@@ -2441,7 +2667,7 @@ rte_vhost_submit_enqueue_burst(int vid, uint16_t queue_id,
 	}
 
 	return virtio_dev_rx_async_submit(dev, queue_id, pkts, count, comp_pkts,
-			comp_count);
+			comp_count, dmadev_id);
 }
 
 static inline bool
@@ -3631,9 +3857,10 @@ async_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
 }
 
 static __rte_always_inline uint16_t
-async_poll_dequeue_completed_split(struct virtio_net *dev,
-		struct vhost_virtqueue *vq, uint16_t queue_id,
-		struct rte_mbuf **pkts, uint16_t count, bool legacy_ol_flags)
+async_poll_dequeue_completed_split(struct virtio_net *dev  __rte_unused,
+		struct vhost_virtqueue *vq, uint16_t queue_id  __rte_unused,
+		struct rte_mbuf **pkts, uint16_t count, bool legacy_ol_flags,
+		int dmadev_id)
 {
 	uint16_t n_pkts_cpl = 0, n_pkts_put = 0;
 	uint16_t start_idx, pkt_idx, from;
@@ -3646,9 +3873,9 @@ async_poll_dequeue_completed_split(struct virtio_net *dev,
 
 	if (count > vq->async_last_pkts_n) {
 		int ret;
+		ret = dmadev_check_completed_copies(dmadev_id, vq->dma_completions,
+							count - vq->async_last_pkts_n);
 
-		ret = vq->async_ops.check_completed_copies(dev->vid, queue_id,
-				0, count - vq->async_last_pkts_n);
 		if (unlikely(ret < 0)) {
 			VHOST_LOG_DATA(ERR, "(%d) async channel poll error\n", dev->vid);
 			ret = 0;
@@ -3688,7 +3915,7 @@ static __rte_always_inline uint16_t
 virtio_dev_tx_async_split(struct virtio_net *dev,
 		struct vhost_virtqueue *vq, uint16_t queue_id,
 		struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts,
-		uint16_t count, bool legacy_ol_flags)
+		uint16_t count, bool legacy_ol_flags, int dmadev_id)
 {
 	static bool allocerr_warned;
 	uint16_t free_entries;
@@ -3802,16 +4029,14 @@ virtio_dev_tx_async_split(struct virtio_net *dev,
 					 iovec_idx < BUF_VECTOR_MAX))) {
 			uint16_t nr_pkts;
 			int32_t ret;
-
-			ret = vq->async_ops.transfer_data(dev->vid, queue_id,
-					tdes, 0, nr_async_burst);
+			ret = dmadev_transfer_data(dmadev_id, vq->dma_completions,
+								tdes, nr_async_burst);
 			if (unlikely(ret < 0)) {
 				VHOST_LOG_DATA(ERR, "(%d) async channel submit"
 						" error\n", dev->vid);
 				ret = 0;
 			}
 			nr_pkts = ret;
-
 			vq->async_pkts_inflight_n += nr_pkts;
 			it_idx = 0;
 			iovec_idx = 0;
@@ -3828,16 +4053,14 @@ virtio_dev_tx_async_split(struct virtio_net *dev,
 	if (nr_async_burst) {
 		uint16_t nr_pkts;
 		int32_t ret;
-
-		ret = vq->async_ops.transfer_data(dev->vid, queue_id,
-				tdes, 0, nr_async_burst);
+		ret = dmadev_transfer_data(dmadev_id, vq->dma_completions, tdes,
+							nr_async_burst);
 		if (unlikely(ret < 0)) {
 			VHOST_LOG_DATA(ERR, "(%d) async channel submit error\n",
 					dev->vid);
 			ret = 0;
 		}
 		nr_pkts = ret;
-
 		vq->async_pkts_inflight_n += nr_pkts;
 
 		if (unlikely(nr_pkts < nr_async_burst))
@@ -3886,7 +4109,7 @@ virtio_dev_tx_async_split(struct virtio_net *dev,
 	if (nr_done_pkts < count && vq->async_pkts_inflight_n > 0) {
 		nr_done_pkts += async_poll_dequeue_completed_split(dev, vq,
 					queue_id, &pkts[nr_done_pkts],
-					count - nr_done_pkts, legacy_ol_flags);
+					count - nr_done_pkts, legacy_ol_flags, dmadev_id);
 	}
 
 	if (likely(nr_done_pkts))
@@ -3900,10 +4123,10 @@ static uint16_t
 virtio_dev_tx_async_split_legacy(struct virtio_net *dev,
 		struct vhost_virtqueue *vq, uint16_t queue_id,
 		struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts,
-		uint16_t count)
+		uint16_t count, int dmadev_id)
 {
 	return virtio_dev_tx_async_split(dev, vq, queue_id, mbuf_pool,
-				pkts, count, true);
+				pkts, count, true, dmadev_id);
 }
 
 __rte_noinline
@@ -3911,16 +4134,16 @@ static uint16_t
 virtio_dev_tx_async_split_compliant(struct virtio_net *dev,
 		struct vhost_virtqueue *vq, uint16_t queue_id,
 		struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts,
-		uint16_t count)
+		uint16_t count, int dmadev_id)
 {
 	return virtio_dev_tx_async_split(dev, vq, queue_id, mbuf_pool,
-				pkts, count, false);
+				pkts, count, false, dmadev_id);
 }
 
 uint16_t
 rte_vhost_async_try_dequeue_burst(int vid, uint16_t queue_id,
 	struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count,
-	int *nr_inflight)
+	int *nr_inflight, int dmadev_id)
 {
 	struct virtio_net *dev;
 	struct rte_mbuf *rarp_mbuf = NULL;
@@ -4007,10 +4230,10 @@ rte_vhost_async_try_dequeue_burst(int vid, uint16_t queue_id,
 
 	if (dev->flags & VIRTIO_DEV_LEGACY_OL_FLAGS)
 		count = virtio_dev_tx_async_split_legacy(dev, vq, queue_id,
-				mbuf_pool, pkts, count);
+				mbuf_pool, pkts, count, dmadev_id);
 	else
 		count = virtio_dev_tx_async_split_compliant(dev, vq, queue_id,
-				mbuf_pool, pkts, count);
+				mbuf_pool, pkts, count, dmadev_id);
 
 out:
 	*nr_inflight = vq->async_pkts_inflight_n;