From patchwork Wed Jul 21 14:20:48 2021 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Ma, WenwuX" X-Patchwork-Id: 96119 X-Patchwork-Delegate: maxime.coquelin@redhat.com Return-Path: X-Original-To: patchwork@inbox.dpdk.org Delivered-To: patchwork@inbox.dpdk.org Received: from mails.dpdk.org (mails.dpdk.org [217.70.189.124]) by inbox.dpdk.org (Postfix) with ESMTP id 5AAF0A0C50; Wed, 21 Jul 2021 04:28:30 +0200 (CEST) Received: from [217.70.189.124] (localhost [127.0.0.1]) by mails.dpdk.org (Postfix) with ESMTP id 74239410DB; Wed, 21 Jul 2021 04:28:27 +0200 (CEST) Received: from mga01.intel.com (mga01.intel.com [192.55.52.88]) by mails.dpdk.org (Postfix) with ESMTP id EB3274014E for ; Wed, 21 Jul 2021 04:28:23 +0200 (CEST) X-IronPort-AV: E=McAfee;i="6200,9189,10051"; a="233152389" X-IronPort-AV: E=Sophos;i="5.84,256,1620716400"; d="scan'208";a="233152389" Received: from fmsmga002.fm.intel.com ([10.253.24.26]) by fmsmga101.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 20 Jul 2021 19:28:23 -0700 X-IronPort-AV: E=Sophos;i="5.84,256,1620716400"; d="scan'208";a="511210458" Received: from unknown (HELO localhost.localdomain) ([10.240.183.109]) by fmsmga002-auth.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 20 Jul 2021 19:28:21 -0700 From: Wenwu Ma To: dev@dpdk.org Cc: maxime.coquelin@redhat.com, chenbo.xia@intel.com, cheng1.jiang@intel.com, jiayu.hu@intel.com, Wenwu Ma Date: Wed, 21 Jul 2021 14:20:48 +0000 Message-Id: <20210721142051.29327-2-wenwux.ma@intel.com> X-Mailer: git-send-email 2.25.1 In-Reply-To: <20210721142051.29327-1-wenwux.ma@intel.com> References: <20210602083110.5530-1-yuanx.wang@intel.com> <20210721142051.29327-1-wenwux.ma@intel.com> MIME-Version: 1.0 Subject: [dpdk-dev] [PATCH v7 1/4] examples/vhost: refactor vhost enqueue and dequeue datapaths X-BeenThere: dev@dpdk.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: DPDK patches and discussions List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: dev-bounces@dpdk.org Sender: "dev" Previously, by judging the flag, we call different enqueue/dequeue functions in data path. Now, we use an ops that was initialized when Vhost was created, so that we can call ops directly in Vhost data path without any more flag judgment. Signed-off-by: Wenwu Ma Reviewed-by: Maxime Coquelin --- examples/vhost/main.c | 112 ++++++++++++++++++++---------------- examples/vhost/main.h | 33 +++++++++-- examples/vhost/virtio_net.c | 16 +++++- 3 files changed, 105 insertions(+), 56 deletions(-) diff --git a/examples/vhost/main.c b/examples/vhost/main.c index d2179eadb9..aebdc3a566 100644 --- a/examples/vhost/main.c +++ b/examples/vhost/main.c @@ -106,6 +106,8 @@ static uint32_t burst_rx_retry_num = BURST_RX_RETRIES; static char *socket_files; static int nb_sockets; +static struct vhost_queue_ops vdev_queue_ops[MAX_VHOST_DEVICE]; + /* empty vmdq configuration structure. Filled in programatically */ static struct rte_eth_conf vmdq_conf_default = { .rxmode = { @@ -885,27 +887,8 @@ drain_vhost(struct vhost_dev *vdev) uint16_t nr_xmit = vhost_txbuff[buff_idx]->len; struct rte_mbuf **m = vhost_txbuff[buff_idx]->m_table; - if (builtin_net_driver) { - ret = vs_enqueue_pkts(vdev, VIRTIO_RXQ, m, nr_xmit); - } else if (async_vhost_driver) { - uint32_t cpu_cpl_nr = 0; - uint16_t enqueue_fail = 0; - struct rte_mbuf *m_cpu_cpl[nr_xmit]; - - complete_async_pkts(vdev); - ret = rte_vhost_submit_enqueue_burst(vdev->vid, VIRTIO_RXQ, - m, nr_xmit, m_cpu_cpl, &cpu_cpl_nr); - - if (cpu_cpl_nr) - free_pkts(m_cpu_cpl, cpu_cpl_nr); - - enqueue_fail = nr_xmit - ret; - if (enqueue_fail) - free_pkts(&m[ret], nr_xmit - ret); - } else { - ret = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ, - m, nr_xmit); - } + ret = vdev_queue_ops[vdev->vid].enqueue_pkt_burst(vdev, + VIRTIO_RXQ, m, nr_xmit); if (enable_stats) { __atomic_add_fetch(&vdev->stats.rx_total_atomic, nr_xmit, @@ -1184,6 +1167,36 @@ drain_mbuf_table(struct mbuf_table *tx_q) } } +uint16_t +async_enqueue_pkts(struct vhost_dev *vdev, uint16_t queue_id, + struct rte_mbuf **pkts, uint32_t rx_count) +{ + uint16_t enqueue_count; + uint32_t cpu_cpl_nr = 0; + uint16_t enqueue_fail = 0; + struct rte_mbuf *m_cpu_cpl[MAX_PKT_BURST]; + + complete_async_pkts(vdev); + enqueue_count = rte_vhost_submit_enqueue_burst(vdev->vid, + queue_id, pkts, rx_count, + m_cpu_cpl, &cpu_cpl_nr); + if (cpu_cpl_nr) + free_pkts(m_cpu_cpl, cpu_cpl_nr); + + enqueue_fail = rx_count - enqueue_count; + if (enqueue_fail) + free_pkts(&pkts[enqueue_count], enqueue_fail); + + return enqueue_count; +} + +uint16_t +sync_enqueue_pkts(struct vhost_dev *vdev, uint16_t queue_id, + struct rte_mbuf **pkts, uint32_t rx_count) +{ + return rte_vhost_enqueue_burst(vdev->vid, queue_id, pkts, rx_count); +} + static __rte_always_inline void drain_eth_rx(struct vhost_dev *vdev) { @@ -1214,29 +1227,8 @@ drain_eth_rx(struct vhost_dev *vdev) } } - if (builtin_net_driver) { - enqueue_count = vs_enqueue_pkts(vdev, VIRTIO_RXQ, - pkts, rx_count); - } else if (async_vhost_driver) { - uint32_t cpu_cpl_nr = 0; - uint16_t enqueue_fail = 0; - struct rte_mbuf *m_cpu_cpl[MAX_PKT_BURST]; - - complete_async_pkts(vdev); - enqueue_count = rte_vhost_submit_enqueue_burst(vdev->vid, - VIRTIO_RXQ, pkts, rx_count, - m_cpu_cpl, &cpu_cpl_nr); - if (cpu_cpl_nr) - free_pkts(m_cpu_cpl, cpu_cpl_nr); - - enqueue_fail = rx_count - enqueue_count; - if (enqueue_fail) - free_pkts(&pkts[enqueue_count], enqueue_fail); - - } else { - enqueue_count = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ, - pkts, rx_count); - } + enqueue_count = vdev_queue_ops[vdev->vid].enqueue_pkt_burst(vdev, + VIRTIO_RXQ, pkts, rx_count); if (enable_stats) { __atomic_add_fetch(&vdev->stats.rx_total_atomic, rx_count, @@ -1249,6 +1241,14 @@ drain_eth_rx(struct vhost_dev *vdev) free_pkts(pkts, rx_count); } +uint16_t sync_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id, + struct rte_mempool *mbuf_pool, + struct rte_mbuf **pkts, uint16_t count) +{ + return rte_vhost_dequeue_burst(dev->vid, queue_id, + mbuf_pool, pkts, count); +} + static __rte_always_inline void drain_virtio_tx(struct vhost_dev *vdev) { @@ -1256,13 +1256,8 @@ drain_virtio_tx(struct vhost_dev *vdev) uint16_t count; uint16_t i; - if (builtin_net_driver) { - count = vs_dequeue_pkts(vdev, VIRTIO_TXQ, mbuf_pool, - pkts, MAX_PKT_BURST); - } else { - count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ, - mbuf_pool, pkts, MAX_PKT_BURST); - } + count = vdev_queue_ops[vdev->vid].dequeue_pkt_burst(vdev, + VIRTIO_TXQ, mbuf_pool, pkts, MAX_PKT_BURST); /* setup VMDq for the first packet */ if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) { @@ -1436,6 +1431,21 @@ new_device(int vid) } } + if (builtin_net_driver) { + vdev_queue_ops[vid].enqueue_pkt_burst = builtin_enqueue_pkts; + vdev_queue_ops[vid].dequeue_pkt_burst = builtin_dequeue_pkts; + } else { + if (async_vhost_driver) { + vdev_queue_ops[vid].enqueue_pkt_burst = + async_enqueue_pkts; + } else { + vdev_queue_ops[vid].enqueue_pkt_burst = + sync_enqueue_pkts; + } + + vdev_queue_ops[vid].dequeue_pkt_burst = sync_dequeue_pkts; + } + if (builtin_net_driver) vs_vhost_net_setup(vdev); diff --git a/examples/vhost/main.h b/examples/vhost/main.h index 0ccdce4b4a..7cd8a11a45 100644 --- a/examples/vhost/main.h +++ b/examples/vhost/main.h @@ -60,6 +60,19 @@ struct vhost_dev { struct vhost_queue queues[MAX_QUEUE_PAIRS * 2]; } __rte_cache_aligned; +typedef uint16_t (*vhost_enqueue_burst_t)(struct vhost_dev *dev, + uint16_t queue_id, struct rte_mbuf **pkts, + uint32_t count); + +typedef uint16_t (*vhost_dequeue_burst_t)(struct vhost_dev *dev, + uint16_t queue_id, struct rte_mempool *mbuf_pool, + struct rte_mbuf **pkts, uint16_t count); + +struct vhost_queue_ops { + vhost_enqueue_burst_t enqueue_pkt_burst; + vhost_dequeue_burst_t dequeue_pkt_burst; +}; + TAILQ_HEAD(vhost_dev_tailq_list, vhost_dev); @@ -84,9 +97,21 @@ struct lcore_info { void vs_vhost_net_setup(struct vhost_dev *dev); void vs_vhost_net_remove(struct vhost_dev *dev); uint16_t vs_enqueue_pkts(struct vhost_dev *dev, uint16_t queue_id, + struct rte_mbuf **pkts, uint32_t count); + +uint16_t builtin_enqueue_pkts(struct vhost_dev *dev, uint16_t queue_id, + struct rte_mbuf **pkts, uint32_t count); +uint16_t builtin_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id, + struct rte_mempool *mbuf_pool, + struct rte_mbuf **pkts, uint16_t count); +uint16_t sync_enqueue_pkts(struct vhost_dev *dev, uint16_t queue_id, struct rte_mbuf **pkts, uint32_t count); - -uint16_t vs_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id, - struct rte_mempool *mbuf_pool, - struct rte_mbuf **pkts, uint16_t count); +uint16_t sync_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id, + struct rte_mempool *mbuf_pool, + struct rte_mbuf **pkts, uint16_t count); +uint16_t async_enqueue_pkts(struct vhost_dev *dev, uint16_t queue_id, + struct rte_mbuf **pkts, uint32_t count); +uint16_t async_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id, + struct rte_mempool *mbuf_pool, + struct rte_mbuf **pkts, uint16_t count); #endif /* _MAIN_H_ */ diff --git a/examples/vhost/virtio_net.c b/examples/vhost/virtio_net.c index 9064fc3a82..2432a96566 100644 --- a/examples/vhost/virtio_net.c +++ b/examples/vhost/virtio_net.c @@ -238,6 +238,13 @@ vs_enqueue_pkts(struct vhost_dev *dev, uint16_t queue_id, return count; } +uint16_t +builtin_enqueue_pkts(struct vhost_dev *dev, uint16_t queue_id, + struct rte_mbuf **pkts, uint32_t count) +{ + return vs_enqueue_pkts(dev, queue_id, pkts, count); +} + static __rte_always_inline int dequeue_pkt(struct vhost_dev *dev, struct rte_vhost_vring *vr, struct rte_mbuf *m, uint16_t desc_idx, @@ -363,7 +370,7 @@ dequeue_pkt(struct vhost_dev *dev, struct rte_vhost_vring *vr, return 0; } -uint16_t +static uint16_t vs_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id, struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count) { @@ -440,3 +447,10 @@ vs_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id, return i; } + +uint16_t +builtin_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id, + struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count) +{ + return vs_dequeue_pkts(dev, queue_id, mbuf_pool, pkts, count); +} From patchwork Wed Jul 21 14:20:49 2021 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Ma, WenwuX" X-Patchwork-Id: 96120 X-Patchwork-Delegate: maxime.coquelin@redhat.com Return-Path: X-Original-To: patchwork@inbox.dpdk.org Delivered-To: patchwork@inbox.dpdk.org Received: from mails.dpdk.org (mails.dpdk.org [217.70.189.124]) by inbox.dpdk.org (Postfix) with ESMTP id 80683A0C50; Wed, 21 Jul 2021 04:28:36 +0200 (CEST) Received: from [217.70.189.124] (localhost [127.0.0.1]) by mails.dpdk.org (Postfix) with ESMTP id D4A36410F8; Wed, 21 Jul 2021 04:28:28 +0200 (CEST) Received: from mga01.intel.com (mga01.intel.com [192.55.52.88]) by mails.dpdk.org (Postfix) with ESMTP id 23F31410DA for ; Wed, 21 Jul 2021 04:28:25 +0200 (CEST) X-IronPort-AV: E=McAfee;i="6200,9189,10051"; a="233152393" X-IronPort-AV: E=Sophos;i="5.84,256,1620716400"; d="scan'208";a="233152393" Received: from fmsmga002.fm.intel.com ([10.253.24.26]) by fmsmga101.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 20 Jul 2021 19:28:25 -0700 X-IronPort-AV: E=Sophos;i="5.84,256,1620716400"; d="scan'208";a="511210470" Received: from unknown (HELO localhost.localdomain) ([10.240.183.109]) by fmsmga002-auth.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 20 Jul 2021 19:28:23 -0700 From: Wenwu Ma To: dev@dpdk.org Cc: maxime.coquelin@redhat.com, chenbo.xia@intel.com, cheng1.jiang@intel.com, jiayu.hu@intel.com, Wenwu Ma Date: Wed, 21 Jul 2021 14:20:49 +0000 Message-Id: <20210721142051.29327-3-wenwux.ma@intel.com> X-Mailer: git-send-email 2.25.1 In-Reply-To: <20210721142051.29327-1-wenwux.ma@intel.com> References: <20210602083110.5530-1-yuanx.wang@intel.com> <20210721142051.29327-1-wenwux.ma@intel.com> MIME-Version: 1.0 Subject: [dpdk-dev] [PATCH v7 2/4] examples/vhost: use a new API to query remaining ring space X-BeenThere: dev@dpdk.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: DPDK patches and discussions List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: dev-bounces@dpdk.org Sender: "dev" A new API for querying the remaining descriptor ring capacity is available, so we use the new one instead of the old one. Signed-off-by: Wenwu Ma Reviewed-by: Maxime Coquelin --- examples/vhost/ioat.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/examples/vhost/ioat.c b/examples/vhost/ioat.c index 2a2c2d7202..bf4e033bdb 100644 --- a/examples/vhost/ioat.c +++ b/examples/vhost/ioat.c @@ -17,7 +17,6 @@ struct packet_tracker { unsigned short next_read; unsigned short next_write; unsigned short last_remain; - unsigned short ioat_space; }; struct packet_tracker cb_tracker[MAX_VHOST_DEVICE]; @@ -113,7 +112,6 @@ open_ioat(const char *value) goto out; } rte_rawdev_start(dev_id); - cb_tracker[dev_id].ioat_space = IOAT_RING_SIZE - 1; dma_info->nr++; i++; } @@ -140,7 +138,7 @@ ioat_transfer_data_cb(int vid, uint16_t queue_id, src = descs[i_desc].src; dst = descs[i_desc].dst; i_seg = 0; - if (cb_tracker[dev_id].ioat_space < src->nr_segs) + if (rte_ioat_burst_capacity(dev_id) < src->nr_segs) break; while (i_seg < src->nr_segs) { rte_ioat_enqueue_copy(dev_id, @@ -155,7 +153,6 @@ ioat_transfer_data_cb(int vid, uint16_t queue_id, } write &= mask; cb_tracker[dev_id].size_track[write] = src->nr_segs; - cb_tracker[dev_id].ioat_space -= src->nr_segs; write++; } } else { @@ -194,7 +191,6 @@ ioat_check_completed_copies_cb(int vid, uint16_t queue_id, if (n_seg == 0) return 0; - cb_tracker[dev_id].ioat_space += n_seg; n_seg += cb_tracker[dev_id].last_remain; read = cb_tracker[dev_id].next_read; From patchwork Wed Jul 21 14:20:50 2021 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Ma, WenwuX" X-Patchwork-Id: 96121 X-Patchwork-Delegate: maxime.coquelin@redhat.com Return-Path: X-Original-To: patchwork@inbox.dpdk.org Delivered-To: patchwork@inbox.dpdk.org Received: from mails.dpdk.org (mails.dpdk.org [217.70.189.124]) by inbox.dpdk.org (Postfix) with ESMTP id 38D4BA0C50; Wed, 21 Jul 2021 04:28:42 +0200 (CEST) Received: from [217.70.189.124] (localhost [127.0.0.1]) by mails.dpdk.org (Postfix) with ESMTP id 0EFEA41101; Wed, 21 Jul 2021 04:28:30 +0200 (CEST) Received: from mga01.intel.com (mga01.intel.com [192.55.52.88]) by mails.dpdk.org (Postfix) with ESMTP id 59C2F410E7 for ; Wed, 21 Jul 2021 04:28:28 +0200 (CEST) X-IronPort-AV: E=McAfee;i="6200,9189,10051"; a="233152398" X-IronPort-AV: E=Sophos;i="5.84,256,1620716400"; d="scan'208";a="233152398" Received: from fmsmga002.fm.intel.com ([10.253.24.26]) by fmsmga101.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 20 Jul 2021 19:28:28 -0700 X-IronPort-AV: E=Sophos;i="5.84,256,1620716400"; d="scan'208";a="511210478" Received: from unknown (HELO localhost.localdomain) ([10.240.183.109]) by fmsmga002-auth.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 20 Jul 2021 19:28:25 -0700 From: Wenwu Ma To: dev@dpdk.org Cc: maxime.coquelin@redhat.com, chenbo.xia@intel.com, cheng1.jiang@intel.com, jiayu.hu@intel.com, Yuan Wang , Wenwu Ma Date: Wed, 21 Jul 2021 14:20:50 +0000 Message-Id: <20210721142051.29327-4-wenwux.ma@intel.com> X-Mailer: git-send-email 2.25.1 In-Reply-To: <20210721142051.29327-1-wenwux.ma@intel.com> References: <20210602083110.5530-1-yuanx.wang@intel.com> <20210721142051.29327-1-wenwux.ma@intel.com> MIME-Version: 1.0 Subject: [dpdk-dev] [PATCH v7 3/4] vhost: support async dequeue for split ring X-BeenThere: dev@dpdk.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: DPDK patches and discussions List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: dev-bounces@dpdk.org Sender: "dev" From: Yuan Wang This patch implements asynchronous dequeue data path for split ring. A new asynchronous dequeue function is introduced. With this function, the application can try to receive packets from the guest with offloading large copies to the async channel, thus saving precious CPU cycles. Signed-off-by: Yuan Wang Signed-off-by: Jiayu Hu Signed-off-by: Wenwu Ma --- doc/guides/prog_guide/vhost_lib.rst | 9 + lib/vhost/rte_vhost_async.h | 39 +- lib/vhost/version.map | 3 + lib/vhost/virtio_net.c | 586 ++++++++++++++++++++++++++++ 4 files changed, 635 insertions(+), 2 deletions(-) diff --git a/doc/guides/prog_guide/vhost_lib.rst b/doc/guides/prog_guide/vhost_lib.rst index d18fb98910..bf90a2663b 100644 --- a/doc/guides/prog_guide/vhost_lib.rst +++ b/doc/guides/prog_guide/vhost_lib.rst @@ -281,6 +281,15 @@ The following is an overview of some key Vhost API functions: Poll enqueue completion status from async data path. Completed packets are returned to applications through ``pkts``. +* ``rte_vhost_async_try_dequeue_burst(vid, queue_id, mbuf_pool, pkts, count, nr_inflight)`` + + This function tries to receive packets from the guest with offloading + large copies to the async channel. The packets that are transfer completed + are returned in ``pkts``. The other packets that their copies are submitted + to the async channel but not completed are called "in-flight packets". + This function will not return in-flight packets until their copies are + completed by the async channel. + Vhost-user Implementations -------------------------- diff --git a/lib/vhost/rte_vhost_async.h b/lib/vhost/rte_vhost_async.h index 6faa31f5ad..04d7588217 100644 --- a/lib/vhost/rte_vhost_async.h +++ b/lib/vhost/rte_vhost_async.h @@ -83,12 +83,20 @@ struct rte_vhost_async_channel_ops { uint16_t max_packets); }; +struct async_nethdr { + struct virtio_net_hdr hdr; + bool valid; +}; + /** - * inflight async packet information + * in-flight async packet information */ struct async_inflight_info { struct rte_mbuf *mbuf; - uint16_t descs; /* num of descs inflight */ + union { + uint16_t descs; /* num of descs in-flight */ + struct async_nethdr nethdr; + }; uint16_t nr_buffers; /* num of buffers inflight for packed ring */ }; @@ -193,4 +201,31 @@ __rte_experimental uint16_t rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id, struct rte_mbuf **pkts, uint16_t count); +/** + * This function tries to receive packets from the guest with offloading + * large copies to the async channel. The packets that are transfer completed + * are returned in "pkts". The other packets that their copies are submitted to + * the async channel but not completed are called "in-flight packets". + * This function will not return in-flight packets until their copies are + * completed by the async channel. + * + * @param vid + * id of vhost device to dequeue data + * @param queue_id + * queue id to dequeue data + * @param pkts + * blank array to keep successfully dequeued packets + * @param count + * size of the packet array + * @param nr_inflight + * the amount of in-flight packets. If error occurred, its value is set to -1. + * @return + * num of successfully dequeued packets + */ +__rte_experimental +uint16_t +rte_vhost_async_try_dequeue_burst(int vid, uint16_t queue_id, + struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count, + int *nr_inflight); + #endif /* _RTE_VHOST_ASYNC_H_ */ diff --git a/lib/vhost/version.map b/lib/vhost/version.map index 9103a23cd4..a320f889cd 100644 --- a/lib/vhost/version.map +++ b/lib/vhost/version.map @@ -79,4 +79,7 @@ EXPERIMENTAL { # added in 21.05 rte_vhost_get_negotiated_protocol_features; + + # added in 21.08 + rte_vhost_async_try_dequeue_burst; }; diff --git a/lib/vhost/virtio_net.c b/lib/vhost/virtio_net.c index b93482587c..58317d7b75 100644 --- a/lib/vhost/virtio_net.c +++ b/lib/vhost/virtio_net.c @@ -3147,3 +3147,589 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id, return count; } + +static __rte_always_inline int +async_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq, + struct buf_vector *buf_vec, uint16_t nr_vec, + struct rte_mbuf *m, struct rte_mempool *mbuf_pool, + struct iovec *src_iovec, struct iovec *dst_iovec, + struct rte_vhost_iov_iter *src_it, + struct rte_vhost_iov_iter *dst_it, + struct async_nethdr *nethdr, + bool legacy_ol_flags) +{ + uint64_t buf_addr, buf_iova; + uint64_t mapped_len; + uint32_t tlen = 0; + uint32_t buf_avail, buf_offset, buf_len; + uint32_t mbuf_avail, mbuf_offset; + uint32_t cpy_len, cpy_threshold; + /* A counter to avoid desc dead loop chain */ + uint16_t vec_idx = 0; + int tvec_idx = 0; + struct rte_mbuf *cur = m, *prev = m; + struct virtio_net_hdr tmp_hdr; + struct virtio_net_hdr *hdr = NULL; + struct batch_copy_elem *batch_copy = vq->batch_copy_elems; + + buf_addr = buf_vec[vec_idx].buf_addr; + buf_len = buf_vec[vec_idx].buf_len; + buf_iova = buf_vec[vec_idx].buf_iova; + + if (unlikely(buf_len < dev->vhost_hlen && nr_vec <= 1)) + return -1; + + cpy_threshold = vq->async_threshold; + + if (virtio_net_with_host_offload(dev)) { + if (unlikely(buf_len < sizeof(struct virtio_net_hdr))) { + /* + * No luck, the virtio-net header doesn't fit + * in a contiguous virtual area. + */ + copy_vnet_hdr_from_desc(&tmp_hdr, buf_vec); + hdr = &tmp_hdr; + } else { + hdr = (struct virtio_net_hdr *)((uintptr_t)buf_addr); + } + } + + /* + * A virtio driver normally uses at least 2 desc buffers + * for Tx: the first for storing the header, and others + * for storing the data. + */ + if (unlikely(buf_len < dev->vhost_hlen)) { + buf_offset = dev->vhost_hlen - buf_len; + vec_idx++; + buf_addr = buf_vec[vec_idx].buf_addr; + buf_len = buf_vec[vec_idx].buf_len; + buf_avail = buf_len - buf_offset; + } else if (buf_len == dev->vhost_hlen) { + if (unlikely(++vec_idx >= nr_vec)) + return -1; + buf_addr = buf_vec[vec_idx].buf_addr; + buf_len = buf_vec[vec_idx].buf_len; + + buf_offset = 0; + buf_avail = buf_len; + } else { + buf_offset = dev->vhost_hlen; + buf_avail = buf_vec[vec_idx].buf_len - dev->vhost_hlen; + } + + PRINT_PACKET(dev, (uintptr_t)(buf_addr + buf_offset), + (uint32_t)buf_avail, 0); + + mbuf_offset = 0; + mbuf_avail = m->buf_len - RTE_PKTMBUF_HEADROOM; + while (1) { + cpy_len = RTE_MIN(buf_avail, mbuf_avail); + + while (cpy_len && cpy_len >= cpy_threshold) { + void *hpa = (void *)(uintptr_t)gpa_to_first_hpa(dev, + buf_iova + buf_offset, cpy_len, + &mapped_len); + + if (unlikely(!hpa || mapped_len < cpy_threshold)) + break; + + async_fill_vec(src_iovec + tvec_idx, hpa, + (size_t)mapped_len); + async_fill_vec(dst_iovec + tvec_idx, + (void *)(uintptr_t)rte_pktmbuf_iova_offset(cur, + mbuf_offset), + (size_t)mapped_len); + + tvec_idx++; + tlen += (uint32_t)mapped_len; + cpy_len -= (uint32_t)mapped_len; + mbuf_avail -= (uint32_t)mapped_len; + mbuf_offset += (uint32_t)mapped_len; + buf_avail -= (uint32_t)mapped_len; + buf_offset += (uint32_t)mapped_len; + } + + if (cpy_len) { + if (vq->batch_copy_nb_elems >= vq->size || + (hdr && cur == m)) { + rte_memcpy( + rte_pktmbuf_mtod_offset(cur, void *, + mbuf_offset), + (void *)((uintptr_t)(buf_addr + + buf_offset)), + cpy_len); + } else { + batch_copy[vq->batch_copy_nb_elems].dst = + rte_pktmbuf_mtod_offset(cur, void *, + mbuf_offset); + batch_copy[vq->batch_copy_nb_elems].src = + (void *)((uintptr_t)(buf_addr + + buf_offset)); + batch_copy[vq->batch_copy_nb_elems].len = + cpy_len; + vq->batch_copy_nb_elems++; + } + + mbuf_avail -= cpy_len; + mbuf_offset += cpy_len; + buf_avail -= cpy_len; + buf_offset += cpy_len; + } + + /* This buf reaches to its end, get the next one */ + if (buf_avail == 0) { + if (++vec_idx >= nr_vec) + break; + + buf_addr = buf_vec[vec_idx].buf_addr; + buf_len = buf_vec[vec_idx].buf_len; + + buf_offset = 0; + buf_avail = buf_len; + + PRINT_PACKET(dev, (uintptr_t)buf_addr, + (uint32_t)buf_avail, 0); + } + + /* + * This mbuf reaches to its end, get a new one + * to hold more data. + */ + if (mbuf_avail == 0) { + cur = rte_pktmbuf_alloc(mbuf_pool); + if (unlikely(cur == NULL)) { + VHOST_LOG_DATA(ERR, "Failed to " + "allocate memory for mbuf.\n"); + return -1; + } + + prev->next = cur; + prev->data_len = mbuf_offset; + m->nb_segs += 1; + m->pkt_len += mbuf_offset; + prev = cur; + + mbuf_offset = 0; + mbuf_avail = cur->buf_len - RTE_PKTMBUF_HEADROOM; + } + } + + prev->data_len = mbuf_offset; + m->pkt_len += mbuf_offset; + + if (hdr && tlen) { + nethdr->valid = true; + nethdr->hdr = *hdr; + } else if (hdr) + vhost_dequeue_offload(hdr, m, legacy_ol_flags); + + if (tlen) { + async_fill_iter(src_it, tlen, src_iovec, tvec_idx); + async_fill_iter(dst_it, tlen, dst_iovec, tvec_idx); + } else + src_it->count = 0; + + return 0; +} + +static __rte_always_inline uint16_t +async_poll_dequeue_completed_split(struct virtio_net *dev, + struct vhost_virtqueue *vq, uint16_t queue_id, + struct rte_mbuf **pkts, uint16_t count, bool legacy_ol_flags) +{ + uint16_t n_pkts_cpl = 0, n_pkts_put = 0; + uint16_t start_idx, pkt_idx, from; + struct async_inflight_info *pkts_info; + + pkt_idx = vq->async_pkts_idx & (vq->size - 1); + pkts_info = vq->async_pkts_info; + start_idx = virtio_dev_rx_async_get_info_idx(pkt_idx, vq->size, + vq->async_pkts_inflight_n); + + if (count > vq->async_last_pkts_n) { + n_pkts_cpl = vq->async_ops.check_completed_copies(dev->vid, + queue_id, 0, count - vq->async_last_pkts_n); + } + + n_pkts_cpl += vq->async_last_pkts_n; + if (unlikely(n_pkts_cpl == 0)) + return 0; + + n_pkts_put = RTE_MIN(count, n_pkts_cpl); + + for (pkt_idx = 0; pkt_idx < n_pkts_put; pkt_idx++) { + from = (start_idx + pkt_idx) & (vq->size - 1); + pkts[pkt_idx] = pkts_info[from].mbuf; + + if (pkts_info[from].nethdr.valid) { + vhost_dequeue_offload(&pkts_info[from].nethdr.hdr, + pkts[pkt_idx], legacy_ol_flags); + } + } + vq->async_last_pkts_n = n_pkts_cpl - n_pkts_put; + + if (n_pkts_put) { + /* write back completed descs to used ring */ + write_back_completed_descs_split(vq, n_pkts_put); + /* update used ring */ + __atomic_add_fetch(&vq->used->idx, + n_pkts_put, __ATOMIC_RELEASE); + + vq->async_pkts_inflight_n -= n_pkts_put; + } + + return n_pkts_put; +} + +static __rte_always_inline uint16_t +virtio_dev_tx_async_split(struct virtio_net *dev, + struct vhost_virtqueue *vq, uint16_t queue_id, + struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, + uint16_t count, bool legacy_ol_flags) +{ + static bool allocerr_warned; + bool dropped = false; + uint16_t pkt_idx; + uint16_t free_entries; + uint16_t slot_idx = 0; + uint16_t segs_await = 0; + uint16_t nr_done_pkts = 0, nr_async_pkts = 0, nr_async_cmpl_pkts = 0; + uint16_t nr_async_burst = 0; + uint16_t pkt_err = 0; + uint16_t iovec_idx = 0, it_idx = 0; + struct rte_vhost_iov_iter *it_pool = vq->it_pool; + struct iovec *vec_pool = vq->vec_pool; + struct iovec *src_iovec = vec_pool; + struct iovec *dst_iovec = vec_pool + (VHOST_MAX_ASYNC_VEC >> 1); + struct rte_vhost_async_desc tdes[MAX_PKT_BURST]; + struct async_inflight_info *pkts_info = vq->async_pkts_info; + struct rte_mbuf *pkts_prealloc[MAX_PKT_BURST]; + + struct async_pkt_index { + uint16_t last_avail_idx; + } async_pkts_log[MAX_PKT_BURST]; + + /** + * The ordering between avail index and + * desc reads needs to be enforced. + */ + free_entries = __atomic_load_n(&vq->avail->idx, __ATOMIC_ACQUIRE) - + vq->last_avail_idx; + if (free_entries == 0) + goto out; + + rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]); + + count = RTE_MIN(count, MAX_PKT_BURST); + count = RTE_MIN(count, free_entries); + VHOST_LOG_DATA(DEBUG, "(%d) about to dequeue %u buffers\n", + dev->vid, count); + + if (rte_pktmbuf_alloc_bulk(mbuf_pool, pkts_prealloc, count)) + goto out; + + for (pkt_idx = 0; pkt_idx < count; pkt_idx++) { + uint16_t head_idx = 0; + uint16_t nr_vec = 0; + uint32_t buf_len; + int err; + struct buf_vector buf_vec[BUF_VECTOR_MAX]; + struct rte_mbuf *pkt; + + if (unlikely(fill_vec_buf_split(dev, vq, vq->last_avail_idx, + &nr_vec, buf_vec, + &head_idx, &buf_len, + VHOST_ACCESS_RO) < 0)) + break; + + err = virtio_dev_pktmbuf_prep(dev, pkts_prealloc[pkt_idx], + buf_len); + if (unlikely(err)) { + /** + * mbuf allocation fails for jumbo packets when external + * buffer allocation is not allowed and linear buffer + * is required. Drop this packet. + */ + if (!allocerr_warned) { + VHOST_LOG_DATA(ERR, + "Failed mbuf alloc of size %d from %s on %s.\n", + buf_len, mbuf_pool->name, dev->ifname); + allocerr_warned = true; + } + dropped = true; + break; + } + + pkt = pkts_prealloc[pkt_idx]; + + slot_idx = (vq->async_pkts_idx + nr_async_pkts) & + (vq->size - 1); + err = async_desc_to_mbuf(dev, vq, buf_vec, nr_vec, pkt, + mbuf_pool, &src_iovec[iovec_idx], + &dst_iovec[iovec_idx], &it_pool[it_idx], + &it_pool[it_idx + 1], + &pkts_info[slot_idx].nethdr, legacy_ol_flags); + if (unlikely(err)) { + if (!allocerr_warned) { + VHOST_LOG_DATA(ERR, + "Failed to copy desc to mbuf on %s.\n", + dev->ifname); + allocerr_warned = true; + } + dropped = true; + break; + } + + if (it_pool[it_idx].count) { + uint16_t to = vq->async_desc_idx_split & (vq->size - 1); + + async_fill_desc(&tdes[nr_async_burst], &it_pool[it_idx], + &it_pool[it_idx + 1]); + pkts_info[slot_idx].mbuf = pkt; + async_pkts_log[nr_async_pkts++].last_avail_idx = + vq->last_avail_idx; + nr_async_burst++; + iovec_idx += it_pool[it_idx].nr_segs; + it_idx += 2; + segs_await += it_pool[it_idx].nr_segs; + + /* keep used desc */ + vq->async_descs_split[to].id = head_idx; + vq->async_descs_split[to].len = 0; + vq->async_desc_idx_split++; + } else { + update_shadow_used_ring_split(vq, head_idx, 0); + pkts[nr_done_pkts++] = pkt; + } + + vq->last_avail_idx++; + + if (unlikely((nr_async_burst >= VHOST_ASYNC_BATCH_THRESHOLD) || + ((VHOST_MAX_ASYNC_VEC >> 1) - + segs_await < BUF_VECTOR_MAX))) { + uint16_t nr_pkts; + + nr_pkts = vq->async_ops.transfer_data(dev->vid, + queue_id, tdes, 0, nr_async_burst); + src_iovec = vec_pool; + dst_iovec = vec_pool + (VHOST_MAX_ASYNC_VEC >> 1); + it_idx = 0; + segs_await = 0; + vq->async_pkts_inflight_n += nr_pkts; + + if (unlikely(nr_pkts < nr_async_burst)) { + pkt_err = nr_async_burst - nr_pkts; + nr_async_burst = 0; + break; + } + nr_async_burst = 0; + } + } + + if (unlikely(dropped)) + rte_pktmbuf_free_bulk(&pkts_prealloc[pkt_idx], count - pkt_idx); + + if (nr_async_burst) { + uint32_t nr_pkts; + + nr_pkts = vq->async_ops.transfer_data(dev->vid, queue_id, + tdes, 0, nr_async_burst); + vq->async_pkts_inflight_n += nr_pkts; + + if (unlikely(nr_pkts < nr_async_burst)) + pkt_err = nr_async_burst - nr_pkts; + } + + do_data_copy_dequeue(vq); + + if (unlikely(pkt_err)) { + uint16_t nr_err_dma = pkt_err; + uint16_t nr_err_sw; + + nr_async_pkts -= nr_err_dma; + + /** + * revert shadow used ring and free pktmbufs for + * CPU-copied pkts after the first DMA-error pkt. + */ + nr_err_sw = vq->last_avail_idx - + async_pkts_log[nr_async_pkts].last_avail_idx - + nr_err_dma; + vq->shadow_used_idx -= nr_err_sw; + while (nr_err_sw-- > 0) + rte_pktmbuf_free(pkts[--nr_done_pkts]); + + /** + * recover DMA-copy related structures and free pktmbufs + * for DMA-error pkts. + */ + vq->async_desc_idx_split -= nr_err_dma; + while (nr_err_dma-- > 0) { + rte_pktmbuf_free( + pkts_info[slot_idx & (vq->size - 1)].mbuf); + slot_idx--; + } + + /* recover available ring */ + vq->last_avail_idx = + async_pkts_log[nr_async_pkts].last_avail_idx; + } + + vq->async_pkts_idx += nr_async_pkts; + + if (likely(vq->shadow_used_idx)) + flush_shadow_used_ring_split(dev, vq); + +out: + if (nr_done_pkts < count && vq->async_pkts_inflight_n > 0) { + nr_async_cmpl_pkts = async_poll_dequeue_completed_split(dev, vq, + queue_id, &pkts[nr_done_pkts], + count - nr_done_pkts, + legacy_ol_flags); + nr_done_pkts += nr_async_cmpl_pkts; + } + if (likely(nr_done_pkts)) + vhost_vring_call_split(dev, vq); + + return nr_done_pkts; +} + +__rte_noinline +static uint16_t +virtio_dev_tx_async_split_legacy(struct virtio_net *dev, + struct vhost_virtqueue *vq, uint16_t queue_id, + struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, + uint16_t count) +{ + return virtio_dev_tx_async_split(dev, vq, queue_id, mbuf_pool, + pkts, count, true); +} + +__rte_noinline +static uint16_t +virtio_dev_tx_async_split_compliant(struct virtio_net *dev, + struct vhost_virtqueue *vq, uint16_t queue_id, + struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, + uint16_t count) +{ + return virtio_dev_tx_async_split(dev, vq, queue_id, mbuf_pool, + pkts, count, false); +} + +uint16_t +rte_vhost_async_try_dequeue_burst(int vid, uint16_t queue_id, + struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count, + int *nr_inflight) +{ + struct virtio_net *dev; + struct rte_mbuf *rarp_mbuf = NULL; + struct vhost_virtqueue *vq; + int16_t success = 1; + + *nr_inflight = -1; + + dev = get_device(vid); + if (!dev) + return 0; + + if (unlikely(!(dev->flags & VIRTIO_DEV_BUILTIN_VIRTIO_NET))) { + VHOST_LOG_DATA(ERR, + "(%d) %s: built-in vhost net backend is disabled.\n", + dev->vid, __func__); + return 0; + } + + if (unlikely(!is_valid_virt_queue_idx(queue_id, 1, dev->nr_vring))) { + VHOST_LOG_DATA(ERR, + "(%d) %s: invalid virtqueue idx %d.\n", + dev->vid, __func__, queue_id); + return 0; + } + + vq = dev->virtqueue[queue_id]; + + if (unlikely(rte_spinlock_trylock(&vq->access_lock) == 0)) + return 0; + + if (unlikely(vq->enabled == 0)) { + count = 0; + goto out_access_unlock; + } + + if (unlikely(!vq->async_registered)) { + VHOST_LOG_DATA(ERR, "(%d) %s: async not registered for queue id %d.\n", + dev->vid, __func__, queue_id); + count = 0; + goto out_access_unlock; + } + + if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM)) + vhost_user_iotlb_rd_lock(vq); + + if (unlikely(vq->access_ok == 0)) + if (unlikely(vring_translate(dev, vq) < 0)) { + count = 0; + goto out_access_unlock; + } + + /* + * Construct a RARP broadcast packet, and inject it to the "pkts" + * array, to looks like that guest actually send such packet. + * + * Check user_send_rarp() for more information. + * + * broadcast_rarp shares a cacheline in the virtio_net structure + * with some fields that are accessed during enqueue and + * __atomic_compare_exchange_n causes a write if performed compare + * and exchange. This could result in false sharing between enqueue + * and dequeue. + * + * Prevent unnecessary false sharing by reading broadcast_rarp first + * and only performing compare and exchange if the read indicates it + * is likely to be set. + */ + if (unlikely(__atomic_load_n(&dev->broadcast_rarp, __ATOMIC_ACQUIRE) && + __atomic_compare_exchange_n(&dev->broadcast_rarp, + &success, 0, 0, __ATOMIC_RELEASE, __ATOMIC_RELAXED))) { + + rarp_mbuf = rte_net_make_rarp_packet(mbuf_pool, &dev->mac); + if (rarp_mbuf == NULL) { + VHOST_LOG_DATA(ERR, "Failed to make RARP packet.\n"); + count = 0; + goto out; + } + count -= 1; + } + + if (unlikely(vq_is_packed(dev))) + return 0; + + if (dev->flags & VIRTIO_DEV_LEGACY_OL_FLAGS) + count = virtio_dev_tx_async_split_legacy(dev, vq, queue_id, + mbuf_pool, pkts, count); + else + count = virtio_dev_tx_async_split_compliant(dev, vq, queue_id, + mbuf_pool, pkts, count); + +out: + *nr_inflight = vq->async_pkts_inflight_n; + + if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM)) + vhost_user_iotlb_rd_unlock(vq); + +out_access_unlock: + rte_spinlock_unlock(&vq->access_lock); + + if (unlikely(rarp_mbuf != NULL)) { + /* + * Inject it to the head of "pkts" array, so that switch's mac + * learning table will get updated first. + */ + memmove(&pkts[1], pkts, count * sizeof(struct rte_mbuf *)); + pkts[0] = rarp_mbuf; + count += 1; + } + + return count; +} From patchwork Wed Jul 21 14:20:51 2021 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Ma, WenwuX" X-Patchwork-Id: 96122 X-Patchwork-Delegate: maxime.coquelin@redhat.com Return-Path: X-Original-To: patchwork@inbox.dpdk.org Delivered-To: patchwork@inbox.dpdk.org Received: from mails.dpdk.org (mails.dpdk.org [217.70.189.124]) by inbox.dpdk.org (Postfix) with ESMTP id 550EEA0C50; Wed, 21 Jul 2021 04:28:49 +0200 (CEST) Received: from [217.70.189.124] (localhost [127.0.0.1]) by mails.dpdk.org (Postfix) with ESMTP id A787541134; Wed, 21 Jul 2021 04:28:32 +0200 (CEST) Received: from mga01.intel.com (mga01.intel.com [192.55.52.88]) by mails.dpdk.org (Postfix) with ESMTP id 578654111A for ; Wed, 21 Jul 2021 04:28:30 +0200 (CEST) X-IronPort-AV: E=McAfee;i="6200,9189,10051"; a="233152401" X-IronPort-AV: E=Sophos;i="5.84,256,1620716400"; d="scan'208";a="233152401" Received: from fmsmga002.fm.intel.com ([10.253.24.26]) by fmsmga101.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 20 Jul 2021 19:28:30 -0700 X-IronPort-AV: E=Sophos;i="5.84,256,1620716400"; d="scan'208";a="511210488" Received: from unknown (HELO localhost.localdomain) ([10.240.183.109]) by fmsmga002-auth.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 20 Jul 2021 19:28:28 -0700 From: Wenwu Ma To: dev@dpdk.org Cc: maxime.coquelin@redhat.com, chenbo.xia@intel.com, cheng1.jiang@intel.com, jiayu.hu@intel.com, Wenwu Ma Date: Wed, 21 Jul 2021 14:20:51 +0000 Message-Id: <20210721142051.29327-5-wenwux.ma@intel.com> X-Mailer: git-send-email 2.25.1 In-Reply-To: <20210721142051.29327-1-wenwux.ma@intel.com> References: <20210602083110.5530-1-yuanx.wang@intel.com> <20210721142051.29327-1-wenwux.ma@intel.com> MIME-Version: 1.0 Subject: [dpdk-dev] [PATCH v7 4/4] examples/vhost: support vhost async dequeue data path X-BeenThere: dev@dpdk.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: DPDK patches and discussions List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: dev-bounces@dpdk.org Sender: "dev" This patch is to add vhost async dequeue data-path in vhost sample. vswitch can leverage IOAT to accelerate vhost async dequeue data-path. Signed-off-by: Wenwu Ma Reviewed-by: Maxime Coquelin --- doc/guides/sample_app_ug/vhost.rst | 9 +- examples/vhost/ioat.c | 61 ++++++++++--- examples/vhost/ioat.h | 25 ++++++ examples/vhost/main.c | 140 ++++++++++++++++++++--------- 4 files changed, 177 insertions(+), 58 deletions(-) diff --git a/doc/guides/sample_app_ug/vhost.rst b/doc/guides/sample_app_ug/vhost.rst index 9afde9c7f5..63dcf181e1 100644 --- a/doc/guides/sample_app_ug/vhost.rst +++ b/doc/guides/sample_app_ug/vhost.rst @@ -169,9 +169,12 @@ demonstrates how to use the async vhost APIs. It's used in combination with dmas **--dmas** This parameter is used to specify the assigned DMA device of a vhost device. Async vhost-user net driver will be used if --dmas is set. For example ---dmas [txd0@00:04.0,txd1@00:04.1] means use DMA channel 00:04.0 for vhost -device 0 enqueue operation and use DMA channel 00:04.1 for vhost device 1 -enqueue operation. +--dmas [txd0@00:04.0,txd1@00:04.1,rxd0@00:04.2,rxd1@00:04.3] means use +DMA channel 00:04.0/00:04.2 for vhost device 0 enqueue/dequeue operation +and use DMA channel 00:04.1/00:04.3 for vhost device 1 enqueue/dequeue +operation. The index of the device corresponds to the socket file in order, +that means vhost device 0 is created through the first socket file, vhost +device 1 is created through the second socket file, and so on. Common Issues ------------- diff --git a/examples/vhost/ioat.c b/examples/vhost/ioat.c index bf4e033bdb..8bd379d084 100644 --- a/examples/vhost/ioat.c +++ b/examples/vhost/ioat.c @@ -21,6 +21,8 @@ struct packet_tracker { struct packet_tracker cb_tracker[MAX_VHOST_DEVICE]; +int vid2socketid[MAX_VHOST_DEVICE]; + int open_ioat(const char *value) { @@ -29,7 +31,7 @@ open_ioat(const char *value) char *addrs = input; char *ptrs[2]; char *start, *end, *substr; - int64_t vid, vring_id; + int64_t socketid, vring_id; struct rte_ioat_rawdev_config config; struct rte_rawdev_info info = { .dev_private = &config }; char name[32]; @@ -60,6 +62,7 @@ open_ioat(const char *value) goto out; } while (i < args_nr) { + bool is_txd; char *arg_temp = dma_arg[i]; uint8_t sub_nr; sub_nr = rte_strsplit(arg_temp, strlen(arg_temp), ptrs, 2, '@'); @@ -68,27 +71,39 @@ open_ioat(const char *value) goto out; } - start = strstr(ptrs[0], "txd"); - if (start == NULL) { + int async_flag; + char *txd, *rxd; + txd = strstr(ptrs[0], "txd"); + rxd = strstr(ptrs[0], "rxd"); + if (txd) { + is_txd = true; + start = txd; + async_flag = ASYNC_ENQUEUE_VHOST; + } else if (rxd) { + is_txd = false; + start = rxd; + async_flag = ASYNC_DEQUEUE_VHOST; + } else { ret = -1; goto out; } start += 3; - vid = strtol(start, &end, 0); + socketid = strtol(start, &end, 0); if (end == start) { ret = -1; goto out; } - vring_id = 0 + VIRTIO_RXQ; + vring_id = is_txd ? VIRTIO_RXQ : VIRTIO_TXQ; + if (rte_pci_addr_parse(ptrs[1], - &(dma_info + vid)->dmas[vring_id].addr) < 0) { + &(dma_info + socketid)->dmas[vring_id].addr) < 0) { ret = -1; goto out; } - rte_pci_device_name(&(dma_info + vid)->dmas[vring_id].addr, + rte_pci_device_name(&(dma_info + socketid)->dmas[vring_id].addr, name, sizeof(name)); dev_id = rte_rawdev_get_dev_id(name); if (dev_id == (uint16_t)(-ENODEV) || @@ -103,8 +118,9 @@ open_ioat(const char *value) goto out; } - (dma_info + vid)->dmas[vring_id].dev_id = dev_id; - (dma_info + vid)->dmas[vring_id].is_valid = true; + (dma_info + socketid)->dmas[vring_id].dev_id = dev_id; + (dma_info + socketid)->dmas[vring_id].is_valid = true; + (dma_info + socketid)->async_flag |= async_flag; config.ring_size = IOAT_RING_SIZE; config.hdls_disable = true; if (rte_rawdev_configure(dev_id, &info, sizeof(config)) < 0) { @@ -126,13 +142,16 @@ ioat_transfer_data_cb(int vid, uint16_t queue_id, struct rte_vhost_async_status *opaque_data, uint16_t count) { uint32_t i_desc; - uint16_t dev_id = dma_bind[vid].dmas[queue_id * 2 + VIRTIO_RXQ].dev_id; struct rte_vhost_iov_iter *src = NULL; struct rte_vhost_iov_iter *dst = NULL; unsigned long i_seg; unsigned short mask = MAX_ENQUEUED_SIZE - 1; - unsigned short write = cb_tracker[dev_id].next_write; + if (queue_id >= MAX_RING_COUNT) + return -1; + + uint16_t dev_id = dma_bind[vid2socketid[vid]].dmas[queue_id].dev_id; + unsigned short write = cb_tracker[dev_id].next_write; if (!opaque_data) { for (i_desc = 0; i_desc < count; i_desc++) { src = descs[i_desc].src; @@ -170,16 +189,16 @@ ioat_check_completed_copies_cb(int vid, uint16_t queue_id, struct rte_vhost_async_status *opaque_data, uint16_t max_packets) { - if (!opaque_data) { + if (!opaque_data && (queue_id < MAX_RING_COUNT)) { uintptr_t dump[255]; int n_seg; unsigned short read, write; unsigned short nb_packet = 0; unsigned short mask = MAX_ENQUEUED_SIZE - 1; unsigned short i; + uint16_t dev_id; - uint16_t dev_id = dma_bind[vid].dmas[queue_id * 2 - + VIRTIO_RXQ].dev_id; + dev_id = dma_bind[vid2socketid[vid]].dmas[queue_id].dev_id; n_seg = rte_ioat_completed_ops(dev_id, 255, NULL, NULL, dump, dump); if (n_seg < 0) { RTE_LOG(ERR, @@ -215,4 +234,18 @@ ioat_check_completed_copies_cb(int vid, uint16_t queue_id, return -1; } +uint32_t get_async_flag_by_vid(int vid) +{ + return dma_bind[vid2socketid[vid]].async_flag; +} + +uint32_t get_async_flag_by_socketid(int socketid) +{ + return dma_bind[socketid].async_flag; +} + +void init_vid2socketid_array(int vid, int socketid) +{ + vid2socketid[vid] = socketid; +} #endif /* RTE_RAW_IOAT */ diff --git a/examples/vhost/ioat.h b/examples/vhost/ioat.h index 1aa28ed6a3..3a85c94c8a 100644 --- a/examples/vhost/ioat.h +++ b/examples/vhost/ioat.h @@ -12,6 +12,9 @@ #define MAX_VHOST_DEVICE 1024 #define IOAT_RING_SIZE 4096 #define MAX_ENQUEUED_SIZE 4096 +#define MAX_RING_COUNT 2 +#define ASYNC_ENQUEUE_VHOST 1 +#define ASYNC_DEQUEUE_VHOST 2 struct dma_info { struct rte_pci_addr addr; @@ -20,6 +23,7 @@ struct dma_info { }; struct dma_for_vhost { + int async_flag; struct dma_info dmas[RTE_MAX_QUEUES_PER_PORT * 2]; uint16_t nr; }; @@ -36,6 +40,10 @@ uint32_t ioat_check_completed_copies_cb(int vid, uint16_t queue_id, struct rte_vhost_async_status *opaque_data, uint16_t max_packets); + +uint32_t get_async_flag_by_vid(int vid); +uint32_t get_async_flag_by_socketid(int socketid); +void init_vid2socketid_array(int vid, int socketid); #else static int open_ioat(const char *value __rte_unused) { @@ -59,5 +67,22 @@ ioat_check_completed_copies_cb(int vid __rte_unused, { return -1; } + +static uint32_t +get_async_flag_by_vid(int vid __rte_unused) +{ + return 0; +} + +static uint32_t +get_async_flag_by_socketid(int socketid __rte_unused) +{ + return 0; +} + +static void +init_vid2socketid_array(int vid __rte_unused, int socketid __rte_unused) +{ +} #endif #endif /* _IOAT_H_ */ diff --git a/examples/vhost/main.c b/examples/vhost/main.c index aebdc3a566..314184b447 100644 --- a/examples/vhost/main.c +++ b/examples/vhost/main.c @@ -93,8 +93,6 @@ static int client_mode; static int builtin_net_driver; -static int async_vhost_driver; - static char *dma_type; /* Specify timeout (in useconds) between retries on RX. */ @@ -679,7 +677,6 @@ us_vhost_parse_args(int argc, char **argv) us_vhost_usage(prgname); return -1; } - async_vhost_driver = 1; break; case OPT_CLIENT_NUM: @@ -897,7 +894,7 @@ drain_vhost(struct vhost_dev *vdev) __ATOMIC_SEQ_CST); } - if (!async_vhost_driver) + if ((get_async_flag_by_vid(vdev->vid) & ASYNC_ENQUEUE_VHOST) == 0) free_pkts(m, nr_xmit); } @@ -1237,10 +1234,19 @@ drain_eth_rx(struct vhost_dev *vdev) __ATOMIC_SEQ_CST); } - if (!async_vhost_driver) + if ((get_async_flag_by_vid(vdev->vid) & ASYNC_ENQUEUE_VHOST) == 0) free_pkts(pkts, rx_count); } +uint16_t async_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id, + struct rte_mempool *mbuf_pool, + struct rte_mbuf **pkts, uint16_t count) +{ + int nr_inflight; + return rte_vhost_async_try_dequeue_burst(dev->vid, queue_id, + mbuf_pool, pkts, count, &nr_inflight); +} + uint16_t sync_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id, struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count) @@ -1392,12 +1398,90 @@ destroy_device(int vid) "(%d) device has been removed from data core\n", vdev->vid); - if (async_vhost_driver) + if (get_async_flag_by_vid(vid) & ASYNC_ENQUEUE_VHOST) rte_vhost_async_channel_unregister(vid, VIRTIO_RXQ); + if (get_async_flag_by_vid(vid) & ASYNC_DEQUEUE_VHOST) + rte_vhost_async_channel_unregister(vid, VIRTIO_TXQ); rte_free(vdev); } +static int +get_socketid_by_vid(int vid) +{ + int i; + char ifname[PATH_MAX]; + rte_vhost_get_ifname(vid, ifname, sizeof(ifname)); + + for (i = 0; i < nb_sockets; i++) { + char *file = socket_files + i * PATH_MAX; + if (strcmp(file, ifname) == 0) + return i; + } + + return -1; +} + +static int +init_vhost_queue_ops(int vid) +{ + int socketid = get_socketid_by_vid(vid); + if (socketid == -1) + return -1; + + init_vid2socketid_array(vid, socketid); + if (builtin_net_driver) { + vdev_queue_ops[vid].enqueue_pkt_burst = builtin_enqueue_pkts; + vdev_queue_ops[vid].dequeue_pkt_burst = builtin_dequeue_pkts; + } else { + if (get_async_flag_by_vid(vid) & ASYNC_ENQUEUE_VHOST) { + vdev_queue_ops[vid].enqueue_pkt_burst = + async_enqueue_pkts; + } else { + vdev_queue_ops[vid].enqueue_pkt_burst = + sync_enqueue_pkts; + } + + if (get_async_flag_by_vid(vid) & ASYNC_DEQUEUE_VHOST) { + vdev_queue_ops[vid].dequeue_pkt_burst = + async_dequeue_pkts; + } else { + vdev_queue_ops[vid].dequeue_pkt_burst = + sync_dequeue_pkts; + } + } + + return 0; +} + +static int +vhost_async_channel_register(int vid) +{ + int ret = 0; + struct rte_vhost_async_features f; + struct rte_vhost_async_channel_ops channel_ops; + + if (dma_type != NULL && strncmp(dma_type, "ioat", 4) == 0) { + channel_ops.transfer_data = ioat_transfer_data_cb; + channel_ops.check_completed_copies = + ioat_check_completed_copies_cb; + + f.async_inorder = 1; + f.async_threshold = 256; + + if (get_async_flag_by_vid(vid) & ASYNC_ENQUEUE_VHOST) { + ret |= rte_vhost_async_channel_register(vid, VIRTIO_RXQ, + f.intval, &channel_ops); + } + if (get_async_flag_by_vid(vid) & ASYNC_DEQUEUE_VHOST) { + ret |= rte_vhost_async_channel_register(vid, VIRTIO_TXQ, + f.intval, &channel_ops); + } + } + + return ret; +} + /* * A new device is added to a data core. First the device is added to the main linked list * and then allocated to a specific data core. @@ -1431,20 +1515,8 @@ new_device(int vid) } } - if (builtin_net_driver) { - vdev_queue_ops[vid].enqueue_pkt_burst = builtin_enqueue_pkts; - vdev_queue_ops[vid].dequeue_pkt_burst = builtin_dequeue_pkts; - } else { - if (async_vhost_driver) { - vdev_queue_ops[vid].enqueue_pkt_burst = - async_enqueue_pkts; - } else { - vdev_queue_ops[vid].enqueue_pkt_burst = - sync_enqueue_pkts; - } - - vdev_queue_ops[vid].dequeue_pkt_burst = sync_dequeue_pkts; - } + if (init_vhost_queue_ops(vid) != 0) + return -1; if (builtin_net_driver) vs_vhost_net_setup(vdev); @@ -1473,28 +1545,13 @@ new_device(int vid) rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0); rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0); + int ret = vhost_async_channel_register(vid); + RTE_LOG(INFO, VHOST_DATA, "(%d) device has been added to data core %d\n", vid, vdev->coreid); - if (async_vhost_driver) { - struct rte_vhost_async_features f; - struct rte_vhost_async_channel_ops channel_ops; - - if (dma_type != NULL && strncmp(dma_type, "ioat", 4) == 0) { - channel_ops.transfer_data = ioat_transfer_data_cb; - channel_ops.check_completed_copies = - ioat_check_completed_copies_cb; - - f.async_inorder = 1; - f.async_threshold = 256; - - return rte_vhost_async_channel_register(vid, VIRTIO_RXQ, - f.intval, &channel_ops); - } - } - - return 0; + return ret; } /* @@ -1735,10 +1792,11 @@ main(int argc, char *argv[]) for (i = 0; i < nb_sockets; i++) { char *file = socket_files + i * PATH_MAX; - if (async_vhost_driver) - flags = flags | RTE_VHOST_USER_ASYNC_COPY; + uint64_t flag = flags; + if (get_async_flag_by_socketid(i) != 0) + flag |= RTE_VHOST_USER_ASYNC_COPY; - ret = rte_vhost_driver_register(file, flags); + ret = rte_vhost_driver_register(file, flag); if (ret != 0) { unregister_drivers(i); rte_exit(EXIT_FAILURE,