Hi Yuan,
This is a first review, I will certainly have more comments later.
On 6/2/21 10:31 AM, Yuan Wang wrote:
> This patch implements asynchronous dequeue data path for split ring.
> A new asynchronous dequeue function is introduced. With this function,
> the application can try to receive packets from the guest with
> offloading large copies to the DMA engine, thus saving precious CPU
> cycles.
Do you have any number to share?
> Signed-off-by: Wenwu Ma <wenwux.ma@intel.com>
> Signed-off-by: Yuan Wang <yuanx.wang@intel.com>
> Signed-off-by: Jiayu Hu <jiayu.hu@intel.com>
> ---
> doc/guides/prog_guide/vhost_lib.rst | 10 +
> examples/vhost/ioat.c | 30 +-
> examples/vhost/ioat.h | 3 +
> examples/vhost/main.c | 60 +--
> lib/vhost/rte_vhost_async.h | 44 ++-
> lib/vhost/version.map | 3 +
> lib/vhost/virtio_net.c | 549 ++++++++++++++++++++++++++++
> 7 files changed, 664 insertions(+), 35 deletions(-)
Please split the patch in multple parts.
At least don't mix example and lib changes in the same patch.
> diff --git a/doc/guides/prog_guide/vhost_lib.rst b/doc/guides/prog_guide/vhost_lib.rst
> index 6b7206bc1d..785ab0fb34 100644
> --- a/doc/guides/prog_guide/vhost_lib.rst
> +++ b/doc/guides/prog_guide/vhost_lib.rst
> @@ -281,6 +281,16 @@ The following is an overview of some key Vhost API functions:
> Poll enqueue completion status from async data path. Completed packets
> are returned to applications through ``pkts``.
>
> +* ``rte_vhost_try_dequeue_burst(vid, queue_id, mbuf_pool, pkts, count, nr_inflight)``
The function should contain async in its name.
BTW, I think we should also rename below APIs while they are
experimental to highlight it is async related:
rte_vhost_submit_enqueue_burst
rte_vhost_poll_enqueue_completed
> +
> + Try to receive packets from the guest with offloading large packets
> + to the DMA engine. Successfully dequeued packets are transfer
> + completed and returned in ``pkts``. But there may be other packets
> + that are sent from the guest but being transferred by the DMA engine,
> + called in-flight packets. This function will return in-flight packets
> + only after the DMA engine finishes transferring. The amount of
> + in-flight packets by now is returned in ``nr_inflight``.
> +
> Vhost-user Implementations
> --------------------------
>
> diff --git a/examples/vhost/ioat.c b/examples/vhost/ioat.c
> index 2a2c2d7202..236306c9c7 100644
> --- a/examples/vhost/ioat.c
> +++ b/examples/vhost/ioat.c
> @@ -17,7 +17,6 @@ struct packet_tracker {
> unsigned short next_read;
> unsigned short next_write;
> unsigned short last_remain;
> - unsigned short ioat_space;
> };
>
> struct packet_tracker cb_tracker[MAX_VHOST_DEVICE];
> @@ -61,18 +60,30 @@ open_ioat(const char *value)
> goto out;
> }
> while (i < args_nr) {
> + char *txd, *rxd;
> + bool is_txd;
> char *arg_temp = dma_arg[i];
> uint8_t sub_nr;
> +
> sub_nr = rte_strsplit(arg_temp, strlen(arg_temp), ptrs, 2, '@');
> if (sub_nr != 2) {
> ret = -1;
> goto out;
> }
>
> - start = strstr(ptrs[0], "txd");
> - if (start == NULL) {
> + txd = strstr(ptrs[0], "txd");
> + rxd = strstr(ptrs[0], "rxd");
> + if (txd == NULL && rxd == NULL) {
> ret = -1;
> goto out;
> + } else if (txd) {
> + is_txd = true;
> + start = txd;
> + ret |= ASYNC_RX_VHOST;
> + } else {
> + is_txd = false;
> + start = rxd;
> + ret |= ASYNC_TX_VHOST;
> }
>
> start += 3;
> @@ -82,7 +93,8 @@ open_ioat(const char *value)
> goto out;
> }
>
> - vring_id = 0 + VIRTIO_RXQ;
> + vring_id = is_txd ? VIRTIO_RXQ : VIRTIO_TXQ;
> +
> if (rte_pci_addr_parse(ptrs[1],
> &(dma_info + vid)->dmas[vring_id].addr) < 0) {
> ret = -1;
> @@ -113,7 +125,6 @@ open_ioat(const char *value)
> goto out;
> }
> rte_rawdev_start(dev_id);
> - cb_tracker[dev_id].ioat_space = IOAT_RING_SIZE - 1;
> dma_info->nr++;
> i++;
> }
> @@ -128,7 +139,7 @@ ioat_transfer_data_cb(int vid, uint16_t queue_id,
> struct rte_vhost_async_status *opaque_data, uint16_t count)
> {
> uint32_t i_desc;
> - uint16_t dev_id = dma_bind[vid].dmas[queue_id * 2 + VIRTIO_RXQ].dev_id;
> + uint16_t dev_id = dma_bind[vid].dmas[queue_id].dev_id;
It looks broken with regards to multiqueue (it was before this patch).
In open_ioat(), only dma_bind[vid].dmas[VIRTIO_RXQ] and
dma_bind[vid].dmas[VIRTIO_TXQ] are set.
As it seems that the application does not support multiqueue, it may be
a good idea to check queue_id value before using it.
> struct rte_vhost_iov_iter *src = NULL;
> struct rte_vhost_iov_iter *dst = NULL;
> unsigned long i_seg;
> @@ -140,7 +151,7 @@ ioat_transfer_data_cb(int vid, uint16_t queue_id,
> src = descs[i_desc].src;
> dst = descs[i_desc].dst;
> i_seg = 0;
> - if (cb_tracker[dev_id].ioat_space < src->nr_segs)
> + if (rte_ioat_burst_capacity(dev_id) < src->nr_segs)
This change should be in a dedicated patch, it is not related to dequeue
support.
> break;
> while (i_seg < src->nr_segs) {
> rte_ioat_enqueue_copy(dev_id,
> @@ -155,7 +166,6 @@ ioat_transfer_data_cb(int vid, uint16_t queue_id,
> }
> write &= mask;
> cb_tracker[dev_id].size_track[write] = src->nr_segs;
> - cb_tracker[dev_id].ioat_space -= src->nr_segs;
> write++;
> }
> } else {
> @@ -181,8 +191,7 @@ ioat_check_completed_copies_cb(int vid, uint16_t queue_id,
> unsigned short mask = MAX_ENQUEUED_SIZE - 1;
> unsigned short i;
>
> - uint16_t dev_id = dma_bind[vid].dmas[queue_id * 2
> - + VIRTIO_RXQ].dev_id;
> + uint16_t dev_id = dma_bind[vid].dmas[queue_id].dev_id;
> n_seg = rte_ioat_completed_ops(dev_id, 255, NULL, NULL, dump, dump);
> if (n_seg < 0) {
> RTE_LOG(ERR,
> @@ -194,7 +203,6 @@ ioat_check_completed_copies_cb(int vid, uint16_t queue_id,
> if (n_seg == 0)
> return 0;
>
> - cb_tracker[dev_id].ioat_space += n_seg;
> n_seg += cb_tracker[dev_id].last_remain;
>
> read = cb_tracker[dev_id].next_read;
> diff --git a/examples/vhost/ioat.h b/examples/vhost/ioat.h
> index 1aa28ed6a3..db7acefc02 100644
> --- a/examples/vhost/ioat.h
> +++ b/examples/vhost/ioat.h
> @@ -13,6 +13,9 @@
> #define IOAT_RING_SIZE 4096
> #define MAX_ENQUEUED_SIZE 4096
>
> +#define ASYNC_RX_VHOST 1
> +#define ASYNC_TX_VHOST 2
> +
> struct dma_info {
> struct rte_pci_addr addr;
> uint16_t dev_id;
> diff --git a/examples/vhost/main.c b/examples/vhost/main.c
> index d2179eadb9..a5662a1a91 100644
> --- a/examples/vhost/main.c
> +++ b/examples/vhost/main.c
> @@ -93,7 +93,8 @@ static int client_mode;
>
> static int builtin_net_driver;
>
> -static int async_vhost_driver;
> +static int async_rx_vhost_driver;
> +static int async_tx_vhost_driver;
>
> static char *dma_type;
>
> @@ -671,13 +672,17 @@ us_vhost_parse_args(int argc, char **argv)
> break;
>
> case OPT_DMAS_NUM:
> - if (open_dma(optarg) == -1) {
> + ret = open_dma(optarg);
> + if (ret == -1) {
> RTE_LOG(INFO, VHOST_CONFIG,
> "Wrong DMA args\n");
> us_vhost_usage(prgname);
> return -1;
> }
> - async_vhost_driver = 1;
> + if (ret & ASYNC_RX_VHOST)
> + async_rx_vhost_driver = 1;
> + if (ret & ASYNC_TX_VHOST)
> + async_tx_vhost_driver = 1;
> break;
>
> case OPT_CLIENT_NUM:
> @@ -887,7 +892,7 @@ drain_vhost(struct vhost_dev *vdev)
>
> if (builtin_net_driver) {
> ret = vs_enqueue_pkts(vdev, VIRTIO_RXQ, m, nr_xmit);
> - } else if (async_vhost_driver) {
> + } else if (async_rx_vhost_driver) {
I think we should consider having ops for async and sync instead of all
these if/else. It could be refactored as preliminary patch for this
series.
> uint32_t cpu_cpl_nr = 0;
> uint16_t enqueue_fail = 0;
> struct rte_mbuf *m_cpu_cpl[nr_xmit];
> @@ -914,7 +919,7 @@ drain_vhost(struct vhost_dev *vdev)
> __ATOMIC_SEQ_CST);
> }
>
> - if (!async_vhost_driver)
> + if (!async_rx_vhost_driver)
> free_pkts(m, nr_xmit);
> }
>
@@ -281,6 +281,16 @@ The following is an overview of some key Vhost API functions:
Poll enqueue completion status from async data path. Completed packets
are returned to applications through ``pkts``.
+* ``rte_vhost_try_dequeue_burst(vid, queue_id, mbuf_pool, pkts, count, nr_inflight)``
+
+ Try to receive packets from the guest with offloading large packets
+ to the DMA engine. Successfully dequeued packets are transfer
+ completed and returned in ``pkts``. But there may be other packets
+ that are sent from the guest but being transferred by the DMA engine,
+ called in-flight packets. This function will return in-flight packets
+ only after the DMA engine finishes transferring. The amount of
+ in-flight packets by now is returned in ``nr_inflight``.
+
Vhost-user Implementations
--------------------------
@@ -17,7 +17,6 @@ struct packet_tracker {
unsigned short next_read;
unsigned short next_write;
unsigned short last_remain;
- unsigned short ioat_space;
};
struct packet_tracker cb_tracker[MAX_VHOST_DEVICE];
@@ -61,18 +60,30 @@ open_ioat(const char *value)
goto out;
}
while (i < args_nr) {
+ char *txd, *rxd;
+ bool is_txd;
char *arg_temp = dma_arg[i];
uint8_t sub_nr;
+
sub_nr = rte_strsplit(arg_temp, strlen(arg_temp), ptrs, 2, '@');
if (sub_nr != 2) {
ret = -1;
goto out;
}
- start = strstr(ptrs[0], "txd");
- if (start == NULL) {
+ txd = strstr(ptrs[0], "txd");
+ rxd = strstr(ptrs[0], "rxd");
+ if (txd == NULL && rxd == NULL) {
ret = -1;
goto out;
+ } else if (txd) {
+ is_txd = true;
+ start = txd;
+ ret |= ASYNC_RX_VHOST;
+ } else {
+ is_txd = false;
+ start = rxd;
+ ret |= ASYNC_TX_VHOST;
}
start += 3;
@@ -82,7 +93,8 @@ open_ioat(const char *value)
goto out;
}
- vring_id = 0 + VIRTIO_RXQ;
+ vring_id = is_txd ? VIRTIO_RXQ : VIRTIO_TXQ;
+
if (rte_pci_addr_parse(ptrs[1],
&(dma_info + vid)->dmas[vring_id].addr) < 0) {
ret = -1;
@@ -113,7 +125,6 @@ open_ioat(const char *value)
goto out;
}
rte_rawdev_start(dev_id);
- cb_tracker[dev_id].ioat_space = IOAT_RING_SIZE - 1;
dma_info->nr++;
i++;
}
@@ -128,7 +139,7 @@ ioat_transfer_data_cb(int vid, uint16_t queue_id,
struct rte_vhost_async_status *opaque_data, uint16_t count)
{
uint32_t i_desc;
- uint16_t dev_id = dma_bind[vid].dmas[queue_id * 2 + VIRTIO_RXQ].dev_id;
+ uint16_t dev_id = dma_bind[vid].dmas[queue_id].dev_id;
struct rte_vhost_iov_iter *src = NULL;
struct rte_vhost_iov_iter *dst = NULL;
unsigned long i_seg;
@@ -140,7 +151,7 @@ ioat_transfer_data_cb(int vid, uint16_t queue_id,
src = descs[i_desc].src;
dst = descs[i_desc].dst;
i_seg = 0;
- if (cb_tracker[dev_id].ioat_space < src->nr_segs)
+ if (rte_ioat_burst_capacity(dev_id) < src->nr_segs)
break;
while (i_seg < src->nr_segs) {
rte_ioat_enqueue_copy(dev_id,
@@ -155,7 +166,6 @@ ioat_transfer_data_cb(int vid, uint16_t queue_id,
}
write &= mask;
cb_tracker[dev_id].size_track[write] = src->nr_segs;
- cb_tracker[dev_id].ioat_space -= src->nr_segs;
write++;
}
} else {
@@ -181,8 +191,7 @@ ioat_check_completed_copies_cb(int vid, uint16_t queue_id,
unsigned short mask = MAX_ENQUEUED_SIZE - 1;
unsigned short i;
- uint16_t dev_id = dma_bind[vid].dmas[queue_id * 2
- + VIRTIO_RXQ].dev_id;
+ uint16_t dev_id = dma_bind[vid].dmas[queue_id].dev_id;
n_seg = rte_ioat_completed_ops(dev_id, 255, NULL, NULL, dump, dump);
if (n_seg < 0) {
RTE_LOG(ERR,
@@ -194,7 +203,6 @@ ioat_check_completed_copies_cb(int vid, uint16_t queue_id,
if (n_seg == 0)
return 0;
- cb_tracker[dev_id].ioat_space += n_seg;
n_seg += cb_tracker[dev_id].last_remain;
read = cb_tracker[dev_id].next_read;
@@ -13,6 +13,9 @@
#define IOAT_RING_SIZE 4096
#define MAX_ENQUEUED_SIZE 4096
+#define ASYNC_RX_VHOST 1
+#define ASYNC_TX_VHOST 2
+
struct dma_info {
struct rte_pci_addr addr;
uint16_t dev_id;
@@ -93,7 +93,8 @@ static int client_mode;
static int builtin_net_driver;
-static int async_vhost_driver;
+static int async_rx_vhost_driver;
+static int async_tx_vhost_driver;
static char *dma_type;
@@ -671,13 +672,17 @@ us_vhost_parse_args(int argc, char **argv)
break;
case OPT_DMAS_NUM:
- if (open_dma(optarg) == -1) {
+ ret = open_dma(optarg);
+ if (ret == -1) {
RTE_LOG(INFO, VHOST_CONFIG,
"Wrong DMA args\n");
us_vhost_usage(prgname);
return -1;
}
- async_vhost_driver = 1;
+ if (ret & ASYNC_RX_VHOST)
+ async_rx_vhost_driver = 1;
+ if (ret & ASYNC_TX_VHOST)
+ async_tx_vhost_driver = 1;
break;
case OPT_CLIENT_NUM:
@@ -887,7 +892,7 @@ drain_vhost(struct vhost_dev *vdev)
if (builtin_net_driver) {
ret = vs_enqueue_pkts(vdev, VIRTIO_RXQ, m, nr_xmit);
- } else if (async_vhost_driver) {
+ } else if (async_rx_vhost_driver) {
uint32_t cpu_cpl_nr = 0;
uint16_t enqueue_fail = 0;
struct rte_mbuf *m_cpu_cpl[nr_xmit];
@@ -914,7 +919,7 @@ drain_vhost(struct vhost_dev *vdev)
__ATOMIC_SEQ_CST);
}
- if (!async_vhost_driver)
+ if (!async_rx_vhost_driver)
free_pkts(m, nr_xmit);
}
@@ -1217,7 +1222,7 @@ drain_eth_rx(struct vhost_dev *vdev)
if (builtin_net_driver) {
enqueue_count = vs_enqueue_pkts(vdev, VIRTIO_RXQ,
pkts, rx_count);
- } else if (async_vhost_driver) {
+ } else if (async_rx_vhost_driver) {
uint32_t cpu_cpl_nr = 0;
uint16_t enqueue_fail = 0;
struct rte_mbuf *m_cpu_cpl[MAX_PKT_BURST];
@@ -1245,7 +1250,7 @@ drain_eth_rx(struct vhost_dev *vdev)
__ATOMIC_SEQ_CST);
}
- if (!async_vhost_driver)
+ if (!async_rx_vhost_driver)
free_pkts(pkts, rx_count);
}
@@ -1259,6 +1264,12 @@ drain_virtio_tx(struct vhost_dev *vdev)
if (builtin_net_driver) {
count = vs_dequeue_pkts(vdev, VIRTIO_TXQ, mbuf_pool,
pkts, MAX_PKT_BURST);
+ } else if (async_tx_vhost_driver) {
+ int nr_inflight;
+
+ count = rte_vhost_try_dequeue_burst(vdev->vid, VIRTIO_TXQ,
+ mbuf_pool, pkts, MAX_PKT_BURST, &nr_inflight);
+
} else {
count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ,
mbuf_pool, pkts, MAX_PKT_BURST);
@@ -1397,8 +1408,10 @@ destroy_device(int vid)
"(%d) device has been removed from data core\n",
vdev->vid);
- if (async_vhost_driver)
+ if (async_rx_vhost_driver)
rte_vhost_async_channel_unregister(vid, VIRTIO_RXQ);
+ if (async_tx_vhost_driver)
+ rte_vhost_async_channel_unregister(vid, VIRTIO_TXQ);
rte_free(vdev);
}
@@ -1467,24 +1480,29 @@ new_device(int vid)
"(%d) device has been added to data core %d\n",
vid, vdev->coreid);
- if (async_vhost_driver) {
- struct rte_vhost_async_features f;
- struct rte_vhost_async_channel_ops channel_ops;
+ int ret = 0;
+ struct rte_vhost_async_features f;
+ struct rte_vhost_async_channel_ops channel_ops;
- if (dma_type != NULL && strncmp(dma_type, "ioat", 4) == 0) {
- channel_ops.transfer_data = ioat_transfer_data_cb;
- channel_ops.check_completed_copies =
- ioat_check_completed_copies_cb;
+ if (dma_type != NULL && strncmp(dma_type, "ioat", 4) == 0) {
+ channel_ops.transfer_data = ioat_transfer_data_cb;
+ channel_ops.check_completed_copies =
+ ioat_check_completed_copies_cb;
- f.async_inorder = 1;
- f.async_threshold = 256;
+ f.async_inorder = 1;
+ f.async_threshold = 0;
- return rte_vhost_async_channel_register(vid, VIRTIO_RXQ,
- f.intval, &channel_ops);
+ if (async_rx_vhost_driver) {
+ ret = rte_vhost_async_channel_register(
+ vid, VIRTIO_RXQ, f.intval, &channel_ops);
+ }
+ if (async_tx_vhost_driver && (ret == 0)) {
+ ret = rte_vhost_async_channel_register(
+ vid, VIRTIO_TXQ, f.intval, &channel_ops);
}
}
- return 0;
+ return ret;
}
/*
@@ -1725,7 +1743,7 @@ main(int argc, char *argv[])
for (i = 0; i < nb_sockets; i++) {
char *file = socket_files + i * PATH_MAX;
- if (async_vhost_driver)
+ if (async_rx_vhost_driver || async_tx_vhost_driver)
flags = flags | RTE_VHOST_USER_ASYNC_COPY;
ret = rte_vhost_driver_register(file, flags);
@@ -84,13 +84,21 @@ struct rte_vhost_async_channel_ops {
};
/**
- * inflight async packet information
+ * in-flight async packet information
*/
+struct async_nethdr {
+ struct virtio_net_hdr hdr;
+ bool valid;
+};
+
struct async_inflight_info {
struct rte_mbuf *mbuf;
- uint16_t descs; /* num of descs inflight */
+ union {
+ uint16_t descs; /* num of descs in-flight */
+ struct async_nethdr nethdr;
+ };
uint16_t nr_buffers; /* num of buffers inflight for packed ring */
-};
+} __rte_cache_aligned;
/**
* dma channel feature bit definition
@@ -193,4 +201,34 @@ __rte_experimental
uint16_t rte_vhost_poll_enqueue_completed(int vid, uint16_t queue_id,
struct rte_mbuf **pkts, uint16_t count);
+/**
+ * This function tries to receive packets from the guest with offloading
+ * large copies to the DMA engine. Successfully dequeued packets are
+ * transfer completed, either by the CPU or the DMA engine, and they are
+ * returned in "pkts". There may be other packets that are sent from
+ * the guest but being transferred by the DMA engine, called in-flight
+ * packets. The amount of in-flight packets by now is returned in
+ * "nr_inflight". This function will return in-flight packets only after
+ * the DMA engine finishes transferring.
+ *
+ * @param vid
+ * id of vhost device to dequeue data
+ * @param queue_id
+ * queue id to dequeue data
+ * @param pkts
+ * blank array to keep successfully dequeued packets
+ * @param count
+ * size of the packet array
+ * @param nr_inflight
+ * the amount of in-flight packets by now. If error occured, its
+ * value is set to -1.
+ * @return
+ * num of successfully dequeued packets
+ */
+__rte_experimental
+uint16_t
+rte_vhost_try_dequeue_burst(int vid, uint16_t queue_id,
+ struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count,
+ int *nr_inflight);
+
#endif /* _RTE_VHOST_ASYNC_H_ */
@@ -79,4 +79,7 @@ EXPERIMENTAL {
# added in 21.05
rte_vhost_get_negotiated_protocol_features;
+
+ # added in 21.08
+ rte_vhost_try_dequeue_burst;
};
@@ -3155,3 +3155,552 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
return count;
}
+
+static __rte_always_inline int
+async_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
+ struct buf_vector *buf_vec, uint16_t nr_vec,
+ struct rte_mbuf *m, struct rte_mempool *mbuf_pool,
+ struct iovec *src_iovec, struct iovec *dst_iovec,
+ struct rte_vhost_iov_iter *src_it,
+ struct rte_vhost_iov_iter *dst_it,
+ struct async_nethdr *nethdr,
+ bool legacy_ol_flags)
+{
+ uint64_t buf_addr;
+ uint32_t tlen = 0;
+ uint32_t buf_avail, buf_offset, buf_len;
+ uint32_t mbuf_avail, mbuf_offset;
+ uint32_t cpy_len, cpy_threshold;
+ /* A counter to avoid desc dead loop chain */
+ uint16_t vec_idx = 0;
+ int tvec_idx = 0;
+ struct rte_mbuf *cur = m, *prev = m;
+ struct virtio_net_hdr tmp_hdr;
+ struct virtio_net_hdr *hdr = NULL;
+ struct batch_copy_elem *batch_copy = vq->batch_copy_elems;
+
+ buf_addr = buf_vec[vec_idx].buf_addr;
+ buf_len = buf_vec[vec_idx].buf_len;
+
+ if (unlikely(buf_len < dev->vhost_hlen && nr_vec <= 1))
+ return -1;
+
+ cpy_threshold = vq->async_threshold;
+
+ if (virtio_net_with_host_offload(dev)) {
+ if (unlikely(buf_len < sizeof(struct virtio_net_hdr))) {
+ /*
+ * No luck, the virtio-net header doesn't fit
+ * in a contiguous virtual area.
+ */
+ copy_vnet_hdr_from_desc(&tmp_hdr, buf_vec);
+ hdr = &tmp_hdr;
+ } else {
+ hdr = (struct virtio_net_hdr *)((uintptr_t)buf_addr);
+ }
+ }
+
+ /*
+ * A virtio driver normally uses at least 2 desc buffers
+ * for Tx: the first for storing the header, and others
+ * for storing the data.
+ */
+ if (unlikely(buf_len < dev->vhost_hlen)) {
+ buf_offset = dev->vhost_hlen - buf_len;
+ vec_idx++;
+ buf_addr = buf_vec[vec_idx].buf_addr;
+ buf_len = buf_vec[vec_idx].buf_len;
+ buf_avail = buf_len - buf_offset;
+ } else if (buf_len == dev->vhost_hlen) {
+ if (unlikely(++vec_idx >= nr_vec))
+ return -1;
+ buf_addr = buf_vec[vec_idx].buf_addr;
+ buf_len = buf_vec[vec_idx].buf_len;
+
+ buf_offset = 0;
+ buf_avail = buf_len;
+ } else {
+ buf_offset = dev->vhost_hlen;
+ buf_avail = buf_vec[vec_idx].buf_len - dev->vhost_hlen;
+ }
+
+ PRINT_PACKET(dev, (uintptr_t)(buf_addr + buf_offset),
+ (uint32_t)buf_avail, 0);
+
+ mbuf_offset = 0;
+ mbuf_avail = m->buf_len - RTE_PKTMBUF_HEADROOM;
+ while (1) {
+ cpy_len = RTE_MIN(buf_avail, mbuf_avail);
+
+ if (cpy_len >= cpy_threshold) {
+ async_fill_vec(src_iovec + tvec_idx,
+ (void *)((uintptr_t)(buf_addr + buf_offset)),
+ (size_t)cpy_len);
+ async_fill_vec(dst_iovec + tvec_idx,
+ rte_pktmbuf_mtod_offset(cur,
+ void *, mbuf_offset),
+ (size_t)cpy_len);
+ tvec_idx++;
+ tlen += cpy_len;
+ } else if (likely(cpy_len > MAX_BATCH_LEN ||
+ vq->batch_copy_nb_elems >= vq->size ||
+ (hdr && cur == m))) {
+ rte_memcpy(rte_pktmbuf_mtod_offset(cur,
+ void *, mbuf_offset),
+ (void *)((uintptr_t)(buf_addr + buf_offset)),
+ cpy_len);
+ } else {
+ batch_copy[vq->batch_copy_nb_elems].dst =
+ rte_pktmbuf_mtod_offset(cur,
+ void *, mbuf_offset);
+ batch_copy[vq->batch_copy_nb_elems].src =
+ (void *)((uintptr_t)(buf_addr + buf_offset));
+ batch_copy[vq->batch_copy_nb_elems].len = cpy_len;
+ vq->batch_copy_nb_elems++;
+ }
+
+ mbuf_avail -= cpy_len;
+ mbuf_offset += cpy_len;
+ buf_avail -= cpy_len;
+ buf_offset += cpy_len;
+
+ /* This buf reaches to its end, get the next one */
+ if (buf_avail == 0) {
+ if (++vec_idx >= nr_vec)
+ break;
+
+ buf_addr = buf_vec[vec_idx].buf_addr;
+ buf_len = buf_vec[vec_idx].buf_len;
+
+ buf_offset = 0;
+ buf_avail = buf_len;
+
+ PRINT_PACKET(dev, (uintptr_t)buf_addr,
+ (uint32_t)buf_avail, 0);
+ }
+
+ /*
+ * This mbuf reaches to its end, get a new one
+ * to hold more data.
+ */
+ if (mbuf_avail == 0) {
+ cur = rte_pktmbuf_alloc(mbuf_pool);
+ if (unlikely(cur == NULL)) {
+ VHOST_LOG_DATA(ERR, "Failed to "
+ "allocate memory for mbuf.\n");
+ return -1;
+ }
+
+ prev->next = cur;
+ prev->data_len = mbuf_offset;
+ m->nb_segs += 1;
+ m->pkt_len += mbuf_offset;
+ prev = cur;
+
+ mbuf_offset = 0;
+ mbuf_avail = cur->buf_len - RTE_PKTMBUF_HEADROOM;
+ }
+ }
+
+ prev->data_len = mbuf_offset;
+ m->pkt_len += mbuf_offset;
+
+ if (hdr && tlen) {
+ nethdr->valid = true;
+ nethdr->hdr = *hdr;
+ } else if (hdr)
+ vhost_dequeue_offload(hdr, m, legacy_ol_flags);
+
+ if (tlen) {
+ async_fill_iter(src_it, tlen, src_iovec, tvec_idx);
+ async_fill_iter(dst_it, tlen, dst_iovec, tvec_idx);
+ } else
+ src_it->count = 0;
+
+ return 0;
+}
+
+static __rte_always_inline uint16_t
+async_poll_dequeue_completed_split(struct virtio_net *dev,
+ struct vhost_virtqueue *vq, uint16_t queue_id,
+ struct rte_mbuf **pkts, uint16_t count, bool legacy_ol_flags)
+{
+ uint16_t n_pkts_cpl = 0, n_pkts_put = 0;
+ uint16_t start_idx, pkt_idx, from;
+ struct async_inflight_info *pkts_info;
+
+ pkt_idx = vq->async_pkts_idx & (vq->size - 1);
+ pkts_info = vq->async_pkts_info;
+ start_idx = virtio_dev_rx_async_get_info_idx(pkt_idx, vq->size,
+ vq->async_pkts_inflight_n);
+
+ if (count > vq->async_last_pkts_n) {
+ n_pkts_cpl = vq->async_ops.check_completed_copies(dev->vid,
+ queue_id, 0, count - vq->async_last_pkts_n);
+ }
+
+ n_pkts_cpl += vq->async_last_pkts_n;
+ if (unlikely(n_pkts_cpl == 0))
+ return 0;
+
+ n_pkts_put = RTE_MIN(count, n_pkts_cpl);
+
+ for (pkt_idx = 0; pkt_idx < n_pkts_put; pkt_idx++) {
+ from = (start_idx + pkt_idx) & (vq->size - 1);
+ pkts[pkt_idx] = pkts_info[from].mbuf;
+
+ if (pkts_info[from].nethdr.valid) {
+ vhost_dequeue_offload(&pkts_info[from].nethdr.hdr,
+ pkts[pkt_idx], legacy_ol_flags);
+ }
+
+ from = vq->last_async_desc_idx_split & (vq->size - 1);
+ update_shadow_used_ring_split(vq,
+ vq->async_descs_split[from].id, 0);
+ vq->last_async_desc_idx_split++;
+ }
+ vq->async_last_pkts_n = n_pkts_cpl - n_pkts_put;
+
+ if (n_pkts_put)
+ vq->async_pkts_inflight_n -= n_pkts_put;
+
+ return n_pkts_put;
+}
+
+static __rte_always_inline uint16_t
+virtio_dev_tx_async_split(struct virtio_net *dev,
+ struct vhost_virtqueue *vq, uint16_t queue_id,
+ struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts,
+ uint16_t count, bool legacy_ol_flags)
+{
+ static bool allocerr_warned;
+ uint16_t pkt_idx;
+ uint16_t free_entries;
+ uint16_t slot_idx = 0;
+ uint16_t segs_await = 0;
+ uint16_t nr_done_pkts = 0, nr_async_pkts = 0;
+ uint16_t nr_async_burst = 0;
+ uint16_t pkt_err = 0;
+ uint16_t iovec_idx = 0, it_idx = 0;
+
+ struct rte_vhost_iov_iter *it_pool = vq->it_pool;
+ struct iovec *vec_pool = vq->vec_pool;
+ struct iovec *src_iovec = vec_pool;
+ struct iovec *dst_iovec = vec_pool + (VHOST_MAX_ASYNC_VEC >> 1);
+ struct rte_vhost_async_desc tdes[MAX_PKT_BURST];
+ struct async_inflight_info *pkts_info = vq->async_pkts_info;
+
+ struct async_pkt_index {
+ uint16_t last_avail_idx;
+ } async_pkts_log[MAX_PKT_BURST];
+
+ nr_done_pkts = async_poll_dequeue_completed_split(dev, vq, queue_id,
+ pkts, count, legacy_ol_flags);
+ if (unlikely(nr_done_pkts == count))
+ goto out;
+
+ /**
+ * The ordering between avail index and
+ * desc reads needs to be enforced.
+ */
+ free_entries = __atomic_load_n(&vq->avail->idx, __ATOMIC_ACQUIRE) -
+ vq->last_avail_idx;
+ if (free_entries == 0)
+ goto out;
+
+ rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]);
+
+ count = RTE_MIN(count - nr_done_pkts, MAX_PKT_BURST);
+ count = RTE_MIN(count, free_entries);
+ VHOST_LOG_DATA(DEBUG, "(%d) about to dequeue %u buffers\n",
+ dev->vid, count);
+
+ for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
+ uint16_t head_idx = 0;
+ uint16_t nr_vec = 0;
+ uint32_t buf_len;
+ int err;
+ struct buf_vector buf_vec[BUF_VECTOR_MAX];
+ struct rte_mbuf *pkt;
+
+ if (unlikely(fill_vec_buf_split(dev, vq, vq->last_avail_idx,
+ &nr_vec, buf_vec,
+ &head_idx, &buf_len,
+ VHOST_ACCESS_RO) < 0))
+ break;
+
+ pkt = virtio_dev_pktmbuf_alloc(dev, mbuf_pool, buf_len);
+ if (unlikely(pkt == NULL)) {
+ /**
+ * mbuf allocation fails for jumbo packets when external
+ * buffer allocation is not allowed and linear buffer
+ * is required. Drop this packet.
+ */
+ if (!allocerr_warned) {
+ VHOST_LOG_DATA(ERR,
+ "Failed mbuf alloc of size %d from %s on %s.\n",
+ buf_len, mbuf_pool->name, dev->ifname);
+ allocerr_warned = true;
+ }
+ break;
+ }
+
+ slot_idx = (vq->async_pkts_idx + nr_async_pkts) &
+ (vq->size - 1);
+ err = async_desc_to_mbuf(dev, vq, buf_vec, nr_vec, pkt,
+ mbuf_pool, &src_iovec[iovec_idx],
+ &dst_iovec[iovec_idx], &it_pool[it_idx],
+ &it_pool[it_idx + 1],
+ &pkts_info[slot_idx].nethdr, legacy_ol_flags);
+ if (unlikely(err)) {
+ rte_pktmbuf_free(pkt);
+ if (!allocerr_warned) {
+ VHOST_LOG_DATA(ERR,
+ "Failed to copy desc to mbuf on %s.\n",
+ dev->ifname);
+ allocerr_warned = true;
+ }
+ break;
+ }
+
+ if (it_pool[it_idx].count) {
+ uint16_t to = vq->async_desc_idx_split & (vq->size - 1);
+
+ async_fill_desc(&tdes[nr_async_burst], &it_pool[it_idx],
+ &it_pool[it_idx + 1]);
+ pkts_info[slot_idx].mbuf = pkt;
+ async_pkts_log[nr_async_pkts++].last_avail_idx =
+ vq->last_avail_idx;
+ nr_async_burst++;
+ iovec_idx += it_pool[it_idx].nr_segs;
+ it_idx += 2;
+ segs_await += it_pool[it_idx].nr_segs;
+
+ /* keep used desc */
+ vq->async_descs_split[to].id = head_idx;
+ vq->async_descs_split[to].len = 0;
+ vq->async_desc_idx_split++;
+ } else {
+ update_shadow_used_ring_split(vq, head_idx, 0);
+ pkts[nr_done_pkts++] = pkt;
+ }
+
+ vq->last_avail_idx++;
+
+ if (unlikely((nr_async_burst >= VHOST_ASYNC_BATCH_THRESHOLD) ||
+ ((VHOST_MAX_ASYNC_VEC >> 1) -
+ segs_await < BUF_VECTOR_MAX))) {
+ uint16_t nr_pkts;
+
+ nr_pkts = vq->async_ops.transfer_data(dev->vid,
+ queue_id, tdes, 0, nr_async_burst);
+ src_iovec = vec_pool;
+ dst_iovec = vec_pool + (VHOST_MAX_ASYNC_VEC >> 1);
+ it_idx = 0;
+ segs_await = 0;
+ vq->async_pkts_inflight_n += nr_pkts;
+
+ if (unlikely(nr_pkts < nr_async_burst)) {
+ pkt_err = nr_async_burst - nr_pkts;
+ nr_async_burst = 0;
+ break;
+ }
+ nr_async_burst = 0;
+ }
+ }
+
+ if (nr_async_burst) {
+ uint32_t nr_pkts;
+
+ nr_pkts = vq->async_ops.transfer_data(dev->vid, queue_id,
+ tdes, 0, nr_async_burst);
+ vq->async_pkts_inflight_n += nr_pkts;
+
+ if (unlikely(nr_pkts < nr_async_burst))
+ pkt_err = nr_async_burst - nr_pkts;
+ }
+
+ do_data_copy_dequeue(vq);
+
+ if (unlikely(pkt_err)) {
+ uint16_t nr_err_dma = pkt_err;
+ uint16_t nr_err_sw;
+
+ nr_async_pkts -= nr_err_dma;
+
+ /**
+ * revert shadow used ring and free pktmbufs for
+ * CPU-copied pkts after the first DMA-error pkt.
+ */
+ nr_err_sw = vq->last_avail_idx -
+ async_pkts_log[nr_async_pkts].last_avail_idx -
+ nr_err_dma;
+ vq->shadow_used_idx -= nr_err_sw;
+ while (nr_err_sw-- > 0)
+ rte_pktmbuf_free(pkts[--nr_done_pkts]);
+
+ /**
+ * recover DMA-copy related structures and free pktmbufs
+ * for DMA-error pkts.
+ */
+ vq->async_desc_idx_split -= nr_err_dma;
+ while (nr_err_dma-- > 0) {
+ rte_pktmbuf_free(
+ pkts_info[slot_idx & (vq->size - 1)].mbuf);
+ slot_idx--;
+ }
+
+ /* recover available ring */
+ vq->last_avail_idx =
+ async_pkts_log[nr_async_pkts].last_avail_idx;
+ }
+
+ vq->async_pkts_idx += nr_async_pkts;
+
+out:
+ if (likely(vq->shadow_used_idx)) {
+ flush_shadow_used_ring_split(dev, vq);
+ vhost_vring_call_split(dev, vq);
+ }
+
+ return nr_done_pkts;
+}
+
+__rte_noinline
+static uint16_t
+virtio_dev_tx_async_split_legacy(struct virtio_net *dev,
+ struct vhost_virtqueue *vq, uint16_t queue_id,
+ struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts,
+ uint16_t count)
+{
+ return virtio_dev_tx_async_split(dev, vq, queue_id, mbuf_pool,
+ pkts, count, true);
+}
+
+__rte_noinline
+static uint16_t
+virtio_dev_tx_async_split_compliant(struct virtio_net *dev,
+ struct vhost_virtqueue *vq, uint16_t queue_id,
+ struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts,
+ uint16_t count)
+{
+ return virtio_dev_tx_async_split(dev, vq, queue_id, mbuf_pool,
+ pkts, count, false);
+}
+
+uint16_t
+rte_vhost_try_dequeue_burst(int vid, uint16_t queue_id,
+ struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count,
+ int *nr_inflight)
+{
+ struct virtio_net *dev;
+ struct rte_mbuf *rarp_mbuf = NULL;
+ struct vhost_virtqueue *vq;
+ int16_t success = 1;
+
+ *nr_inflight = -1;
+
+ dev = get_device(vid);
+ if (!dev)
+ return 0;
+
+ if (unlikely(!(dev->flags & VIRTIO_DEV_BUILTIN_VIRTIO_NET))) {
+ VHOST_LOG_DATA(ERR,
+ "(%d) %s: built-in vhost net backend is disabled.\n",
+ dev->vid, __func__);
+ return 0;
+ }
+
+ if (unlikely(!is_valid_virt_queue_idx(queue_id, 1, dev->nr_vring))) {
+ VHOST_LOG_DATA(ERR,
+ "(%d) %s: invalid virtqueue idx %d.\n",
+ dev->vid, __func__, queue_id);
+ return 0;
+ }
+
+ vq = dev->virtqueue[queue_id];
+
+ if (unlikely(rte_spinlock_trylock(&vq->access_lock) == 0))
+ return 0;
+
+ if (unlikely(vq->enabled == 0)) {
+ count = 0;
+ goto out_access_unlock;
+ }
+
+ if (unlikely(!vq->async_registered)) {
+ VHOST_LOG_DATA(ERR, "(%d) %s: async not registered for queue id %d.\n",
+ dev->vid, __func__, queue_id);
+ count = 0;
+ goto out_access_unlock;
+ }
+
+ if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
+ vhost_user_iotlb_rd_lock(vq);
+
+ if (unlikely(vq->access_ok == 0))
+ if (unlikely(vring_translate(dev, vq) < 0)) {
+ count = 0;
+ goto out_access_unlock;
+ }
+
+ /*
+ * Construct a RARP broadcast packet, and inject it to the "pkts"
+ * array, to looks like that guest actually send such packet.
+ *
+ * Check user_send_rarp() for more information.
+ *
+ * broadcast_rarp shares a cacheline in the virtio_net structure
+ * with some fields that are accessed during enqueue and
+ * __atomic_compare_exchange_n causes a write if performed compare
+ * and exchange. This could result in false sharing between enqueue
+ * and dequeue.
+ *
+ * Prevent unnecessary false sharing by reading broadcast_rarp first
+ * and only performing compare and exchange if the read indicates it
+ * is likely to be set.
+ */
+ if (unlikely(__atomic_load_n(&dev->broadcast_rarp, __ATOMIC_ACQUIRE) &&
+ __atomic_compare_exchange_n(&dev->broadcast_rarp,
+ &success, 0, 0, __ATOMIC_RELEASE, __ATOMIC_RELAXED))) {
+
+ rarp_mbuf = rte_net_make_rarp_packet(mbuf_pool, &dev->mac);
+ if (rarp_mbuf == NULL) {
+ VHOST_LOG_DATA(ERR, "Failed to make RARP packet.\n");
+ count = 0;
+ goto out;
+ }
+ count -= 1;
+ }
+
+ if (unlikely(vq_is_packed(dev)))
+ return 0;
+
+ if (dev->flags & VIRTIO_DEV_LEGACY_OL_FLAGS)
+ count = virtio_dev_tx_async_split_legacy(dev, vq, queue_id,
+ mbuf_pool, pkts, count);
+ else
+ count = virtio_dev_tx_async_split_compliant(dev, vq, queue_id,
+ mbuf_pool, pkts, count);
+
+out:
+ *nr_inflight = vq->async_pkts_inflight_n;
+
+ if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
+ vhost_user_iotlb_rd_unlock(vq);
+
+out_access_unlock:
+ rte_spinlock_unlock(&vq->access_lock);
+
+ if (unlikely(rarp_mbuf != NULL)) {
+ /*
+ * Inject it to the head of "pkts" array, so that switch's mac
+ * learning table will get updated first.
+ */
+ memmove(&pkts[1], pkts, count * sizeof(struct rte_mbuf *));
+ pkts[0] = rarp_mbuf;
+ count += 1;
+ }
+
+ return count;
+}