[v7,06/13] vhost: add packed ring batch dequeue

Message ID 20191021154016.16274-7-yong.liu@intel.com (mailing list archive)
State Superseded, archived
Delegated to: Maxime Coquelin
Headers
Series vhost packed ring performance optimization |

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/Intel-compilation success Compilation OK

Commit Message

Marvin Liu Oct. 21, 2019, 3:40 p.m. UTC
  Add batch dequeue function like enqueue function for packed ring, batch
dequeue function will not support chained descritpors, single packet
dequeue function will handle it.

Signed-off-by: Marvin Liu <yong.liu@intel.com>
  

Comments

Maxime Coquelin Oct. 21, 2019, 9:47 a.m. UTC | #1
On 10/21/19 5:40 PM, Marvin Liu wrote:
> Add batch dequeue function like enqueue function for packed ring, batch
> dequeue function will not support chained descritpors, single packet
> dequeue function will handle it.
> 
> Signed-off-by: Marvin Liu <yong.liu@intel.com>
> 
> diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
> index a2b9221e0..67724c342 100644
> --- a/lib/librte_vhost/vhost.h
> +++ b/lib/librte_vhost/vhost.h
> @@ -39,6 +39,9 @@
>  
>  #define VHOST_LOG_CACHE_NR 32
>  
> +#define PACKED_DESC_SINGLE_DEQUEUE_FLAG (VRING_DESC_F_NEXT | \
> +					 VRING_DESC_F_INDIRECT)
> +
>  #define PACKED_BATCH_SIZE (RTE_CACHE_LINE_SIZE / \
>  			    sizeof(struct vring_packed_desc))
>  #define PACKED_BATCH_MASK (PACKED_BATCH_SIZE - 1)
> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
> index 317be1aed..f13fcafbb 100644
> --- a/lib/librte_vhost/virtio_net.c
> +++ b/lib/librte_vhost/virtio_net.c
> @@ -1635,6 +1635,114 @@ virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
>  	return i;
>  }
>  
> +static __rte_always_inline int
> +vhost_reserve_avail_batch_packed(struct virtio_net *dev,
> +				 struct vhost_virtqueue *vq,
> +				 struct rte_mempool *mbuf_pool,
> +				 struct rte_mbuf **pkts,
> +				 uint16_t avail_idx,
> +				 uintptr_t *desc_addrs,
> +				 uint16_t *ids)
> +{
> +	bool wrap = vq->avail_wrap_counter;
> +	struct vring_packed_desc *descs = vq->desc_packed;
> +	struct virtio_net_hdr *hdr;
> +	uint64_t lens[PACKED_BATCH_SIZE];
> +	uint64_t buf_lens[PACKED_BATCH_SIZE];
> +	uint32_t buf_offset = dev->vhost_hlen;
> +	uint16_t flags, i;
> +
> +	if (unlikely(avail_idx & PACKED_BATCH_MASK))
> +		return -1;
> +	if (unlikely((avail_idx + PACKED_BATCH_SIZE) > vq->size))
> +		return -1;
> +
> +	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
> +		flags = descs[avail_idx + i].flags;
> +		if (unlikely((wrap != !!(flags & VRING_DESC_F_AVAIL)) ||
> +			     (wrap == !!(flags & VRING_DESC_F_USED))  ||
> +			     (flags & PACKED_DESC_SINGLE_DEQUEUE_FLAG)))
> +			return -1;
> +	}
> +
> +	rte_smp_rmb();
> +
> +	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
> +		lens[i] = descs[avail_idx + i].len;
> +
> +	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
> +		desc_addrs[i] = vhost_iova_to_vva(dev, vq,
> +						  descs[avail_idx + i].addr,
> +						  &lens[i], VHOST_ACCESS_RW);
> +	}
> +
> +	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
> +		if (unlikely((lens[i] != descs[avail_idx + i].len)))
> +			return -1;
> +	}
> +
> +	if (rte_pktmbuf_alloc_bulk(mbuf_pool, pkts, PACKED_BATCH_SIZE))

Same here, you may want to create a variant of Flavio's
virtio_dev_pktmbuf_alloc for bulk allocations.

> +		return -1;
> +
> +	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
> +		buf_lens[i] = pkts[i]->buf_len - pkts[i]->data_off;
> +
> +	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
> +		if (unlikely(buf_lens[i] < (lens[i] - buf_offset)))
> +			goto free_buf;
> +	}
> +
> +	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
> +		pkts[i]->pkt_len = descs[avail_idx + i].len - buf_offset;
> +		pkts[i]->data_len = pkts[i]->pkt_len;
> +		ids[i] = descs[avail_idx + i].id;
> +	}
> +
> +	if (virtio_net_with_host_offload(dev)) {
> +		vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
> +			hdr = (struct virtio_net_hdr *)(desc_addrs[i]);
> +			vhost_dequeue_offload(hdr, pkts[i]);
> +		}
> +	}
> +
> +	return 0;
> +
> +free_buf:
> +	for (i = 0; i < PACKED_BATCH_SIZE; i++)
> +		rte_pktmbuf_free(pkts[i]);
> +
> +	return -1;
> +}
> +
> +static __rte_unused int
> +virtio_dev_tx_batch_packed(struct virtio_net *dev,
> +			   struct vhost_virtqueue *vq,
> +			   struct rte_mempool *mbuf_pool,
> +			   struct rte_mbuf **pkts)
> +{
> +	uint16_t avail_idx = vq->last_avail_idx;
> +	uint32_t buf_offset = dev->vhost_hlen;
> +	uintptr_t desc_addrs[PACKED_BATCH_SIZE];
> +	uint16_t ids[PACKED_BATCH_SIZE];
> +	uint16_t i;
> +
> +	if (vhost_reserve_avail_batch_packed(dev, vq, mbuf_pool, pkts,
> +					     avail_idx, desc_addrs, ids))
> +		return -1;
> +
> +	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
> +		rte_prefetch0((void *)(uintptr_t)desc_addrs[i]);
> +
> +	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
> +		rte_memcpy(rte_pktmbuf_mtod_offset(pkts[i], void *, 0),
> +			   (void *)(uintptr_t)(desc_addrs[i] + buf_offset),
> +			   pkts[i]->pkt_len);
> +
> +	vq_inc_last_avail_packed(vq, PACKED_BATCH_SIZE);
> +
> +	return 0;
> +}
> +
>  static __rte_always_inline int
>  vhost_dequeue_single_packed(struct virtio_net *dev,
>  			    struct vhost_virtqueue *vq,
>
  
Marvin Liu Oct. 21, 2019, 2:29 p.m. UTC | #2
Thanks Maxime, has been modified in v8.

> -----Original Message-----
> From: Maxime Coquelin [mailto:maxime.coquelin@redhat.com]
> Sent: Monday, October 21, 2019 5:47 PM
> To: Liu, Yong <yong.liu@intel.com>; Bie, Tiwei <tiwei.bie@intel.com>; Wang,
> Zhihong <zhihong.wang@intel.com>; stephen@networkplumber.org;
> gavin.hu@arm.com
> Cc: dev@dpdk.org
> Subject: Re: [PATCH v7 06/13] vhost: add packed ring batch dequeue
> 
> 
> 
> On 10/21/19 5:40 PM, Marvin Liu wrote:
> > Add batch dequeue function like enqueue function for packed ring, batch
> > dequeue function will not support chained descritpors, single packet
> > dequeue function will handle it.
> >
> > Signed-off-by: Marvin Liu <yong.liu@intel.com>
> >
> > diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
> > index a2b9221e0..67724c342 100644
> > --- a/lib/librte_vhost/vhost.h
> > +++ b/lib/librte_vhost/vhost.h
> > @@ -39,6 +39,9 @@
> >
> >  #define VHOST_LOG_CACHE_NR 32
> >
> > +#define PACKED_DESC_SINGLE_DEQUEUE_FLAG (VRING_DESC_F_NEXT | \
> > +					 VRING_DESC_F_INDIRECT)
> > +
> >  #define PACKED_BATCH_SIZE (RTE_CACHE_LINE_SIZE / \
> >  			    sizeof(struct vring_packed_desc))
> >  #define PACKED_BATCH_MASK (PACKED_BATCH_SIZE - 1)
> > diff --git a/lib/librte_vhost/virtio_net.c
> b/lib/librte_vhost/virtio_net.c
> > index 317be1aed..f13fcafbb 100644
> > --- a/lib/librte_vhost/virtio_net.c
> > +++ b/lib/librte_vhost/virtio_net.c
> > @@ -1635,6 +1635,114 @@ virtio_dev_tx_split(struct virtio_net *dev,
> struct vhost_virtqueue *vq,
> >  	return i;
> >  }
> >
> > +static __rte_always_inline int
> > +vhost_reserve_avail_batch_packed(struct virtio_net *dev,
> > +				 struct vhost_virtqueue *vq,
> > +				 struct rte_mempool *mbuf_pool,
> > +				 struct rte_mbuf **pkts,
> > +				 uint16_t avail_idx,
> > +				 uintptr_t *desc_addrs,
> > +				 uint16_t *ids)
> > +{
> > +	bool wrap = vq->avail_wrap_counter;
> > +	struct vring_packed_desc *descs = vq->desc_packed;
> > +	struct virtio_net_hdr *hdr;
> > +	uint64_t lens[PACKED_BATCH_SIZE];
> > +	uint64_t buf_lens[PACKED_BATCH_SIZE];
> > +	uint32_t buf_offset = dev->vhost_hlen;
> > +	uint16_t flags, i;
> > +
> > +	if (unlikely(avail_idx & PACKED_BATCH_MASK))
> > +		return -1;
> > +	if (unlikely((avail_idx + PACKED_BATCH_SIZE) > vq->size))
> > +		return -1;
> > +
> > +	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
> > +		flags = descs[avail_idx + i].flags;
> > +		if (unlikely((wrap != !!(flags & VRING_DESC_F_AVAIL)) ||
> > +			     (wrap == !!(flags & VRING_DESC_F_USED))  ||
> > +			     (flags & PACKED_DESC_SINGLE_DEQUEUE_FLAG)))
> > +			return -1;
> > +	}
> > +
> > +	rte_smp_rmb();
> > +
> > +	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
> > +		lens[i] = descs[avail_idx + i].len;
> > +
> > +	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
> > +		desc_addrs[i] = vhost_iova_to_vva(dev, vq,
> > +						  descs[avail_idx + i].addr,
> > +						  &lens[i], VHOST_ACCESS_RW);
> > +	}
> > +
> > +	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
> > +		if (unlikely((lens[i] != descs[avail_idx + i].len)))
> > +			return -1;
> > +	}
> > +
> > +	if (rte_pktmbuf_alloc_bulk(mbuf_pool, pkts, PACKED_BATCH_SIZE))
> 
> Same here, you may want to create a variant of Flavio's
> virtio_dev_pktmbuf_alloc for bulk allocations.

> 
> > +		return -1;
> > +
> > +	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
> > +		buf_lens[i] = pkts[i]->buf_len - pkts[i]->data_off;
> > +
> > +	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
> > +		if (unlikely(buf_lens[i] < (lens[i] - buf_offset)))
> > +			goto free_buf;
> > +	}
> > +
> > +	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
> > +		pkts[i]->pkt_len = descs[avail_idx + i].len - buf_offset;
> > +		pkts[i]->data_len = pkts[i]->pkt_len;
> > +		ids[i] = descs[avail_idx + i].id;
> > +	}
> > +
> > +	if (virtio_net_with_host_offload(dev)) {
> > +		vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
> > +			hdr = (struct virtio_net_hdr *)(desc_addrs[i]);
> > +			vhost_dequeue_offload(hdr, pkts[i]);
> > +		}
> > +	}
> > +
> > +	return 0;
> > +
> > +free_buf:
> > +	for (i = 0; i < PACKED_BATCH_SIZE; i++)
> > +		rte_pktmbuf_free(pkts[i]);
> > +
> > +	return -1;
> > +}
> > +
> > +static __rte_unused int
> > +virtio_dev_tx_batch_packed(struct virtio_net *dev,
> > +			   struct vhost_virtqueue *vq,
> > +			   struct rte_mempool *mbuf_pool,
> > +			   struct rte_mbuf **pkts)
> > +{
> > +	uint16_t avail_idx = vq->last_avail_idx;
> > +	uint32_t buf_offset = dev->vhost_hlen;
> > +	uintptr_t desc_addrs[PACKED_BATCH_SIZE];
> > +	uint16_t ids[PACKED_BATCH_SIZE];
> > +	uint16_t i;
> > +
> > +	if (vhost_reserve_avail_batch_packed(dev, vq, mbuf_pool, pkts,
> > +					     avail_idx, desc_addrs, ids))
> > +		return -1;
> > +
> > +	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
> > +		rte_prefetch0((void *)(uintptr_t)desc_addrs[i]);
> > +
> > +	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
> > +		rte_memcpy(rte_pktmbuf_mtod_offset(pkts[i], void *, 0),
> > +			   (void *)(uintptr_t)(desc_addrs[i] + buf_offset),
> > +			   pkts[i]->pkt_len);
> > +
> > +	vq_inc_last_avail_packed(vq, PACKED_BATCH_SIZE);
> > +
> > +	return 0;
> > +}
> > +
> >  static __rte_always_inline int
> >  vhost_dequeue_single_packed(struct virtio_net *dev,
> >  			    struct vhost_virtqueue *vq,
> >
  

Patch

diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
index a2b9221e0..67724c342 100644
--- a/lib/librte_vhost/vhost.h
+++ b/lib/librte_vhost/vhost.h
@@ -39,6 +39,9 @@ 
 
 #define VHOST_LOG_CACHE_NR 32
 
+#define PACKED_DESC_SINGLE_DEQUEUE_FLAG (VRING_DESC_F_NEXT | \
+					 VRING_DESC_F_INDIRECT)
+
 #define PACKED_BATCH_SIZE (RTE_CACHE_LINE_SIZE / \
 			    sizeof(struct vring_packed_desc))
 #define PACKED_BATCH_MASK (PACKED_BATCH_SIZE - 1)
diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 317be1aed..f13fcafbb 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -1635,6 +1635,114 @@  virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	return i;
 }
 
+static __rte_always_inline int
+vhost_reserve_avail_batch_packed(struct virtio_net *dev,
+				 struct vhost_virtqueue *vq,
+				 struct rte_mempool *mbuf_pool,
+				 struct rte_mbuf **pkts,
+				 uint16_t avail_idx,
+				 uintptr_t *desc_addrs,
+				 uint16_t *ids)
+{
+	bool wrap = vq->avail_wrap_counter;
+	struct vring_packed_desc *descs = vq->desc_packed;
+	struct virtio_net_hdr *hdr;
+	uint64_t lens[PACKED_BATCH_SIZE];
+	uint64_t buf_lens[PACKED_BATCH_SIZE];
+	uint32_t buf_offset = dev->vhost_hlen;
+	uint16_t flags, i;
+
+	if (unlikely(avail_idx & PACKED_BATCH_MASK))
+		return -1;
+	if (unlikely((avail_idx + PACKED_BATCH_SIZE) > vq->size))
+		return -1;
+
+	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
+		flags = descs[avail_idx + i].flags;
+		if (unlikely((wrap != !!(flags & VRING_DESC_F_AVAIL)) ||
+			     (wrap == !!(flags & VRING_DESC_F_USED))  ||
+			     (flags & PACKED_DESC_SINGLE_DEQUEUE_FLAG)))
+			return -1;
+	}
+
+	rte_smp_rmb();
+
+	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
+		lens[i] = descs[avail_idx + i].len;
+
+	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
+		desc_addrs[i] = vhost_iova_to_vva(dev, vq,
+						  descs[avail_idx + i].addr,
+						  &lens[i], VHOST_ACCESS_RW);
+	}
+
+	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
+		if (unlikely((lens[i] != descs[avail_idx + i].len)))
+			return -1;
+	}
+
+	if (rte_pktmbuf_alloc_bulk(mbuf_pool, pkts, PACKED_BATCH_SIZE))
+		return -1;
+
+	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
+		buf_lens[i] = pkts[i]->buf_len - pkts[i]->data_off;
+
+	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
+		if (unlikely(buf_lens[i] < (lens[i] - buf_offset)))
+			goto free_buf;
+	}
+
+	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
+		pkts[i]->pkt_len = descs[avail_idx + i].len - buf_offset;
+		pkts[i]->data_len = pkts[i]->pkt_len;
+		ids[i] = descs[avail_idx + i].id;
+	}
+
+	if (virtio_net_with_host_offload(dev)) {
+		vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
+			hdr = (struct virtio_net_hdr *)(desc_addrs[i]);
+			vhost_dequeue_offload(hdr, pkts[i]);
+		}
+	}
+
+	return 0;
+
+free_buf:
+	for (i = 0; i < PACKED_BATCH_SIZE; i++)
+		rte_pktmbuf_free(pkts[i]);
+
+	return -1;
+}
+
+static __rte_unused int
+virtio_dev_tx_batch_packed(struct virtio_net *dev,
+			   struct vhost_virtqueue *vq,
+			   struct rte_mempool *mbuf_pool,
+			   struct rte_mbuf **pkts)
+{
+	uint16_t avail_idx = vq->last_avail_idx;
+	uint32_t buf_offset = dev->vhost_hlen;
+	uintptr_t desc_addrs[PACKED_BATCH_SIZE];
+	uint16_t ids[PACKED_BATCH_SIZE];
+	uint16_t i;
+
+	if (vhost_reserve_avail_batch_packed(dev, vq, mbuf_pool, pkts,
+					     avail_idx, desc_addrs, ids))
+		return -1;
+
+	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
+		rte_prefetch0((void *)(uintptr_t)desc_addrs[i]);
+
+	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE)
+		rte_memcpy(rte_pktmbuf_mtod_offset(pkts[i], void *, 0),
+			   (void *)(uintptr_t)(desc_addrs[i] + buf_offset),
+			   pkts[i]->pkt_len);
+
+	vq_inc_last_avail_packed(vq, PACKED_BATCH_SIZE);
+
+	return 0;
+}
+
 static __rte_always_inline int
 vhost_dequeue_single_packed(struct virtio_net *dev,
 			    struct vhost_virtqueue *vq,