[dpdk-dev,v3] vhost: Add indirect descriptors support to the TX path

Message ID 1474619303-16709-1-git-send-email-maxime.coquelin@redhat.com (mailing list archive)
State Superseded, archived
Headers

Commit Message

Maxime Coquelin Sept. 23, 2016, 8:28 a.m. UTC
  Indirect descriptors are usually supported by virtio-net devices,
allowing to dispatch a larger number of requests.

When the virtio device sends a packet using indirect descriptors,
only one slot is used in the ring, even for large packets.

The main effect is to improve the 0% packet loss benchmark.
A PVP benchmark using Moongen (64 bytes) on the TE, and testpmd
(fwd io for host, macswap for VM) on DUT shows a +50% gain for
zero loss.

On the downside, micro-benchmark using testpmd txonly in VM and
rxonly on host shows a loss between 1 and 4%.i But depending on
the needs, feature can be disabled at VM boot time by passing
indirect_desc=off argument to vhost-user device in Qemu.

Signed-off-by: Maxime Coquelin <maxime.coquelin@redhat.com>
---
Changes since v2:
  

Comments

Michael S. Tsirkin Sept. 23, 2016, 3:49 p.m. UTC | #1
On Fri, Sep 23, 2016 at 10:28:23AM +0200, Maxime Coquelin wrote:
> Indirect descriptors are usually supported by virtio-net devices,
> allowing to dispatch a larger number of requests.
> 
> When the virtio device sends a packet using indirect descriptors,
> only one slot is used in the ring, even for large packets.
> 
> The main effect is to improve the 0% packet loss benchmark.
> A PVP benchmark using Moongen (64 bytes) on the TE, and testpmd
> (fwd io for host, macswap for VM) on DUT shows a +50% gain for
> zero loss.
> 
> On the downside, micro-benchmark using testpmd txonly in VM and
> rxonly on host shows a loss between 1 and 4%.i But depending on
> the needs, feature can be disabled at VM boot time by passing
> indirect_desc=off argument to vhost-user device in Qemu.

Even better, change guest pmd to only use indirect
descriptors when this makes sense (e.g. sufficiently
large packets).

I would be very interested to know when does it make
sense.

The feature is there, it's up to guest whether to
use it.


> Signed-off-by: Maxime Coquelin <maxime.coquelin@redhat.com>
> ---
> Changes since v2:
> =================
>  - Revert back to not checking feature flag to be aligned with
> kernel implementation
>  - Ensure we don't have nested indirect descriptors
>  - Ensure the indirect desc address is valid, to protect against
> malicious guests
> 
> Changes since RFC:
> =================
>  - Enrich commit message with figures
>  - Rebased on top of dpdk-next-virtio's master
>  - Add feature check to ensure we don't receive an indirect desc
> if not supported by the virtio driver
> 
>  lib/librte_vhost/vhost.c      |  3 ++-
>  lib/librte_vhost/virtio_net.c | 41 +++++++++++++++++++++++++++++++----------
>  2 files changed, 33 insertions(+), 11 deletions(-)
> 
> diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
> index 46095c3..30bb0ce 100644
> --- a/lib/librte_vhost/vhost.c
> +++ b/lib/librte_vhost/vhost.c
> @@ -65,7 +65,8 @@
>  				(1ULL << VIRTIO_NET_F_CSUM)    | \
>  				(1ULL << VIRTIO_NET_F_GUEST_CSUM) | \
>  				(1ULL << VIRTIO_NET_F_GUEST_TSO4) | \
> -				(1ULL << VIRTIO_NET_F_GUEST_TSO6))
> +				(1ULL << VIRTIO_NET_F_GUEST_TSO6) | \
> +				(1ULL << VIRTIO_RING_F_INDIRECT_DESC))
>  
>  uint64_t VHOST_FEATURES = VHOST_SUPPORTED_FEATURES;
>  
> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
> index 8a151af..2e0a587 100644
> --- a/lib/librte_vhost/virtio_net.c
> +++ b/lib/librte_vhost/virtio_net.c
> @@ -679,8 +679,8 @@ make_rarp_packet(struct rte_mbuf *rarp_mbuf, const struct ether_addr *mac)
>  }
>  
>  static inline int __attribute__((always_inline))
> -copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
> -		  struct rte_mbuf *m, uint16_t desc_idx,
> +copy_desc_to_mbuf(struct virtio_net *dev, struct vring_desc *descs,
> +		  uint16_t max_desc, struct rte_mbuf *m, uint16_t desc_idx,
>  		  struct rte_mempool *mbuf_pool)
>  {
>  	struct vring_desc *desc;
> @@ -693,8 +693,9 @@ copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
>  	/* A counter to avoid desc dead loop chain */
>  	uint32_t nr_desc = 1;
>  
> -	desc = &vq->desc[desc_idx];
> -	if (unlikely(desc->len < dev->vhost_hlen))
> +	desc = &descs[desc_idx];
> +	if (unlikely((desc->len < dev->vhost_hlen)) ||
> +			(desc->flags & VRING_DESC_F_INDIRECT))
>  		return -1;
>  
>  	desc_addr = gpa_to_vva(dev, desc->addr);
> @@ -711,7 +712,9 @@ copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
>  	 */
>  	if (likely((desc->len == dev->vhost_hlen) &&
>  		   (desc->flags & VRING_DESC_F_NEXT) != 0)) {
> -		desc = &vq->desc[desc->next];
> +		desc = &descs[desc->next];
> +		if (unlikely(desc->flags & VRING_DESC_F_INDIRECT))
> +			return -1;
>  
>  		desc_addr = gpa_to_vva(dev, desc->addr);
>  		if (unlikely(!desc_addr))


Just to make sure, does this still allow a chain of
direct descriptors ending with an indirect one?
This is legal as per spec.

> @@ -747,10 +750,12 @@ copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
>  			if ((desc->flags & VRING_DESC_F_NEXT) == 0)
>  				break;
>  
> -			if (unlikely(desc->next >= vq->size ||
> -				     ++nr_desc > vq->size))
> +			if (unlikely(desc->next >= max_desc ||
> +				     ++nr_desc > max_desc))
> +				return -1;
> +			desc = &descs[desc->next];
> +			if (unlikely(desc->flags & VRING_DESC_F_INDIRECT))
>  				return -1;
> -			desc = &vq->desc[desc->next];
>  
>  			desc_addr = gpa_to_vva(dev, desc->addr);
>  			if (unlikely(!desc_addr))
> @@ -878,19 +883,35 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
>  	/* Prefetch descriptor index. */
>  	rte_prefetch0(&vq->desc[desc_indexes[0]]);
>  	for (i = 0; i < count; i++) {
> +		struct vring_desc *desc;
> +		uint16_t sz, idx;
>  		int err;
>  
>  		if (likely(i + 1 < count))
>  			rte_prefetch0(&vq->desc[desc_indexes[i + 1]]);
>  
> +		if (vq->desc[desc_indexes[i]].flags & VRING_DESC_F_INDIRECT) {
> +			desc = (struct vring_desc *)gpa_to_vva(dev,
> +					vq->desc[desc_indexes[i]].addr);
> +			if (unlikely(!desc))
> +				break;
> +
> +			rte_prefetch0(desc);
> +			sz = vq->desc[desc_indexes[i]].len / sizeof(*desc);
> +			idx = 0;
> +		} else {
> +			desc = vq->desc;
> +			sz = vq->size;
> +			idx = desc_indexes[i];
> +		}
> +
>  		pkts[i] = rte_pktmbuf_alloc(mbuf_pool);
>  		if (unlikely(pkts[i] == NULL)) {
>  			RTE_LOG(ERR, VHOST_DATA,
>  				"Failed to allocate memory for mbuf.\n");
>  			break;
>  		}
> -		err = copy_desc_to_mbuf(dev, vq, pkts[i], desc_indexes[i],
> -					mbuf_pool);
> +		err = copy_desc_to_mbuf(dev, desc, sz, pkts[i], idx, mbuf_pool);
>  		if (unlikely(err)) {
>  			rte_pktmbuf_free(pkts[i]);
>  			break;
> -- 
> 2.7.4

Something that I'm missing here: it's legal for guest
to add indirect descriptors for RX.
I don't see the handling of RX here though.
I think it's required for spec compliance.
  
Maxime Coquelin Sept. 23, 2016, 6:02 p.m. UTC | #2
On 09/23/2016 05:49 PM, Michael S. Tsirkin wrote:
> On Fri, Sep 23, 2016 at 10:28:23AM +0200, Maxime Coquelin wrote:
>> Indirect descriptors are usually supported by virtio-net devices,
>> allowing to dispatch a larger number of requests.
>>
>> When the virtio device sends a packet using indirect descriptors,
>> only one slot is used in the ring, even for large packets.
>>
>> The main effect is to improve the 0% packet loss benchmark.
>> A PVP benchmark using Moongen (64 bytes) on the TE, and testpmd
>> (fwd io for host, macswap for VM) on DUT shows a +50% gain for
>> zero loss.
>>
>> On the downside, micro-benchmark using testpmd txonly in VM and
>> rxonly on host shows a loss between 1 and 4%.i But depending on
>> the needs, feature can be disabled at VM boot time by passing
>> indirect_desc=off argument to vhost-user device in Qemu.
>
> Even better, change guest pmd to only use indirect
> descriptors when this makes sense (e.g. sufficiently
> large packets).
With the micro-benchmark, the degradation is quite constant whatever
the packet size.

For PVP, I could not test with larger packets than 64 bytes, as I don't
have a 40G interface, and line rate with 10G is reached rapidly.

> I would be very interested to know when does it make
> sense.
>
> The feature is there, it's up to guest whether to
> use it.
Do you mean the PMD should detect dynamically whether using indirect,
or having an option at device init time to enable or not the feature?

>
>
>> Signed-off-by: Maxime Coquelin <maxime.coquelin@redhat.com>
>> ---
>> Changes since v2:
>> =================
>>  - Revert back to not checking feature flag to be aligned with
>> kernel implementation
>>  - Ensure we don't have nested indirect descriptors
>>  - Ensure the indirect desc address is valid, to protect against
>> malicious guests
>>
>> Changes since RFC:
>> =================
>>  - Enrich commit message with figures
>>  - Rebased on top of dpdk-next-virtio's master
>>  - Add feature check to ensure we don't receive an indirect desc
>> if not supported by the virtio driver
>>
>>  lib/librte_vhost/vhost.c      |  3 ++-
>>  lib/librte_vhost/virtio_net.c | 41 +++++++++++++++++++++++++++++++----------
>>  2 files changed, 33 insertions(+), 11 deletions(-)
>>
>> diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
>> index 46095c3..30bb0ce 100644
>> --- a/lib/librte_vhost/vhost.c
>> +++ b/lib/librte_vhost/vhost.c
>> @@ -65,7 +65,8 @@
>>  				(1ULL << VIRTIO_NET_F_CSUM)    | \
>>  				(1ULL << VIRTIO_NET_F_GUEST_CSUM) | \
>>  				(1ULL << VIRTIO_NET_F_GUEST_TSO4) | \
>> -				(1ULL << VIRTIO_NET_F_GUEST_TSO6))
>> +				(1ULL << VIRTIO_NET_F_GUEST_TSO6) | \
>> +				(1ULL << VIRTIO_RING_F_INDIRECT_DESC))
>>
>>  uint64_t VHOST_FEATURES = VHOST_SUPPORTED_FEATURES;
>>
>> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
>> index 8a151af..2e0a587 100644
>> --- a/lib/librte_vhost/virtio_net.c
>> +++ b/lib/librte_vhost/virtio_net.c
>> @@ -679,8 +679,8 @@ make_rarp_packet(struct rte_mbuf *rarp_mbuf, const struct ether_addr *mac)
>>  }
>>
>>  static inline int __attribute__((always_inline))
>> -copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
>> -		  struct rte_mbuf *m, uint16_t desc_idx,
>> +copy_desc_to_mbuf(struct virtio_net *dev, struct vring_desc *descs,
>> +		  uint16_t max_desc, struct rte_mbuf *m, uint16_t desc_idx,
>>  		  struct rte_mempool *mbuf_pool)
>>  {
>>  	struct vring_desc *desc;
>> @@ -693,8 +693,9 @@ copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
>>  	/* A counter to avoid desc dead loop chain */
>>  	uint32_t nr_desc = 1;
>>
>> -	desc = &vq->desc[desc_idx];
>> -	if (unlikely(desc->len < dev->vhost_hlen))
>> +	desc = &descs[desc_idx];
>> +	if (unlikely((desc->len < dev->vhost_hlen)) ||
>> +			(desc->flags & VRING_DESC_F_INDIRECT))
>>  		return -1;
>>
>>  	desc_addr = gpa_to_vva(dev, desc->addr);
>> @@ -711,7 +712,9 @@ copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
>>  	 */
>>  	if (likely((desc->len == dev->vhost_hlen) &&
>>  		   (desc->flags & VRING_DESC_F_NEXT) != 0)) {
>> -		desc = &vq->desc[desc->next];
>> +		desc = &descs[desc->next];
>> +		if (unlikely(desc->flags & VRING_DESC_F_INDIRECT))
>> +			return -1;
>>
>>  		desc_addr = gpa_to_vva(dev, desc->addr);
>>  		if (unlikely(!desc_addr))
>
>
> Just to make sure, does this still allow a chain of
> direct descriptors ending with an indirect one?
> This is legal as per spec.
>
>> @@ -747,10 +750,12 @@ copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
>>  			if ((desc->flags & VRING_DESC_F_NEXT) == 0)
>>  				break;
>>
>> -			if (unlikely(desc->next >= vq->size ||
>> -				     ++nr_desc > vq->size))
>> +			if (unlikely(desc->next >= max_desc ||
>> +				     ++nr_desc > max_desc))
>> +				return -1;
>> +			desc = &descs[desc->next];
>> +			if (unlikely(desc->flags & VRING_DESC_F_INDIRECT))
>>  				return -1;
>> -			desc = &vq->desc[desc->next];
>>
>>  			desc_addr = gpa_to_vva(dev, desc->addr);
>>  			if (unlikely(!desc_addr))
>> @@ -878,19 +883,35 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
>>  	/* Prefetch descriptor index. */
>>  	rte_prefetch0(&vq->desc[desc_indexes[0]]);
>>  	for (i = 0; i < count; i++) {
>> +		struct vring_desc *desc;
>> +		uint16_t sz, idx;
>>  		int err;
>>
>>  		if (likely(i + 1 < count))
>>  			rte_prefetch0(&vq->desc[desc_indexes[i + 1]]);
>>
>> +		if (vq->desc[desc_indexes[i]].flags & VRING_DESC_F_INDIRECT) {
>> +			desc = (struct vring_desc *)gpa_to_vva(dev,
>> +					vq->desc[desc_indexes[i]].addr);
>> +			if (unlikely(!desc))
>> +				break;
>> +
>> +			rte_prefetch0(desc);
>> +			sz = vq->desc[desc_indexes[i]].len / sizeof(*desc);
>> +			idx = 0;
>> +		} else {
>> +			desc = vq->desc;
>> +			sz = vq->size;
>> +			idx = desc_indexes[i];
>> +		}
>> +
>>  		pkts[i] = rte_pktmbuf_alloc(mbuf_pool);
>>  		if (unlikely(pkts[i] == NULL)) {
>>  			RTE_LOG(ERR, VHOST_DATA,
>>  				"Failed to allocate memory for mbuf.\n");
>>  			break;
>>  		}
>> -		err = copy_desc_to_mbuf(dev, vq, pkts[i], desc_indexes[i],
>> -					mbuf_pool);
>> +		err = copy_desc_to_mbuf(dev, desc, sz, pkts[i], idx, mbuf_pool);
>>  		if (unlikely(err)) {
>>  			rte_pktmbuf_free(pkts[i]);
>>  			break;
>> --
>> 2.7.4
>
> Something that I'm missing here: it's legal for guest
> to add indirect descriptors for RX.
> I don't see the handling of RX here though.
> I think it's required for spec compliance.
>
  
Michael S. Tsirkin Sept. 23, 2016, 6:06 p.m. UTC | #3
On Fri, Sep 23, 2016 at 08:02:27PM +0200, Maxime Coquelin wrote:
> 
> 
> On 09/23/2016 05:49 PM, Michael S. Tsirkin wrote:
> > On Fri, Sep 23, 2016 at 10:28:23AM +0200, Maxime Coquelin wrote:
> > > Indirect descriptors are usually supported by virtio-net devices,
> > > allowing to dispatch a larger number of requests.
> > > 
> > > When the virtio device sends a packet using indirect descriptors,
> > > only one slot is used in the ring, even for large packets.
> > > 
> > > The main effect is to improve the 0% packet loss benchmark.
> > > A PVP benchmark using Moongen (64 bytes) on the TE, and testpmd
> > > (fwd io for host, macswap for VM) on DUT shows a +50% gain for
> > > zero loss.
> > > 
> > > On the downside, micro-benchmark using testpmd txonly in VM and
> > > rxonly on host shows a loss between 1 and 4%.i But depending on
> > > the needs, feature can be disabled at VM boot time by passing
> > > indirect_desc=off argument to vhost-user device in Qemu.
> > 
> > Even better, change guest pmd to only use indirect
> > descriptors when this makes sense (e.g. sufficiently
> > large packets).
> With the micro-benchmark, the degradation is quite constant whatever
> the packet size.
> 
> For PVP, I could not test with larger packets than 64 bytes, as I don't
> have a 40G interface,

Don't 64 byte packets fit in a single slot anyway?
Why would there be an effect with that?

> and line rate with 10G is reached rapidly.

Right but focus on packet loss. you can have that at any rate.

> 
> > I would be very interested to know when does it make
> > sense.
> > 
> > The feature is there, it's up to guest whether to
> > use it.
> Do you mean the PMD should detect dynamically whether using indirect,
> or having an option at device init time to enable or not the feature?

guest PMD should not use indirect where it slows things down.

> > 
> > 
> > > Signed-off-by: Maxime Coquelin <maxime.coquelin@redhat.com>
> > > ---
> > > Changes since v2:
> > > =================
> > >  - Revert back to not checking feature flag to be aligned with
> > > kernel implementation
> > >  - Ensure we don't have nested indirect descriptors
> > >  - Ensure the indirect desc address is valid, to protect against
> > > malicious guests
> > > 
> > > Changes since RFC:
> > > =================
> > >  - Enrich commit message with figures
> > >  - Rebased on top of dpdk-next-virtio's master
> > >  - Add feature check to ensure we don't receive an indirect desc
> > > if not supported by the virtio driver
> > > 
> > >  lib/librte_vhost/vhost.c      |  3 ++-
> > >  lib/librte_vhost/virtio_net.c | 41 +++++++++++++++++++++++++++++++----------
> > >  2 files changed, 33 insertions(+), 11 deletions(-)
> > > 
> > > diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
> > > index 46095c3..30bb0ce 100644
> > > --- a/lib/librte_vhost/vhost.c
> > > +++ b/lib/librte_vhost/vhost.c
> > > @@ -65,7 +65,8 @@
> > >  				(1ULL << VIRTIO_NET_F_CSUM)    | \
> > >  				(1ULL << VIRTIO_NET_F_GUEST_CSUM) | \
> > >  				(1ULL << VIRTIO_NET_F_GUEST_TSO4) | \
> > > -				(1ULL << VIRTIO_NET_F_GUEST_TSO6))
> > > +				(1ULL << VIRTIO_NET_F_GUEST_TSO6) | \
> > > +				(1ULL << VIRTIO_RING_F_INDIRECT_DESC))
> > > 
> > >  uint64_t VHOST_FEATURES = VHOST_SUPPORTED_FEATURES;
> > > 
> > > diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
> > > index 8a151af..2e0a587 100644
> > > --- a/lib/librte_vhost/virtio_net.c
> > > +++ b/lib/librte_vhost/virtio_net.c
> > > @@ -679,8 +679,8 @@ make_rarp_packet(struct rte_mbuf *rarp_mbuf, const struct ether_addr *mac)
> > >  }
> > > 
> > >  static inline int __attribute__((always_inline))
> > > -copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
> > > -		  struct rte_mbuf *m, uint16_t desc_idx,
> > > +copy_desc_to_mbuf(struct virtio_net *dev, struct vring_desc *descs,
> > > +		  uint16_t max_desc, struct rte_mbuf *m, uint16_t desc_idx,
> > >  		  struct rte_mempool *mbuf_pool)
> > >  {
> > >  	struct vring_desc *desc;
> > > @@ -693,8 +693,9 @@ copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
> > >  	/* A counter to avoid desc dead loop chain */
> > >  	uint32_t nr_desc = 1;
> > > 
> > > -	desc = &vq->desc[desc_idx];
> > > -	if (unlikely(desc->len < dev->vhost_hlen))
> > > +	desc = &descs[desc_idx];
> > > +	if (unlikely((desc->len < dev->vhost_hlen)) ||
> > > +			(desc->flags & VRING_DESC_F_INDIRECT))
> > >  		return -1;
> > > 
> > >  	desc_addr = gpa_to_vva(dev, desc->addr);
> > > @@ -711,7 +712,9 @@ copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
> > >  	 */
> > >  	if (likely((desc->len == dev->vhost_hlen) &&
> > >  		   (desc->flags & VRING_DESC_F_NEXT) != 0)) {
> > > -		desc = &vq->desc[desc->next];
> > > +		desc = &descs[desc->next];
> > > +		if (unlikely(desc->flags & VRING_DESC_F_INDIRECT))
> > > +			return -1;
> > > 
> > >  		desc_addr = gpa_to_vva(dev, desc->addr);
> > >  		if (unlikely(!desc_addr))
> > 
> > 
> > Just to make sure, does this still allow a chain of
> > direct descriptors ending with an indirect one?
> > This is legal as per spec.
> > 
> > > @@ -747,10 +750,12 @@ copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
> > >  			if ((desc->flags & VRING_DESC_F_NEXT) == 0)
> > >  				break;
> > > 
> > > -			if (unlikely(desc->next >= vq->size ||
> > > -				     ++nr_desc > vq->size))
> > > +			if (unlikely(desc->next >= max_desc ||
> > > +				     ++nr_desc > max_desc))
> > > +				return -1;
> > > +			desc = &descs[desc->next];
> > > +			if (unlikely(desc->flags & VRING_DESC_F_INDIRECT))
> > >  				return -1;
> > > -			desc = &vq->desc[desc->next];
> > > 
> > >  			desc_addr = gpa_to_vva(dev, desc->addr);
> > >  			if (unlikely(!desc_addr))
> > > @@ -878,19 +883,35 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
> > >  	/* Prefetch descriptor index. */
> > >  	rte_prefetch0(&vq->desc[desc_indexes[0]]);
> > >  	for (i = 0; i < count; i++) {
> > > +		struct vring_desc *desc;
> > > +		uint16_t sz, idx;
> > >  		int err;
> > > 
> > >  		if (likely(i + 1 < count))
> > >  			rte_prefetch0(&vq->desc[desc_indexes[i + 1]]);
> > > 
> > > +		if (vq->desc[desc_indexes[i]].flags & VRING_DESC_F_INDIRECT) {
> > > +			desc = (struct vring_desc *)gpa_to_vva(dev,
> > > +					vq->desc[desc_indexes[i]].addr);
> > > +			if (unlikely(!desc))
> > > +				break;
> > > +
> > > +			rte_prefetch0(desc);
> > > +			sz = vq->desc[desc_indexes[i]].len / sizeof(*desc);
> > > +			idx = 0;
> > > +		} else {
> > > +			desc = vq->desc;
> > > +			sz = vq->size;
> > > +			idx = desc_indexes[i];
> > > +		}
> > > +
> > >  		pkts[i] = rte_pktmbuf_alloc(mbuf_pool);
> > >  		if (unlikely(pkts[i] == NULL)) {
> > >  			RTE_LOG(ERR, VHOST_DATA,
> > >  				"Failed to allocate memory for mbuf.\n");
> > >  			break;
> > >  		}
> > > -		err = copy_desc_to_mbuf(dev, vq, pkts[i], desc_indexes[i],
> > > -					mbuf_pool);
> > > +		err = copy_desc_to_mbuf(dev, desc, sz, pkts[i], idx, mbuf_pool);
> > >  		if (unlikely(err)) {
> > >  			rte_pktmbuf_free(pkts[i]);
> > >  			break;
> > > --
> > > 2.7.4
> > 
> > Something that I'm missing here: it's legal for guest
> > to add indirect descriptors for RX.
> > I don't see the handling of RX here though.
> > I think it's required for spec compliance.
> >
  
Maxime Coquelin Sept. 23, 2016, 6:16 p.m. UTC | #4
On 09/23/2016 08:06 PM, Michael S. Tsirkin wrote:
> On Fri, Sep 23, 2016 at 08:02:27PM +0200, Maxime Coquelin wrote:
>>
>>
>> On 09/23/2016 05:49 PM, Michael S. Tsirkin wrote:
>>> On Fri, Sep 23, 2016 at 10:28:23AM +0200, Maxime Coquelin wrote:
>>>> Indirect descriptors are usually supported by virtio-net devices,
>>>> allowing to dispatch a larger number of requests.
>>>>
>>>> When the virtio device sends a packet using indirect descriptors,
>>>> only one slot is used in the ring, even for large packets.
>>>>
>>>> The main effect is to improve the 0% packet loss benchmark.
>>>> A PVP benchmark using Moongen (64 bytes) on the TE, and testpmd
>>>> (fwd io for host, macswap for VM) on DUT shows a +50% gain for
>>>> zero loss.
>>>>
>>>> On the downside, micro-benchmark using testpmd txonly in VM and
>>>> rxonly on host shows a loss between 1 and 4%.i But depending on
>>>> the needs, feature can be disabled at VM boot time by passing
>>>> indirect_desc=off argument to vhost-user device in Qemu.
>>>
>>> Even better, change guest pmd to only use indirect
>>> descriptors when this makes sense (e.g. sufficiently
>>> large packets).
>> With the micro-benchmark, the degradation is quite constant whatever
>> the packet size.
>>
>> For PVP, I could not test with larger packets than 64 bytes, as I don't
>> have a 40G interface,
>
> Don't 64 byte packets fit in a single slot anyway?
No, indirect is used. I didn't checked in details, but I think this is
because there is no headroom reserved in the mbuf.

This is the condition to meet to fit in a single slot:
/* optimize ring usage */
if (vtpci_with_feature(hw, VIRTIO_F_ANY_LAYOUT) &&
	rte_mbuf_refcnt_read(txm) == 1 &&
	RTE_MBUF_DIRECT(txm) &&
	txm->nb_segs == 1 &&
	rte_pktmbuf_headroom(txm) >= hdr_size &&
	rte_is_aligned(rte_pktmbuf_mtod(txm, char *),
		__alignof__(struct virtio_net_hdr_mrg_rxbuf)))
     can_push = 1;
else if (vtpci_with_feature(hw, VIRTIO_RING_F_INDIRECT_DESC) &&
	txm->nb_segs < VIRTIO_MAX_TX_INDIRECT)
     use_indirect = 1;

I will check more in details next week.

> Why would there be an effect with that?
>
>> and line rate with 10G is reached rapidly.
>
> Right but focus on packet loss. you can have that at any rate.
>
>>
>>> I would be very interested to know when does it make
>>> sense.
>>>
>>> The feature is there, it's up to guest whether to
>>> use it.
>> Do you mean the PMD should detect dynamically whether using indirect,
>> or having an option at device init time to enable or not the feature?
>
> guest PMD should not use indirect where it slows things down.
>
>>>
>>>
>>>> Signed-off-by: Maxime Coquelin <maxime.coquelin@redhat.com>
>>>> ---
>>>> Changes since v2:
>>>> =================
>>>>  - Revert back to not checking feature flag to be aligned with
>>>> kernel implementation
>>>>  - Ensure we don't have nested indirect descriptors
>>>>  - Ensure the indirect desc address is valid, to protect against
>>>> malicious guests
>>>>
>>>> Changes since RFC:
>>>> =================
>>>>  - Enrich commit message with figures
>>>>  - Rebased on top of dpdk-next-virtio's master
>>>>  - Add feature check to ensure we don't receive an indirect desc
>>>> if not supported by the virtio driver
>>>>
>>>>  lib/librte_vhost/vhost.c      |  3 ++-
>>>>  lib/librte_vhost/virtio_net.c | 41 +++++++++++++++++++++++++++++++----------
>>>>  2 files changed, 33 insertions(+), 11 deletions(-)
>>>>
>>>> diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
>>>> index 46095c3..30bb0ce 100644
>>>> --- a/lib/librte_vhost/vhost.c
>>>> +++ b/lib/librte_vhost/vhost.c
>>>> @@ -65,7 +65,8 @@
>>>>  				(1ULL << VIRTIO_NET_F_CSUM)    | \
>>>>  				(1ULL << VIRTIO_NET_F_GUEST_CSUM) | \
>>>>  				(1ULL << VIRTIO_NET_F_GUEST_TSO4) | \
>>>> -				(1ULL << VIRTIO_NET_F_GUEST_TSO6))
>>>> +				(1ULL << VIRTIO_NET_F_GUEST_TSO6) | \
>>>> +				(1ULL << VIRTIO_RING_F_INDIRECT_DESC))
>>>>
>>>>  uint64_t VHOST_FEATURES = VHOST_SUPPORTED_FEATURES;
>>>>
>>>> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
>>>> index 8a151af..2e0a587 100644
>>>> --- a/lib/librte_vhost/virtio_net.c
>>>> +++ b/lib/librte_vhost/virtio_net.c
>>>> @@ -679,8 +679,8 @@ make_rarp_packet(struct rte_mbuf *rarp_mbuf, const struct ether_addr *mac)
>>>>  }
>>>>
>>>>  static inline int __attribute__((always_inline))
>>>> -copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
>>>> -		  struct rte_mbuf *m, uint16_t desc_idx,
>>>> +copy_desc_to_mbuf(struct virtio_net *dev, struct vring_desc *descs,
>>>> +		  uint16_t max_desc, struct rte_mbuf *m, uint16_t desc_idx,
>>>>  		  struct rte_mempool *mbuf_pool)
>>>>  {
>>>>  	struct vring_desc *desc;
>>>> @@ -693,8 +693,9 @@ copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
>>>>  	/* A counter to avoid desc dead loop chain */
>>>>  	uint32_t nr_desc = 1;
>>>>
>>>> -	desc = &vq->desc[desc_idx];
>>>> -	if (unlikely(desc->len < dev->vhost_hlen))
>>>> +	desc = &descs[desc_idx];
>>>> +	if (unlikely((desc->len < dev->vhost_hlen)) ||
>>>> +			(desc->flags & VRING_DESC_F_INDIRECT))
>>>>  		return -1;
>>>>
>>>>  	desc_addr = gpa_to_vva(dev, desc->addr);
>>>> @@ -711,7 +712,9 @@ copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
>>>>  	 */
>>>>  	if (likely((desc->len == dev->vhost_hlen) &&
>>>>  		   (desc->flags & VRING_DESC_F_NEXT) != 0)) {
>>>> -		desc = &vq->desc[desc->next];
>>>> +		desc = &descs[desc->next];
>>>> +		if (unlikely(desc->flags & VRING_DESC_F_INDIRECT))
>>>> +			return -1;
>>>>
>>>>  		desc_addr = gpa_to_vva(dev, desc->addr);
>>>>  		if (unlikely(!desc_addr))
>>>
>>>
>>> Just to make sure, does this still allow a chain of
>>> direct descriptors ending with an indirect one?
>>> This is legal as per spec.
>>>
>>>> @@ -747,10 +750,12 @@ copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
>>>>  			if ((desc->flags & VRING_DESC_F_NEXT) == 0)
>>>>  				break;
>>>>
>>>> -			if (unlikely(desc->next >= vq->size ||
>>>> -				     ++nr_desc > vq->size))
>>>> +			if (unlikely(desc->next >= max_desc ||
>>>> +				     ++nr_desc > max_desc))
>>>> +				return -1;
>>>> +			desc = &descs[desc->next];
>>>> +			if (unlikely(desc->flags & VRING_DESC_F_INDIRECT))
>>>>  				return -1;
>>>> -			desc = &vq->desc[desc->next];
>>>>
>>>>  			desc_addr = gpa_to_vva(dev, desc->addr);
>>>>  			if (unlikely(!desc_addr))
>>>> @@ -878,19 +883,35 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
>>>>  	/* Prefetch descriptor index. */
>>>>  	rte_prefetch0(&vq->desc[desc_indexes[0]]);
>>>>  	for (i = 0; i < count; i++) {
>>>> +		struct vring_desc *desc;
>>>> +		uint16_t sz, idx;
>>>>  		int err;
>>>>
>>>>  		if (likely(i + 1 < count))
>>>>  			rte_prefetch0(&vq->desc[desc_indexes[i + 1]]);
>>>>
>>>> +		if (vq->desc[desc_indexes[i]].flags & VRING_DESC_F_INDIRECT) {
>>>> +			desc = (struct vring_desc *)gpa_to_vva(dev,
>>>> +					vq->desc[desc_indexes[i]].addr);
>>>> +			if (unlikely(!desc))
>>>> +				break;
>>>> +
>>>> +			rte_prefetch0(desc);
>>>> +			sz = vq->desc[desc_indexes[i]].len / sizeof(*desc);
>>>> +			idx = 0;
>>>> +		} else {
>>>> +			desc = vq->desc;
>>>> +			sz = vq->size;
>>>> +			idx = desc_indexes[i];
>>>> +		}
>>>> +
>>>>  		pkts[i] = rte_pktmbuf_alloc(mbuf_pool);
>>>>  		if (unlikely(pkts[i] == NULL)) {
>>>>  			RTE_LOG(ERR, VHOST_DATA,
>>>>  				"Failed to allocate memory for mbuf.\n");
>>>>  			break;
>>>>  		}
>>>> -		err = copy_desc_to_mbuf(dev, vq, pkts[i], desc_indexes[i],
>>>> -					mbuf_pool);
>>>> +		err = copy_desc_to_mbuf(dev, desc, sz, pkts[i], idx, mbuf_pool);
>>>>  		if (unlikely(err)) {
>>>>  			rte_pktmbuf_free(pkts[i]);
>>>>  			break;
>>>> --
>>>> 2.7.4
>>>
>>> Something that I'm missing here: it's legal for guest
>>> to add indirect descriptors for RX.
>>> I don't see the handling of RX here though.
>>> I think it's required for spec compliance.
>>>
  
Michael S. Tsirkin Sept. 23, 2016, 6:22 p.m. UTC | #5
On Fri, Sep 23, 2016 at 08:16:09PM +0200, Maxime Coquelin wrote:
> 
> 
> On 09/23/2016 08:06 PM, Michael S. Tsirkin wrote:
> > On Fri, Sep 23, 2016 at 08:02:27PM +0200, Maxime Coquelin wrote:
> > > 
> > > 
> > > On 09/23/2016 05:49 PM, Michael S. Tsirkin wrote:
> > > > On Fri, Sep 23, 2016 at 10:28:23AM +0200, Maxime Coquelin wrote:
> > > > > Indirect descriptors are usually supported by virtio-net devices,
> > > > > allowing to dispatch a larger number of requests.
> > > > > 
> > > > > When the virtio device sends a packet using indirect descriptors,
> > > > > only one slot is used in the ring, even for large packets.
> > > > > 
> > > > > The main effect is to improve the 0% packet loss benchmark.
> > > > > A PVP benchmark using Moongen (64 bytes) on the TE, and testpmd
> > > > > (fwd io for host, macswap for VM) on DUT shows a +50% gain for
> > > > > zero loss.
> > > > > 
> > > > > On the downside, micro-benchmark using testpmd txonly in VM and
> > > > > rxonly on host shows a loss between 1 and 4%.i But depending on
> > > > > the needs, feature can be disabled at VM boot time by passing
> > > > > indirect_desc=off argument to vhost-user device in Qemu.
> > > > 
> > > > Even better, change guest pmd to only use indirect
> > > > descriptors when this makes sense (e.g. sufficiently
> > > > large packets).
> > > With the micro-benchmark, the degradation is quite constant whatever
> > > the packet size.
> > > 
> > > For PVP, I could not test with larger packets than 64 bytes, as I don't
> > > have a 40G interface,
> > 
> > Don't 64 byte packets fit in a single slot anyway?
> No, indirect is used. I didn't checked in details, but I think this is
> because there is no headroom reserved in the mbuf.
> 
> This is the condition to meet to fit in a single slot:
> /* optimize ring usage */
> if (vtpci_with_feature(hw, VIRTIO_F_ANY_LAYOUT) &&
> 	rte_mbuf_refcnt_read(txm) == 1 &&
> 	RTE_MBUF_DIRECT(txm) &&
> 	txm->nb_segs == 1 &&
> 	rte_pktmbuf_headroom(txm) >= hdr_size &&
> 	rte_is_aligned(rte_pktmbuf_mtod(txm, char *),
> 		__alignof__(struct virtio_net_hdr_mrg_rxbuf)))
>     can_push = 1;
> else if (vtpci_with_feature(hw, VIRTIO_RING_F_INDIRECT_DESC) &&
> 	txm->nb_segs < VIRTIO_MAX_TX_INDIRECT)
>     use_indirect = 1;
> 
> I will check more in details next week.

Two thoughts then
1. so can some headroom be reserved?
2. how about using indirect with 3 s/g entries,
   but direct with 2 and down?


> > Why would there be an effect with that?
> > 
> > > and line rate with 10G is reached rapidly.
> > 
> > Right but focus on packet loss. you can have that at any rate.
> > 
> > > 
> > > > I would be very interested to know when does it make
> > > > sense.
> > > > 
> > > > The feature is there, it's up to guest whether to
> > > > use it.
> > > Do you mean the PMD should detect dynamically whether using indirect,
> > > or having an option at device init time to enable or not the feature?
> > 
> > guest PMD should not use indirect where it slows things down.
> > 
> > > > 
> > > > 
> > > > > Signed-off-by: Maxime Coquelin <maxime.coquelin@redhat.com>
> > > > > ---
> > > > > Changes since v2:
> > > > > =================
> > > > >  - Revert back to not checking feature flag to be aligned with
> > > > > kernel implementation
> > > > >  - Ensure we don't have nested indirect descriptors
> > > > >  - Ensure the indirect desc address is valid, to protect against
> > > > > malicious guests
> > > > > 
> > > > > Changes since RFC:
> > > > > =================
> > > > >  - Enrich commit message with figures
> > > > >  - Rebased on top of dpdk-next-virtio's master
> > > > >  - Add feature check to ensure we don't receive an indirect desc
> > > > > if not supported by the virtio driver
> > > > > 
> > > > >  lib/librte_vhost/vhost.c      |  3 ++-
> > > > >  lib/librte_vhost/virtio_net.c | 41 +++++++++++++++++++++++++++++++----------
> > > > >  2 files changed, 33 insertions(+), 11 deletions(-)
> > > > > 
> > > > > diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
> > > > > index 46095c3..30bb0ce 100644
> > > > > --- a/lib/librte_vhost/vhost.c
> > > > > +++ b/lib/librte_vhost/vhost.c
> > > > > @@ -65,7 +65,8 @@
> > > > >  				(1ULL << VIRTIO_NET_F_CSUM)    | \
> > > > >  				(1ULL << VIRTIO_NET_F_GUEST_CSUM) | \
> > > > >  				(1ULL << VIRTIO_NET_F_GUEST_TSO4) | \
> > > > > -				(1ULL << VIRTIO_NET_F_GUEST_TSO6))
> > > > > +				(1ULL << VIRTIO_NET_F_GUEST_TSO6) | \
> > > > > +				(1ULL << VIRTIO_RING_F_INDIRECT_DESC))
> > > > > 
> > > > >  uint64_t VHOST_FEATURES = VHOST_SUPPORTED_FEATURES;
> > > > > 
> > > > > diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
> > > > > index 8a151af..2e0a587 100644
> > > > > --- a/lib/librte_vhost/virtio_net.c
> > > > > +++ b/lib/librte_vhost/virtio_net.c
> > > > > @@ -679,8 +679,8 @@ make_rarp_packet(struct rte_mbuf *rarp_mbuf, const struct ether_addr *mac)
> > > > >  }
> > > > > 
> > > > >  static inline int __attribute__((always_inline))
> > > > > -copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
> > > > > -		  struct rte_mbuf *m, uint16_t desc_idx,
> > > > > +copy_desc_to_mbuf(struct virtio_net *dev, struct vring_desc *descs,
> > > > > +		  uint16_t max_desc, struct rte_mbuf *m, uint16_t desc_idx,
> > > > >  		  struct rte_mempool *mbuf_pool)
> > > > >  {
> > > > >  	struct vring_desc *desc;
> > > > > @@ -693,8 +693,9 @@ copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
> > > > >  	/* A counter to avoid desc dead loop chain */
> > > > >  	uint32_t nr_desc = 1;
> > > > > 
> > > > > -	desc = &vq->desc[desc_idx];
> > > > > -	if (unlikely(desc->len < dev->vhost_hlen))
> > > > > +	desc = &descs[desc_idx];
> > > > > +	if (unlikely((desc->len < dev->vhost_hlen)) ||
> > > > > +			(desc->flags & VRING_DESC_F_INDIRECT))
> > > > >  		return -1;
> > > > > 
> > > > >  	desc_addr = gpa_to_vva(dev, desc->addr);
> > > > > @@ -711,7 +712,9 @@ copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
> > > > >  	 */
> > > > >  	if (likely((desc->len == dev->vhost_hlen) &&
> > > > >  		   (desc->flags & VRING_DESC_F_NEXT) != 0)) {
> > > > > -		desc = &vq->desc[desc->next];
> > > > > +		desc = &descs[desc->next];
> > > > > +		if (unlikely(desc->flags & VRING_DESC_F_INDIRECT))
> > > > > +			return -1;
> > > > > 
> > > > >  		desc_addr = gpa_to_vva(dev, desc->addr);
> > > > >  		if (unlikely(!desc_addr))
> > > > 
> > > > 
> > > > Just to make sure, does this still allow a chain of
> > > > direct descriptors ending with an indirect one?
> > > > This is legal as per spec.
> > > > 
> > > > > @@ -747,10 +750,12 @@ copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
> > > > >  			if ((desc->flags & VRING_DESC_F_NEXT) == 0)
> > > > >  				break;
> > > > > 
> > > > > -			if (unlikely(desc->next >= vq->size ||
> > > > > -				     ++nr_desc > vq->size))
> > > > > +			if (unlikely(desc->next >= max_desc ||
> > > > > +				     ++nr_desc > max_desc))
> > > > > +				return -1;
> > > > > +			desc = &descs[desc->next];
> > > > > +			if (unlikely(desc->flags & VRING_DESC_F_INDIRECT))
> > > > >  				return -1;
> > > > > -			desc = &vq->desc[desc->next];
> > > > > 
> > > > >  			desc_addr = gpa_to_vva(dev, desc->addr);
> > > > >  			if (unlikely(!desc_addr))
> > > > > @@ -878,19 +883,35 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
> > > > >  	/* Prefetch descriptor index. */
> > > > >  	rte_prefetch0(&vq->desc[desc_indexes[0]]);
> > > > >  	for (i = 0; i < count; i++) {
> > > > > +		struct vring_desc *desc;
> > > > > +		uint16_t sz, idx;
> > > > >  		int err;
> > > > > 
> > > > >  		if (likely(i + 1 < count))
> > > > >  			rte_prefetch0(&vq->desc[desc_indexes[i + 1]]);
> > > > > 
> > > > > +		if (vq->desc[desc_indexes[i]].flags & VRING_DESC_F_INDIRECT) {
> > > > > +			desc = (struct vring_desc *)gpa_to_vva(dev,
> > > > > +					vq->desc[desc_indexes[i]].addr);
> > > > > +			if (unlikely(!desc))
> > > > > +				break;
> > > > > +
> > > > > +			rte_prefetch0(desc);
> > > > > +			sz = vq->desc[desc_indexes[i]].len / sizeof(*desc);
> > > > > +			idx = 0;
> > > > > +		} else {
> > > > > +			desc = vq->desc;
> > > > > +			sz = vq->size;
> > > > > +			idx = desc_indexes[i];
> > > > > +		}
> > > > > +
> > > > >  		pkts[i] = rte_pktmbuf_alloc(mbuf_pool);
> > > > >  		if (unlikely(pkts[i] == NULL)) {
> > > > >  			RTE_LOG(ERR, VHOST_DATA,
> > > > >  				"Failed to allocate memory for mbuf.\n");
> > > > >  			break;
> > > > >  		}
> > > > > -		err = copy_desc_to_mbuf(dev, vq, pkts[i], desc_indexes[i],
> > > > > -					mbuf_pool);
> > > > > +		err = copy_desc_to_mbuf(dev, desc, sz, pkts[i], idx, mbuf_pool);
> > > > >  		if (unlikely(err)) {
> > > > >  			rte_pktmbuf_free(pkts[i]);
> > > > >  			break;
> > > > > --
> > > > > 2.7.4
> > > > 
> > > > Something that I'm missing here: it's legal for guest
> > > > to add indirect descriptors for RX.
> > > > I don't see the handling of RX here though.
> > > > I think it's required for spec compliance.
> > > >
  
Stephen Hemminger Sept. 23, 2016, 8:24 p.m. UTC | #6
On Fri, 23 Sep 2016 21:22:23 +0300
"Michael S. Tsirkin" <mst@redhat.com> wrote:

> On Fri, Sep 23, 2016 at 08:16:09PM +0200, Maxime Coquelin wrote:
> > 
> > 
> > On 09/23/2016 08:06 PM, Michael S. Tsirkin wrote:  
> > > On Fri, Sep 23, 2016 at 08:02:27PM +0200, Maxime Coquelin wrote:  
> > > > 
> > > > 
> > > > On 09/23/2016 05:49 PM, Michael S. Tsirkin wrote:  
> > > > > On Fri, Sep 23, 2016 at 10:28:23AM +0200, Maxime Coquelin wrote:  
> > > > > > Indirect descriptors are usually supported by virtio-net devices,
> > > > > > allowing to dispatch a larger number of requests.
> > > > > > 
> > > > > > When the virtio device sends a packet using indirect descriptors,
> > > > > > only one slot is used in the ring, even for large packets.
> > > > > > 
> > > > > > The main effect is to improve the 0% packet loss benchmark.
> > > > > > A PVP benchmark using Moongen (64 bytes) on the TE, and testpmd
> > > > > > (fwd io for host, macswap for VM) on DUT shows a +50% gain for
> > > > > > zero loss.
> > > > > > 
> > > > > > On the downside, micro-benchmark using testpmd txonly in VM and
> > > > > > rxonly on host shows a loss between 1 and 4%.i But depending on
> > > > > > the needs, feature can be disabled at VM boot time by passing
> > > > > > indirect_desc=off argument to vhost-user device in Qemu.  
> > > > > 
> > > > > Even better, change guest pmd to only use indirect
> > > > > descriptors when this makes sense (e.g. sufficiently
> > > > > large packets).  
> > > > With the micro-benchmark, the degradation is quite constant whatever
> > > > the packet size.
> > > > 
> > > > For PVP, I could not test with larger packets than 64 bytes, as I don't
> > > > have a 40G interface,  
> > > 
> > > Don't 64 byte packets fit in a single slot anyway?  
> > No, indirect is used. I didn't checked in details, but I think this is
> > because there is no headroom reserved in the mbuf.
> > 
> > This is the condition to meet to fit in a single slot:
> > /* optimize ring usage */
> > if (vtpci_with_feature(hw, VIRTIO_F_ANY_LAYOUT) &&
> > 	rte_mbuf_refcnt_read(txm) == 1 &&
> > 	RTE_MBUF_DIRECT(txm) &&
> > 	txm->nb_segs == 1 &&
> > 	rte_pktmbuf_headroom(txm) >= hdr_size &&
> > 	rte_is_aligned(rte_pktmbuf_mtod(txm, char *),
> > 		__alignof__(struct virtio_net_hdr_mrg_rxbuf)))
> >     can_push = 1;
> > else if (vtpci_with_feature(hw, VIRTIO_RING_F_INDIRECT_DESC) &&
> > 	txm->nb_segs < VIRTIO_MAX_TX_INDIRECT)
> >     use_indirect = 1;
> > 
> > I will check more in details next week.  
> 
> Two thoughts then
> 1. so can some headroom be reserved?
> 2. how about using indirect with 3 s/g entries,
>    but direct with 2 and down?

The default mbuf allocator does keep headroom available. Sounds like a
test bug.
  
Yuanhan Liu Sept. 26, 2016, 3:03 a.m. UTC | #7
On Fri, Sep 23, 2016 at 01:24:14PM -0700, Stephen Hemminger wrote:
> On Fri, 23 Sep 2016 21:22:23 +0300
> "Michael S. Tsirkin" <mst@redhat.com> wrote:
> 
> > On Fri, Sep 23, 2016 at 08:16:09PM +0200, Maxime Coquelin wrote:
> > > 
> > > 
> > > On 09/23/2016 08:06 PM, Michael S. Tsirkin wrote:  
> > > > On Fri, Sep 23, 2016 at 08:02:27PM +0200, Maxime Coquelin wrote:  
> > > > > 
> > > > > 
> > > > > On 09/23/2016 05:49 PM, Michael S. Tsirkin wrote:  
> > > > > > On Fri, Sep 23, 2016 at 10:28:23AM +0200, Maxime Coquelin wrote:  
> > > > > > > Indirect descriptors are usually supported by virtio-net devices,
> > > > > > > allowing to dispatch a larger number of requests.
> > > > > > > 
> > > > > > > When the virtio device sends a packet using indirect descriptors,
> > > > > > > only one slot is used in the ring, even for large packets.
> > > > > > > 
> > > > > > > The main effect is to improve the 0% packet loss benchmark.
> > > > > > > A PVP benchmark using Moongen (64 bytes) on the TE, and testpmd
> > > > > > > (fwd io for host, macswap for VM) on DUT shows a +50% gain for
> > > > > > > zero loss.
> > > > > > > 
> > > > > > > On the downside, micro-benchmark using testpmd txonly in VM and
> > > > > > > rxonly on host shows a loss between 1 and 4%.i But depending on
> > > > > > > the needs, feature can be disabled at VM boot time by passing
> > > > > > > indirect_desc=off argument to vhost-user device in Qemu.  
> > > > > > 
> > > > > > Even better, change guest pmd to only use indirect
> > > > > > descriptors when this makes sense (e.g. sufficiently
> > > > > > large packets).  
> > > > > With the micro-benchmark, the degradation is quite constant whatever
> > > > > the packet size.
> > > > > 
> > > > > For PVP, I could not test with larger packets than 64 bytes, as I don't
> > > > > have a 40G interface,  
> > > > 
> > > > Don't 64 byte packets fit in a single slot anyway?  
> > > No, indirect is used. I didn't checked in details, but I think this is
> > > because there is no headroom reserved in the mbuf.
> > > 
> > > This is the condition to meet to fit in a single slot:
> > > /* optimize ring usage */
> > > if (vtpci_with_feature(hw, VIRTIO_F_ANY_LAYOUT) &&
> > > 	rte_mbuf_refcnt_read(txm) == 1 &&
> > > 	RTE_MBUF_DIRECT(txm) &&
> > > 	txm->nb_segs == 1 &&
> > > 	rte_pktmbuf_headroom(txm) >= hdr_size &&
> > > 	rte_is_aligned(rte_pktmbuf_mtod(txm, char *),
> > > 		__alignof__(struct virtio_net_hdr_mrg_rxbuf)))
> > >     can_push = 1;
> > > else if (vtpci_with_feature(hw, VIRTIO_RING_F_INDIRECT_DESC) &&
> > > 	txm->nb_segs < VIRTIO_MAX_TX_INDIRECT)
> > >     use_indirect = 1;
> > > 
> > > I will check more in details next week.  
> > 
> > Two thoughts then
> > 1. so can some headroom be reserved?
> > 2. how about using indirect with 3 s/g entries,
> >    but direct with 2 and down?
> 
> The default mbuf allocator does keep headroom available. Sounds like a
> test bug.

That's because we don't have VIRTIO_F_ANY_LAYOUT set, as Stephen claimed
in v2's comment.

Since DPDK vhost actually supports VIRTIO_F_ANY_LAYOUT for a while, I'd
like to add it in the features list (VHOST_SUPPORTED_FEATURES).

Will drop a patch shortly.

	--yliu
  
Michael S. Tsirkin Sept. 26, 2016, 12:25 p.m. UTC | #8
On Mon, Sep 26, 2016 at 11:03:54AM +0800, Yuanhan Liu wrote:
> On Fri, Sep 23, 2016 at 01:24:14PM -0700, Stephen Hemminger wrote:
> > On Fri, 23 Sep 2016 21:22:23 +0300
> > "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > 
> > > On Fri, Sep 23, 2016 at 08:16:09PM +0200, Maxime Coquelin wrote:
> > > > 
> > > > 
> > > > On 09/23/2016 08:06 PM, Michael S. Tsirkin wrote:  
> > > > > On Fri, Sep 23, 2016 at 08:02:27PM +0200, Maxime Coquelin wrote:  
> > > > > > 
> > > > > > 
> > > > > > On 09/23/2016 05:49 PM, Michael S. Tsirkin wrote:  
> > > > > > > On Fri, Sep 23, 2016 at 10:28:23AM +0200, Maxime Coquelin wrote:  
> > > > > > > > Indirect descriptors are usually supported by virtio-net devices,
> > > > > > > > allowing to dispatch a larger number of requests.
> > > > > > > > 
> > > > > > > > When the virtio device sends a packet using indirect descriptors,
> > > > > > > > only one slot is used in the ring, even for large packets.
> > > > > > > > 
> > > > > > > > The main effect is to improve the 0% packet loss benchmark.
> > > > > > > > A PVP benchmark using Moongen (64 bytes) on the TE, and testpmd
> > > > > > > > (fwd io for host, macswap for VM) on DUT shows a +50% gain for
> > > > > > > > zero loss.
> > > > > > > > 
> > > > > > > > On the downside, micro-benchmark using testpmd txonly in VM and
> > > > > > > > rxonly on host shows a loss between 1 and 4%.i But depending on
> > > > > > > > the needs, feature can be disabled at VM boot time by passing
> > > > > > > > indirect_desc=off argument to vhost-user device in Qemu.  
> > > > > > > 
> > > > > > > Even better, change guest pmd to only use indirect
> > > > > > > descriptors when this makes sense (e.g. sufficiently
> > > > > > > large packets).  
> > > > > > With the micro-benchmark, the degradation is quite constant whatever
> > > > > > the packet size.
> > > > > > 
> > > > > > For PVP, I could not test with larger packets than 64 bytes, as I don't
> > > > > > have a 40G interface,  
> > > > > 
> > > > > Don't 64 byte packets fit in a single slot anyway?  
> > > > No, indirect is used. I didn't checked in details, but I think this is
> > > > because there is no headroom reserved in the mbuf.
> > > > 
> > > > This is the condition to meet to fit in a single slot:
> > > > /* optimize ring usage */
> > > > if (vtpci_with_feature(hw, VIRTIO_F_ANY_LAYOUT) &&
> > > > 	rte_mbuf_refcnt_read(txm) == 1 &&
> > > > 	RTE_MBUF_DIRECT(txm) &&
> > > > 	txm->nb_segs == 1 &&
> > > > 	rte_pktmbuf_headroom(txm) >= hdr_size &&
> > > > 	rte_is_aligned(rte_pktmbuf_mtod(txm, char *),
> > > > 		__alignof__(struct virtio_net_hdr_mrg_rxbuf)))
> > > >     can_push = 1;
> > > > else if (vtpci_with_feature(hw, VIRTIO_RING_F_INDIRECT_DESC) &&
> > > > 	txm->nb_segs < VIRTIO_MAX_TX_INDIRECT)
> > > >     use_indirect = 1;
> > > > 
> > > > I will check more in details next week.  
> > > 
> > > Two thoughts then
> > > 1. so can some headroom be reserved?
> > > 2. how about using indirect with 3 s/g entries,
> > >    but direct with 2 and down?
> > 
> > The default mbuf allocator does keep headroom available. Sounds like a
> > test bug.
> 
> That's because we don't have VIRTIO_F_ANY_LAYOUT set, as Stephen claimed
> in v2's comment.
> 
> Since DPDK vhost actually supports VIRTIO_F_ANY_LAYOUT for a while, I'd
> like to add it in the features list (VHOST_SUPPORTED_FEATURES).
> 
> Will drop a patch shortly.
> 
> 	--yliu

If VERSION_1 is set then this implies ANY_LAYOUT without it being set.
  
Yuanhan Liu Sept. 26, 2016, 1:04 p.m. UTC | #9
On Mon, Sep 26, 2016 at 03:25:35PM +0300, Michael S. Tsirkin wrote:
> On Mon, Sep 26, 2016 at 11:03:54AM +0800, Yuanhan Liu wrote:
> > On Fri, Sep 23, 2016 at 01:24:14PM -0700, Stephen Hemminger wrote:
> > > On Fri, 23 Sep 2016 21:22:23 +0300
> > > "Michael S. Tsirkin" <mst@redhat.com> wrote:
> > > 
> > > > On Fri, Sep 23, 2016 at 08:16:09PM +0200, Maxime Coquelin wrote:
> > > > > 
> > > > > 
> > > > > On 09/23/2016 08:06 PM, Michael S. Tsirkin wrote:  
> > > > > > On Fri, Sep 23, 2016 at 08:02:27PM +0200, Maxime Coquelin wrote:  
> > > > > > > 
> > > > > > > 
> > > > > > > On 09/23/2016 05:49 PM, Michael S. Tsirkin wrote:  
> > > > > > > > On Fri, Sep 23, 2016 at 10:28:23AM +0200, Maxime Coquelin wrote:  
> > > > > > > > > Indirect descriptors are usually supported by virtio-net devices,
> > > > > > > > > allowing to dispatch a larger number of requests.
> > > > > > > > > 
> > > > > > > > > When the virtio device sends a packet using indirect descriptors,
> > > > > > > > > only one slot is used in the ring, even for large packets.
> > > > > > > > > 
> > > > > > > > > The main effect is to improve the 0% packet loss benchmark.
> > > > > > > > > A PVP benchmark using Moongen (64 bytes) on the TE, and testpmd
> > > > > > > > > (fwd io for host, macswap for VM) on DUT shows a +50% gain for
> > > > > > > > > zero loss.
> > > > > > > > > 
> > > > > > > > > On the downside, micro-benchmark using testpmd txonly in VM and
> > > > > > > > > rxonly on host shows a loss between 1 and 4%.i But depending on
> > > > > > > > > the needs, feature can be disabled at VM boot time by passing
> > > > > > > > > indirect_desc=off argument to vhost-user device in Qemu.  
> > > > > > > > 
> > > > > > > > Even better, change guest pmd to only use indirect
> > > > > > > > descriptors when this makes sense (e.g. sufficiently
> > > > > > > > large packets).  
> > > > > > > With the micro-benchmark, the degradation is quite constant whatever
> > > > > > > the packet size.
> > > > > > > 
> > > > > > > For PVP, I could not test with larger packets than 64 bytes, as I don't
> > > > > > > have a 40G interface,  
> > > > > > 
> > > > > > Don't 64 byte packets fit in a single slot anyway?  
> > > > > No, indirect is used. I didn't checked in details, but I think this is
> > > > > because there is no headroom reserved in the mbuf.
> > > > > 
> > > > > This is the condition to meet to fit in a single slot:
> > > > > /* optimize ring usage */
> > > > > if (vtpci_with_feature(hw, VIRTIO_F_ANY_LAYOUT) &&
> > > > > 	rte_mbuf_refcnt_read(txm) == 1 &&
> > > > > 	RTE_MBUF_DIRECT(txm) &&
> > > > > 	txm->nb_segs == 1 &&
> > > > > 	rte_pktmbuf_headroom(txm) >= hdr_size &&
> > > > > 	rte_is_aligned(rte_pktmbuf_mtod(txm, char *),
> > > > > 		__alignof__(struct virtio_net_hdr_mrg_rxbuf)))
> > > > >     can_push = 1;
> > > > > else if (vtpci_with_feature(hw, VIRTIO_RING_F_INDIRECT_DESC) &&
> > > > > 	txm->nb_segs < VIRTIO_MAX_TX_INDIRECT)
> > > > >     use_indirect = 1;
> > > > > 
> > > > > I will check more in details next week.  
> > > > 
> > > > Two thoughts then
> > > > 1. so can some headroom be reserved?
> > > > 2. how about using indirect with 3 s/g entries,
> > > >    but direct with 2 and down?
> > > 
> > > The default mbuf allocator does keep headroom available. Sounds like a
> > > test bug.
> > 
> > That's because we don't have VIRTIO_F_ANY_LAYOUT set, as Stephen claimed
> > in v2's comment.
> > 
> > Since DPDK vhost actually supports VIRTIO_F_ANY_LAYOUT for a while, I'd
> > like to add it in the features list (VHOST_SUPPORTED_FEATURES).
> > 
> > Will drop a patch shortly.
> > 
> > 	--yliu
> 
> If VERSION_1 is set then this implies ANY_LAYOUT without it being set.

Yes, I saw this note from you in another email. I kept it as it is,
for two reasons (maybe I should have claimed it earlier):

- we have to return all features we support to the guest.

  We don't know the guest is a modern or legacy device. That means
  we should claim we support both: VERSION_1 and ANY_LAYOUT.

  Assume guest is a legacy device and we just set VERSION_1 (the current
  case), ANY_LAYOUT will never be negotiated.

- I'm following the way Linux kernel takes: it also set both features.

Maybe, we could unset ANY_LAYOUT when VERSION_1 is _negotiated_?

	--yliu
  
Yuanhan Liu Sept. 27, 2016, 4:15 a.m. UTC | #10
On Fri, Sep 23, 2016 at 10:28:23AM +0200, Maxime Coquelin wrote:
> +		if (vq->desc[desc_indexes[i]].flags & VRING_DESC_F_INDIRECT) {
> +			desc = (struct vring_desc *)gpa_to_vva(dev,

As mentioned before, this would break 32 bit OS build. It should be

	(struct vring_desc *)(uintptr_t)gpa_to_vva(...);

I meant to fix this while apply, later I just realized you haven't
updated the release note (sorry for the late notice).

So would you mind send a new version, with the fix and release note
update? FYI, the release note is at "doc/guides/rel_notes/"

	--yliu
  
Maxime Coquelin Sept. 27, 2016, 7:25 a.m. UTC | #11
On 09/27/2016 06:15 AM, Yuanhan Liu wrote:
> On Fri, Sep 23, 2016 at 10:28:23AM +0200, Maxime Coquelin wrote:
>> +		if (vq->desc[desc_indexes[i]].flags & VRING_DESC_F_INDIRECT) {
>> +			desc = (struct vring_desc *)gpa_to_vva(dev,
>
> As mentioned before, this would break 32 bit OS build. It should be
>
> 	(struct vring_desc *)(uintptr_t)gpa_to_vva(...);
>
> I meant to fix this while apply, later I just realized you haven't
> updated the release note (sorry for the late notice).
>
> So would you mind send a new version, with the fix and release note
> update? FYI, the release note is at "doc/guides/rel_notes/"
Not a problem, doing it now.

Thanks,
Maxime
  

Patch

=================
 - Revert back to not checking feature flag to be aligned with
kernel implementation
 - Ensure we don't have nested indirect descriptors
 - Ensure the indirect desc address is valid, to protect against
malicious guests

Changes since RFC:
=================
 - Enrich commit message with figures
 - Rebased on top of dpdk-next-virtio's master
 - Add feature check to ensure we don't receive an indirect desc
if not supported by the virtio driver

 lib/librte_vhost/vhost.c      |  3 ++-
 lib/librte_vhost/virtio_net.c | 41 +++++++++++++++++++++++++++++++----------
 2 files changed, 33 insertions(+), 11 deletions(-)

diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
index 46095c3..30bb0ce 100644
--- a/lib/librte_vhost/vhost.c
+++ b/lib/librte_vhost/vhost.c
@@ -65,7 +65,8 @@ 
 				(1ULL << VIRTIO_NET_F_CSUM)    | \
 				(1ULL << VIRTIO_NET_F_GUEST_CSUM) | \
 				(1ULL << VIRTIO_NET_F_GUEST_TSO4) | \
-				(1ULL << VIRTIO_NET_F_GUEST_TSO6))
+				(1ULL << VIRTIO_NET_F_GUEST_TSO6) | \
+				(1ULL << VIRTIO_RING_F_INDIRECT_DESC))
 
 uint64_t VHOST_FEATURES = VHOST_SUPPORTED_FEATURES;
 
diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 8a151af..2e0a587 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -679,8 +679,8 @@  make_rarp_packet(struct rte_mbuf *rarp_mbuf, const struct ether_addr *mac)
 }
 
 static inline int __attribute__((always_inline))
-copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
-		  struct rte_mbuf *m, uint16_t desc_idx,
+copy_desc_to_mbuf(struct virtio_net *dev, struct vring_desc *descs,
+		  uint16_t max_desc, struct rte_mbuf *m, uint16_t desc_idx,
 		  struct rte_mempool *mbuf_pool)
 {
 	struct vring_desc *desc;
@@ -693,8 +693,9 @@  copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	/* A counter to avoid desc dead loop chain */
 	uint32_t nr_desc = 1;
 
-	desc = &vq->desc[desc_idx];
-	if (unlikely(desc->len < dev->vhost_hlen))
+	desc = &descs[desc_idx];
+	if (unlikely((desc->len < dev->vhost_hlen)) ||
+			(desc->flags & VRING_DESC_F_INDIRECT))
 		return -1;
 
 	desc_addr = gpa_to_vva(dev, desc->addr);
@@ -711,7 +712,9 @@  copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	 */
 	if (likely((desc->len == dev->vhost_hlen) &&
 		   (desc->flags & VRING_DESC_F_NEXT) != 0)) {
-		desc = &vq->desc[desc->next];
+		desc = &descs[desc->next];
+		if (unlikely(desc->flags & VRING_DESC_F_INDIRECT))
+			return -1;
 
 		desc_addr = gpa_to_vva(dev, desc->addr);
 		if (unlikely(!desc_addr))
@@ -747,10 +750,12 @@  copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
 			if ((desc->flags & VRING_DESC_F_NEXT) == 0)
 				break;
 
-			if (unlikely(desc->next >= vq->size ||
-				     ++nr_desc > vq->size))
+			if (unlikely(desc->next >= max_desc ||
+				     ++nr_desc > max_desc))
+				return -1;
+			desc = &descs[desc->next];
+			if (unlikely(desc->flags & VRING_DESC_F_INDIRECT))
 				return -1;
-			desc = &vq->desc[desc->next];
 
 			desc_addr = gpa_to_vva(dev, desc->addr);
 			if (unlikely(!desc_addr))
@@ -878,19 +883,35 @@  rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
 	/* Prefetch descriptor index. */
 	rte_prefetch0(&vq->desc[desc_indexes[0]]);
 	for (i = 0; i < count; i++) {
+		struct vring_desc *desc;
+		uint16_t sz, idx;
 		int err;
 
 		if (likely(i + 1 < count))
 			rte_prefetch0(&vq->desc[desc_indexes[i + 1]]);
 
+		if (vq->desc[desc_indexes[i]].flags & VRING_DESC_F_INDIRECT) {
+			desc = (struct vring_desc *)gpa_to_vva(dev,
+					vq->desc[desc_indexes[i]].addr);
+			if (unlikely(!desc))
+				break;
+
+			rte_prefetch0(desc);
+			sz = vq->desc[desc_indexes[i]].len / sizeof(*desc);
+			idx = 0;
+		} else {
+			desc = vq->desc;
+			sz = vq->size;
+			idx = desc_indexes[i];
+		}
+
 		pkts[i] = rte_pktmbuf_alloc(mbuf_pool);
 		if (unlikely(pkts[i] == NULL)) {
 			RTE_LOG(ERR, VHOST_DATA,
 				"Failed to allocate memory for mbuf.\n");
 			break;
 		}
-		err = copy_desc_to_mbuf(dev, vq, pkts[i], desc_indexes[i],
-					mbuf_pool);
+		err = copy_desc_to_mbuf(dev, desc, sz, pkts[i], idx, mbuf_pool);
 		if (unlikely(err)) {
 			rte_pktmbuf_free(pkts[i]);
 			break;