[dpdk-dev,1/3] vhost: pre update used ring for Tx and Rx

Message ID 1462236378-7604-2-git-send-email-yuanhan.liu@linux.intel.com (mailing list archive)
State Accepted, archived
Delegated to: Yuanhan Liu
Headers

Commit Message

Yuanhan Liu May 3, 2016, 12:46 a.m. UTC
  Pre update and update used ring in batch for Tx and Rx at the stage
while fetching all avail desc idx. This would reduce some cache misses
and hence, increase the performance a bit.

Pre update would be feasible as guest driver will not start processing
those entries as far as we don't update "used->idx". (I'm not 100%
certain I don't miss anything, though).

Cc: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Yuanhan Liu <yuanhan.liu@linux.intel.com>
---
 lib/librte_vhost/vhost_rxtx.c | 58 +++++++++++++++++++++----------------------
 1 file changed, 28 insertions(+), 30 deletions(-)
  

Comments

Huawei Xie June 1, 2016, 6:40 a.m. UTC | #1
On 5/3/2016 8:42 AM, Yuanhan Liu wrote:
> Pre update and update used ring in batch for Tx and Rx at the stage
> while fetching all avail desc idx. This would reduce some cache misses
> and hence, increase the performance a bit.
>
> Pre update would be feasible as guest driver will not start processing
> those entries as far as we don't update "used->idx". (I'm not 100%
> certain I don't miss anything, though).
>
> Cc: Michael S. Tsirkin <mst@redhat.com>
> Signed-off-by: Yuanhan Liu <yuanhan.liu@linux.intel.com>
> ---
>  lib/librte_vhost/vhost_rxtx.c | 58 +++++++++++++++++++++----------------------
>  1 file changed, 28 insertions(+), 30 deletions(-)
>
> diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c
> index c9cd1c5..2c3b810 100644
> --- a/lib/librte_vhost/vhost_rxtx.c
> +++ b/lib/librte_vhost/vhost_rxtx.c
> @@ -137,7 +137,7 @@ copy_virtio_net_hdr(struct virtio_net *dev, uint64_t desc_addr,
>  
>  static inline int __attribute__((always_inline))
>  copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
> -		  struct rte_mbuf *m, uint16_t desc_idx, uint32_t *copied)
> +		  struct rte_mbuf *m, uint16_t desc_idx)
>  {
>  	uint32_t desc_avail, desc_offset;
>  	uint32_t mbuf_avail, mbuf_offset;
> @@ -161,7 +161,6 @@ copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
>  	desc_offset = dev->vhost_hlen;
>  	desc_avail  = desc->len - dev->vhost_hlen;
>  
> -	*copied = rte_pktmbuf_pkt_len(m);
>  	mbuf_avail  = rte_pktmbuf_data_len(m);
>  	mbuf_offset = 0;
>  	while (mbuf_avail != 0 || m->next != NULL) {
> @@ -262,6 +261,7 @@ virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
>  	struct vhost_virtqueue *vq;
>  	uint16_t res_start_idx, res_end_idx;
>  	uint16_t desc_indexes[MAX_PKT_BURST];
> +	uint16_t used_idx;
>  	uint32_t i;
>  
>  	LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
> @@ -285,27 +285,29 @@ virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
>  	/* Retrieve all of the desc indexes first to avoid caching issues. */
>  	rte_prefetch0(&vq->avail->ring[res_start_idx & (vq->size - 1)]);
>  	for (i = 0; i < count; i++) {
> -		desc_indexes[i] = vq->avail->ring[(res_start_idx + i) &
> -						  (vq->size - 1)];
> +		used_idx = (res_start_idx + i) & (vq->size - 1);
> +		desc_indexes[i] = vq->avail->ring[used_idx];
> +		vq->used->ring[used_idx].id = desc_indexes[i];
> +		vq->used->ring[used_idx].len = pkts[i]->pkt_len +
> +					       dev->vhost_hlen;
> +		vhost_log_used_vring(dev, vq,
> +			offsetof(struct vring_used, ring[used_idx]),
> +			sizeof(vq->used->ring[used_idx]));
>  	}
>  
>  	rte_prefetch0(&vq->desc[desc_indexes[0]]);
>  	for (i = 0; i < count; i++) {
>  		uint16_t desc_idx = desc_indexes[i];
> -		uint16_t used_idx = (res_start_idx + i) & (vq->size - 1);
> -		uint32_t copied;
>  		int err;
>  
> -		err = copy_mbuf_to_desc(dev, vq, pkts[i], desc_idx, &copied);
> -
> -		vq->used->ring[used_idx].id = desc_idx;
> -		if (unlikely(err))
> +		err = copy_mbuf_to_desc(dev, vq, pkts[i], desc_idx);
> +		if (unlikely(err)) {
> +			used_idx = (res_start_idx + i) & (vq->size - 1);
>  			vq->used->ring[used_idx].len = dev->vhost_hlen;
> -		else
> -			vq->used->ring[used_idx].len = copied + dev->vhost_hlen;
> -		vhost_log_used_vring(dev, vq,
> -			offsetof(struct vring_used, ring[used_idx]),
> -			sizeof(vq->used->ring[used_idx]));
> +			vhost_log_used_vring(dev, vq,
> +				offsetof(struct vring_used, ring[used_idx]),
> +				sizeof(vq->used->ring[used_idx]));
> +		}
>  
>  		if (i + 1 < count)
>  			rte_prefetch0(&vq->desc[desc_indexes[i+1]]);
> @@ -879,6 +881,7 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
>  	/* Prefetch available ring to retrieve head indexes. */
>  	used_idx = vq->last_used_idx & (vq->size - 1);
>  	rte_prefetch0(&vq->avail->ring[used_idx]);
> +	rte_prefetch0(&vq->used->ring[used_idx]);
>  
>  	count = RTE_MIN(count, MAX_PKT_BURST);
>  	count = RTE_MIN(count, free_entries);
> @@ -887,22 +890,23 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
>  
>  	/* Retrieve all of the head indexes first to avoid caching issues. */
>  	for (i = 0; i < count; i++) {
> -		desc_indexes[i] = vq->avail->ring[(vq->last_used_idx + i) &
> -					(vq->size - 1)];
> +		used_idx = (vq->last_used_idx + i) & (vq->size - 1);
> +		desc_indexes[i] = vq->avail->ring[used_idx];
> +
> +		vq->used->ring[used_idx].id  = desc_indexes[i];
> +		vq->used->ring[used_idx].len = 0;
> +		vhost_log_used_vring(dev, vq,
> +				offsetof(struct vring_used, ring[used_idx]),
> +				sizeof(vq->used->ring[used_idx]));
>  	}
>  
>  	/* Prefetch descriptor index. */
>  	rte_prefetch0(&vq->desc[desc_indexes[0]]);
> -	rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
> -
>  	for (i = 0; i < count; i++) {
>  		int err;
>  
> -		if (likely(i + 1 < count)) {
> +		if (likely(i + 1 < count))
>  			rte_prefetch0(&vq->desc[desc_indexes[i + 1]]);
> -			rte_prefetch0(&vq->used->ring[(used_idx + 1) &
> -						      (vq->size - 1)]);
> -		}
>  
>  		pkts[i] = rte_pktmbuf_alloc(mbuf_pool);
>  		if (unlikely(pkts[i] == NULL)) {
> @@ -916,18 +920,12 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
>  			rte_pktmbuf_free(pkts[i]);
>  			break;
>  		}
> -
> -		used_idx = vq->last_used_idx++ & (vq->size - 1);
> -		vq->used->ring[used_idx].id  = desc_indexes[i];
> -		vq->used->ring[used_idx].len = 0;
> -		vhost_log_used_vring(dev, vq,
> -				offsetof(struct vring_used, ring[used_idx]),
> -				sizeof(vq->used->ring[used_idx]));
>  	}

Had tried post-updating used ring in batch,  but forget the perf change.

One optimization would be on vhost_log_used_ring.
I have two ideas,
a) In QEMU side, we always assume use ring will be changed. so that we
don't need to log used ring in VHOST.

Michael: feasible in QEMU? comments on this?

b) We could always mark the total used ring modified rather than entry
by entry.


>  
>  	rte_smp_wmb();
>  	rte_smp_rmb();
>  	vq->used->idx += i;
> +	vq->last_used_idx += i;
>  	vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
>  			sizeof(vq->used->idx));
>
  
Yuanhan Liu June 1, 2016, 6:55 a.m. UTC | #2
On Wed, Jun 01, 2016 at 06:40:41AM +0000, Xie, Huawei wrote:
> >  	/* Retrieve all of the head indexes first to avoid caching issues. */
> >  	for (i = 0; i < count; i++) {
> > -		desc_indexes[i] = vq->avail->ring[(vq->last_used_idx + i) &
> > -					(vq->size - 1)];
> > +		used_idx = (vq->last_used_idx + i) & (vq->size - 1);
> > +		desc_indexes[i] = vq->avail->ring[used_idx];
> > +
> > +		vq->used->ring[used_idx].id  = desc_indexes[i];
> > +		vq->used->ring[used_idx].len = 0;
> > +		vhost_log_used_vring(dev, vq,
> > +				offsetof(struct vring_used, ring[used_idx]),
> > +				sizeof(vq->used->ring[used_idx]));
> >  	}
> >  
> >  	/* Prefetch descriptor index. */
> >  	rte_prefetch0(&vq->desc[desc_indexes[0]]);
> > -	rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
> > -
> >  	for (i = 0; i < count; i++) {
> >  		int err;
> >  
> > -		if (likely(i + 1 < count)) {
> > +		if (likely(i + 1 < count))
> >  			rte_prefetch0(&vq->desc[desc_indexes[i + 1]]);
> > -			rte_prefetch0(&vq->used->ring[(used_idx + 1) &
> > -						      (vq->size - 1)]);
> > -		}
> >  
> >  		pkts[i] = rte_pktmbuf_alloc(mbuf_pool);
> >  		if (unlikely(pkts[i] == NULL)) {
> > @@ -916,18 +920,12 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
> >  			rte_pktmbuf_free(pkts[i]);
> >  			break;
> >  		}
> > -
> > -		used_idx = vq->last_used_idx++ & (vq->size - 1);
> > -		vq->used->ring[used_idx].id  = desc_indexes[i];
> > -		vq->used->ring[used_idx].len = 0;
> > -		vhost_log_used_vring(dev, vq,
> > -				offsetof(struct vring_used, ring[used_idx]),
> > -				sizeof(vq->used->ring[used_idx]));
> >  	}
> 
> Had tried post-updating used ring in batch,  but forget the perf change.

I would assume pre-updating gives better performance gain, as we are
fiddling with avail and used ring together, which would be more cache
friendly.

> One optimization would be on vhost_log_used_ring.
> I have two ideas,
> a) In QEMU side, we always assume use ring will be changed. so that we
> don't need to log used ring in VHOST.
> 
> Michael: feasible in QEMU? comments on this?
> 
> b) We could always mark the total used ring modified rather than entry
> by entry.

I doubt it's worthwhile. One fact is that vhost_log_used_ring is
a non operation in most time: it will take action only in the short
gap of during live migration.

And FYI, I even tried with all vhost_log_xxx being removed, it showed
no performance boost at all. Therefore, it's not a factor that will
impact performance.

	--yliu
  
Michael S. Tsirkin June 1, 2016, 1:05 p.m. UTC | #3
On Wed, Jun 01, 2016 at 06:40:41AM +0000, Xie, Huawei wrote:
> On 5/3/2016 8:42 AM, Yuanhan Liu wrote:
> > Pre update and update used ring in batch for Tx and Rx at the stage
> > while fetching all avail desc idx. This would reduce some cache misses
> > and hence, increase the performance a bit.
> >
> > Pre update would be feasible as guest driver will not start processing
> > those entries as far as we don't update "used->idx". (I'm not 100%
> > certain I don't miss anything, though).
> >
> > Cc: Michael S. Tsirkin <mst@redhat.com>
> > Signed-off-by: Yuanhan Liu <yuanhan.liu@linux.intel.com>
> > ---
> >  lib/librte_vhost/vhost_rxtx.c | 58 +++++++++++++++++++++----------------------
> >  1 file changed, 28 insertions(+), 30 deletions(-)
> >
> > diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c
> > index c9cd1c5..2c3b810 100644
> > --- a/lib/librte_vhost/vhost_rxtx.c
> > +++ b/lib/librte_vhost/vhost_rxtx.c
> > @@ -137,7 +137,7 @@ copy_virtio_net_hdr(struct virtio_net *dev, uint64_t desc_addr,
> >  
> >  static inline int __attribute__((always_inline))
> >  copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
> > -		  struct rte_mbuf *m, uint16_t desc_idx, uint32_t *copied)
> > +		  struct rte_mbuf *m, uint16_t desc_idx)
> >  {
> >  	uint32_t desc_avail, desc_offset;
> >  	uint32_t mbuf_avail, mbuf_offset;
> > @@ -161,7 +161,6 @@ copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
> >  	desc_offset = dev->vhost_hlen;
> >  	desc_avail  = desc->len - dev->vhost_hlen;
> >  
> > -	*copied = rte_pktmbuf_pkt_len(m);
> >  	mbuf_avail  = rte_pktmbuf_data_len(m);
> >  	mbuf_offset = 0;
> >  	while (mbuf_avail != 0 || m->next != NULL) {
> > @@ -262,6 +261,7 @@ virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
> >  	struct vhost_virtqueue *vq;
> >  	uint16_t res_start_idx, res_end_idx;
> >  	uint16_t desc_indexes[MAX_PKT_BURST];
> > +	uint16_t used_idx;
> >  	uint32_t i;
> >  
> >  	LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
> > @@ -285,27 +285,29 @@ virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
> >  	/* Retrieve all of the desc indexes first to avoid caching issues. */
> >  	rte_prefetch0(&vq->avail->ring[res_start_idx & (vq->size - 1)]);
> >  	for (i = 0; i < count; i++) {
> > -		desc_indexes[i] = vq->avail->ring[(res_start_idx + i) &
> > -						  (vq->size - 1)];
> > +		used_idx = (res_start_idx + i) & (vq->size - 1);
> > +		desc_indexes[i] = vq->avail->ring[used_idx];
> > +		vq->used->ring[used_idx].id = desc_indexes[i];
> > +		vq->used->ring[used_idx].len = pkts[i]->pkt_len +
> > +					       dev->vhost_hlen;
> > +		vhost_log_used_vring(dev, vq,
> > +			offsetof(struct vring_used, ring[used_idx]),
> > +			sizeof(vq->used->ring[used_idx]));
> >  	}
> >  
> >  	rte_prefetch0(&vq->desc[desc_indexes[0]]);
> >  	for (i = 0; i < count; i++) {
> >  		uint16_t desc_idx = desc_indexes[i];
> > -		uint16_t used_idx = (res_start_idx + i) & (vq->size - 1);
> > -		uint32_t copied;
> >  		int err;
> >  
> > -		err = copy_mbuf_to_desc(dev, vq, pkts[i], desc_idx, &copied);
> > -
> > -		vq->used->ring[used_idx].id = desc_idx;
> > -		if (unlikely(err))
> > +		err = copy_mbuf_to_desc(dev, vq, pkts[i], desc_idx);
> > +		if (unlikely(err)) {
> > +			used_idx = (res_start_idx + i) & (vq->size - 1);
> >  			vq->used->ring[used_idx].len = dev->vhost_hlen;
> > -		else
> > -			vq->used->ring[used_idx].len = copied + dev->vhost_hlen;
> > -		vhost_log_used_vring(dev, vq,
> > -			offsetof(struct vring_used, ring[used_idx]),
> > -			sizeof(vq->used->ring[used_idx]));
> > +			vhost_log_used_vring(dev, vq,
> > +				offsetof(struct vring_used, ring[used_idx]),
> > +				sizeof(vq->used->ring[used_idx]));
> > +		}
> >  
> >  		if (i + 1 < count)
> >  			rte_prefetch0(&vq->desc[desc_indexes[i+1]]);
> > @@ -879,6 +881,7 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
> >  	/* Prefetch available ring to retrieve head indexes. */
> >  	used_idx = vq->last_used_idx & (vq->size - 1);
> >  	rte_prefetch0(&vq->avail->ring[used_idx]);
> > +	rte_prefetch0(&vq->used->ring[used_idx]);
> >  
> >  	count = RTE_MIN(count, MAX_PKT_BURST);
> >  	count = RTE_MIN(count, free_entries);
> > @@ -887,22 +890,23 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
> >  
> >  	/* Retrieve all of the head indexes first to avoid caching issues. */
> >  	for (i = 0; i < count; i++) {
> > -		desc_indexes[i] = vq->avail->ring[(vq->last_used_idx + i) &
> > -					(vq->size - 1)];
> > +		used_idx = (vq->last_used_idx + i) & (vq->size - 1);
> > +		desc_indexes[i] = vq->avail->ring[used_idx];
> > +
> > +		vq->used->ring[used_idx].id  = desc_indexes[i];
> > +		vq->used->ring[used_idx].len = 0;
> > +		vhost_log_used_vring(dev, vq,
> > +				offsetof(struct vring_used, ring[used_idx]),
> > +				sizeof(vq->used->ring[used_idx]));
> >  	}
> >  
> >  	/* Prefetch descriptor index. */
> >  	rte_prefetch0(&vq->desc[desc_indexes[0]]);
> > -	rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
> > -
> >  	for (i = 0; i < count; i++) {
> >  		int err;
> >  
> > -		if (likely(i + 1 < count)) {
> > +		if (likely(i + 1 < count))
> >  			rte_prefetch0(&vq->desc[desc_indexes[i + 1]]);
> > -			rte_prefetch0(&vq->used->ring[(used_idx + 1) &
> > -						      (vq->size - 1)]);
> > -		}
> >  
> >  		pkts[i] = rte_pktmbuf_alloc(mbuf_pool);
> >  		if (unlikely(pkts[i] == NULL)) {
> > @@ -916,18 +920,12 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
> >  			rte_pktmbuf_free(pkts[i]);
> >  			break;
> >  		}
> > -
> > -		used_idx = vq->last_used_idx++ & (vq->size - 1);
> > -		vq->used->ring[used_idx].id  = desc_indexes[i];
> > -		vq->used->ring[used_idx].len = 0;
> > -		vhost_log_used_vring(dev, vq,
> > -				offsetof(struct vring_used, ring[used_idx]),
> > -				sizeof(vq->used->ring[used_idx]));
> >  	}
> 
> Had tried post-updating used ring in batch,  but forget the perf change.
> 
> One optimization would be on vhost_log_used_ring.
> I have two ideas,
> a) In QEMU side, we always assume use ring will be changed. so that we
> don't need to log used ring in VHOST.
> 
> Michael: feasible in QEMU? comments on this?

To avoid breaking old QEMU, we'll need a new protocol feature bit,
but generally this sounds reasonable.

> b) We could always mark the total used ring modified rather than entry
> by entry.
> 
> >  
> >  	rte_smp_wmb();
> >  	rte_smp_rmb();
> >  	vq->used->idx += i;
> > +	vq->last_used_idx += i;
> >  	vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
> >  			sizeof(vq->used->idx));
> >  
>
  
Huawei Xie June 3, 2016, 8:18 a.m. UTC | #4
On 6/1/2016 2:53 PM, Yuanhan Liu wrote:
> On Wed, Jun 01, 2016 at 06:40:41AM +0000, Xie, Huawei wrote:
>>>  	/* Retrieve all of the head indexes first to avoid caching issues. */
>>>  	for (i = 0; i < count; i++) {
>>> -		desc_indexes[i] = vq->avail->ring[(vq->last_used_idx + i) &
>>> -					(vq->size - 1)];
>>> +		used_idx = (vq->last_used_idx + i) & (vq->size - 1);
>>> +		desc_indexes[i] = vq->avail->ring[used_idx];
>>> +
>>> +		vq->used->ring[used_idx].id  = desc_indexes[i];
>>> +		vq->used->ring[used_idx].len = 0;
>>> +		vhost_log_used_vring(dev, vq,
>>> +				offsetof(struct vring_used, ring[used_idx]),
>>> +				sizeof(vq->used->ring[used_idx]));
>>>  	}
>>>  
>>>  	/* Prefetch descriptor index. */
>>>  	rte_prefetch0(&vq->desc[desc_indexes[0]]);
>>> -	rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
>>> -
>>>  	for (i = 0; i < count; i++) {
>>>  		int err;
>>>  
>>> -		if (likely(i + 1 < count)) {
>>> +		if (likely(i + 1 < count))
>>>  			rte_prefetch0(&vq->desc[desc_indexes[i + 1]]);
>>> -			rte_prefetch0(&vq->used->ring[(used_idx + 1) &
>>> -						      (vq->size - 1)]);
>>> -		}
>>>  
>>>  		pkts[i] = rte_pktmbuf_alloc(mbuf_pool);
>>>  		if (unlikely(pkts[i] == NULL)) {
>>> @@ -916,18 +920,12 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
>>>  			rte_pktmbuf_free(pkts[i]);
>>>  			break;
>>>  		}
>>> -
>>> -		used_idx = vq->last_used_idx++ & (vq->size - 1);
>>> -		vq->used->ring[used_idx].id  = desc_indexes[i];
>>> -		vq->used->ring[used_idx].len = 0;
>>> -		vhost_log_used_vring(dev, vq,
>>> -				offsetof(struct vring_used, ring[used_idx]),
>>> -				sizeof(vq->used->ring[used_idx]));
>>>  	}
>> Had tried post-updating used ring in batch,  but forget the perf change.
> I would assume pre-updating gives better performance gain, as we are
> fiddling with avail and used ring together, which would be more cache
> friendly.
 
The distance between entry for avail ring and used ring are at least 8
cache lines.
The benefit comes from batch updates, if applicable.

>
>> One optimization would be on vhost_log_used_ring.
>> I have two ideas,
>> a) In QEMU side, we always assume use ring will be changed. so that we
>> don't need to log used ring in VHOST.
>>
>> Michael: feasible in QEMU? comments on this?
>>
>> b) We could always mark the total used ring modified rather than entry
>> by entry.
> I doubt it's worthwhile. One fact is that vhost_log_used_ring is
> a non operation in most time: it will take action only in the short
> gap of during live migration.
>
> And FYI, I even tried with all vhost_log_xxx being removed, it showed
> no performance boost at all. Therefore, it's not a factor that will
> impact performance.

I knew this.

> 	--yliu
>
  

Patch

diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c
index c9cd1c5..2c3b810 100644
--- a/lib/librte_vhost/vhost_rxtx.c
+++ b/lib/librte_vhost/vhost_rxtx.c
@@ -137,7 +137,7 @@  copy_virtio_net_hdr(struct virtio_net *dev, uint64_t desc_addr,
 
 static inline int __attribute__((always_inline))
 copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
-		  struct rte_mbuf *m, uint16_t desc_idx, uint32_t *copied)
+		  struct rte_mbuf *m, uint16_t desc_idx)
 {
 	uint32_t desc_avail, desc_offset;
 	uint32_t mbuf_avail, mbuf_offset;
@@ -161,7 +161,6 @@  copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	desc_offset = dev->vhost_hlen;
 	desc_avail  = desc->len - dev->vhost_hlen;
 
-	*copied = rte_pktmbuf_pkt_len(m);
 	mbuf_avail  = rte_pktmbuf_data_len(m);
 	mbuf_offset = 0;
 	while (mbuf_avail != 0 || m->next != NULL) {
@@ -262,6 +261,7 @@  virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
 	struct vhost_virtqueue *vq;
 	uint16_t res_start_idx, res_end_idx;
 	uint16_t desc_indexes[MAX_PKT_BURST];
+	uint16_t used_idx;
 	uint32_t i;
 
 	LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
@@ -285,27 +285,29 @@  virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
 	/* Retrieve all of the desc indexes first to avoid caching issues. */
 	rte_prefetch0(&vq->avail->ring[res_start_idx & (vq->size - 1)]);
 	for (i = 0; i < count; i++) {
-		desc_indexes[i] = vq->avail->ring[(res_start_idx + i) &
-						  (vq->size - 1)];
+		used_idx = (res_start_idx + i) & (vq->size - 1);
+		desc_indexes[i] = vq->avail->ring[used_idx];
+		vq->used->ring[used_idx].id = desc_indexes[i];
+		vq->used->ring[used_idx].len = pkts[i]->pkt_len +
+					       dev->vhost_hlen;
+		vhost_log_used_vring(dev, vq,
+			offsetof(struct vring_used, ring[used_idx]),
+			sizeof(vq->used->ring[used_idx]));
 	}
 
 	rte_prefetch0(&vq->desc[desc_indexes[0]]);
 	for (i = 0; i < count; i++) {
 		uint16_t desc_idx = desc_indexes[i];
-		uint16_t used_idx = (res_start_idx + i) & (vq->size - 1);
-		uint32_t copied;
 		int err;
 
-		err = copy_mbuf_to_desc(dev, vq, pkts[i], desc_idx, &copied);
-
-		vq->used->ring[used_idx].id = desc_idx;
-		if (unlikely(err))
+		err = copy_mbuf_to_desc(dev, vq, pkts[i], desc_idx);
+		if (unlikely(err)) {
+			used_idx = (res_start_idx + i) & (vq->size - 1);
 			vq->used->ring[used_idx].len = dev->vhost_hlen;
-		else
-			vq->used->ring[used_idx].len = copied + dev->vhost_hlen;
-		vhost_log_used_vring(dev, vq,
-			offsetof(struct vring_used, ring[used_idx]),
-			sizeof(vq->used->ring[used_idx]));
+			vhost_log_used_vring(dev, vq,
+				offsetof(struct vring_used, ring[used_idx]),
+				sizeof(vq->used->ring[used_idx]));
+		}
 
 		if (i + 1 < count)
 			rte_prefetch0(&vq->desc[desc_indexes[i+1]]);
@@ -879,6 +881,7 @@  rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
 	/* Prefetch available ring to retrieve head indexes. */
 	used_idx = vq->last_used_idx & (vq->size - 1);
 	rte_prefetch0(&vq->avail->ring[used_idx]);
+	rte_prefetch0(&vq->used->ring[used_idx]);
 
 	count = RTE_MIN(count, MAX_PKT_BURST);
 	count = RTE_MIN(count, free_entries);
@@ -887,22 +890,23 @@  rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
 
 	/* Retrieve all of the head indexes first to avoid caching issues. */
 	for (i = 0; i < count; i++) {
-		desc_indexes[i] = vq->avail->ring[(vq->last_used_idx + i) &
-					(vq->size - 1)];
+		used_idx = (vq->last_used_idx + i) & (vq->size - 1);
+		desc_indexes[i] = vq->avail->ring[used_idx];
+
+		vq->used->ring[used_idx].id  = desc_indexes[i];
+		vq->used->ring[used_idx].len = 0;
+		vhost_log_used_vring(dev, vq,
+				offsetof(struct vring_used, ring[used_idx]),
+				sizeof(vq->used->ring[used_idx]));
 	}
 
 	/* Prefetch descriptor index. */
 	rte_prefetch0(&vq->desc[desc_indexes[0]]);
-	rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
-
 	for (i = 0; i < count; i++) {
 		int err;
 
-		if (likely(i + 1 < count)) {
+		if (likely(i + 1 < count))
 			rte_prefetch0(&vq->desc[desc_indexes[i + 1]]);
-			rte_prefetch0(&vq->used->ring[(used_idx + 1) &
-						      (vq->size - 1)]);
-		}
 
 		pkts[i] = rte_pktmbuf_alloc(mbuf_pool);
 		if (unlikely(pkts[i] == NULL)) {
@@ -916,18 +920,12 @@  rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
 			rte_pktmbuf_free(pkts[i]);
 			break;
 		}
-
-		used_idx = vq->last_used_idx++ & (vq->size - 1);
-		vq->used->ring[used_idx].id  = desc_indexes[i];
-		vq->used->ring[used_idx].len = 0;
-		vhost_log_used_vring(dev, vq,
-				offsetof(struct vring_used, ring[used_idx]),
-				sizeof(vq->used->ring[used_idx]));
 	}
 
 	rte_smp_wmb();
 	rte_smp_rmb();
 	vq->used->idx += i;
+	vq->last_used_idx += i;
 	vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
 			sizeof(vq->used->idx));