[dpdk-dev,v5,2/6] vhost: rewrite enqueue

Message ID 1473392368-84903-3-git-send-email-zhihong.wang@intel.com (mailing list archive)
State Superseded, archived
Delegated to: Yuanhan Liu
Headers

Commit Message

Zhihong Wang Sept. 9, 2016, 3:39 a.m. UTC
  This patch implements the vhost logic from scratch into a single function
designed for high performance and better maintainability.

This is the baseline version of the new code, more optimization will be
added in the following patches in this patch set.

Signed-off-by: Zhihong Wang <zhihong.wang@intel.com>
---
Changes in v5:

 1. Rebase to the latest branch.

 2. Rename variables to keep consistent in naming style.

 3. Small changes like return value adjustment and vertical alignment.

---
Changes in v4:

 1. Refactor the code for clearer logic.

 2. Add PRINT_PACKET for debugging.

---
Changes in v3:

 1. Rewrite enqueue and delete the obsolete in the same patch.

 lib/librte_vhost/virtio_net.c | 514 ++++++++++++------------------------------
 1 file changed, 138 insertions(+), 376 deletions(-)
  

Comments

Maxime Coquelin Sept. 12, 2016, 3:42 p.m. UTC | #1
Hi,

On 09/09/2016 05:39 AM, Zhihong Wang wrote:
> This patch implements the vhost logic from scratch into a single function
> designed for high performance and better maintainability.
>
> This is the baseline version of the new code, more optimization will be
> added in the following patches in this patch set.
>
> Signed-off-by: Zhihong Wang <zhihong.wang@intel.com>
> ---
> Changes in v5:
>
>  1. Rebase to the latest branch.
>
>  2. Rename variables to keep consistent in naming style.
>
>  3. Small changes like return value adjustment and vertical alignment.
>
> ---
> Changes in v4:
>
>  1. Refactor the code for clearer logic.
>
>  2. Add PRINT_PACKET for debugging.
>
> ---
> Changes in v3:
>
>  1. Rewrite enqueue and delete the obsolete in the same patch.
>
>  lib/librte_vhost/virtio_net.c | 514 ++++++++++++------------------------------
>  1 file changed, 138 insertions(+), 376 deletions(-)
>
> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
> index 0d6e7d9..6f63968 100644
> --- a/lib/librte_vhost/virtio_net.c
> +++ b/lib/librte_vhost/virtio_net.c
> @@ -91,7 +91,7 @@ is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t qp_nb)
>  	return (is_tx ^ (idx & 1)) == 0 && idx < qp_nb * VIRTIO_QNUM;
>  }
>
> -static void
> +static inline void __attribute__((always_inline))
>  virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
>  {
>  	if (m_buf->ol_flags & PKT_TX_L4_MASK) {
> @@ -112,6 +112,10 @@ virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
>  						cksum));
>  			break;
>  		}
> +	} else {
> +		net_hdr->flags       = 0;
> +		net_hdr->csum_start  = 0;
> +		net_hdr->csum_offset = 0;
>  	}
>
>  	if (m_buf->ol_flags & PKT_TX_TCP_SEG) {
> @@ -122,439 +126,197 @@ virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
>  		net_hdr->gso_size = m_buf->tso_segsz;
>  		net_hdr->hdr_len = m_buf->l2_len + m_buf->l3_len
>  					+ m_buf->l4_len;
> +	} else {
> +		net_hdr->gso_type = 0;
> +		net_hdr->hdr_len  = 0;
> +		net_hdr->gso_size = 0;
>  	}
>  }
>
> -static inline void
> -copy_virtio_net_hdr(struct virtio_net *dev, uint64_t desc_addr,
> -		    struct virtio_net_hdr_mrg_rxbuf hdr)
> +static inline void __attribute__((always_inline))
> +update_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
> +		uint32_t desc_chain_head, uint32_t desc_chain_len)
>  {
> -	if (dev->vhost_hlen == sizeof(struct virtio_net_hdr_mrg_rxbuf))
> -		*(struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr = hdr;
> -	else
> -		*(struct virtio_net_hdr *)(uintptr_t)desc_addr = hdr.hdr;
> +	uint32_t used_idx = vq->last_used_idx & (vq->size - 1);
> +
> +	vq->used->ring[used_idx].id = desc_chain_head;
> +	vq->used->ring[used_idx].len = desc_chain_len;
> +	vq->last_used_idx++;
> +	vhost_log_used_vring(dev, vq, offsetof(struct vring_used,
> +				ring[used_idx]),
> +			sizeof(vq->used->ring[used_idx]));
>  }
>
>  static inline int __attribute__((always_inline))
> -copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
> -		  struct rte_mbuf *m, uint16_t desc_idx)
> +enqueue_packet(struct virtio_net *dev, struct vhost_virtqueue *vq,
> +		uint16_t avail_idx, struct rte_mbuf *mbuf,
> +		uint32_t is_mrg_rxbuf)
>  {
> -	uint32_t desc_avail, desc_offset;
> -	uint32_t mbuf_avail, mbuf_offset;
> -	uint32_t cpy_len;
> +	struct virtio_net_hdr_mrg_rxbuf *virtio_hdr;
>  	struct vring_desc *desc;
>  	uint64_t desc_addr;
> -	struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0};
> +	uint32_t desc_chain_head;
> +	uint32_t desc_chain_len;
> +	uint32_t desc_current;
> +	uint32_t desc_offset;
> +	uint32_t mbuf_len;
> +	uint32_t mbuf_avail;
> +	uint32_t cpy_len;
> +	uint32_t num_buffers = 0;
>
> -	desc = &vq->desc[desc_idx];
> +	/* start with the first mbuf of the packet */
> +	mbuf_len = rte_pktmbuf_data_len(mbuf);
> +	mbuf_avail = mbuf_len;
> +
> +	/* get the current desc */
> +	desc_current = vq->avail->ring[(vq->last_used_idx) & (vq->size - 1)];
> +	desc_chain_head = desc_current;
> +	desc = &vq->desc[desc_current];
>  	desc_addr = gpa_to_vva(dev, desc->addr);
> -	/*
> -	 * Checking of 'desc_addr' placed outside of 'unlikely' macro to avoid
> -	 * performance issue with some versions of gcc (4.8.4 and 5.3.0) which
> -	 * otherwise stores offset on the stack instead of in a register.
> -	 */
> -	if (unlikely(desc->len < dev->vhost_hlen) || !desc_addr)
> -		return -1;
> +	if (unlikely(!desc_addr))
> +		goto error;
>
> -	rte_prefetch0((void *)(uintptr_t)desc_addr);
> +	/* handle virtio header */
> +	virtio_hdr = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr;
> +	virtio_enqueue_offload(mbuf, &(virtio_hdr->hdr));
> +	if (is_mrg_rxbuf)
> +		virtio_hdr->num_buffers = 1;
>
> -	virtio_enqueue_offload(m, &virtio_hdr.hdr);
> -	copy_virtio_net_hdr(dev, desc_addr, virtio_hdr);
>  	vhost_log_write(dev, desc->addr, dev->vhost_hlen);
>  	PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0);
> -
>  	desc_offset = dev->vhost_hlen;
> -	desc_avail  = desc->len - dev->vhost_hlen;
> -
> -	mbuf_avail  = rte_pktmbuf_data_len(m);
> -	mbuf_offset = 0;
> -	while (mbuf_avail != 0 || m->next != NULL) {
> -		/* done with current mbuf, fetch next */
> -		if (mbuf_avail == 0) {
> -			m = m->next;
> -
> -			mbuf_offset = 0;
> -			mbuf_avail  = rte_pktmbuf_data_len(m);
> +	desc_chain_len = desc_offset;
> +	desc_addr += desc_offset;
> +
> +	/* start copy from mbuf to desc */
> +	while (mbuf_avail || mbuf->next) {
> +		/* get the next mbuf if the current done */
> +		if (!mbuf_avail) {
> +			mbuf = mbuf->next;
> +			mbuf_len = rte_pktmbuf_data_len(mbuf);
> +			mbuf_avail = mbuf_len;
>  		}
>
> -		/* done with current desc buf, fetch next */
> -		if (desc_avail == 0) {
> -			if ((desc->flags & VRING_DESC_F_NEXT) == 0) {
> -				/* Room in vring buffer is not enough */
> -				return -1;
> -			}
> -			if (unlikely(desc->next >= vq->size))
> -				return -1;
> +		/* get the next desc if the current done */
> +		if (desc->len <= desc_offset) {
> +			if (desc->flags & VRING_DESC_F_NEXT) {
> +				/* go on with the current desc chain */
> +				desc_offset = 0;
> +				desc_current = desc->next;
> +				desc = &vq->desc[desc_current];
> +				desc_addr = gpa_to_vva(dev, desc->addr);
> +				if (unlikely(!desc_addr))
> +					goto error;
> +			} else if (is_mrg_rxbuf) {
> +				/* start with the next desc chain */
> +				update_used_ring(dev, vq, desc_chain_head,
> +						desc_chain_len);
> +				num_buffers++;
> +				virtio_hdr->num_buffers++;
> +				if (avail_idx == vq->last_used_idx)
> +					goto error;
> +
> +				desc_current =
> +					vq->avail->ring[(vq->last_used_idx) &
> +					(vq->size - 1)];
> +				desc_chain_head = desc_current;
> +				desc = &vq->desc[desc_current];
> +				desc_addr = gpa_to_vva(dev, desc->addr);
> +				if (unlikely(!desc_addr))
> +					goto error;
>
> -			desc = &vq->desc[desc->next];
> -			desc_addr = gpa_to_vva(dev, desc->addr);
> -			if (unlikely(!desc_addr))
> -				return -1;
> -
> -			desc_offset = 0;
> -			desc_avail  = desc->len;
> +				desc_chain_len = 0;
> +				desc_offset = 0;
As I commented on v3, there is code duplication between next flag, and 
mrg buf cases:
desc_offset = 0;

and:

desc = &vq->desc[desc_current];
desc_addr = gpa_to_vva(dev, desc->addr);
if (unlikely(!desc_addr))
     goto error;

Regards,
Maxime
  
Maxime Coquelin Sept. 12, 2016, 4:26 p.m. UTC | #2
On 09/09/2016 05:39 AM, Zhihong Wang wrote:
>
> +static inline void __attribute__((always_inline))
> +notify_guest(struct virtio_net *dev, struct vhost_virtqueue *vq)
> +{
>  	rte_smp_wmb();
> -
> -	*(volatile uint16_t *)&vq->used->idx += count;
> -	vq->last_used_idx += count;
> -	vhost_log_used_vring(dev, vq,
> -		offsetof(struct vring_used, idx),
> -		sizeof(vq->used->idx));
> -
> -	/* flush used->idx update before we read avail->flags. */
Please don't remove comments if not justified.
Here the comment is important, as it explains why the barrier is needed.
> +	*(volatile uint16_t *)&vq->used->idx = vq->last_used_idx;
> +	vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
> +			sizeof(vq->used->idx));
>  	rte_mb();
> -
> -	/* Kick the guest if necessary. */
>  	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
>  			&& (vq->callfd >= 0))
>  		eventfd_write(vq->callfd, (eventfd_t)1);
> -	return count;
>  }
  
Zhihong Wang Sept. 14, 2016, 8:20 a.m. UTC | #3
> > +				desc_current =
> > +					vq->avail->ring[(vq->last_used_idx)
> &
> > +					(vq->size - 1)];
> > +				desc_chain_head = desc_current;
> > +				desc = &vq->desc[desc_current];
> > +				desc_addr = gpa_to_vva(dev, desc->addr);
> > +				if (unlikely(!desc_addr))
> > +					goto error;
> >
> > -			desc = &vq->desc[desc->next];
> > -			desc_addr = gpa_to_vva(dev, desc->addr);
> > -			if (unlikely(!desc_addr))
> > -				return -1;
> > -
> > -			desc_offset = 0;
> > -			desc_avail  = desc->len;
> > +				desc_chain_len = 0;
> > +				desc_offset = 0;
> As I commented on v3, there is code duplication between next flag, and
> mrg buf cases:
> desc_offset = 0;
> 
> and:
> 
> desc = &vq->desc[desc_current];
> desc_addr = gpa_to_vva(dev, desc->addr);
> if (unlikely(!desc_addr))
>      goto error;
> 

Do you mean to add something like:

static inline int __attribute__((always_inline))
get_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
                uint32_t desc_idx, struct vring_desc **desc,
                uint64_t *desc_addr)
{
        *desc = &vq->desc[desc_idx];
        *desc_addr = gpa_to_vva(dev, (*desc)->addr);
        if (unlikely(!(*desc_addr)))
                return -1;

        return 0;
}


> Regards,
> Maxime
  
Zhihong Wang Sept. 14, 2016, 8:22 a.m. UTC | #4
> -----Original Message-----
> From: Maxime Coquelin [mailto:maxime.coquelin@redhat.com]
> Sent: Tuesday, September 13, 2016 12:27 AM
> To: Wang, Zhihong <zhihong.wang@intel.com>; dev@dpdk.org
> Cc: yuanhan.liu@linux.intel.com; thomas.monjalon@6wind.com
> Subject: Re: [PATCH v5 2/6] vhost: rewrite enqueue
> 
> 
> 
> On 09/09/2016 05:39 AM, Zhihong Wang wrote:
> >
> > +static inline void __attribute__((always_inline))
> > +notify_guest(struct virtio_net *dev, struct vhost_virtqueue *vq)
> > +{
> >  	rte_smp_wmb();
> > -
> > -	*(volatile uint16_t *)&vq->used->idx += count;
> > -	vq->last_used_idx += count;
> > -	vhost_log_used_vring(dev, vq,
> > -		offsetof(struct vring_used, idx),
> > -		sizeof(vq->used->idx));
> > -
> > -	/* flush used->idx update before we read avail->flags. */
> Please don't remove comments if not justified.
> Here the comment is important, as it explains why the barrier is needed.

Okay.

> > +	*(volatile uint16_t *)&vq->used->idx = vq->last_used_idx;
> > +	vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
> > +			sizeof(vq->used->idx));
> >  	rte_mb();
> > -
> > -	/* Kick the guest if necessary. */
> >  	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
> >  			&& (vq->callfd >= 0))
> >  		eventfd_write(vq->callfd, (eventfd_t)1);
> > -	return count;
> >  }
  
Maxime Coquelin Sept. 15, 2016, 4:35 p.m. UTC | #5
Hi,
On 09/14/2016 10:20 AM, Wang, Zhihong wrote:
>>> +				desc_current =
>>> +					vq->avail->ring[(vq->last_used_idx)
>> &
>>> +					(vq->size - 1)];
>>> +				desc_chain_head = desc_current;
>>> +				desc = &vq->desc[desc_current];
>>> +				desc_addr = gpa_to_vva(dev, desc->addr);
>>> +				if (unlikely(!desc_addr))
>>> +					goto error;
>>>
>>> -			desc = &vq->desc[desc->next];
>>> -			desc_addr = gpa_to_vva(dev, desc->addr);
>>> -			if (unlikely(!desc_addr))
>>> -				return -1;
>>> -
>>> -			desc_offset = 0;
>>> -			desc_avail  = desc->len;
>>> +				desc_chain_len = 0;
>>> +				desc_offset = 0;
>> As I commented on v3, there is code duplication between next flag, and
>> mrg buf cases:
>> desc_offset = 0;
>>
>> and:
>>
>> desc = &vq->desc[desc_current];
>> desc_addr = gpa_to_vva(dev, desc->addr);
>> if (unlikely(!desc_addr))
>>      goto error;
>>
>
> Do you mean to add something like:
>
> static inline int __attribute__((always_inline))
> get_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
>                 uint32_t desc_idx, struct vring_desc **desc,
>                 uint64_t *desc_addr)
> {
>         *desc = &vq->desc[desc_idx];
>         *desc_addr = gpa_to_vva(dev, (*desc)->addr);
>         if (unlikely(!(*desc_addr)))
>                 return -1;
>
>         return 0;
> }

I meant, move this code after the if/else.
You can do it in a function if it is done elsewhere in the file.
  
Yuanhan Liu Sept. 18, 2016, 2:19 p.m. UTC | #6
On Thu, Sep 08, 2016 at 11:39:24PM -0400, Zhihong Wang wrote:
> This patch implements the vhost logic from scratch into a single function
> designed for high performance and better maintainability.

As always, your commit log just states what have been done, but doesn't
tell why such changes have been made. For example, you said "it's designed
for high performance", then you'd better explain why your version would
introduce high performance. You need a reason, as well as some numbers
(percent change) to prove it: it's not that right to keep the numbers
inside: I'm sure people outside intel are also willing and happy to know
those numbers.

For this patch, I think it's more about the maintainability improvement
but not performance: the performance tunning patches are done later
after all.

Another example is, in patch 6, you said "It reduces CPU pipeline stall
cycles significantly", but you didn't say why there is pipeline stall
before and why your patch reduces it.

All those are important things that deserves some explanation. So, I'd
ask you to re-visit all your patches in this set, to think what you
could add to make the commit better and more informative.

Besides that, I think this patchset looks fine to me. I may just need
another time to look it more carefully, then I think I can merge (v6).

BTW, thanks for the great work!

	--yliu

> This is the baseline version of the new code, more optimization will be
> added in the following patches in this patch set.
> 
> Signed-off-by: Zhihong Wang <zhihong.wang@intel.com>
> ---
  
Zhihong Wang Sept. 19, 2016, 3:29 a.m. UTC | #7
> -----Original Message-----
> From: Yuanhan Liu [mailto:yuanhan.liu@linux.intel.com]
> Sent: Sunday, September 18, 2016 10:19 PM
> To: Wang, Zhihong <zhihong.wang@intel.com>
> Cc: dev@dpdk.org; maxime.coquelin@redhat.com;
> thomas.monjalon@6wind.com
> Subject: Re: [PATCH v5 2/6] vhost: rewrite enqueue
> 
> On Thu, Sep 08, 2016 at 11:39:24PM -0400, Zhihong Wang wrote:
> > This patch implements the vhost logic from scratch into a single function
> > designed for high performance and better maintainability.
> 
> As always, your commit log just states what have been done, but doesn't
> tell why such changes have been made. For example, you said "it's designed
> for high performance", then you'd better explain why your version would
> introduce high performance. You need a reason, as well as some numbers
> (percent change) to prove it: it's not that right to keep the numbers
> inside: I'm sure people outside intel are also willing and happy to know
> those numbers.
> 
> For this patch, I think it's more about the maintainability improvement
> but not performance: the performance tunning patches are done later
> after all.
> 
> Another example is, in patch 6, you said "It reduces CPU pipeline stall
> cycles significantly", but you didn't say why there is pipeline stall
> before and why your patch reduces it.
> 
> All those are important things that deserves some explanation. So, I'd
> ask you to re-visit all your patches in this set, to think what you
> could add to make the commit better and more informative.

Okay. I'll add more detailed commit log.

> 
> Besides that, I think this patchset looks fine to me. I may just need
> another time to look it more carefully, then I think I can merge (v6).
> 
> BTW, thanks for the great work!
> 
> 	--yliu
> 
> > This is the baseline version of the new code, more optimization will be
> > added in the following patches in this patch set.
> >
> > Signed-off-by: Zhihong Wang <zhihong.wang@intel.com>
> > ---
  

Patch

diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 0d6e7d9..6f63968 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -91,7 +91,7 @@  is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t qp_nb)
 	return (is_tx ^ (idx & 1)) == 0 && idx < qp_nb * VIRTIO_QNUM;
 }
 
-static void
+static inline void __attribute__((always_inline))
 virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
 {
 	if (m_buf->ol_flags & PKT_TX_L4_MASK) {
@@ -112,6 +112,10 @@  virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
 						cksum));
 			break;
 		}
+	} else {
+		net_hdr->flags       = 0;
+		net_hdr->csum_start  = 0;
+		net_hdr->csum_offset = 0;
 	}
 
 	if (m_buf->ol_flags & PKT_TX_TCP_SEG) {
@@ -122,439 +126,197 @@  virtio_enqueue_offload(struct rte_mbuf *m_buf, struct virtio_net_hdr *net_hdr)
 		net_hdr->gso_size = m_buf->tso_segsz;
 		net_hdr->hdr_len = m_buf->l2_len + m_buf->l3_len
 					+ m_buf->l4_len;
+	} else {
+		net_hdr->gso_type = 0;
+		net_hdr->hdr_len  = 0;
+		net_hdr->gso_size = 0;
 	}
 }
 
-static inline void
-copy_virtio_net_hdr(struct virtio_net *dev, uint64_t desc_addr,
-		    struct virtio_net_hdr_mrg_rxbuf hdr)
+static inline void __attribute__((always_inline))
+update_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
+		uint32_t desc_chain_head, uint32_t desc_chain_len)
 {
-	if (dev->vhost_hlen == sizeof(struct virtio_net_hdr_mrg_rxbuf))
-		*(struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr = hdr;
-	else
-		*(struct virtio_net_hdr *)(uintptr_t)desc_addr = hdr.hdr;
+	uint32_t used_idx = vq->last_used_idx & (vq->size - 1);
+
+	vq->used->ring[used_idx].id = desc_chain_head;
+	vq->used->ring[used_idx].len = desc_chain_len;
+	vq->last_used_idx++;
+	vhost_log_used_vring(dev, vq, offsetof(struct vring_used,
+				ring[used_idx]),
+			sizeof(vq->used->ring[used_idx]));
 }
 
 static inline int __attribute__((always_inline))
-copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
-		  struct rte_mbuf *m, uint16_t desc_idx)
+enqueue_packet(struct virtio_net *dev, struct vhost_virtqueue *vq,
+		uint16_t avail_idx, struct rte_mbuf *mbuf,
+		uint32_t is_mrg_rxbuf)
 {
-	uint32_t desc_avail, desc_offset;
-	uint32_t mbuf_avail, mbuf_offset;
-	uint32_t cpy_len;
+	struct virtio_net_hdr_mrg_rxbuf *virtio_hdr;
 	struct vring_desc *desc;
 	uint64_t desc_addr;
-	struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0};
+	uint32_t desc_chain_head;
+	uint32_t desc_chain_len;
+	uint32_t desc_current;
+	uint32_t desc_offset;
+	uint32_t mbuf_len;
+	uint32_t mbuf_avail;
+	uint32_t cpy_len;
+	uint32_t num_buffers = 0;
 
-	desc = &vq->desc[desc_idx];
+	/* start with the first mbuf of the packet */
+	mbuf_len = rte_pktmbuf_data_len(mbuf);
+	mbuf_avail = mbuf_len;
+
+	/* get the current desc */
+	desc_current = vq->avail->ring[(vq->last_used_idx) & (vq->size - 1)];
+	desc_chain_head = desc_current;
+	desc = &vq->desc[desc_current];
 	desc_addr = gpa_to_vva(dev, desc->addr);
-	/*
-	 * Checking of 'desc_addr' placed outside of 'unlikely' macro to avoid
-	 * performance issue with some versions of gcc (4.8.4 and 5.3.0) which
-	 * otherwise stores offset on the stack instead of in a register.
-	 */
-	if (unlikely(desc->len < dev->vhost_hlen) || !desc_addr)
-		return -1;
+	if (unlikely(!desc_addr))
+		goto error;
 
-	rte_prefetch0((void *)(uintptr_t)desc_addr);
+	/* handle virtio header */
+	virtio_hdr = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr;
+	virtio_enqueue_offload(mbuf, &(virtio_hdr->hdr));
+	if (is_mrg_rxbuf)
+		virtio_hdr->num_buffers = 1;
 
-	virtio_enqueue_offload(m, &virtio_hdr.hdr);
-	copy_virtio_net_hdr(dev, desc_addr, virtio_hdr);
 	vhost_log_write(dev, desc->addr, dev->vhost_hlen);
 	PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0);
-
 	desc_offset = dev->vhost_hlen;
-	desc_avail  = desc->len - dev->vhost_hlen;
-
-	mbuf_avail  = rte_pktmbuf_data_len(m);
-	mbuf_offset = 0;
-	while (mbuf_avail != 0 || m->next != NULL) {
-		/* done with current mbuf, fetch next */
-		if (mbuf_avail == 0) {
-			m = m->next;
-
-			mbuf_offset = 0;
-			mbuf_avail  = rte_pktmbuf_data_len(m);
+	desc_chain_len = desc_offset;
+	desc_addr += desc_offset;
+
+	/* start copy from mbuf to desc */
+	while (mbuf_avail || mbuf->next) {
+		/* get the next mbuf if the current done */
+		if (!mbuf_avail) {
+			mbuf = mbuf->next;
+			mbuf_len = rte_pktmbuf_data_len(mbuf);
+			mbuf_avail = mbuf_len;
 		}
 
-		/* done with current desc buf, fetch next */
-		if (desc_avail == 0) {
-			if ((desc->flags & VRING_DESC_F_NEXT) == 0) {
-				/* Room in vring buffer is not enough */
-				return -1;
-			}
-			if (unlikely(desc->next >= vq->size))
-				return -1;
+		/* get the next desc if the current done */
+		if (desc->len <= desc_offset) {
+			if (desc->flags & VRING_DESC_F_NEXT) {
+				/* go on with the current desc chain */
+				desc_offset = 0;
+				desc_current = desc->next;
+				desc = &vq->desc[desc_current];
+				desc_addr = gpa_to_vva(dev, desc->addr);
+				if (unlikely(!desc_addr))
+					goto error;
+			} else if (is_mrg_rxbuf) {
+				/* start with the next desc chain */
+				update_used_ring(dev, vq, desc_chain_head,
+						desc_chain_len);
+				num_buffers++;
+				virtio_hdr->num_buffers++;
+				if (avail_idx == vq->last_used_idx)
+					goto error;
+
+				desc_current =
+					vq->avail->ring[(vq->last_used_idx) &
+					(vq->size - 1)];
+				desc_chain_head = desc_current;
+				desc = &vq->desc[desc_current];
+				desc_addr = gpa_to_vva(dev, desc->addr);
+				if (unlikely(!desc_addr))
+					goto error;
 
-			desc = &vq->desc[desc->next];
-			desc_addr = gpa_to_vva(dev, desc->addr);
-			if (unlikely(!desc_addr))
-				return -1;
-
-			desc_offset = 0;
-			desc_avail  = desc->len;
+				desc_chain_len = 0;
+				desc_offset = 0;
+			} else
+				goto error;
 		}
 
-		cpy_len = RTE_MIN(desc_avail, mbuf_avail);
-		rte_memcpy((void *)((uintptr_t)(desc_addr + desc_offset)),
-			rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
-			cpy_len);
+		/* copy mbuf data */
+		cpy_len = RTE_MIN(desc->len - desc_offset, mbuf_avail);
+		rte_memcpy((void *)(uintptr_t)desc_addr,
+				rte_pktmbuf_mtod_offset(mbuf, void *,
+					mbuf_len - mbuf_avail),
+				cpy_len);
 		vhost_log_write(dev, desc->addr + desc_offset, cpy_len);
-		PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset),
-			     cpy_len, 0);
-
-		mbuf_avail  -= cpy_len;
-		mbuf_offset += cpy_len;
-		desc_avail  -= cpy_len;
-		desc_offset += cpy_len;
-	}
-
-	return 0;
-}
-
-/**
- * This function adds buffers to the virtio devices RX virtqueue. Buffers can
- * be received from the physical port or from another virtio device. A packet
- * count is returned to indicate the number of packets that are succesfully
- * added to the RX queue. This function works when the mbuf is scattered, but
- * it doesn't support the mergeable feature.
- */
-static inline uint32_t __attribute__((always_inline))
-virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
-	      struct rte_mbuf **pkts, uint32_t count)
-{
-	struct vhost_virtqueue *vq;
-	uint16_t avail_idx, free_entries, start_idx;
-	uint16_t desc_indexes[MAX_PKT_BURST];
-	uint16_t used_idx;
-	uint32_t i;
-
-	LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
-	if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb))) {
-		RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
-			dev->vid, __func__, queue_id);
-		return 0;
+		PRINT_PACKET(dev, (uintptr_t)desc_addr, cpy_len, 0);
+		mbuf_avail     -= cpy_len;
+		desc_addr      += cpy_len;
+		desc_offset    += cpy_len;
+		desc_chain_len += cpy_len;
 	}
 
-	vq = dev->virtqueue[queue_id];
-	if (unlikely(vq->enabled == 0))
-		return 0;
+	update_used_ring(dev, vq, desc_chain_head, desc_chain_len);
 
-	avail_idx = *((volatile uint16_t *)&vq->avail->idx);
-	start_idx = vq->last_used_idx;
-	free_entries = avail_idx - start_idx;
-	count = RTE_MIN(count, free_entries);
-	count = RTE_MIN(count, (uint32_t)MAX_PKT_BURST);
-	if (count == 0)
-		return 0;
-
-	LOG_DEBUG(VHOST_DATA, "(%d) start_idx %d | end_idx %d\n",
-		dev->vid, start_idx, start_idx + count);
-
-	/* Retrieve all of the desc indexes first to avoid caching issues. */
-	rte_prefetch0(&vq->avail->ring[start_idx & (vq->size - 1)]);
-	for (i = 0; i < count; i++) {
-		used_idx = (start_idx + i) & (vq->size - 1);
-		desc_indexes[i] = vq->avail->ring[used_idx];
-		vq->used->ring[used_idx].id = desc_indexes[i];
-		vq->used->ring[used_idx].len = pkts[i]->pkt_len +
-					       dev->vhost_hlen;
-		vhost_log_used_vring(dev, vq,
-			offsetof(struct vring_used, ring[used_idx]),
-			sizeof(vq->used->ring[used_idx]));
-	}
-
-	rte_prefetch0(&vq->desc[desc_indexes[0]]);
-	for (i = 0; i < count; i++) {
-		uint16_t desc_idx = desc_indexes[i];
-		int err;
+	return 0;
 
-		err = copy_mbuf_to_desc(dev, vq, pkts[i], desc_idx);
-		if (unlikely(err)) {
-			used_idx = (start_idx + i) & (vq->size - 1);
-			vq->used->ring[used_idx].len = dev->vhost_hlen;
-			vhost_log_used_vring(dev, vq,
-				offsetof(struct vring_used, ring[used_idx]),
-				sizeof(vq->used->ring[used_idx]));
-		}
+error:
+	/* rollback on any error if last_used_idx update on-the-fly */
+	vq->last_used_idx -= num_buffers;
 
-		if (i + 1 < count)
-			rte_prefetch0(&vq->desc[desc_indexes[i+1]]);
-	}
+	return -1;
+}
 
+static inline void __attribute__((always_inline))
+notify_guest(struct virtio_net *dev, struct vhost_virtqueue *vq)
+{
 	rte_smp_wmb();
-
-	*(volatile uint16_t *)&vq->used->idx += count;
-	vq->last_used_idx += count;
-	vhost_log_used_vring(dev, vq,
-		offsetof(struct vring_used, idx),
-		sizeof(vq->used->idx));
-
-	/* flush used->idx update before we read avail->flags. */
+	*(volatile uint16_t *)&vq->used->idx = vq->last_used_idx;
+	vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
+			sizeof(vq->used->idx));
 	rte_mb();
-
-	/* Kick the guest if necessary. */
 	if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
 			&& (vq->callfd >= 0))
 		eventfd_write(vq->callfd, (eventfd_t)1);
-	return count;
 }
 
-static inline int
-fill_vec_buf(struct vhost_virtqueue *vq, uint32_t avail_idx,
-	     uint32_t *allocated, uint32_t *vec_idx,
-	     struct buf_vector *buf_vec)
-{
-	uint16_t idx = vq->avail->ring[avail_idx & (vq->size - 1)];
-	uint32_t vec_id = *vec_idx;
-	uint32_t len    = *allocated;
-
-	while (1) {
-		if (unlikely(vec_id >= BUF_VECTOR_MAX || idx >= vq->size))
-			return -1;
-
-		len += vq->desc[idx].len;
-		buf_vec[vec_id].buf_addr = vq->desc[idx].addr;
-		buf_vec[vec_id].buf_len  = vq->desc[idx].len;
-		buf_vec[vec_id].desc_idx = idx;
-		vec_id++;
-
-		if ((vq->desc[idx].flags & VRING_DESC_F_NEXT) == 0)
-			break;
-
-		idx = vq->desc[idx].next;
-	}
-
-	*allocated = len;
-	*vec_idx   = vec_id;
-
-	return 0;
-}
-
-/*
- * Returns -1 on fail, 0 on success
- */
-static inline int
-reserve_avail_buf_mergeable(struct vhost_virtqueue *vq, uint32_t size,
-			    uint16_t *end, struct buf_vector *buf_vec)
+uint16_t
+rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
+	struct rte_mbuf **pkts, uint16_t count)
 {
-	uint16_t cur_idx;
+	struct vhost_virtqueue *vq;
+	struct virtio_net *dev;
+	uint32_t is_mrg_rxbuf = 0;
+	uint32_t pkt_idx      = 0;
+	uint32_t pkt_left     = count;
 	uint16_t avail_idx;
-	uint32_t allocated = 0;
-	uint32_t vec_idx = 0;
-	uint16_t tries = 0;
-
-	cur_idx  = vq->last_used_idx;
-
-	while (1) {
-		avail_idx = *((volatile uint16_t *)&vq->avail->idx);
-		if (unlikely(cur_idx == avail_idx))
-			return -1;
-
-		if (unlikely(fill_vec_buf(vq, cur_idx, &allocated,
-					  &vec_idx, buf_vec) < 0))
-			return -1;
-
-		cur_idx++;
-		tries++;
-
-		if (allocated >= size)
-			break;
-
-		/*
-		 * if we tried all available ring items, and still
-		 * can't get enough buf, it means something abnormal
-		 * happened.
-		 */
-		if (unlikely(tries >= vq->size))
-			return -1;
-	}
 
-	*end = cur_idx;
-	return 0;
-}
-
-static inline uint32_t __attribute__((always_inline))
-copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,
-			    uint16_t end_idx, struct rte_mbuf *m,
-			    struct buf_vector *buf_vec)
-{
-	struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0, 0, 0, 0, 0, 0}, 0};
-	uint32_t vec_idx = 0;
-	uint16_t start_idx = vq->last_used_idx;
-	uint16_t cur_idx = start_idx;
-	uint64_t desc_addr;
-	uint32_t desc_chain_head;
-	uint32_t desc_chain_len;
-	uint32_t mbuf_offset, mbuf_avail;
-	uint32_t desc_offset, desc_avail;
-	uint32_t cpy_len;
-	uint16_t desc_idx, used_idx;
-
-	if (unlikely(m == NULL))
+	if (unlikely(!pkt_left))
 		return 0;
 
-	LOG_DEBUG(VHOST_DATA, "(%d) current index %d | end index %d\n",
-		dev->vid, cur_idx, end_idx);
+	pkt_left = RTE_MIN((uint32_t)MAX_PKT_BURST, pkt_left);
 
-	desc_addr = gpa_to_vva(dev, buf_vec[vec_idx].buf_addr);
-	if (buf_vec[vec_idx].buf_len < dev->vhost_hlen || !desc_addr)
+	dev = get_device(vid);
+	if (unlikely(!dev))
 		return 0;
 
-	rte_prefetch0((void *)(uintptr_t)desc_addr);
-
-	virtio_hdr.num_buffers = end_idx - start_idx;
-	LOG_DEBUG(VHOST_DATA, "(%d) RX: num merge buffers %d\n",
-		dev->vid, virtio_hdr.num_buffers);
-
-	virtio_enqueue_offload(m, &virtio_hdr.hdr);
-	copy_virtio_net_hdr(dev, desc_addr, virtio_hdr);
-	vhost_log_write(dev, buf_vec[vec_idx].buf_addr, dev->vhost_hlen);
-	PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0);
-
-	desc_avail  = buf_vec[vec_idx].buf_len - dev->vhost_hlen;
-	desc_offset = dev->vhost_hlen;
-	desc_chain_head = buf_vec[vec_idx].desc_idx;
-	desc_chain_len = desc_offset;
-
-	mbuf_avail  = rte_pktmbuf_data_len(m);
-	mbuf_offset = 0;
-	while (mbuf_avail != 0 || m->next != NULL) {
-		/* done with current desc buf, get the next one */
-		if (desc_avail == 0) {
-			desc_idx = buf_vec[vec_idx].desc_idx;
-			vec_idx++;
-
-			if (!(vq->desc[desc_idx].flags & VRING_DESC_F_NEXT)) {
-				/* Update used ring with desc information */
-				used_idx = cur_idx++ & (vq->size - 1);
-				vq->used->ring[used_idx].id = desc_chain_head;
-				vq->used->ring[used_idx].len = desc_chain_len;
-				vhost_log_used_vring(dev, vq,
-					offsetof(struct vring_used,
-						 ring[used_idx]),
-					sizeof(vq->used->ring[used_idx]));
-				desc_chain_head = buf_vec[vec_idx].desc_idx;
-				desc_chain_len = 0;
-			}
-
-			desc_addr = gpa_to_vva(dev, buf_vec[vec_idx].buf_addr);
-			if (unlikely(!desc_addr))
-				return 0;
-
-			/* Prefetch buffer address. */
-			rte_prefetch0((void *)(uintptr_t)desc_addr);
-			desc_offset = 0;
-			desc_avail  = buf_vec[vec_idx].buf_len;
-		}
-
-		/* done with current mbuf, get the next one */
-		if (mbuf_avail == 0) {
-			m = m->next;
-
-			mbuf_offset = 0;
-			mbuf_avail  = rte_pktmbuf_data_len(m);
-		}
-
-		cpy_len = RTE_MIN(desc_avail, mbuf_avail);
-		rte_memcpy((void *)((uintptr_t)(desc_addr + desc_offset)),
-			rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
-			cpy_len);
-		vhost_log_write(dev, buf_vec[vec_idx].buf_addr + desc_offset,
-			cpy_len);
-		PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset),
-			cpy_len, 0);
-
-		mbuf_avail  -= cpy_len;
-		mbuf_offset += cpy_len;
-		desc_avail  -= cpy_len;
-		desc_offset += cpy_len;
-		desc_chain_len += cpy_len;
-	}
-
-	used_idx = cur_idx & (vq->size - 1);
-	vq->used->ring[used_idx].id = desc_chain_head;
-	vq->used->ring[used_idx].len = desc_chain_len;
-	vhost_log_used_vring(dev, vq,
-		offsetof(struct vring_used, ring[used_idx]),
-		sizeof(vq->used->ring[used_idx]));
-
-	return end_idx - start_idx;
-}
-
-static inline uint32_t __attribute__((always_inline))
-virtio_dev_merge_rx(struct virtio_net *dev, uint16_t queue_id,
-	struct rte_mbuf **pkts, uint32_t count)
-{
-	struct vhost_virtqueue *vq;
-	uint32_t pkt_idx = 0, nr_used = 0;
-	uint16_t end;
-	struct buf_vector buf_vec[BUF_VECTOR_MAX];
-
-	LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
-	if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb))) {
-		RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
-			dev->vid, __func__, queue_id);
+	if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->virt_qp_nb)))
 		return 0;
-	}
 
 	vq = dev->virtqueue[queue_id];
-	if (unlikely(vq->enabled == 0))
+	if (unlikely(!vq->enabled))
 		return 0;
 
-	count = RTE_MIN((uint32_t)MAX_PKT_BURST, count);
-	if (count == 0)
-		return 0;
-
-	for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
-		uint32_t pkt_len = pkts[pkt_idx]->pkt_len + dev->vhost_hlen;
+	if (dev->features & (1ULL << VIRTIO_NET_F_MRG_RXBUF))
+		is_mrg_rxbuf = 1;
 
-		if (unlikely(reserve_avail_buf_mergeable(vq, pkt_len,
-							 &end, buf_vec) < 0)) {
-			LOG_DEBUG(VHOST_DATA,
-				"(%d) failed to get enough desc from vring\n",
-				dev->vid);
+	/* start enqueuing packets 1 by 1 */
+	avail_idx = *((volatile uint16_t *)&vq->avail->idx);
+	while (pkt_left && avail_idx != vq->last_used_idx) {
+		if (enqueue_packet(dev, vq, avail_idx, pkts[pkt_idx],
+					is_mrg_rxbuf))
 			break;
-		}
-
-		nr_used = copy_mbuf_to_desc_mergeable(dev, vq, end,
-						      pkts[pkt_idx], buf_vec);
-		rte_smp_wmb();
 
-		*(volatile uint16_t *)&vq->used->idx += nr_used;
-		vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
-			sizeof(vq->used->idx));
-		vq->last_used_idx += nr_used;
+		pkt_idx++;
+		pkt_left--;
 	}
 
-	if (likely(pkt_idx)) {
-		/* flush used->idx update before we read avail->flags. */
-		rte_mb();
-
-		/* Kick the guest if necessary. */
-		if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
-				&& (vq->callfd >= 0))
-			eventfd_write(vq->callfd, (eventfd_t)1);
-	}
+	/* update used idx and kick the guest if necessary */
+	if (pkt_idx)
+		notify_guest(dev, vq);
 
 	return pkt_idx;
 }
 
-uint16_t
-rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
-	struct rte_mbuf **pkts, uint16_t count)
-{
-	struct virtio_net *dev = get_device(vid);
-
-	if (!dev)
-		return 0;
-
-	if (dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF))
-		return virtio_dev_merge_rx(dev, queue_id, pkts, count);
-	else
-		return virtio_dev_rx(dev, queue_id, pkts, count);
-}
-
 static void
 parse_ethernet(struct rte_mbuf *m, uint16_t *l4_proto, void **l4_hdr)
 {