[v2,4/7] vhost: translate iovas at vectors fill time

Message ID 20180623071127.22999-5-maxime.coquelin@redhat.com (mailing list archive)
State Superseded, archived
Delegated to: Maxime Coquelin
Headers
Series vhost: generalize buffer vectors |

Checks

Context Check Description
ci/Intel-compilation success Compilation OK

Commit Message

Maxime Coquelin June 23, 2018, 7:11 a.m. UTC
  This patch aims at simplifying the desc to mbuf and mbuf to desc
copy functions. It performs the iova to hva translations at
vectors fill time.

Doing this, in case desc buffer isn't contiguous in hva space,
it gets split into multiple vectors.

Signed-off-by: Maxime Coquelin <maxime.coquelin@redhat.com>
---
 lib/librte_vhost/vhost.h      |   1 +
 lib/librte_vhost/virtio_net.c | 340 ++++++++++++++++++------------------------
 2 files changed, 144 insertions(+), 197 deletions(-)
  

Comments

Tiwei Bie June 25, 2018, 2:21 a.m. UTC | #1
On Sat, Jun 23, 2018 at 09:11:24AM +0200, Maxime Coquelin wrote:
> This patch aims at simplifying the desc to mbuf and mbuf to desc
> copy functions. It performs the iova to hva translations at
> vectors fill time.
> 
> Doing this, in case desc buffer isn't contiguous in hva space,
> it gets split into multiple vectors.
> 
> Signed-off-by: Maxime Coquelin <maxime.coquelin@redhat.com>
> ---
>  lib/librte_vhost/vhost.h      |   1 +
>  lib/librte_vhost/virtio_net.c | 340 ++++++++++++++++++------------------------
>  2 files changed, 144 insertions(+), 197 deletions(-)
> 
> diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
> index 786a74f64..e3b2ed2ff 100644
> --- a/lib/librte_vhost/vhost.h
> +++ b/lib/librte_vhost/vhost.h
> @@ -43,6 +43,7 @@
>   * from vring to do scatter RX.
>   */
>  struct buf_vector {
> +	uint64_t buf_iova;
>  	uint64_t buf_addr;
>  	uint32_t buf_len;
>  	uint32_t desc_idx;
> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
> index 4816e8003..1ab1edd67 100644
> --- a/lib/librte_vhost/virtio_net.c
> +++ b/lib/librte_vhost/virtio_net.c
> @@ -225,12 +225,12 @@ static __rte_always_inline int
>  fill_vec_buf(struct virtio_net *dev, struct vhost_virtqueue *vq,
>  			 uint32_t avail_idx, uint32_t *vec_idx,
>  			 struct buf_vector *buf_vec, uint16_t *desc_chain_head,
> -			 uint16_t *desc_chain_len)
> +			 uint16_t *desc_chain_len, uint8_t perm)
>  {
>  	uint16_t idx = vq->avail->ring[avail_idx & (vq->size - 1)];
>  	uint32_t vec_id = *vec_idx;
>  	uint32_t len    = 0;
> -	uint64_t dlen;
> +	uint64_t dlen, desc_avail, desc_iova;
>  	struct vring_desc *descs = vq->desc;
>  	struct vring_desc *idesc = NULL;
>  
> @@ -267,10 +267,31 @@ fill_vec_buf(struct virtio_net *dev, struct vhost_virtqueue *vq,
>  		}
>  
>  		len += descs[idx].len;
> -		buf_vec[vec_id].buf_addr = descs[idx].addr;
> -		buf_vec[vec_id].buf_len  = descs[idx].len;
> -		buf_vec[vec_id].desc_idx = idx;
> -		vec_id++;
> +		desc_avail = descs[idx].len;
> +		desc_iova = descs[idx].addr;
> +
> +		while (desc_avail) {

We also need to check whether:

vec_id >= BUF_VECTOR_MAX

> +			uint64_t desc_addr;
> +			uint64_t desc_chunck_len = desc_avail;
> +
> +			desc_addr = vhost_iova_to_vva(dev, vq,
> +					desc_iova,
> +					&desc_chunck_len,
> +					perm);
> +			if (unlikely(!desc_addr)) {
> +				free_ind_table(idesc);
> +				return -1;
> +			}
> +
> +			buf_vec[vec_id].buf_iova = desc_iova;
> +			buf_vec[vec_id].buf_addr = desc_addr;
> +			buf_vec[vec_id].buf_len  = desc_chunck_len;
> +			buf_vec[vec_id].desc_idx = idx;
> +
> +			desc_avail -= desc_chunck_len;
> +			desc_iova += desc_chunck_len;
> +			vec_id++;
> +		}
>  
>  		if ((descs[idx].flags & VRING_DESC_F_NEXT) == 0)
>  			break;
> @@ -293,7 +314,8 @@ fill_vec_buf(struct virtio_net *dev, struct vhost_virtqueue *vq,
>  static inline int
>  reserve_avail_buf(struct virtio_net *dev, struct vhost_virtqueue *vq,
>  				uint32_t size, struct buf_vector *buf_vec,
> -				uint16_t *num_buffers, uint16_t avail_head)
> +				uint16_t *num_buffers, uint16_t avail_head,
> +				uint16_t *nr_vec)
>  {
>  	uint16_t cur_idx;
>  	uint32_t vec_idx = 0;
> @@ -315,7 +337,8 @@ reserve_avail_buf(struct virtio_net *dev, struct vhost_virtqueue *vq,
>  			return -1;
>  
>  		if (unlikely(fill_vec_buf(dev, vq, cur_idx, &vec_idx, buf_vec,
> -						&head_idx, &len) < 0))
> +						&head_idx, &len,
> +						VHOST_ACCESS_RO) < 0))

reserve_avail_buf() is called by virtio_dev_rx(),
so the write perm is needed.

>  			return -1;
>  		len = RTE_MIN(len, size);
>  		update_shadow_used_ring(vq, head_idx, len);
> @@ -334,21 +357,22 @@ reserve_avail_buf(struct virtio_net *dev, struct vhost_virtqueue *vq,
>  			return -1;
>  	}
>  
> +	*nr_vec = vec_idx;
> +
>  	return 0;
>  }
[...]
> @@ -455,18 +454,12 @@ copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
>  				uint64_t len;
>  				uint64_t remain = dev->vhost_hlen;
>  				uint64_t src = (uint64_t)(uintptr_t)hdr, dst;
> -				uint64_t guest_addr = hdr_phys_addr;
> +				uint64_t iova = buf_vec[0].buf_iova;
> +				uint16_t hdr_vec_idx = 0;
>  
>  				while (remain) {
>  					len = remain;
> -					dst = vhost_iova_to_vva(dev, vq,
> -							guest_addr, &len,
> -							VHOST_ACCESS_RW);
> -					if (unlikely(!dst || !len)) {
> -						error = -1;
> -						goto out;
> -					}
> -
> +					dst =  buf_vec[hdr_vec_idx].buf_addr;

There is no need to have two ' ' after '='.

>  					rte_memcpy((void *)(uintptr_t)dst,
>  							(void *)(uintptr_t)src,
>  							len);
[...]
>  
>  		/*
> @@ -1175,7 +1120,8 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
>  		if (unlikely(fill_vec_buf(dev, vq,
>  						vq->last_avail_idx + i,
>  						&nr_vec, buf_vec,
> -						&head_idx, &dummy_len) < 0))
> +						&head_idx, &dummy_len,
> +						VHOST_ACCESS_RW) < 0))

This is dequeue path, so _RO should be used.

>  			break;
>  
>  		if (likely(dev->dequeue_zero_copy == 0))
> -- 
> 2.14.4
>
  
Maxime Coquelin June 25, 2018, 7:19 a.m. UTC | #2
On 06/25/2018 04:21 AM, Tiwei Bie wrote:
> On Sat, Jun 23, 2018 at 09:11:24AM +0200, Maxime Coquelin wrote:
>> This patch aims at simplifying the desc to mbuf and mbuf to desc
>> copy functions. It performs the iova to hva translations at
>> vectors fill time.
>>
>> Doing this, in case desc buffer isn't contiguous in hva space,
>> it gets split into multiple vectors.
>>
>> Signed-off-by: Maxime Coquelin <maxime.coquelin@redhat.com>
>> ---
>>   lib/librte_vhost/vhost.h      |   1 +
>>   lib/librte_vhost/virtio_net.c | 340 ++++++++++++++++++------------------------
>>   2 files changed, 144 insertions(+), 197 deletions(-)
>>
>> diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
>> index 786a74f64..e3b2ed2ff 100644
>> --- a/lib/librte_vhost/vhost.h
>> +++ b/lib/librte_vhost/vhost.h
>> @@ -43,6 +43,7 @@
>>    * from vring to do scatter RX.
>>    */
>>   struct buf_vector {
>> +	uint64_t buf_iova;
>>   	uint64_t buf_addr;
>>   	uint32_t buf_len;
>>   	uint32_t desc_idx;
>> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
>> index 4816e8003..1ab1edd67 100644
>> --- a/lib/librte_vhost/virtio_net.c
>> +++ b/lib/librte_vhost/virtio_net.c
>> @@ -225,12 +225,12 @@ static __rte_always_inline int
>>   fill_vec_buf(struct virtio_net *dev, struct vhost_virtqueue *vq,
>>   			 uint32_t avail_idx, uint32_t *vec_idx,
>>   			 struct buf_vector *buf_vec, uint16_t *desc_chain_head,
>> -			 uint16_t *desc_chain_len)
>> +			 uint16_t *desc_chain_len, uint8_t perm)
>>   {
>>   	uint16_t idx = vq->avail->ring[avail_idx & (vq->size - 1)];
>>   	uint32_t vec_id = *vec_idx;
>>   	uint32_t len    = 0;
>> -	uint64_t dlen;
>> +	uint64_t dlen, desc_avail, desc_iova;
>>   	struct vring_desc *descs = vq->desc;
>>   	struct vring_desc *idesc = NULL;
>>   
>> @@ -267,10 +267,31 @@ fill_vec_buf(struct virtio_net *dev, struct vhost_virtqueue *vq,
>>   		}
>>   
>>   		len += descs[idx].len;
>> -		buf_vec[vec_id].buf_addr = descs[idx].addr;
>> -		buf_vec[vec_id].buf_len  = descs[idx].len;
>> -		buf_vec[vec_id].desc_idx = idx;
>> -		vec_id++;
>> +		desc_avail = descs[idx].len;
>> +		desc_iova = descs[idx].addr;
>> +
>> +		while (desc_avail) {
> 
> We also need to check whether:
> 
> vec_id >= BUF_VECTOR_MAX

Right.

>> +			uint64_t desc_addr;
>> +			uint64_t desc_chunck_len = desc_avail;
>> +
>> +			desc_addr = vhost_iova_to_vva(dev, vq,
>> +					desc_iova,
>> +					&desc_chunck_len,
>> +					perm);
>> +			if (unlikely(!desc_addr)) {
>> +				free_ind_table(idesc);
>> +				return -1;
>> +			}
>> +
>> +			buf_vec[vec_id].buf_iova = desc_iova;
>> +			buf_vec[vec_id].buf_addr = desc_addr;
>> +			buf_vec[vec_id].buf_len  = desc_chunck_len;
>> +			buf_vec[vec_id].desc_idx = idx;
>> +
>> +			desc_avail -= desc_chunck_len;
>> +			desc_iova += desc_chunck_len;
>> +			vec_id++;
>> +		}
>>   
>>   		if ((descs[idx].flags & VRING_DESC_F_NEXT) == 0)
>>   			break;
>> @@ -293,7 +314,8 @@ fill_vec_buf(struct virtio_net *dev, struct vhost_virtqueue *vq,
>>   static inline int
>>   reserve_avail_buf(struct virtio_net *dev, struct vhost_virtqueue *vq,
>>   				uint32_t size, struct buf_vector *buf_vec,
>> -				uint16_t *num_buffers, uint16_t avail_head)
>> +				uint16_t *num_buffers, uint16_t avail_head,
>> +				uint16_t *nr_vec)
>>   {
>>   	uint16_t cur_idx;
>>   	uint32_t vec_idx = 0;
>> @@ -315,7 +337,8 @@ reserve_avail_buf(struct virtio_net *dev, struct vhost_virtqueue *vq,
>>   			return -1;
>>   
>>   		if (unlikely(fill_vec_buf(dev, vq, cur_idx, &vec_idx, buf_vec,
>> -						&head_idx, &len) < 0))
>> +						&head_idx, &len,
>> +						VHOST_ACCESS_RO) < 0))
> 
> reserve_avail_buf() is called by virtio_dev_rx(),
> so the write perm is needed.
Right.

To avoid having to pass the perms, I wonder if it wouldn't be better to
rely on the descriptors' VRING_DESC_F_WRITE flag.

>>   			return -1;
>>   		len = RTE_MIN(len, size);
>>   		update_shadow_used_ring(vq, head_idx, len);
>> @@ -334,21 +357,22 @@ reserve_avail_buf(struct virtio_net *dev, struct vhost_virtqueue *vq,
>>   			return -1;
>>   	}
>>   
>> +	*nr_vec = vec_idx;
>> +
>>   	return 0;
>>   }
> [...]
>> @@ -455,18 +454,12 @@ copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
>>   				uint64_t len;
>>   				uint64_t remain = dev->vhost_hlen;
>>   				uint64_t src = (uint64_t)(uintptr_t)hdr, dst;
>> -				uint64_t guest_addr = hdr_phys_addr;
>> +				uint64_t iova = buf_vec[0].buf_iova;
>> +				uint16_t hdr_vec_idx = 0;
>>   
>>   				while (remain) {
>>   					len = remain;
>> -					dst = vhost_iova_to_vva(dev, vq,
>> -							guest_addr, &len,
>> -							VHOST_ACCESS_RW);
>> -					if (unlikely(!dst || !len)) {
>> -						error = -1;
>> -						goto out;
>> -					}
>> -
>> +					dst =  buf_vec[hdr_vec_idx].buf_addr;
> 
> There is no need to have two ' ' after '='.

Agree.

>>   					rte_memcpy((void *)(uintptr_t)dst,
>>   							(void *)(uintptr_t)src,
>>   							len);
> [...]
>>   
>>   		/*
>> @@ -1175,7 +1120,8 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
>>   		if (unlikely(fill_vec_buf(dev, vq,
>>   						vq->last_avail_idx + i,
>>   						&nr_vec, buf_vec,
>> -						&head_idx, &dummy_len) < 0))
>> +						&head_idx, &dummy_len,
>> +						VHOST_ACCESS_RW) < 0))
> 
> This is dequeue path, so _RO should be used.

Right.

>>   			break;
>>   
>>   		if (likely(dev->dequeue_zero_copy == 0))
>> -- 
>> 2.14.4
>>

Thanks the review,
Maxime
  
Tiwei Bie June 25, 2018, 10:31 a.m. UTC | #3
On Mon, Jun 25, 2018 at 09:19:34AM +0200, Maxime Coquelin wrote:
> On 06/25/2018 04:21 AM, Tiwei Bie wrote:
> > On Sat, Jun 23, 2018 at 09:11:24AM +0200, Maxime Coquelin wrote:
[...]
> > > @@ -293,7 +314,8 @@ fill_vec_buf(struct virtio_net *dev, struct vhost_virtqueue *vq,
> > >   static inline int
> > >   reserve_avail_buf(struct virtio_net *dev, struct vhost_virtqueue *vq,
> > >   				uint32_t size, struct buf_vector *buf_vec,
> > > -				uint16_t *num_buffers, uint16_t avail_head)
> > > +				uint16_t *num_buffers, uint16_t avail_head,
> > > +				uint16_t *nr_vec)
> > >   {
> > >   	uint16_t cur_idx;
> > >   	uint32_t vec_idx = 0;
> > > @@ -315,7 +337,8 @@ reserve_avail_buf(struct virtio_net *dev, struct vhost_virtqueue *vq,
> > >   			return -1;
> > >   		if (unlikely(fill_vec_buf(dev, vq, cur_idx, &vec_idx, buf_vec,
> > > -						&head_idx, &len) < 0))
> > > +						&head_idx, &len,
> > > +						VHOST_ACCESS_RO) < 0))
> > 
> > reserve_avail_buf() is called by virtio_dev_rx(),
> > so the write perm is needed.
> Right.
> 
> To avoid having to pass the perms, I wonder if it wouldn't be better to
> rely on the descriptors' VRING_DESC_F_WRITE flag.
> 

Currently, DPDK vhost net doesn't check this flag,
so it could cause problems in some cases. If we
want to rely on this flag, I think we still need
to pass something similar to tell fill_vec_buf()
whether the bufs will be written or read, so the
flag can be checked.

Best regards,
Tiwei Bie
  
Maxime Coquelin June 27, 2018, 8:38 a.m. UTC | #4
On 06/25/2018 12:31 PM, Tiwei Bie wrote:
> On Mon, Jun 25, 2018 at 09:19:34AM +0200, Maxime Coquelin wrote:
>> On 06/25/2018 04:21 AM, Tiwei Bie wrote:
>>> On Sat, Jun 23, 2018 at 09:11:24AM +0200, Maxime Coquelin wrote:
> [...]
>>>> @@ -293,7 +314,8 @@ fill_vec_buf(struct virtio_net *dev, struct vhost_virtqueue *vq,
>>>>    static inline int
>>>>    reserve_avail_buf(struct virtio_net *dev, struct vhost_virtqueue *vq,
>>>>    				uint32_t size, struct buf_vector *buf_vec,
>>>> -				uint16_t *num_buffers, uint16_t avail_head)
>>>> +				uint16_t *num_buffers, uint16_t avail_head,
>>>> +				uint16_t *nr_vec)
>>>>    {
>>>>    	uint16_t cur_idx;
>>>>    	uint32_t vec_idx = 0;
>>>> @@ -315,7 +337,8 @@ reserve_avail_buf(struct virtio_net *dev, struct vhost_virtqueue *vq,
>>>>    			return -1;
>>>>    		if (unlikely(fill_vec_buf(dev, vq, cur_idx, &vec_idx, buf_vec,
>>>> -						&head_idx, &len) < 0))
>>>> +						&head_idx, &len,
>>>> +						VHOST_ACCESS_RO) < 0))
>>>
>>> reserve_avail_buf() is called by virtio_dev_rx(),
>>> so the write perm is needed.
>> Right.
>>
>> To avoid having to pass the perms, I wonder if it wouldn't be better to
>> rely on the descriptors' VRING_DESC_F_WRITE flag.
>>
> 
> Currently, DPDK vhost net doesn't check this flag,
> so it could cause problems in some cases. If we
> want to rely on this flag, I think we still need
> to pass something similar to tell fill_vec_buf()
> whether the bufs will be written or read, so the
> flag can be checked.

Right, let's keep the perm parameter for now.

Thanks,
Maxime

> Best regards,
> Tiwei Bie
>
  

Patch

diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
index 786a74f64..e3b2ed2ff 100644
--- a/lib/librte_vhost/vhost.h
+++ b/lib/librte_vhost/vhost.h
@@ -43,6 +43,7 @@ 
  * from vring to do scatter RX.
  */
 struct buf_vector {
+	uint64_t buf_iova;
 	uint64_t buf_addr;
 	uint32_t buf_len;
 	uint32_t desc_idx;
diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 4816e8003..1ab1edd67 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -225,12 +225,12 @@  static __rte_always_inline int
 fill_vec_buf(struct virtio_net *dev, struct vhost_virtqueue *vq,
 			 uint32_t avail_idx, uint32_t *vec_idx,
 			 struct buf_vector *buf_vec, uint16_t *desc_chain_head,
-			 uint16_t *desc_chain_len)
+			 uint16_t *desc_chain_len, uint8_t perm)
 {
 	uint16_t idx = vq->avail->ring[avail_idx & (vq->size - 1)];
 	uint32_t vec_id = *vec_idx;
 	uint32_t len    = 0;
-	uint64_t dlen;
+	uint64_t dlen, desc_avail, desc_iova;
 	struct vring_desc *descs = vq->desc;
 	struct vring_desc *idesc = NULL;
 
@@ -267,10 +267,31 @@  fill_vec_buf(struct virtio_net *dev, struct vhost_virtqueue *vq,
 		}
 
 		len += descs[idx].len;
-		buf_vec[vec_id].buf_addr = descs[idx].addr;
-		buf_vec[vec_id].buf_len  = descs[idx].len;
-		buf_vec[vec_id].desc_idx = idx;
-		vec_id++;
+		desc_avail = descs[idx].len;
+		desc_iova = descs[idx].addr;
+
+		while (desc_avail) {
+			uint64_t desc_addr;
+			uint64_t desc_chunck_len = desc_avail;
+
+			desc_addr = vhost_iova_to_vva(dev, vq,
+					desc_iova,
+					&desc_chunck_len,
+					perm);
+			if (unlikely(!desc_addr)) {
+				free_ind_table(idesc);
+				return -1;
+			}
+
+			buf_vec[vec_id].buf_iova = desc_iova;
+			buf_vec[vec_id].buf_addr = desc_addr;
+			buf_vec[vec_id].buf_len  = desc_chunck_len;
+			buf_vec[vec_id].desc_idx = idx;
+
+			desc_avail -= desc_chunck_len;
+			desc_iova += desc_chunck_len;
+			vec_id++;
+		}
 
 		if ((descs[idx].flags & VRING_DESC_F_NEXT) == 0)
 			break;
@@ -293,7 +314,8 @@  fill_vec_buf(struct virtio_net *dev, struct vhost_virtqueue *vq,
 static inline int
 reserve_avail_buf(struct virtio_net *dev, struct vhost_virtqueue *vq,
 				uint32_t size, struct buf_vector *buf_vec,
-				uint16_t *num_buffers, uint16_t avail_head)
+				uint16_t *num_buffers, uint16_t avail_head,
+				uint16_t *nr_vec)
 {
 	uint16_t cur_idx;
 	uint32_t vec_idx = 0;
@@ -315,7 +337,8 @@  reserve_avail_buf(struct virtio_net *dev, struct vhost_virtqueue *vq,
 			return -1;
 
 		if (unlikely(fill_vec_buf(dev, vq, cur_idx, &vec_idx, buf_vec,
-						&head_idx, &len) < 0))
+						&head_idx, &len,
+						VHOST_ACCESS_RO) < 0))
 			return -1;
 		len = RTE_MIN(len, size);
 		update_shadow_used_ring(vq, head_idx, len);
@@ -334,21 +357,22 @@  reserve_avail_buf(struct virtio_net *dev, struct vhost_virtqueue *vq,
 			return -1;
 	}
 
+	*nr_vec = vec_idx;
+
 	return 0;
 }
 
 static __rte_always_inline int
 copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
 			    struct rte_mbuf *m, struct buf_vector *buf_vec,
-			    uint16_t num_buffers)
+			    uint16_t nr_vec, uint16_t num_buffers)
 {
 	uint32_t vec_idx = 0;
-	uint64_t desc_addr, desc_gaddr;
 	uint32_t mbuf_offset, mbuf_avail;
-	uint32_t desc_offset, desc_avail;
+	uint32_t buf_offset, buf_avail;
+	uint64_t buf_addr, buf_iova, buf_len;
 	uint32_t cpy_len;
-	uint64_t desc_chunck_len;
-	uint64_t hdr_addr, hdr_phys_addr;
+	uint64_t hdr_addr;
 	struct rte_mbuf *hdr_mbuf;
 	struct batch_copy_elem *batch_copy = vq->batch_copy_elems;
 	struct virtio_net_hdr_mrg_rxbuf tmp_hdr, *hdr = NULL;
@@ -359,82 +383,57 @@  copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
 		goto out;
 	}
 
-	desc_chunck_len = buf_vec[vec_idx].buf_len;
-	desc_gaddr = buf_vec[vec_idx].buf_addr;
-	desc_addr = vhost_iova_to_vva(dev, vq,
-					desc_gaddr,
-					&desc_chunck_len,
-					VHOST_ACCESS_RW);
-	if (buf_vec[vec_idx].buf_len < dev->vhost_hlen || !desc_addr) {
+	buf_addr = buf_vec[vec_idx].buf_addr;
+	buf_iova = buf_vec[vec_idx].buf_iova;
+	buf_len = buf_vec[vec_idx].buf_len;
+
+	if (unlikely(buf_len < dev->vhost_hlen && nr_vec <= 1)) {
 		error = -1;
 		goto out;
 	}
 
 	hdr_mbuf = m;
-	hdr_addr = desc_addr;
-	if (unlikely(desc_chunck_len < dev->vhost_hlen))
+	hdr_addr = buf_addr;
+	if (unlikely(buf_len < dev->vhost_hlen))
 		hdr = &tmp_hdr;
 	else
 		hdr = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)hdr_addr;
-	hdr_phys_addr = desc_gaddr;
 	rte_prefetch0((void *)(uintptr_t)hdr_addr);
 
 	VHOST_LOG_DEBUG(VHOST_DATA, "(%d) RX: num merge buffers %d\n",
 		dev->vid, num_buffers);
 
-	desc_avail  = buf_vec[vec_idx].buf_len - dev->vhost_hlen;
-	if (unlikely(desc_chunck_len < dev->vhost_hlen)) {
-		desc_chunck_len = desc_avail;
-		desc_gaddr += dev->vhost_hlen;
-		desc_addr = vhost_iova_to_vva(dev, vq,
-				desc_gaddr,
-				&desc_chunck_len,
-				VHOST_ACCESS_RW);
-		if (unlikely(!desc_addr)) {
-			error = -1;
-			goto out;
-		}
-
-		desc_offset = 0;
+	if (unlikely(buf_len < dev->vhost_hlen)) {
+		buf_offset = dev->vhost_hlen - buf_len;
+		vec_idx++;
+		buf_addr = buf_vec[vec_idx].buf_addr;
+		buf_iova = buf_vec[vec_idx].buf_iova;
+		buf_len = buf_vec[vec_idx].buf_len;
+		buf_avail = buf_len - buf_offset;
 	} else {
-		desc_offset = dev->vhost_hlen;
-		desc_chunck_len -= dev->vhost_hlen;
+		buf_offset = dev->vhost_hlen;
+		buf_avail = buf_len - dev->vhost_hlen;
 	}
 
-
 	mbuf_avail  = rte_pktmbuf_data_len(m);
 	mbuf_offset = 0;
 	while (mbuf_avail != 0 || m->next != NULL) {
-		/* done with current desc buf, get the next one */
-		if (desc_avail == 0) {
+		/* done with current buf, get the next one */
+		if (buf_avail == 0) {
 			vec_idx++;
-			desc_chunck_len = buf_vec[vec_idx].buf_len;
-			desc_gaddr = buf_vec[vec_idx].buf_addr;
-			desc_addr =
-				vhost_iova_to_vva(dev, vq,
-					desc_gaddr,
-					&desc_chunck_len,
-					VHOST_ACCESS_RW);
-			if (unlikely(!desc_addr)) {
+			if (unlikely(vec_idx >= nr_vec)) {
 				error = -1;
 				goto out;
 			}
 
+			buf_addr = buf_vec[vec_idx].buf_addr;
+			buf_iova = buf_vec[vec_idx].buf_iova;
+			buf_len = buf_vec[vec_idx].buf_len;
+
 			/* Prefetch buffer address. */
-			rte_prefetch0((void *)(uintptr_t)desc_addr);
-			desc_offset = 0;
-			desc_avail  = buf_vec[vec_idx].buf_len;
-		} else if (unlikely(desc_chunck_len == 0)) {
-			desc_chunck_len = desc_avail;
-			desc_gaddr += desc_offset;
-			desc_addr = vhost_iova_to_vva(dev, vq,
-					desc_gaddr,
-					&desc_chunck_len, VHOST_ACCESS_RW);
-			if (unlikely(!desc_addr)) {
-				error = -1;
-				goto out;
-			}
-			desc_offset = 0;
+			rte_prefetch0((void *)(uintptr_t)buf_addr);
+			buf_offset = 0;
+			buf_avail  = buf_len;
 		}
 
 		/* done with current mbuf, get the next one */
@@ -455,18 +454,12 @@  copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
 				uint64_t len;
 				uint64_t remain = dev->vhost_hlen;
 				uint64_t src = (uint64_t)(uintptr_t)hdr, dst;
-				uint64_t guest_addr = hdr_phys_addr;
+				uint64_t iova = buf_vec[0].buf_iova;
+				uint16_t hdr_vec_idx = 0;
 
 				while (remain) {
 					len = remain;
-					dst = vhost_iova_to_vva(dev, vq,
-							guest_addr, &len,
-							VHOST_ACCESS_RW);
-					if (unlikely(!dst || !len)) {
-						error = -1;
-						goto out;
-					}
-
+					dst =  buf_vec[hdr_vec_idx].buf_addr;
 					rte_memcpy((void *)(uintptr_t)dst,
 							(void *)(uintptr_t)src,
 							len);
@@ -474,50 +467,50 @@  copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
 					PRINT_PACKET(dev, (uintptr_t)dst,
 							(uint32_t)len, 0);
 					vhost_log_cache_write(dev, vq,
-							guest_addr, len);
+							iova, len);
 
 					remain -= len;
-					guest_addr += len;
+					iova += len;
 					src += len;
+					hdr_vec_idx++;
 				}
 			} else {
 				PRINT_PACKET(dev, (uintptr_t)hdr_addr,
 						dev->vhost_hlen, 0);
-				vhost_log_cache_write(dev, vq, hdr_phys_addr,
+				vhost_log_cache_write(dev, vq,
+						buf_vec[0].buf_iova,
 						dev->vhost_hlen);
 			}
 
 			hdr_addr = 0;
 		}
 
-		cpy_len = RTE_MIN(desc_chunck_len, mbuf_avail);
+		cpy_len = RTE_MIN(buf_len, mbuf_avail);
 
 		if (likely(cpy_len > MAX_BATCH_LEN ||
 					vq->batch_copy_nb_elems >= vq->size)) {
-			rte_memcpy((void *)((uintptr_t)(desc_addr +
-							desc_offset)),
+			rte_memcpy((void *)((uintptr_t)(buf_addr + buf_offset)),
 				rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
 				cpy_len);
-			vhost_log_cache_write(dev, vq, desc_gaddr + desc_offset,
+			vhost_log_cache_write(dev, vq, buf_iova + buf_offset,
 					cpy_len);
-			PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset),
+			PRINT_PACKET(dev, (uintptr_t)(buf_addr + buf_offset),
 				cpy_len, 0);
 		} else {
 			batch_copy[vq->batch_copy_nb_elems].dst =
-				(void *)((uintptr_t)(desc_addr + desc_offset));
+				(void *)((uintptr_t)(buf_addr + buf_offset));
 			batch_copy[vq->batch_copy_nb_elems].src =
 				rte_pktmbuf_mtod_offset(m, void *, mbuf_offset);
 			batch_copy[vq->batch_copy_nb_elems].log_addr =
-				desc_gaddr + desc_offset;
+				buf_iova + buf_offset;
 			batch_copy[vq->batch_copy_nb_elems].len = cpy_len;
 			vq->batch_copy_nb_elems++;
 		}
 
 		mbuf_avail  -= cpy_len;
 		mbuf_offset += cpy_len;
-		desc_avail  -= cpy_len;
-		desc_offset += cpy_len;
-		desc_chunck_len -= cpy_len;
+		buf_avail  -= cpy_len;
+		buf_offset += cpy_len;
 	}
 
 out:
@@ -568,10 +561,11 @@  virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
 	avail_head = *((volatile uint16_t *)&vq->avail->idx);
 	for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
 		uint32_t pkt_len = pkts[pkt_idx]->pkt_len + dev->vhost_hlen;
+		uint16_t nr_vec = 0;
 
 		if (unlikely(reserve_avail_buf(dev, vq,
 						pkt_len, buf_vec, &num_buffers,
-						avail_head) < 0)) {
+						avail_head, &nr_vec) < 0)) {
 			VHOST_LOG_DEBUG(VHOST_DATA,
 				"(%d) failed to get enough desc from vring\n",
 				dev->vid);
@@ -584,7 +578,8 @@  virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
 			vq->last_avail_idx + num_buffers);
 
 		if (copy_mbuf_to_desc(dev, vq, pkts[pkt_idx],
-						buf_vec, num_buffers) < 0) {
+						buf_vec, nr_vec,
+						num_buffers) < 0) {
 			vq->shadow_used_idx -= num_buffers;
 			break;
 		}
@@ -753,11 +748,10 @@  copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
 		  struct buf_vector *buf_vec, uint16_t nr_vec,
 		  struct rte_mbuf *m, struct rte_mempool *mbuf_pool)
 {
-	uint64_t desc_addr, desc_gaddr;
-	uint32_t desc_avail, desc_offset;
+	uint32_t buf_avail, buf_offset;
+	uint64_t buf_addr, buf_iova, buf_len;
 	uint32_t mbuf_avail, mbuf_offset;
 	uint32_t cpy_len;
-	uint64_t desc_chunck_len;
 	struct rte_mbuf *cur = m, *prev = m;
 	struct virtio_net_hdr tmp_hdr;
 	struct virtio_net_hdr *hdr = NULL;
@@ -766,25 +760,25 @@  copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	struct batch_copy_elem *batch_copy = vq->batch_copy_elems;
 	int error = 0;
 
-	desc_chunck_len = buf_vec[vec_idx].buf_len;
-	desc_gaddr = buf_vec[vec_idx].buf_addr;
-	desc_addr = vhost_iova_to_vva(dev,
-					vq, desc_gaddr,
-					&desc_chunck_len,
-					VHOST_ACCESS_RO);
-	if (unlikely(buf_vec[vec_idx].buf_len < dev->vhost_hlen ||
-				!desc_addr)) {
+	buf_addr = buf_vec[vec_idx].buf_addr;
+	buf_iova = buf_vec[vec_idx].buf_iova;
+	buf_len = buf_vec[vec_idx].buf_len;
+
+	if (unlikely(buf_len < dev->vhost_hlen && nr_vec <= 1)) {
 		error = -1;
 		goto out;
 	}
 
+	if (likely(nr_vec > 1))
+		rte_prefetch0((void *)(uintptr_t)buf_vec[1].buf_addr);
+
 	if (virtio_net_with_host_offload(dev)) {
-		if (unlikely(desc_chunck_len < sizeof(struct virtio_net_hdr))) {
-			uint64_t len = desc_chunck_len;
+		if (unlikely(buf_len < sizeof(struct virtio_net_hdr))) {
+			uint64_t len;
 			uint64_t remain = sizeof(struct virtio_net_hdr);
-			uint64_t src = desc_addr;
+			uint64_t src;
 			uint64_t dst = (uint64_t)(uintptr_t)&tmp_hdr;
-			uint64_t guest_addr = desc_gaddr;
+			uint16_t hdr_vec_idx = 0;
 
 			/*
 			 * No luck, the virtio-net header doesn't fit
@@ -792,25 +786,18 @@  copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
 			 */
 			while (remain) {
 				len = remain;
-				src = vhost_iova_to_vva(dev, vq,
-						guest_addr, &len,
-						VHOST_ACCESS_RO);
-				if (unlikely(!src || !len)) {
-					error = -1;
-					goto out;
-				}
-
+				src = buf_vec[hdr_vec_idx].buf_addr;
 				rte_memcpy((void *)(uintptr_t)dst,
 						   (void *)(uintptr_t)src, len);
 
-				guest_addr += len;
 				remain -= len;
 				dst += len;
+				hdr_vec_idx++;
 			}
 
 			hdr = &tmp_hdr;
 		} else {
-			hdr = (struct virtio_net_hdr *)((uintptr_t)desc_addr);
+			hdr = (struct virtio_net_hdr *)((uintptr_t)buf_addr);
 			rte_prefetch0(hdr);
 		}
 	}
@@ -820,68 +807,51 @@  copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	 * for Tx: the first for storing the header, and others
 	 * for storing the data.
 	 */
-	if (likely(buf_vec[vec_idx].buf_len == dev->vhost_hlen)) {
+	if (unlikely(buf_len < dev->vhost_hlen)) {
+		buf_offset = dev->vhost_hlen - buf_len;
+		vec_idx++;
+		buf_addr = buf_vec[vec_idx].buf_addr;
+		buf_iova = buf_vec[vec_idx].buf_iova;
+		buf_len = buf_vec[vec_idx].buf_len;
+		buf_avail  = buf_len - buf_offset;
+	} else if (buf_len == dev->vhost_hlen) {
 		if (unlikely(++vec_idx >= nr_vec))
 			goto out;
+		buf_addr = buf_vec[vec_idx].buf_addr;
+		buf_iova = buf_vec[vec_idx].buf_iova;
+		buf_len = buf_vec[vec_idx].buf_len;
 
-		desc_chunck_len = buf_vec[vec_idx].buf_len;
-		desc_gaddr = buf_vec[vec_idx].buf_addr;
-		desc_addr = vhost_iova_to_vva(dev,
-							vq, desc_gaddr,
-							&desc_chunck_len,
-							VHOST_ACCESS_RO);
-		if (unlikely(!desc_addr)) {
-			error = -1;
-			goto out;
-		}
-
-		desc_offset = 0;
-		desc_avail  = buf_vec[vec_idx].buf_len;
+		buf_offset = 0;
+		buf_avail  = buf_len;
 	} else {
-		desc_avail  = buf_vec[vec_idx].buf_len - dev->vhost_hlen;
-
-		if (unlikely(desc_chunck_len < dev->vhost_hlen)) {
-			desc_chunck_len = desc_avail;
-			desc_gaddr += dev->vhost_hlen;
-			desc_addr = vhost_iova_to_vva(dev,
-					vq, desc_gaddr,
-					&desc_chunck_len,
-					VHOST_ACCESS_RO);
-			if (unlikely(!desc_addr)) {
-				error = -1;
-				goto out;
-			}
-
-			desc_offset = 0;
-		} else {
-			desc_offset = dev->vhost_hlen;
-			desc_chunck_len -= dev->vhost_hlen;
-		}
+		buf_offset = dev->vhost_hlen;
+		buf_avail = buf_vec[vec_idx].buf_len - dev->vhost_hlen;
 	}
 
-	rte_prefetch0((void *)(uintptr_t)(desc_addr + desc_offset));
+	rte_prefetch0((void *)(uintptr_t)
+			(buf_addr + buf_offset));
 
-	PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset),
-			(uint32_t)desc_chunck_len, 0);
+	PRINT_PACKET(dev,
+			(uintptr_t)(buf_addr + buf_offset),
+			(uint32_t)buf_avail, 0);
 
 	mbuf_offset = 0;
 	mbuf_avail  = m->buf_len - RTE_PKTMBUF_HEADROOM;
 	while (1) {
 		uint64_t hpa;
 
-		cpy_len = RTE_MIN(desc_chunck_len, mbuf_avail);
+		cpy_len = RTE_MIN(buf_avail, mbuf_avail);
 
 		if (unlikely(dev->dequeue_zero_copy)) {
-			hpa = gpa_to_hpa(dev,
-					desc_gaddr + desc_offset, cpy_len);
+			hpa = gpa_to_hpa(dev, buf_iova + buf_offset, cpy_len);
 			if (unlikely(!hpa)) {
 				error = -1;
 				goto out;
 			}
 			cur->data_len = cpy_len;
 			cur->data_off = 0;
-			cur->buf_addr = (void *)(uintptr_t)(desc_addr
-				+ desc_offset);
+			cur->buf_addr =
+				(void *)(uintptr_t)(buf_addr + buf_offset);
 			cur->buf_iova = hpa;
 
 			/*
@@ -892,21 +862,19 @@  copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
 		} else {
 			if (likely(cpy_len > MAX_BATCH_LEN ||
 				   vq->batch_copy_nb_elems >= vq->size ||
-				   (hdr && cur == m) ||
-				   buf_vec[vec_idx].buf_len !=
-						desc_chunck_len)) {
+				   (hdr && cur == m))) {
 				rte_memcpy(rte_pktmbuf_mtod_offset(cur, void *,
 								   mbuf_offset),
-					   (void *)((uintptr_t)(desc_addr +
-								desc_offset)),
+					   (void *)((uintptr_t)(buf_addr +
+							   buf_offset)),
 					   cpy_len);
 			} else {
 				batch_copy[vq->batch_copy_nb_elems].dst =
 					rte_pktmbuf_mtod_offset(cur, void *,
 								mbuf_offset);
 				batch_copy[vq->batch_copy_nb_elems].src =
-					(void *)((uintptr_t)(desc_addr +
-							     desc_offset));
+					(void *)((uintptr_t)(buf_addr +
+								buf_offset));
 				batch_copy[vq->batch_copy_nb_elems].len =
 					cpy_len;
 				vq->batch_copy_nb_elems++;
@@ -915,48 +883,25 @@  copy_desc_to_mbuf(struct virtio_net *dev, struct vhost_virtqueue *vq,
 
 		mbuf_avail  -= cpy_len;
 		mbuf_offset += cpy_len;
-		desc_avail  -= cpy_len;
-		desc_chunck_len -= cpy_len;
-		desc_offset += cpy_len;
+		buf_avail  -= cpy_len;
+		buf_offset += cpy_len;
 
-		/* This desc reaches to its end, get the next one */
-		if (desc_avail == 0) {
+		/* This buf reaches to its end, get the next one */
+		if (buf_avail == 0) {
 			if (++vec_idx >= nr_vec)
 				break;
 
-			desc_chunck_len = buf_vec[vec_idx].buf_len;
-			desc_gaddr = buf_vec[vec_idx].buf_addr;
-			desc_addr = vhost_iova_to_vva(dev,
-							vq, desc_gaddr,
-							&desc_chunck_len,
-							VHOST_ACCESS_RO);
-			if (unlikely(!desc_addr)) {
-				error = -1;
-				goto out;
-			}
-
-			rte_prefetch0((void *)(uintptr_t)desc_addr);
+			buf_addr = buf_vec[vec_idx].buf_addr;
+			buf_iova = buf_vec[vec_idx].buf_iova;
+			buf_len = buf_vec[vec_idx].buf_len;
 
-			desc_offset = 0;
-			desc_avail  = buf_vec[vec_idx].buf_len;
+			rte_prefetch0((void *)(uintptr_t)buf_addr);
 
-			PRINT_PACKET(dev, (uintptr_t)desc_addr,
-					(uint32_t)desc_chunck_len, 0);
-		} else if (unlikely(desc_chunck_len == 0)) {
-			desc_chunck_len = desc_avail;
-			desc_gaddr += desc_offset;
-			desc_addr = vhost_iova_to_vva(dev, vq,
-					desc_gaddr,
-					&desc_chunck_len,
-					VHOST_ACCESS_RO);
-			if (unlikely(!desc_addr)) {
-				error = -1;
-				goto out;
-			}
-			desc_offset = 0;
+			buf_offset = 0;
+			buf_avail  = buf_len;
 
-			PRINT_PACKET(dev, (uintptr_t)desc_addr,
-					(uint32_t)desc_chunck_len, 0);
+			PRINT_PACKET(dev, (uintptr_t)buf_addr,
+					(uint32_t)buf_avail, 0);
 		}
 
 		/*
@@ -1175,7 +1120,8 @@  rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
 		if (unlikely(fill_vec_buf(dev, vq,
 						vq->last_avail_idx + i,
 						&nr_vec, buf_vec,
-						&head_idx, &dummy_len) < 0))
+						&head_idx, &dummy_len,
+						VHOST_ACCESS_RW) < 0))
 			break;
 
 		if (likely(dev->dequeue_zero_copy == 0))