[v4] vhost: fix async copy fail on multi-page buffers

Message ID 20200728032851.3704083-1-patrick.fu@intel.com (mailing list archive)
State Superseded, archived
Delegated to: Maxime Coquelin
Headers
Series [v4] vhost: fix async copy fail on multi-page buffers |

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/iol-broadcom-Performance success Performance Testing PASS
ci/iol-intel-Performance success Performance Testing PASS
ci/Intel-compilation success Compilation OK
ci/iol-testing success Testing PASS
ci/travis-robot success Travis build: errored
ci/iol-mellanox-Performance success Performance Testing PASS

Commit Message

Patrick Fu July 28, 2020, 3:28 a.m. UTC
  From: Patrick Fu <patrick.fu@intel.com>

Async copy fails when single ring buffer vector is splited on multiple
physical pages. This happens because current hpa address translation
function doesn't handle multi-page buffers. A new gpa to hpa address
conversion function, which returns the hpa on the first hitting host
pages, is implemented in this patch. Async data path recursively calls
this new function to construct a multi-segments async copy descriptor
for ring buffers crossing physical page boundaries.

Fixes: cd6760da1076 ("vhost: introduce async enqueue for split ring")

Signed-off-by: Patrick Fu <patrick.fu@intel.com>
---
v2:
 - change commit message and title
 - v1 patch used CPU to copy multi-page buffers; v2 patch split the
copy into multiple async copy segments whenever possible

v3:
 - added fixline

v4:
 - fix miss translation of the gpa which is the same length with host
   page size

 lib/librte_vhost/vhost.h      | 50 +++++++++++++++++++++++++++++++++++
 lib/librte_vhost/virtio_net.c | 40 +++++++++++++++++-----------
 2 files changed, 75 insertions(+), 15 deletions(-)
  

Comments

Maxime Coquelin July 28, 2020, 1:55 p.m. UTC | #1
On 7/28/20 5:28 AM, patrick.fu@intel.com wrote:
> From: Patrick Fu <patrick.fu@intel.com>
> 
> Async copy fails when single ring buffer vector is splited on multiple
> physical pages. This happens because current hpa address translation
> function doesn't handle multi-page buffers. A new gpa to hpa address
> conversion function, which returns the hpa on the first hitting host
> pages, is implemented in this patch. Async data path recursively calls
> this new function to construct a multi-segments async copy descriptor
> for ring buffers crossing physical page boundaries.
> 
> Fixes: cd6760da1076 ("vhost: introduce async enqueue for split ring")
> 
> Signed-off-by: Patrick Fu <patrick.fu@intel.com>
> ---
> v2:
>  - change commit message and title
>  - v1 patch used CPU to copy multi-page buffers; v2 patch split the
> copy into multiple async copy segments whenever possible
> 
> v3:
>  - added fixline
> 
> v4:
>  - fix miss translation of the gpa which is the same length with host
>    page size
> 
>  lib/librte_vhost/vhost.h      | 50 +++++++++++++++++++++++++++++++++++
>  lib/librte_vhost/virtio_net.c | 40 +++++++++++++++++-----------
>  2 files changed, 75 insertions(+), 15 deletions(-)
> 
> diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
> index 0f7212f88..05c202a57 100644
> --- a/lib/librte_vhost/vhost.h
> +++ b/lib/librte_vhost/vhost.h
> @@ -616,6 +616,56 @@ gpa_to_hpa(struct virtio_net *dev, uint64_t gpa, uint64_t size)
>  	return 0;
>  }
>  
> +static __rte_always_inline rte_iova_t
> +gpa_to_first_hpa(struct virtio_net *dev, uint64_t gpa,
> +	uint64_t gpa_size, uint64_t *hpa_size)
> +{
> +	uint32_t i;
> +	struct guest_page *page;
> +	struct guest_page key;
> +
> +	*hpa_size = gpa_size;
> +	if (dev->nr_guest_pages >= VHOST_BINARY_SEARCH_THRESH) {
> +		key.guest_phys_addr = gpa & ~(dev->guest_pages[0].size - 1);
> +		page = bsearch(&key, dev->guest_pages, dev->nr_guest_pages,
> +			       sizeof(struct guest_page), guest_page_addrcmp);
> +		if (page) {
> +			if (gpa + gpa_size <=
> +					page->guest_phys_addr + page->size) {
> +				return gpa - page->guest_phys_addr +
> +					page->host_phys_addr;
> +			} else if (gpa < page->guest_phys_addr +
> +						page->size) {
> +				*hpa_size = page->guest_phys_addr +
> +					page->size - gpa;
> +				return gpa - page->guest_phys_addr +
> +					page->host_phys_addr;
> +			}
> +		}
> +	} else {
> +		for (i = 0; i < dev->nr_guest_pages; i++) {
> +			page = &dev->guest_pages[i];
> +
> +			if (gpa >= page->guest_phys_addr) {
> +				if (gpa + gpa_size <=
> +					page->guest_phys_addr + page->size) {
> +					return gpa - page->guest_phys_addr +
> +						page->host_phys_addr;
> +				} else if (gpa < page->guest_phys_addr +
> +							page->size) {
> +					*hpa_size = page->guest_phys_addr +
> +						page->size - gpa;
> +					return gpa - page->guest_phys_addr +
> +						page->host_phys_addr;
> +				}
> +			}
> +		}
> +	}
> +
> +	*hpa_size = 0;
> +	return 0;
> +}
> +
>  static __rte_always_inline uint64_t
>  hva_to_gpa(struct virtio_net *dev, uint64_t vva, uint64_t len)
>  {
> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
> index 95a0bc19f..124a33a10 100644
> --- a/lib/librte_vhost/virtio_net.c
> +++ b/lib/librte_vhost/virtio_net.c
> @@ -980,6 +980,7 @@ async_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
>  	struct batch_copy_elem *batch_copy = vq->batch_copy_elems;
>  	struct virtio_net_hdr_mrg_rxbuf tmp_hdr, *hdr = NULL;
>  	int error = 0;
> +	uint64_t mapped_len;
>  
>  	uint32_t tlen = 0;
>  	int tvec_idx = 0;
> @@ -1072,24 +1073,31 @@ async_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
>  
>  		cpy_len = RTE_MIN(buf_avail, mbuf_avail);
>  
> -		if (unlikely(cpy_len >= cpy_threshold)) {
> -			hpa = (void *)(uintptr_t)gpa_to_hpa(dev,
> -					buf_iova + buf_offset, cpy_len);
> +		while (unlikely(cpy_len && cpy_len >= cpy_threshold)) {
> +			hpa = (void *)(uintptr_t)gpa_to_first_hpa(dev,
> +					buf_iova + buf_offset,
> +					cpy_len, &mapped_len);
>  
> -			if (unlikely(!hpa)) {
> -				error = -1;
> -				goto out;
> -			}
> +			if (unlikely(!hpa || mapped_len < cpy_threshold))
> +				break;
>  
>  			async_fill_vec(src_iovec + tvec_idx,
>  				(void *)(uintptr_t)rte_pktmbuf_iova_offset(m,
> -						mbuf_offset), cpy_len);
> +				mbuf_offset), (size_t)mapped_len);
>  
> -			async_fill_vec(dst_iovec + tvec_idx, hpa, cpy_len);
> +			async_fill_vec(dst_iovec + tvec_idx,
> +					hpa, (size_t)mapped_len);
>  
> -			tlen += cpy_len;
> +			tlen += (uint32_t)mapped_len;
> +			cpy_len -= (uint32_t)mapped_len;
> +			mbuf_avail  -= (uint32_t)mapped_len;
> +			mbuf_offset += (uint32_t)mapped_len;
> +			buf_avail  -= (uint32_t)mapped_len;
> +			buf_offset += (uint32_t)mapped_len;
>  			tvec_idx++;
> -		} else {
> +		}
> +
> +		if (likely(cpy_len)) {
>  			if (unlikely(vq->batch_copy_nb_elems >= vq->size)) {
>  				rte_memcpy(
>  				(void *)((uintptr_t)(buf_addr + buf_offset)),
> @@ -1112,10 +1120,12 @@ async_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
>  			}
>  		}
>  
> -		mbuf_avail  -= cpy_len;
> -		mbuf_offset += cpy_len;
> -		buf_avail  -= cpy_len;
> -		buf_offset += cpy_len;
> +		if (cpy_len) {
> +			mbuf_avail  -= cpy_len;
> +			mbuf_offset += cpy_len;
> +			buf_avail  -= cpy_len;
> +			buf_offset += cpy_len;
> +		}

Is that really necessary to check if copy length is not 0?

Thanks,
Maxime

>  	}
>  
>  out:
>
  
Patrick Fu July 29, 2020, 1:40 a.m. UTC | #2
Hi Maxime,

> -----Original Message-----
> From: Maxime Coquelin <maxime.coquelin@redhat.com>
> Sent: Tuesday, July 28, 2020 9:55 PM
> To: Fu, Patrick <patrick.fu@intel.com>; dev@dpdk.org; Xia, Chenbo
> <chenbo.xia@intel.com>
> Subject: Re: [PATCH v4] vhost: fix async copy fail on multi-page buffers
> 
> 
> 
> On 7/28/20 5:28 AM, patrick.fu@intel.com wrote:
> > From: Patrick Fu <patrick.fu@intel.com>
> >
> > Async copy fails when single ring buffer vector is splited on multiple
> > physical pages. This happens because current hpa address translation
> > function doesn't handle multi-page buffers. A new gpa to hpa address
> > conversion function, which returns the hpa on the first hitting host
> > pages, is implemented in this patch. Async data path recursively calls
> > this new function to construct a multi-segments async copy descriptor
> > for ring buffers crossing physical page boundaries.
> >
> > Fixes: cd6760da1076 ("vhost: introduce async enqueue for split ring")
> >
> > Signed-off-by: Patrick Fu <patrick.fu@intel.com>
> > ---
> > v2:
> >  - change commit message and title
> >  - v1 patch used CPU to copy multi-page buffers; v2 patch split the
> > copy into multiple async copy segments whenever possible
> >
> > v3:
> >  - added fixline
> >
> > v4:
> >  - fix miss translation of the gpa which is the same length with host
> >    page size
> >
> >  lib/librte_vhost/vhost.h      | 50 +++++++++++++++++++++++++++++++++++
> >  lib/librte_vhost/virtio_net.c | 40 +++++++++++++++++-----------
> >  2 files changed, 75 insertions(+), 15 deletions(-)
> >
> > diff --git a/lib/librte_vhost/virtio_net.c
> > b/lib/librte_vhost/virtio_net.c index 95a0bc19f..124a33a10 100644
> > --- a/lib/librte_vhost/virtio_net.c
> > +++ b/lib/librte_vhost/virtio_net.c
> > @@ -980,6 +980,7 @@ async_mbuf_to_desc(struct virtio_net *dev, struct
> vhost_virtqueue *vq,
> >  	struct batch_copy_elem *batch_copy = vq->batch_copy_elems;
> >  	struct virtio_net_hdr_mrg_rxbuf tmp_hdr, *hdr = NULL;
> >  	int error = 0;
> > +	uint64_t mapped_len;
> >
> >  	uint32_t tlen = 0;
> >  	int tvec_idx = 0;
> > @@ -1072,24 +1073,31 @@ async_mbuf_to_desc(struct virtio_net *dev,
> > struct vhost_virtqueue *vq,
> >
> >  		cpy_len = RTE_MIN(buf_avail, mbuf_avail);
> >
> > -		if (unlikely(cpy_len >= cpy_threshold)) {
> > -			hpa = (void *)(uintptr_t)gpa_to_hpa(dev,
> > -					buf_iova + buf_offset, cpy_len);
> > +		while (unlikely(cpy_len && cpy_len >= cpy_threshold)) {
> > +			hpa = (void *)(uintptr_t)gpa_to_first_hpa(dev,
> > +					buf_iova + buf_offset,
> > +					cpy_len, &mapped_len);
> >
> > -			if (unlikely(!hpa)) {
> > -				error = -1;
> > -				goto out;
> > -			}
> > +			if (unlikely(!hpa || mapped_len < cpy_threshold))
> > +				break;
> >
> >  			async_fill_vec(src_iovec + tvec_idx,
> >  				(void *)(uintptr_t)rte_pktmbuf_iova_offset(m,
> > -						mbuf_offset), cpy_len);
> > +				mbuf_offset), (size_t)mapped_len);
> >
> > -			async_fill_vec(dst_iovec + tvec_idx, hpa, cpy_len);
> > +			async_fill_vec(dst_iovec + tvec_idx,
> > +					hpa, (size_t)mapped_len);
> >
> > -			tlen += cpy_len;
> > +			tlen += (uint32_t)mapped_len;
> > +			cpy_len -= (uint32_t)mapped_len;
> > +			mbuf_avail  -= (uint32_t)mapped_len;
> > +			mbuf_offset += (uint32_t)mapped_len;
> > +			buf_avail  -= (uint32_t)mapped_len;
> > +			buf_offset += (uint32_t)mapped_len;
> >  			tvec_idx++;
> > -		} else {
> > +		}
> > +
> > +		if (likely(cpy_len)) {
> >  			if (unlikely(vq->batch_copy_nb_elems >= vq->size)) {
> >  				rte_memcpy(
> >  				(void *)((uintptr_t)(buf_addr + buf_offset)),
> @@ -1112,10
> > +1120,12 @@ async_mbuf_to_desc(struct virtio_net *dev, struct
> vhost_virtqueue *vq,
> >  			}
> >  		}
> >
> > -		mbuf_avail  -= cpy_len;
> > -		mbuf_offset += cpy_len;
> > -		buf_avail  -= cpy_len;
> > -		buf_offset += cpy_len;
> > +		if (cpy_len) {
> > +			mbuf_avail  -= cpy_len;
> > +			mbuf_offset += cpy_len;
> > +			buf_avail  -= cpy_len;
> > +			buf_offset += cpy_len;
> > +		}
> 
> Is that really necessary to check if copy length is not 0?
> 
The intension is to optimize for the case that ring buffers are NOT split (which should be the most common case). In that case, cpy_len will be zero and by this "if" statement we can save couple of cycles. With that said, the actual difference is minor. I'm open with either adding an "unlikely" to the "if", or removing this the "if". Would like to hear your option and submit modified patch.


> Thanks,
> Maxime
> 
> >  	}
> >
> >  out:
> >
  
Patrick Fu July 29, 2020, 2:05 a.m. UTC | #3
Hi Maxime,

> -----Original Message-----
> From: Fu, Patrick
> Sent: Wednesday, July 29, 2020 9:40 AM
> To: 'Maxime Coquelin' <maxime.coquelin@redhat.com>; dev@dpdk.org; Xia,
> Chenbo <Chenbo.Xia@intel.com>
> Subject: RE: [PATCH v4] vhost: fix async copy fail on multi-page buffers
> 
> Hi Maxime,
> 
> > -----Original Message-----
> > From: Maxime Coquelin <maxime.coquelin@redhat.com>
> > Sent: Tuesday, July 28, 2020 9:55 PM
> > To: Fu, Patrick <patrick.fu@intel.com>; dev@dpdk.org; Xia, Chenbo
> > <chenbo.xia@intel.com>
> > Subject: Re: [PATCH v4] vhost: fix async copy fail on multi-page
> > buffers
> >
> >
> >
> > On 7/28/20 5:28 AM, patrick.fu@intel.com wrote:
> > > From: Patrick Fu <patrick.fu@intel.com>
> > >
> > > Async copy fails when single ring buffer vector is splited on
> > > multiple physical pages. This happens because current hpa address
> > > translation function doesn't handle multi-page buffers. A new gpa to
> > > hpa address conversion function, which returns the hpa on the first
> > > hitting host pages, is implemented in this patch. Async data path
> > > recursively calls this new function to construct a multi-segments
> > > async copy descriptor for ring buffers crossing physical page boundaries.
> > >
> > > Fixes: cd6760da1076 ("vhost: introduce async enqueue for split
> > > ring")
> > >
> > > Signed-off-by: Patrick Fu <patrick.fu@intel.com>
> > > ---
> > > v2:
> > >  - change commit message and title
> > >  - v1 patch used CPU to copy multi-page buffers; v2 patch split the
> > > copy into multiple async copy segments whenever possible
> > >
> > > v3:
> > >  - added fixline
> > >
> > > v4:
> > >  - fix miss translation of the gpa which is the same length with host
> > >    page size
> > >
> > >  lib/librte_vhost/vhost.h      | 50
> +++++++++++++++++++++++++++++++++++
> > >  lib/librte_vhost/virtio_net.c | 40 +++++++++++++++++-----------
> > >  2 files changed, 75 insertions(+), 15 deletions(-)
> > >
> > > diff --git a/lib/librte_vhost/virtio_net.c
> > > b/lib/librte_vhost/virtio_net.c index 95a0bc19f..124a33a10 100644
> > > --- a/lib/librte_vhost/virtio_net.c
> > > +++ b/lib/librte_vhost/virtio_net.c
> > > @@ -980,6 +980,7 @@ async_mbuf_to_desc(struct virtio_net *dev,
> > > struct
> > vhost_virtqueue *vq,
> > >  	struct batch_copy_elem *batch_copy = vq->batch_copy_elems;
> > >  	struct virtio_net_hdr_mrg_rxbuf tmp_hdr, *hdr = NULL;
> > >  	int error = 0;
> > > +	uint64_t mapped_len;
> > >
> > >  	uint32_t tlen = 0;
> > >  	int tvec_idx = 0;
> > > @@ -1072,24 +1073,31 @@ async_mbuf_to_desc(struct virtio_net *dev,
> > > struct vhost_virtqueue *vq,
> > >
> > >  		cpy_len = RTE_MIN(buf_avail, mbuf_avail);
> > >
> > > -		if (unlikely(cpy_len >= cpy_threshold)) {
> > > -			hpa = (void *)(uintptr_t)gpa_to_hpa(dev,
> > > -					buf_iova + buf_offset, cpy_len);
> > > +		while (unlikely(cpy_len && cpy_len >= cpy_threshold)) {
> > > +			hpa = (void *)(uintptr_t)gpa_to_first_hpa(dev,
> > > +					buf_iova + buf_offset,
> > > +					cpy_len, &mapped_len);
> > >
> > > -			if (unlikely(!hpa)) {
> > > -				error = -1;
> > > -				goto out;
> > > -			}
> > > +			if (unlikely(!hpa || mapped_len < cpy_threshold))
> > > +				break;
> > >
> > >  			async_fill_vec(src_iovec + tvec_idx,
> > >  				(void *)(uintptr_t)rte_pktmbuf_iova_offset(m,
> > > -						mbuf_offset), cpy_len);
> > > +				mbuf_offset), (size_t)mapped_len);
> > >
> > > -			async_fill_vec(dst_iovec + tvec_idx, hpa, cpy_len);
> > > +			async_fill_vec(dst_iovec + tvec_idx,
> > > +					hpa, (size_t)mapped_len);
> > >
> > > -			tlen += cpy_len;
> > > +			tlen += (uint32_t)mapped_len;
> > > +			cpy_len -= (uint32_t)mapped_len;
> > > +			mbuf_avail  -= (uint32_t)mapped_len;
> > > +			mbuf_offset += (uint32_t)mapped_len;
> > > +			buf_avail  -= (uint32_t)mapped_len;
> > > +			buf_offset += (uint32_t)mapped_len;
> > >  			tvec_idx++;
> > > -		} else {
> > > +		}
> > > +
> > > +		if (likely(cpy_len)) {
> > >  			if (unlikely(vq->batch_copy_nb_elems >= vq->size)) {
> > >  				rte_memcpy(
> > >  				(void *)((uintptr_t)(buf_addr + buf_offset)),
> > @@ -1112,10
> > > +1120,12 @@ async_mbuf_to_desc(struct virtio_net *dev, struct
> > vhost_virtqueue *vq,
> > >  			}
> > >  		}
> > >
> > > -		mbuf_avail  -= cpy_len;
> > > -		mbuf_offset += cpy_len;
> > > -		buf_avail  -= cpy_len;
> > > -		buf_offset += cpy_len;
> > > +		if (cpy_len) {
> > > +			mbuf_avail  -= cpy_len;
> > > +			mbuf_offset += cpy_len;
> > > +			buf_avail  -= cpy_len;
> > > +			buf_offset += cpy_len;
> > > +		}
> >
> > Is that really necessary to check if copy length is not 0?
> >
> The intension is to optimize for the case that ring buffers are NOT split (which
> should be the most common case). In that case, cpy_len will be zero and by
> this "if" statement we can save couple of cycles. With that said, the actual
> difference is minor. I'm open with either adding an "unlikely" to the "if", or
> removing this the "if". Would like to hear your option and submit modified
> patch.
> 

I have a better way to handle the case (combine this "if" logic with the previous one). 
Please review my v5 patch for the code change.

Thanks,

Patrick
  

Patch

diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
index 0f7212f88..05c202a57 100644
--- a/lib/librte_vhost/vhost.h
+++ b/lib/librte_vhost/vhost.h
@@ -616,6 +616,56 @@  gpa_to_hpa(struct virtio_net *dev, uint64_t gpa, uint64_t size)
 	return 0;
 }
 
+static __rte_always_inline rte_iova_t
+gpa_to_first_hpa(struct virtio_net *dev, uint64_t gpa,
+	uint64_t gpa_size, uint64_t *hpa_size)
+{
+	uint32_t i;
+	struct guest_page *page;
+	struct guest_page key;
+
+	*hpa_size = gpa_size;
+	if (dev->nr_guest_pages >= VHOST_BINARY_SEARCH_THRESH) {
+		key.guest_phys_addr = gpa & ~(dev->guest_pages[0].size - 1);
+		page = bsearch(&key, dev->guest_pages, dev->nr_guest_pages,
+			       sizeof(struct guest_page), guest_page_addrcmp);
+		if (page) {
+			if (gpa + gpa_size <=
+					page->guest_phys_addr + page->size) {
+				return gpa - page->guest_phys_addr +
+					page->host_phys_addr;
+			} else if (gpa < page->guest_phys_addr +
+						page->size) {
+				*hpa_size = page->guest_phys_addr +
+					page->size - gpa;
+				return gpa - page->guest_phys_addr +
+					page->host_phys_addr;
+			}
+		}
+	} else {
+		for (i = 0; i < dev->nr_guest_pages; i++) {
+			page = &dev->guest_pages[i];
+
+			if (gpa >= page->guest_phys_addr) {
+				if (gpa + gpa_size <=
+					page->guest_phys_addr + page->size) {
+					return gpa - page->guest_phys_addr +
+						page->host_phys_addr;
+				} else if (gpa < page->guest_phys_addr +
+							page->size) {
+					*hpa_size = page->guest_phys_addr +
+						page->size - gpa;
+					return gpa - page->guest_phys_addr +
+						page->host_phys_addr;
+				}
+			}
+		}
+	}
+
+	*hpa_size = 0;
+	return 0;
+}
+
 static __rte_always_inline uint64_t
 hva_to_gpa(struct virtio_net *dev, uint64_t vva, uint64_t len)
 {
diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 95a0bc19f..124a33a10 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -980,6 +980,7 @@  async_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	struct batch_copy_elem *batch_copy = vq->batch_copy_elems;
 	struct virtio_net_hdr_mrg_rxbuf tmp_hdr, *hdr = NULL;
 	int error = 0;
+	uint64_t mapped_len;
 
 	uint32_t tlen = 0;
 	int tvec_idx = 0;
@@ -1072,24 +1073,31 @@  async_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
 
 		cpy_len = RTE_MIN(buf_avail, mbuf_avail);
 
-		if (unlikely(cpy_len >= cpy_threshold)) {
-			hpa = (void *)(uintptr_t)gpa_to_hpa(dev,
-					buf_iova + buf_offset, cpy_len);
+		while (unlikely(cpy_len && cpy_len >= cpy_threshold)) {
+			hpa = (void *)(uintptr_t)gpa_to_first_hpa(dev,
+					buf_iova + buf_offset,
+					cpy_len, &mapped_len);
 
-			if (unlikely(!hpa)) {
-				error = -1;
-				goto out;
-			}
+			if (unlikely(!hpa || mapped_len < cpy_threshold))
+				break;
 
 			async_fill_vec(src_iovec + tvec_idx,
 				(void *)(uintptr_t)rte_pktmbuf_iova_offset(m,
-						mbuf_offset), cpy_len);
+				mbuf_offset), (size_t)mapped_len);
 
-			async_fill_vec(dst_iovec + tvec_idx, hpa, cpy_len);
+			async_fill_vec(dst_iovec + tvec_idx,
+					hpa, (size_t)mapped_len);
 
-			tlen += cpy_len;
+			tlen += (uint32_t)mapped_len;
+			cpy_len -= (uint32_t)mapped_len;
+			mbuf_avail  -= (uint32_t)mapped_len;
+			mbuf_offset += (uint32_t)mapped_len;
+			buf_avail  -= (uint32_t)mapped_len;
+			buf_offset += (uint32_t)mapped_len;
 			tvec_idx++;
-		} else {
+		}
+
+		if (likely(cpy_len)) {
 			if (unlikely(vq->batch_copy_nb_elems >= vq->size)) {
 				rte_memcpy(
 				(void *)((uintptr_t)(buf_addr + buf_offset)),
@@ -1112,10 +1120,12 @@  async_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
 			}
 		}
 
-		mbuf_avail  -= cpy_len;
-		mbuf_offset += cpy_len;
-		buf_avail  -= cpy_len;
-		buf_offset += cpy_len;
+		if (cpy_len) {
+			mbuf_avail  -= cpy_len;
+			mbuf_offset += cpy_len;
+			buf_avail  -= cpy_len;
+			buf_offset += cpy_len;
+		}
 	}
 
 out: