diff mbox series

[v1,08/14] vhost: improve IO vector logic

Message ID	20211018130229.308694-9-maxime.coquelin@redhat.com (mailing list archive)
State	Superseded, archived
Delegated to:	Maxime Coquelin
Headers	From: Maxime Coquelin <maxime.coquelin@redhat.com> To: dev@dpdk.org, chenbo.xia@intel.com, jiayu.hu@intel.com, yuanx.wang@intel.com, wenwux.ma@intel.com, bruce.richardson@intel.com, john.mcnamara@intel.com, david.marchand@redhat.com Cc: Maxime Coquelin <maxime.coquelin@redhat.com> Date: Mon, 18 Oct 2021 15:02:23 +0200 Message-Id: <20211018130229.308694-9-maxime.coquelin@redhat.com> In-Reply-To: <20211018130229.308694-1-maxime.coquelin@redhat.com> References: <20211018130229.308694-1-maxime.coquelin@redhat.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Content-Type: text/plain; charset="US-ASCII" Subject: [dpdk-dev] [PATCH v1 08/14] vhost: improve IO vector logic Precedence: list Errors-To: dev-bounces@dpdk.org Sender: "dev" <dev-bounces@dpdk.org>
Series	vhost: clean-up and simplify async implementation \| [v1,00/14] vhost: clean-up and simplify async implementation [v1,01/14] vhost: move async data in a dedicated structure [v1,02/14] vhost: hide inflight async structure [v1,03/14] vhost: simplify async IO vectors [v1,04/14] vhost: simplify async IO vectors iterators [v1,05/14] vhost: remove async batch threshold [v1,06/14] vhost: introduce specific iovec structure [v1,07/14] vhost: remove useless fields in async iterator struct [v1,08/14] vhost: improve IO vector logic [v1,09/14] vhost: remove notion of async descriptor [v1,10/14] vhost: simplify async enqueue completion [v1,11/14] vhost: simplify getting the first in-flight index [v1,12/14] vhost: prepare async for mbuf to desc refactoring [v1,13/14] vhost: prepare sync for mbuf to desc refactoring [v1,14/14] vhost: merge sync and async mbuf to desc filling

Checks

Context	Check	Description
ci/checkpatch	success	coding style OK

Commit Message

Maxime Coquelin Oct. 18, 2021, 1:02 p.m. UTC

  IO vectors and their iterators arrays were part of the
async metadata but not their indexes.

In order to makes this more consistent, the patch adds the
indexes to the async metadata. Doing that, we can avoid
triggering DMA transfer within the loop as it IO vector
index overflow is now prevented in the async_mbuf_to_desc()
function.

Note that previous detection mechanism was broken
since the overflow already happened when detected, so OOB
memory access would already have happened.

With this changes done, virtio_dev_rx_async_submit_split()
and virtio_dev_rx_async_submit_packed() can be further
simplified.

Signed-off-by: Maxime Coquelin <maxime.coquelin@redhat.com>
---
 lib/vhost/vhost.h      |   2 +
 lib/vhost/virtio_net.c | 291 ++++++++++++++++++-----------------------
 2 files changed, 131 insertions(+), 162 deletions(-)

Comments

Hu, Jiayu Oct. 25, 2021, 7:22 a.m. UTC | #1

Hi Maxime,

> -----Original Message-----
> From: Maxime Coquelin <maxime.coquelin@redhat.com>
> Sent: Monday, October 18, 2021 9:02 PM
> To: dev@dpdk.org; Xia, Chenbo <chenbo.xia@intel.com>; Hu, Jiayu
> <jiayu.hu@intel.com>; Wang, YuanX <yuanx.wang@intel.com>; Ma,
> WenwuX <wenwux.ma@intel.com>; Richardson, Bruce
> <bruce.richardson@intel.com>; Mcnamara, John
> <john.mcnamara@intel.com>; david.marchand@redhat.com
> Cc: Maxime Coquelin <maxime.coquelin@redhat.com>
> Subject: [PATCH v1 08/14] vhost: improve IO vector logic
> 
> IO vectors and their iterators arrays were part of the async metadata but not
> their indexes.
> 
> In order to makes this more consistent, the patch adds the indexes to the
> async metadata. Doing that, we can avoid triggering DMA transfer within the
> loop as it IO vector index overflow is now prevented in the
> async_mbuf_to_desc() function.
> 
> Note that previous detection mechanism was broken since the overflow
> already happened when detected, so OOB memory access would already
> have happened.
> 
> With this changes done, virtio_dev_rx_async_submit_split()
> and virtio_dev_rx_async_submit_packed() can be further simplified.
> 
> Signed-off-by: Maxime Coquelin <maxime.coquelin@redhat.com>
> ---
>  lib/vhost/vhost.h      |   2 +
>  lib/vhost/virtio_net.c | 291 ++++++++++++++++++-----------------------
>  2 files changed, 131 insertions(+), 162 deletions(-)
> 
> diff --git a/lib/vhost/vhost.h b/lib/vhost/vhost.h index
> dae9a1ac2d..812d4c55a5 100644
> --- a/lib/vhost/vhost.h
> +++ b/lib/vhost/vhost.h
> @@ -134,6 +134,8 @@ struct vhost_async {
> 
>  	struct rte_vhost_iov_iter iov_iter[VHOST_MAX_ASYNC_IT];
>  	struct rte_vhost_iovec iovec[VHOST_MAX_ASYNC_VEC];
> +	uint16_t iter_idx;
> +	uint16_t iovec_idx;
> 
>  	/* data transfer status */
>  	struct async_inflight_info *pkts_info; diff --git a/lib/vhost/virtio_net.c
> b/lib/vhost/virtio_net.c index ae7dded979..c80823a8de 100644
> --- a/lib/vhost/virtio_net.c
> +++ b/lib/vhost/virtio_net.c
> @@ -924,33 +924,86 @@ copy_mbuf_to_desc(struct virtio_net *dev, struct
> vhost_virtqueue *vq,
>  	return error;
>  }
> 
> +static __rte_always_inline int
> +async_iter_initialize(struct vhost_async *async) {
> +	struct rte_vhost_iov_iter *iter;
> +
> +	if (unlikely(async->iovec_idx >= VHOST_MAX_ASYNC_VEC)) {
> +		VHOST_LOG_DATA(ERR, "no more async iovec available\n");
> +		return -1;
> +	}
> +
> +	iter = async->iov_iter + async->iter_idx;
> +	iter->iov = async->iovec + async->iovec_idx;
> +	iter->nr_segs = 0;
> +
> +	return 0;
> +}
> +
> +static __rte_always_inline int
> +async_iter_add_iovec(struct vhost_async *async, void *src, void *dst,
> +size_t len) {
> +	struct rte_vhost_iov_iter *iter;
> +	struct rte_vhost_iovec *iovec;
> +
> +	if (unlikely(async->iovec_idx >= VHOST_MAX_ASYNC_VEC)) {
> +		VHOST_LOG_DATA(ERR, "no more async iovec available\n");
> +		return -1;
> +	}

For large packets, like 64KB in iperf test, async_iter_add_iovec() frequently
reports the log above, as we run out of iovecs. I think it's better to change
the log from ERR to DEBUG.

In addition, the size of iovec is too small. For burst 32 and 64KB pkts, it's
easy to run out of iovecs and we will drop the pkts to enqueue if it happens,
which hurts performance. Enlarging the array is a choice to mitigate the
issue, but another solution is to reallocate iovec once we run out of it. How do
you think?

Thanks,
Jiayu
> +
> +	iter = async->iov_iter + async->iter_idx;
> +	iovec = async->iovec + async->iovec_idx;
> +
> +	iovec->src_addr = src;
> +	iovec->dst_addr = dst;
> +	iovec->len = len;
> +
> +	iter->nr_segs++;
> +	async->iovec_idx++;
> +
> +	return 0;
> +}

Maxime Coquelin Oct. 25, 2021, 10:02 a.m. UTC | #2

Hi Jiayu,

On 10/25/21 09:22, Hu, Jiayu wrote:
> Hi Maxime,
> 
>> -----Original Message-----
>> From: Maxime Coquelin <maxime.coquelin@redhat.com>
>> Sent: Monday, October 18, 2021 9:02 PM
>> To: dev@dpdk.org; Xia, Chenbo <chenbo.xia@intel.com>; Hu, Jiayu
>> <jiayu.hu@intel.com>; Wang, YuanX <yuanx.wang@intel.com>; Ma,
>> WenwuX <wenwux.ma@intel.com>; Richardson, Bruce
>> <bruce.richardson@intel.com>; Mcnamara, John
>> <john.mcnamara@intel.com>; david.marchand@redhat.com
>> Cc: Maxime Coquelin <maxime.coquelin@redhat.com>
>> Subject: [PATCH v1 08/14] vhost: improve IO vector logic
>>
>> IO vectors and their iterators arrays were part of the async metadata but not
>> their indexes.
>>
>> In order to makes this more consistent, the patch adds the indexes to the
>> async metadata. Doing that, we can avoid triggering DMA transfer within the
>> loop as it IO vector index overflow is now prevented in the
>> async_mbuf_to_desc() function.
>>
>> Note that previous detection mechanism was broken since the overflow
>> already happened when detected, so OOB memory access would already
>> have happened.
>>
>> With this changes done, virtio_dev_rx_async_submit_split()
>> and virtio_dev_rx_async_submit_packed() can be further simplified.
>>
>> Signed-off-by: Maxime Coquelin <maxime.coquelin@redhat.com>
>> ---
>>   lib/vhost/vhost.h      |   2 +
>>   lib/vhost/virtio_net.c | 291 ++++++++++++++++++-----------------------
>>   2 files changed, 131 insertions(+), 162 deletions(-)
>>
>> diff --git a/lib/vhost/vhost.h b/lib/vhost/vhost.h index
>> dae9a1ac2d..812d4c55a5 100644
>> --- a/lib/vhost/vhost.h
>> +++ b/lib/vhost/vhost.h
>> @@ -134,6 +134,8 @@ struct vhost_async {
>>
>>   	struct rte_vhost_iov_iter iov_iter[VHOST_MAX_ASYNC_IT];
>>   	struct rte_vhost_iovec iovec[VHOST_MAX_ASYNC_VEC];
>> +	uint16_t iter_idx;
>> +	uint16_t iovec_idx;
>>
>>   	/* data transfer status */
>>   	struct async_inflight_info *pkts_info; diff --git a/lib/vhost/virtio_net.c
>> b/lib/vhost/virtio_net.c index ae7dded979..c80823a8de 100644
>> --- a/lib/vhost/virtio_net.c
>> +++ b/lib/vhost/virtio_net.c
>> @@ -924,33 +924,86 @@ copy_mbuf_to_desc(struct virtio_net *dev, struct
>> vhost_virtqueue *vq,
>>   	return error;
>>   }
>>
>> +static __rte_always_inline int
>> +async_iter_initialize(struct vhost_async *async) {
>> +	struct rte_vhost_iov_iter *iter;
>> +
>> +	if (unlikely(async->iovec_idx >= VHOST_MAX_ASYNC_VEC)) {
>> +		VHOST_LOG_DATA(ERR, "no more async iovec available\n");
>> +		return -1;
>> +	}
>> +
>> +	iter = async->iov_iter + async->iter_idx;
>> +	iter->iov = async->iovec + async->iovec_idx;
>> +	iter->nr_segs = 0;
>> +
>> +	return 0;
>> +}
>> +
>> +static __rte_always_inline int
>> +async_iter_add_iovec(struct vhost_async *async, void *src, void *dst,
>> +size_t len) {
>> +	struct rte_vhost_iov_iter *iter;
>> +	struct rte_vhost_iovec *iovec;
>> +
>> +	if (unlikely(async->iovec_idx >= VHOST_MAX_ASYNC_VEC)) {
>> +		VHOST_LOG_DATA(ERR, "no more async iovec available\n");
>> +		return -1;
>> +	}
> 
> For large packets, like 64KB in iperf test, async_iter_add_iovec() frequently
> reports the log above, as we run out of iovecs. I think it's better to change
> the log from ERR to DEBUG.

I think it is better to keep it as an error, we want to see it if it
happens without having the user to enable debug.

But maybe we can only print it once, not to flood the logs.

> In addition, the size of iovec is too small. For burst 32 and 64KB pkts, it's
> easy to run out of iovecs and we will drop the pkts to enqueue if it happens,
> which hurts performance. Enlarging the array is a choice to mitigate the
> issue, but another solution is to reallocate iovec once we run out of it. How do
> you think?

I would prefer we enlarge the array, reallocating the array when the
issue happens sounds like over-engineering to me.

Any idea what size it should be based on your experiments?

Thanks,
Maxime

> Thanks,
> Jiayu
>> +
>> +	iter = async->iov_iter + async->iter_idx;
>> +	iovec = async->iovec + async->iovec_idx;
>> +
>> +	iovec->src_addr = src;
>> +	iovec->dst_addr = dst;
>> +	iovec->len = len;
>> +
>> +	iter->nr_segs++;
>> +	async->iovec_idx++;
>> +
>> +	return 0;
>> +}
>

Hu, Jiayu Oct. 26, 2021, 7:07 a.m. UTC | #3

Hi Maxime,

> -----Original Message-----
> From: Maxime Coquelin <maxime.coquelin@redhat.com>
> Sent: Monday, October 25, 2021 6:03 PM
> To: Hu, Jiayu <jiayu.hu@intel.com>; dev@dpdk.org; Xia, Chenbo
> <chenbo.xia@intel.com>; Wang, YuanX <yuanx.wang@intel.com>; Ma,
> WenwuX <wenwux.ma@intel.com>; Richardson, Bruce
> <bruce.richardson@intel.com>; Mcnamara, John
> <john.mcnamara@intel.com>; david.marchand@redhat.com
> Subject: Re: [PATCH v1 08/14] vhost: improve IO vector logic
> 
> Hi Jiayu,
> 
> On 10/25/21 09:22, Hu, Jiayu wrote:
> > Hi Maxime,
> >
> >> -----Original Message-----
> >> From: Maxime Coquelin <maxime.coquelin@redhat.com>
> >> Sent: Monday, October 18, 2021 9:02 PM
> >> To: dev@dpdk.org; Xia, Chenbo <chenbo.xia@intel.com>; Hu, Jiayu
> >> <jiayu.hu@intel.com>; Wang, YuanX <yuanx.wang@intel.com>; Ma,
> WenwuX
> >> <wenwux.ma@intel.com>; Richardson, Bruce
> >> <bruce.richardson@intel.com>; Mcnamara, John
> >> <john.mcnamara@intel.com>; david.marchand@redhat.com
> >> Cc: Maxime Coquelin <maxime.coquelin@redhat.com>
> >> Subject: [PATCH v1 08/14] vhost: improve IO vector logic
> >>
> >> IO vectors and their iterators arrays were part of the async metadata
> >> but not their indexes.
> >>
> >> In order to makes this more consistent, the patch adds the indexes to
> >> the async metadata. Doing that, we can avoid triggering DMA transfer
> >> within the loop as it IO vector index overflow is now prevented in
> >> the
> >> async_mbuf_to_desc() function.
> >>
> >> Note that previous detection mechanism was broken since the overflow
> >> already happened when detected, so OOB memory access would already
> >> have happened.
> >>
> >> With this changes done, virtio_dev_rx_async_submit_split()
> >> and virtio_dev_rx_async_submit_packed() can be further simplified.
> >>
> >> Signed-off-by: Maxime Coquelin <maxime.coquelin@redhat.com>
> >> ---
> >>   lib/vhost/vhost.h      |   2 +
> >>   lib/vhost/virtio_net.c | 291 ++++++++++++++++++-----------------------
> >>   2 files changed, 131 insertions(+), 162 deletions(-)
> >>
> >> diff --git a/lib/vhost/vhost.h b/lib/vhost/vhost.h index
> >> dae9a1ac2d..812d4c55a5 100644
> >> --- a/lib/vhost/vhost.h
> >> +++ b/lib/vhost/vhost.h
> >> @@ -134,6 +134,8 @@ struct vhost_async {
> >>
> >>   	struct rte_vhost_iov_iter iov_iter[VHOST_MAX_ASYNC_IT];
> >>   	struct rte_vhost_iovec iovec[VHOST_MAX_ASYNC_VEC];
> >> +	uint16_t iter_idx;
> >> +	uint16_t iovec_idx;
> >>
> >>   	/* data transfer status */
> >>   	struct async_inflight_info *pkts_info; diff --git
> >> a/lib/vhost/virtio_net.c b/lib/vhost/virtio_net.c index
> >> ae7dded979..c80823a8de 100644
> >> --- a/lib/vhost/virtio_net.c
> >> +++ b/lib/vhost/virtio_net.c
> >> @@ -924,33 +924,86 @@ copy_mbuf_to_desc(struct virtio_net *dev,
> >> struct vhost_virtqueue *vq,
> >>   	return error;
> >>   }
> >>
> >> +static __rte_always_inline int
> >> +async_iter_initialize(struct vhost_async *async) {
> >> +	struct rte_vhost_iov_iter *iter;
> >> +
> >> +	if (unlikely(async->iovec_idx >= VHOST_MAX_ASYNC_VEC)) {
> >> +		VHOST_LOG_DATA(ERR, "no more async iovec available\n");
> >> +		return -1;
> >> +	}
> >> +
> >> +	iter = async->iov_iter + async->iter_idx;
> >> +	iter->iov = async->iovec + async->iovec_idx;
> >> +	iter->nr_segs = 0;
> >> +
> >> +	return 0;
> >> +}
> >> +
> >> +static __rte_always_inline int
> >> +async_iter_add_iovec(struct vhost_async *async, void *src, void
> >> +*dst, size_t len) {
> >> +	struct rte_vhost_iov_iter *iter;
> >> +	struct rte_vhost_iovec *iovec;
> >> +
> >> +	if (unlikely(async->iovec_idx >= VHOST_MAX_ASYNC_VEC)) {
> >> +		VHOST_LOG_DATA(ERR, "no more async iovec available\n");
> >> +		return -1;
> >> +	}
> >
> > For large packets, like 64KB in iperf test, async_iter_add_iovec()
> > frequently reports the log above, as we run out of iovecs. I think
> > it's better to change the log from ERR to DEBUG.
> 
> I think it is better to keep it as an error, we want to see it if it happens
> without having the user to enable debug.
> 
> But maybe we can only print it once, not to flood the logs.

OK.

> 
> > In addition, the size of iovec is too small. For burst 32 and 64KB
> > pkts, it's easy to run out of iovecs and we will drop the pkts to
> > enqueue if it happens, which hurts performance. Enlarging the array is
> > a choice to mitigate the issue, but another solution is to reallocate
> > iovec once we run out of it. How do you think?
> 
> I would prefer we enlarge the array, reallocating the array when the issue
> happens sounds like over-engineering to me.
> 
> Any idea what size it should be based on your experiments?

2048 is enough for iperf and 64KB pkts.

Thanks,
Jiayu
> 
> Thanks,
> Maxime
> 
> > Thanks,
> > Jiayu
> >> +
> >> +	iter = async->iov_iter + async->iter_idx;
> >> +	iovec = async->iovec + async->iovec_idx;
> >> +
> >> +	iovec->src_addr = src;
> >> +	iovec->dst_addr = dst;
> >> +	iovec->len = len;
> >> +
> >> +	iter->nr_segs++;
> >> +	async->iovec_idx++;
> >> +
> >> +	return 0;
> >> +}
> >

Maxime Coquelin Oct. 26, 2021, 7:27 a.m. UTC | #4

On 10/26/21 09:07, Hu, Jiayu wrote:
> Hi Maxime,
> 
>> -----Original Message-----
>> From: Maxime Coquelin <maxime.coquelin@redhat.com>
>> Sent: Monday, October 25, 2021 6:03 PM
>> To: Hu, Jiayu <jiayu.hu@intel.com>; dev@dpdk.org; Xia, Chenbo
>> <chenbo.xia@intel.com>; Wang, YuanX <yuanx.wang@intel.com>; Ma,
>> WenwuX <wenwux.ma@intel.com>; Richardson, Bruce
>> <bruce.richardson@intel.com>; Mcnamara, John
>> <john.mcnamara@intel.com>; david.marchand@redhat.com
>> Subject: Re: [PATCH v1 08/14] vhost: improve IO vector logic
>>
>> Hi Jiayu,
>>
>> On 10/25/21 09:22, Hu, Jiayu wrote:
>>> Hi Maxime,
>>>
>>>> -----Original Message-----
>>>> From: Maxime Coquelin <maxime.coquelin@redhat.com>
>>>> Sent: Monday, October 18, 2021 9:02 PM
>>>> To: dev@dpdk.org; Xia, Chenbo <chenbo.xia@intel.com>; Hu, Jiayu
>>>> <jiayu.hu@intel.com>; Wang, YuanX <yuanx.wang@intel.com>; Ma,
>> WenwuX
>>>> <wenwux.ma@intel.com>; Richardson, Bruce
>>>> <bruce.richardson@intel.com>; Mcnamara, John
>>>> <john.mcnamara@intel.com>; david.marchand@redhat.com
>>>> Cc: Maxime Coquelin <maxime.coquelin@redhat.com>
>>>> Subject: [PATCH v1 08/14] vhost: improve IO vector logic
>>>>
>>>> IO vectors and their iterators arrays were part of the async metadata
>>>> but not their indexes.
>>>>
>>>> In order to makes this more consistent, the patch adds the indexes to
>>>> the async metadata. Doing that, we can avoid triggering DMA transfer
>>>> within the loop as it IO vector index overflow is now prevented in
>>>> the
>>>> async_mbuf_to_desc() function.
>>>>
>>>> Note that previous detection mechanism was broken since the overflow
>>>> already happened when detected, so OOB memory access would already
>>>> have happened.
>>>>
>>>> With this changes done, virtio_dev_rx_async_submit_split()
>>>> and virtio_dev_rx_async_submit_packed() can be further simplified.
>>>>
>>>> Signed-off-by: Maxime Coquelin <maxime.coquelin@redhat.com>
>>>> ---
>>>>    lib/vhost/vhost.h      |   2 +
>>>>    lib/vhost/virtio_net.c | 291 ++++++++++++++++++-----------------------
>>>>    2 files changed, 131 insertions(+), 162 deletions(-)
>>>>
>>>> diff --git a/lib/vhost/vhost.h b/lib/vhost/vhost.h index
>>>> dae9a1ac2d..812d4c55a5 100644
>>>> --- a/lib/vhost/vhost.h
>>>> +++ b/lib/vhost/vhost.h
>>>> @@ -134,6 +134,8 @@ struct vhost_async {
>>>>
>>>>    	struct rte_vhost_iov_iter iov_iter[VHOST_MAX_ASYNC_IT];
>>>>    	struct rte_vhost_iovec iovec[VHOST_MAX_ASYNC_VEC];
>>>> +	uint16_t iter_idx;
>>>> +	uint16_t iovec_idx;
>>>>
>>>>    	/* data transfer status */
>>>>    	struct async_inflight_info *pkts_info; diff --git
>>>> a/lib/vhost/virtio_net.c b/lib/vhost/virtio_net.c index
>>>> ae7dded979..c80823a8de 100644
>>>> --- a/lib/vhost/virtio_net.c
>>>> +++ b/lib/vhost/virtio_net.c
>>>> @@ -924,33 +924,86 @@ copy_mbuf_to_desc(struct virtio_net *dev,
>>>> struct vhost_virtqueue *vq,
>>>>    	return error;
>>>>    }
>>>>
>>>> +static __rte_always_inline int
>>>> +async_iter_initialize(struct vhost_async *async) {
>>>> +	struct rte_vhost_iov_iter *iter;
>>>> +
>>>> +	if (unlikely(async->iovec_idx >= VHOST_MAX_ASYNC_VEC)) {
>>>> +		VHOST_LOG_DATA(ERR, "no more async iovec available\n");
>>>> +		return -1;
>>>> +	}
>>>> +
>>>> +	iter = async->iov_iter + async->iter_idx;
>>>> +	iter->iov = async->iovec + async->iovec_idx;
>>>> +	iter->nr_segs = 0;
>>>> +
>>>> +	return 0;
>>>> +}
>>>> +
>>>> +static __rte_always_inline int
>>>> +async_iter_add_iovec(struct vhost_async *async, void *src, void
>>>> +*dst, size_t len) {
>>>> +	struct rte_vhost_iov_iter *iter;
>>>> +	struct rte_vhost_iovec *iovec;
>>>> +
>>>> +	if (unlikely(async->iovec_idx >= VHOST_MAX_ASYNC_VEC)) {
>>>> +		VHOST_LOG_DATA(ERR, "no more async iovec available\n");
>>>> +		return -1;
>>>> +	}
>>>
>>> For large packets, like 64KB in iperf test, async_iter_add_iovec()
>>> frequently reports the log above, as we run out of iovecs. I think
>>> it's better to change the log from ERR to DEBUG.
>>
>> I think it is better to keep it as an error, we want to see it if it happens
>> without having the user to enable debug.
>>
>> But maybe we can only print it once, not to flood the logs.
> 
> OK.
> 
>>
>>> In addition, the size of iovec is too small. For burst 32 and 64KB
>>> pkts, it's easy to run out of iovecs and we will drop the pkts to
>>> enqueue if it happens, which hurts performance. Enlarging the array is
>>> a choice to mitigate the issue, but another solution is to reallocate
>>> iovec once we run out of it. How do you think?
>>
>> I would prefer we enlarge the array, reallocating the array when the issue
>> happens sounds like over-engineering to me.
>>
>> Any idea what size it should be based on your experiments?
> 
> 2048 is enough for iperf and 64KB pkts.

Thanks for the insight, I will change to 2048 in next revision.

Maxime

> 
> Thanks,
> Jiayu
>>
>> Thanks,
>> Maxime
>>
>>> Thanks,
>>> Jiayu
>>>> +
>>>> +	iter = async->iov_iter + async->iter_idx;
>>>> +	iovec = async->iovec + async->iovec_idx;
>>>> +
>>>> +	iovec->src_addr = src;
>>>> +	iovec->dst_addr = dst;
>>>> +	iovec->len = len;
>>>> +
>>>> +	iter->nr_segs++;
>>>> +	async->iovec_idx++;
>>>> +
>>>> +	return 0;
>>>> +}
>>>
>

diff mbox series

Patch

diff --git a/lib/vhost/vhost.h b/lib/vhost/vhost.h
index dae9a1ac2d..812d4c55a5 100644
--- a/lib/vhost/vhost.h
+++ b/lib/vhost/vhost.h
@@ -134,6 +134,8 @@  struct vhost_async {
 
 	struct rte_vhost_iov_iter iov_iter[VHOST_MAX_ASYNC_IT];
 	struct rte_vhost_iovec iovec[VHOST_MAX_ASYNC_VEC];
+	uint16_t iter_idx;
+	uint16_t iovec_idx;
 
 	/* data transfer status */
 	struct async_inflight_info *pkts_info;
diff --git a/lib/vhost/virtio_net.c b/lib/vhost/virtio_net.c
index ae7dded979..c80823a8de 100644
--- a/lib/vhost/virtio_net.c
+++ b/lib/vhost/virtio_net.c
@@ -924,33 +924,86 @@  copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	return error;
 }
 
+static __rte_always_inline int
+async_iter_initialize(struct vhost_async *async)
+{
+	struct rte_vhost_iov_iter *iter;
+
+	if (unlikely(async->iovec_idx >= VHOST_MAX_ASYNC_VEC)) {
+		VHOST_LOG_DATA(ERR, "no more async iovec available\n");
+		return -1;
+	}
+
+	iter = async->iov_iter + async->iter_idx;
+	iter->iov = async->iovec + async->iovec_idx;
+	iter->nr_segs = 0;
+
+	return 0;
+}
+
+static __rte_always_inline int
+async_iter_add_iovec(struct vhost_async *async, void *src, void *dst, size_t len)
+{
+	struct rte_vhost_iov_iter *iter;
+	struct rte_vhost_iovec *iovec;
+
+	if (unlikely(async->iovec_idx >= VHOST_MAX_ASYNC_VEC)) {
+		VHOST_LOG_DATA(ERR, "no more async iovec available\n");
+		return -1;
+	}
+
+	iter = async->iov_iter + async->iter_idx;
+	iovec = async->iovec + async->iovec_idx;
+
+	iovec->src_addr = src;
+	iovec->dst_addr = dst;
+	iovec->len = len;
+
+	iter->nr_segs++;
+	async->iovec_idx++;
+
+	return 0;
+}
+
 static __rte_always_inline void
-async_fill_vec(struct rte_vhost_iovec *v, void *src, void *dst, size_t len)
+async_iter_finalize(struct vhost_async *async)
 {
-	v->src_addr = src;
-	v->dst_addr = dst;
-	v->len = len;
+	async->iter_idx++;
 }
 
 static __rte_always_inline void
-async_fill_iter(struct rte_vhost_iov_iter *it, struct rte_vhost_iovec *vec, unsigned long nr_seg)
+async_iter_cancel(struct vhost_async *async)
 {
-	it->iov = vec;
-	it->nr_segs = nr_seg;
+	struct rte_vhost_iov_iter *iter;
+
+	iter = async->iov_iter + async->iter_idx;
+	async->iovec_idx -= iter->nr_segs;
+	iter->nr_segs = 0;
+	iter->iov = NULL;
 }
 
 static __rte_always_inline void
-async_fill_desc(struct rte_vhost_async_desc *desc, struct rte_vhost_iov_iter *iter)
+async_iter_reset(struct vhost_async *async)
 {
-	desc->iter = iter;
+	async->iter_idx = 0;
+	async->iovec_idx = 0;
+}
+
+static __rte_always_inline void
+async_fill_descs(struct vhost_async *async, struct rte_vhost_async_desc *descs)
+{
+	int i;
+
+	for (i = 0; i < async->iter_idx; i++)
+		descs[i].iter = async->iov_iter + i;
 }
 
 static __rte_always_inline int
 async_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
 			struct rte_mbuf *m, struct buf_vector *buf_vec,
-			uint16_t nr_vec, uint16_t num_buffers,
-			struct rte_vhost_iovec *iovec, struct rte_vhost_iov_iter *iter)
+			uint16_t nr_vec, uint16_t num_buffers)
 {
+	struct vhost_async *async = vq->async;
 	struct rte_mbuf *hdr_mbuf;
 	struct virtio_net_hdr_mrg_rxbuf tmp_hdr, *hdr = NULL;
 	uint64_t buf_addr, buf_iova;
@@ -960,24 +1013,18 @@  async_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	uint32_t mbuf_offset, mbuf_avail;
 	uint32_t buf_offset, buf_avail;
 	uint32_t cpy_len, buf_len;
-	int error = 0;
 
-	int tvec_idx = 0;
 	void *hpa;
 
-	if (unlikely(m == NULL)) {
-		error = -1;
-		goto out;
-	}
+	if (unlikely(m == NULL))
+		return -1;
 
 	buf_addr = buf_vec[vec_idx].buf_addr;
 	buf_iova = buf_vec[vec_idx].buf_iova;
 	buf_len = buf_vec[vec_idx].buf_len;
 
-	if (unlikely(buf_len < dev->vhost_hlen && nr_vec <= 1)) {
-		error = -1;
-		goto out;
-	}
+	if (unlikely(buf_len < dev->vhost_hlen && nr_vec <= 1))
+		return -1;
 
 	hdr_mbuf = m;
 	hdr_addr = buf_addr;
@@ -1005,14 +1052,15 @@  async_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	mbuf_avail  = rte_pktmbuf_data_len(m);
 	mbuf_offset = 0;
 
+	if (async_iter_initialize(async))
+		return -1;
+
 	while (mbuf_avail != 0 || m->next != NULL) {
 		/* done with current buf, get the next one */
 		if (buf_avail == 0) {
 			vec_idx++;
-			if (unlikely(vec_idx >= nr_vec)) {
-				error = -1;
-				goto out;
-			}
+			if (unlikely(vec_idx >= nr_vec))
+				goto error;
 
 			buf_addr = buf_vec[vec_idx].buf_addr;
 			buf_iova = buf_vec[vec_idx].buf_iova;
@@ -1058,26 +1106,30 @@  async_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
 			if (unlikely(!hpa)) {
 				VHOST_LOG_DATA(ERR, "(%d) %s: failed to get hpa.\n",
 				dev->vid, __func__);
-				error = -1;
-				goto out;
+				goto error;
 			}
 
-			async_fill_vec(iovec + tvec_idx,
-				(void *)(uintptr_t)rte_pktmbuf_iova_offset(m,
-				mbuf_offset), hpa, (size_t)mapped_len);
+			if (unlikely(async_iter_add_iovec(async,
+					(void *)(uintptr_t)rte_pktmbuf_iova_offset(m,
+						mbuf_offset),
+					hpa, (size_t)mapped_len)))
+				goto error;
 
 			cpy_len -= (uint32_t)mapped_len;
 			mbuf_avail  -= (uint32_t)mapped_len;
 			mbuf_offset += (uint32_t)mapped_len;
 			buf_avail  -= (uint32_t)mapped_len;
 			buf_offset += (uint32_t)mapped_len;
-			tvec_idx++;
 		}
 	}
 
-	async_fill_iter(iter, iovec, tvec_idx);
-out:
-	return error;
+	async_iter_finalize(async);
+
+	return 0;
+error:
+	async_iter_cancel(async);
+
+	return -1;
 }
 
 static __rte_always_inline int
@@ -1487,18 +1539,16 @@  virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 	struct rte_mbuf **pkts, uint32_t count)
 {
 	struct buf_vector buf_vec[BUF_VECTOR_MAX];
-	uint32_t pkt_idx = 0, pkt_burst_idx = 0;
+	uint32_t pkt_idx = 0;
 	uint16_t num_buffers;
 	uint16_t avail_head;
 
 	struct vhost_async *async = vq->async;
-	struct rte_vhost_iov_iter *iter = async->iov_iter;
-	struct rte_vhost_async_desc tdes[MAX_PKT_BURST];
-	struct rte_vhost_iovec *iovec = async->iovec;
+	struct rte_vhost_async_desc async_descs[MAX_PKT_BURST];
 	struct async_inflight_info *pkts_info = async->pkts_info;
-	uint32_t n_pkts = 0, pkt_err = 0;
+	uint32_t pkt_err = 0;
 	int32_t n_xfer;
-	uint16_t iovec_idx = 0, it_idx = 0, slot_idx = 0;
+	uint16_t slot_idx = 0;
 
 	/*
 	 * The ordering between avail index and desc reads need to be enforced.
@@ -1507,95 +1557,53 @@  virtio_dev_rx_async_submit_split(struct virtio_net *dev,
 
 	rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]);
 
+	async_iter_reset(async);
+
 	for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
 		uint32_t pkt_len = pkts[pkt_idx]->pkt_len + dev->vhost_hlen;
 		uint16_t nr_vec = 0;
 
-		if (unlikely(reserve_avail_buf_split(dev, vq,
-						pkt_len, buf_vec, &num_buffers,
-						avail_head, &nr_vec) < 0)) {
-			VHOST_LOG_DATA(DEBUG,
-				"(%d) failed to get enough desc from vring\n",
-				dev->vid);
+		if (unlikely(reserve_avail_buf_split(dev, vq, pkt_len, buf_vec,
+						&num_buffers, avail_head, &nr_vec) < 0)) {
+			VHOST_LOG_DATA(DEBUG, "(%d) failed to get enough desc from vring\n",
+					dev->vid);
 			vq->shadow_used_idx -= num_buffers;
 			break;
 		}
 
 		VHOST_LOG_DATA(DEBUG, "(%d) current index %d | end index %d\n",
-			dev->vid, vq->last_avail_idx,
-			vq->last_avail_idx + num_buffers);
+			dev->vid, vq->last_avail_idx, vq->last_avail_idx + num_buffers);
 
-		if (async_mbuf_to_desc(dev, vq, pkts[pkt_idx], buf_vec, nr_vec, num_buffers,
-				&iovec[iovec_idx], &iter[it_idx]) < 0) {
+		if (async_mbuf_to_desc(dev, vq, pkts[pkt_idx], buf_vec, nr_vec, num_buffers) < 0) {
 			vq->shadow_used_idx -= num_buffers;
 			break;
 		}
 
-		async_fill_desc(&tdes[pkt_burst_idx++], &iter[it_idx]);
-
 		slot_idx = (async->pkts_idx + pkt_idx) & (vq->size - 1);
 		pkts_info[slot_idx].descs = num_buffers;
 		pkts_info[slot_idx].mbuf = pkts[pkt_idx];
 
-		iovec_idx += iter[it_idx].nr_segs;
-		it_idx++;
-
 		vq->last_avail_idx += num_buffers;
+	}
 
-		/*
-		 * condition to trigger async device transfer:
-		 * - unused async iov number is less than max vhost vector
-		 */
-		if (unlikely(VHOST_MAX_ASYNC_VEC - iovec_idx < BUF_VECTOR_MAX)) {
-			n_xfer = async->ops.transfer_data(dev->vid,
-					queue_id, tdes, 0, pkt_burst_idx);
-			if (likely(n_xfer >= 0)) {
-				n_pkts = n_xfer;
-			} else {
-				VHOST_LOG_DATA(ERR,
-					"(%d) %s: failed to transfer data for queue id %d.\n",
-					dev->vid, __func__, queue_id);
-				n_pkts = 0;
-			}
-
-			iovec_idx = 0;
-			it_idx = 0;
-
-			if (unlikely(n_pkts < pkt_burst_idx)) {
-				/*
-				 * log error packets number here and do actual
-				 * error processing when applications poll
-				 * completion
-				 */
-				pkt_err = pkt_burst_idx - n_pkts;
-				pkt_idx++;
-				pkt_burst_idx = 0;
-				break;
-			}
+	if (unlikely(pkt_idx == 0))
+		return 0;
 
-			pkt_burst_idx = 0;
-		}
-	}
+	async_fill_descs(async, async_descs);
 
-	if (pkt_burst_idx) {
-		n_xfer = async->ops.transfer_data(dev->vid, queue_id, tdes, 0, pkt_burst_idx);
-		if (likely(n_xfer >= 0)) {
-			n_pkts = n_xfer;
-		} else {
-			VHOST_LOG_DATA(ERR, "(%d) %s: failed to transfer data for queue id %d.\n",
+	n_xfer = async->ops.transfer_data(dev->vid, queue_id, async_descs, 0, pkt_idx);
+	if (unlikely(n_xfer < 0)) {
+		VHOST_LOG_DATA(ERR, "(%d) %s: failed to transfer data for queue id %d.\n",
 				dev->vid, __func__, queue_id);
-			n_pkts = 0;
-		}
-
-		if (unlikely(n_pkts < pkt_burst_idx))
-			pkt_err = pkt_burst_idx - n_pkts;
+		n_xfer = 0;
 	}
 
+	pkt_err = pkt_idx - n_xfer;
 	if (unlikely(pkt_err)) {
 		uint16_t num_descs = 0;
 
 		/* update number of completed packets */
-		pkt_idx -= pkt_err;
+		pkt_idx = n_xfer;
 
 		/* calculate the sum of descriptors to revert */
 		while (pkt_err-- > 0) {
@@ -1686,9 +1694,7 @@  vhost_enqueue_async_packed(struct virtio_net *dev,
 			    struct rte_mbuf *pkt,
 			    struct buf_vector *buf_vec,
 			    uint16_t *nr_descs,
-			    uint16_t *nr_buffers,
-			    struct rte_vhost_iovec *iovec,
-			    struct rte_vhost_iov_iter *iter)
+			    uint16_t *nr_buffers)
 {
 	uint16_t nr_vec = 0;
 	uint16_t avail_idx = vq->last_avail_idx;
@@ -1736,7 +1742,7 @@  vhost_enqueue_async_packed(struct virtio_net *dev,
 	}
 
 	if (unlikely(async_mbuf_to_desc(dev, vq, pkt, buf_vec, nr_vec,
-					*nr_buffers, iovec, iter) < 0))
+					*nr_buffers) < 0))
 		return -1;
 
 	vhost_shadow_enqueue_packed(vq, buffer_len, buffer_buf_id, buffer_desc_count, *nr_buffers);
@@ -1746,13 +1752,12 @@  vhost_enqueue_async_packed(struct virtio_net *dev,
 
 static __rte_always_inline int16_t
 virtio_dev_rx_async_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
-			    struct rte_mbuf *pkt, uint16_t *nr_descs, uint16_t *nr_buffers,
-			    struct rte_vhost_iovec *iovec, struct rte_vhost_iov_iter *iter)
+			    struct rte_mbuf *pkt, uint16_t *nr_descs, uint16_t *nr_buffers)
 {
 	struct buf_vector buf_vec[BUF_VECTOR_MAX];
 
-	if (unlikely(vhost_enqueue_async_packed(dev, vq, pkt, buf_vec, nr_descs, nr_buffers,
-						 iovec, iter) < 0)) {
+	if (unlikely(vhost_enqueue_async_packed(dev, vq, pkt, buf_vec,
+					nr_descs, nr_buffers) < 0)) {
 		VHOST_LOG_DATA(DEBUG, "(%d) failed to get enough desc from vring\n", dev->vid);
 		return -1;
 	}
@@ -1794,20 +1799,17 @@  virtio_dev_rx_async_submit_packed(struct virtio_net *dev,
 	struct vhost_virtqueue *vq, uint16_t queue_id,
 	struct rte_mbuf **pkts, uint32_t count)
 {
-	uint32_t pkt_idx = 0, pkt_burst_idx = 0;
+	uint32_t pkt_idx = 0;
 	uint32_t remained = count;
 	int32_t n_xfer;
 	uint16_t num_buffers;
 	uint16_t num_descs;
 
 	struct vhost_async *async = vq->async;
-	struct rte_vhost_iov_iter *iter = async->iov_iter;
-	struct rte_vhost_async_desc tdes[MAX_PKT_BURST];
-	struct rte_vhost_iovec *iovec = async->iovec;
+	struct rte_vhost_async_desc async_descs[MAX_PKT_BURST];
 	struct async_inflight_info *pkts_info = async->pkts_info;
-	uint32_t n_pkts = 0, pkt_err = 0;
+	uint32_t pkt_err = 0;
 	uint16_t slot_idx = 0;
-	uint16_t iovec_idx = 0, it_idx = 0;
 
 	do {
 		rte_prefetch0(&vq->desc_packed[vq->last_avail_idx]);
@@ -1815,71 +1817,36 @@  virtio_dev_rx_async_submit_packed(struct virtio_net *dev,
 		num_buffers = 0;
 		num_descs = 0;
 		if (unlikely(virtio_dev_rx_async_packed(dev, vq, pkts[pkt_idx],
-						&num_descs, &num_buffers,
-						&iovec[iovec_idx], &iter[it_idx]) < 0))
+						&num_descs, &num_buffers) < 0))
 			break;
 
 		slot_idx = (async->pkts_idx + pkt_idx) % vq->size;
 
-		async_fill_desc(&tdes[pkt_burst_idx++], &iter[it_idx]);
 		pkts_info[slot_idx].descs = num_descs;
 		pkts_info[slot_idx].nr_buffers = num_buffers;
 		pkts_info[slot_idx].mbuf = pkts[pkt_idx];
-		iovec_idx += iter[it_idx].nr_segs;
-		it_idx++;
 
 		pkt_idx++;
 		remained--;
 		vq_inc_last_avail_packed(vq, num_descs);
+	} while (pkt_idx < count);
 
-		/*
-		 * condition to trigger async device transfer:
-		 * - unused async iov number is less than max vhost vector
-		 */
-		if (unlikely(VHOST_MAX_ASYNC_VEC - iovec_idx < BUF_VECTOR_MAX)) {
-			n_xfer = async->ops.transfer_data(dev->vid,
-					queue_id, tdes, 0, pkt_burst_idx);
-			if (likely(n_xfer >= 0)) {
-				n_pkts = n_xfer;
-			} else {
-				VHOST_LOG_DATA(ERR,
-					"(%d) %s: failed to transfer data for queue id %d.\n",
-					dev->vid, __func__, queue_id);
-				n_pkts = 0;
-			}
-
-			iovec_idx = 0;
-			it_idx = 0;
-
-			if (unlikely(n_pkts < pkt_burst_idx)) {
-				/*
-				 * log error packets number here and do actual
-				 * error processing when applications poll
-				 * completion
-				 */
-				pkt_err = pkt_burst_idx - n_pkts;
-				pkt_burst_idx = 0;
-				break;
-			}
+	if (unlikely(pkt_idx == 0))
+		return 0;
 
-			pkt_burst_idx = 0;
-		}
-	} while (pkt_idx < count);
+	async_fill_descs(async, async_descs);
 
-	if (pkt_burst_idx) {
-		n_xfer = async->ops.transfer_data(dev->vid, queue_id, tdes, 0, pkt_burst_idx);
-		if (likely(n_xfer >= 0)) {
-			n_pkts = n_xfer;
-		} else {
-			VHOST_LOG_DATA(ERR, "(%d) %s: failed to transfer data for queue id %d.\n",
+	n_xfer = async->ops.transfer_data(dev->vid, queue_id, async_descs, 0, pkt_idx);
+	if (unlikely(n_xfer < 0)) {
+		VHOST_LOG_DATA(ERR, "(%d) %s: failed to transfer data for queue id %d.\n",
 				dev->vid, __func__, queue_id);
-			n_pkts = 0;
-		}
-
-		if (unlikely(n_pkts < pkt_burst_idx))
-			pkt_err = pkt_burst_idx - n_pkts;
+		n_xfer = 0;
 	}
 
+	pkt_err = pkt_idx - n_xfer;
+
+	async_iter_reset(async);
+
 	if (unlikely(pkt_err))
 		dma_error_handler_packed(vq, slot_idx, pkt_err, &pkt_idx);