vhost: batch used descriptors chains write-back with packed ring
Checks
Commit Message
Instead of writing back descriptors chains in order, let's
write the first chain flags last in order to improve batching.
With Kernel's pktgen benchmark, ~3% performance gain is measured.
Signed-off-by: Maxime Coquelin <maxime.coquelin@redhat.com>
---
lib/librte_vhost/virtio_net.c | 39 +++++++++++++++++++++--------------
1 file changed, 24 insertions(+), 15 deletions(-)
Comments
Sorry, just notice I missed to add v2 to the commit message prefix.
On 12/12/18 9:24 AM, Maxime Coquelin wrote:
> Instead of writing back descriptors chains in order, let's
> write the first chain flags last in order to improve batching.
>
> With Kernel's pktgen benchmark, ~3% performance gain is measured.
>
> Signed-off-by: Maxime Coquelin <maxime.coquelin@redhat.com>
> ---
> lib/librte_vhost/virtio_net.c | 39 +++++++++++++++++++++--------------
> 1 file changed, 24 insertions(+), 15 deletions(-)
>
On 12.12.2018 11:24, Maxime Coquelin wrote:
> Instead of writing back descriptors chains in order, let's
> write the first chain flags last in order to improve batching.
>
> With Kernel's pktgen benchmark, ~3% performance gain is measured.
>
> Signed-off-by: Maxime Coquelin <maxime.coquelin@redhat.com>
> ---
> lib/librte_vhost/virtio_net.c | 39 +++++++++++++++++++++--------------
> 1 file changed, 24 insertions(+), 15 deletions(-)
>
Hi.
I made some rough testing on my ARMv8 system with this patch and v1 of it.
Here is the performance difference with current master:
v1: +1.1 %
v2: -3.6 %
So, write barriers are quiet heavy in practice.
My testcase is the three instances of testpmd on a same host (with v11 from Jens):
txonly (virtio_user0) --> fwd mode io (vhost0, vhost1) --> rxonly (virtio_user1)
Best regards, Ilya Maximets.
> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
> index 5e1a1a727..c0b3d1137 100644
> --- a/lib/librte_vhost/virtio_net.c
> +++ b/lib/librte_vhost/virtio_net.c
> @@ -135,19 +135,10 @@ flush_shadow_used_ring_packed(struct virtio_net *dev,
> struct vhost_virtqueue *vq)
> {
> int i;
> - uint16_t used_idx = vq->last_used_idx;
> + uint16_t head_flags, head_idx = vq->last_used_idx;
>
> - /* Split loop in two to save memory barriers */
> - for (i = 0; i < vq->shadow_used_idx; i++) {
> - vq->desc_packed[used_idx].id = vq->shadow_used_packed[i].id;
> - vq->desc_packed[used_idx].len = vq->shadow_used_packed[i].len;
> -
> - used_idx += vq->shadow_used_packed[i].count;
> - if (used_idx >= vq->size)
> - used_idx -= vq->size;
> - }
> -
> - rte_smp_wmb();
> + if (unlikely(vq->shadow_used_idx == 0))
> + return;
>
> for (i = 0; i < vq->shadow_used_idx; i++) {
> uint16_t flags;
> @@ -165,12 +156,24 @@ flush_shadow_used_ring_packed(struct virtio_net *dev,
> flags &= ~VRING_DESC_F_AVAIL;
> }
>
> - vq->desc_packed[vq->last_used_idx].flags = flags;
> + vq->desc_packed[vq->last_used_idx].id =
> + vq->shadow_used_packed[i].id;
> + vq->desc_packed[vq->last_used_idx].len =
> + vq->shadow_used_packed[i].len;
> +
> + rte_smp_wmb();
>
> - vhost_log_cache_used_vring(dev, vq,
> + if (i > 0) {
> + vq->desc_packed[vq->last_used_idx].flags = flags;
> +
> + vhost_log_cache_used_vring(dev, vq,
> vq->last_used_idx *
> sizeof(struct vring_packed_desc),
> sizeof(struct vring_packed_desc));
> + } else {
> + head_idx = vq->last_used_idx;
> + head_flags = flags;
> + }
>
> vq->last_used_idx += vq->shadow_used_packed[i].count;
> if (vq->last_used_idx >= vq->size) {
> @@ -179,8 +182,14 @@ flush_shadow_used_ring_packed(struct virtio_net *dev,
> }
> }
>
> - rte_smp_wmb();
> + vq->desc_packed[head_idx].flags = head_flags;
> vq->shadow_used_idx = 0;
> +
> + vhost_log_cache_used_vring(dev, vq,
> + head_idx *
> + sizeof(struct vring_packed_desc),
> + sizeof(struct vring_packed_desc));
> +
> vhost_log_cache_sync(dev, vq);
> }
>
>
Hi Ilya,
On 12/12/18 4:23 PM, Ilya Maximets wrote:
> On 12.12.2018 11:24, Maxime Coquelin wrote:
>> Instead of writing back descriptors chains in order, let's
>> write the first chain flags last in order to improve batching.
>>
>> With Kernel's pktgen benchmark, ~3% performance gain is measured.
>>
>> Signed-off-by: Maxime Coquelin <maxime.coquelin@redhat.com>
>> ---
>> lib/librte_vhost/virtio_net.c | 39 +++++++++++++++++++++--------------
>> 1 file changed, 24 insertions(+), 15 deletions(-)
>>
>
> Hi.
> I made some rough testing on my ARMv8 system with this patch and v1 of it.
> Here is the performance difference with current master:
> v1: +1.1 %
> v2: -3.6 %
>
> So, write barriers are quiet heavy in practice.
Thanks for testing it on ARM. Indeed, SMP WMB is heavier on ARM.
To reduce the number of WMBs, I propose to revert back to original
implementation by first writing all .len and .id fields, do the write
barrier then writing all flags by writing first one last:
diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 5e1a1a727..58a277c53 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -136,6 +136,8 @@ flush_shadow_used_ring_packed(struct virtio_net *dev,
{
int i;
uint16_t used_idx = vq->last_used_idx;
+ uint16_t head_idx = vq->last_used_idx;
+ uint16_t head_flags = 0;
/* Split loop in two to save memory barriers */
for (i = 0; i < vq->shadow_used_idx; i++) {
@@ -165,12 +167,17 @@ flush_shadow_used_ring_packed(struct virtio_net *dev,
flags &= ~VRING_DESC_F_AVAIL;
}
- vq->desc_packed[vq->last_used_idx].flags = flags;
+ if (i > 0) {
+ vq->desc_packed[vq->last_used_idx].flags = flags;
- vhost_log_cache_used_vring(dev, vq,
+ vhost_log_cache_used_vring(dev, vq,
vq->last_used_idx *
sizeof(struct vring_packed_desc),
sizeof(struct vring_packed_desc));
+ } else {
+ head_idx = vq->last_used_idx;
+ head_flags = flags;
+ }
vq->last_used_idx += vq->shadow_used_packed[i].count;
if (vq->last_used_idx >= vq->size) {
@@ -179,7 +186,13 @@ flush_shadow_used_ring_packed(struct virtio_net *dev,
}
}
- rte_smp_wmb();
+ vq->desc_packed[head_idx].flags = head_flags;
+
+ vhost_log_cache_used_vring(dev, vq,
+ vq->last_used_idx *
+ sizeof(struct vring_packed_desc),
+ sizeof(struct vring_packed_desc));
+
vq->shadow_used_idx = 0;
vhost_log_cache_sync(dev, vq);
}
> My testcase is the three instances of testpmd on a same host (with v11 from Jens):
>
> txonly (virtio_user0) --> fwd mode io (vhost0, vhost1) --> rxonly (virtio_user1)
>
> Best regards, Ilya Maximets.
>
>> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
>> index 5e1a1a727..c0b3d1137 100644
>> --- a/lib/librte_vhost/virtio_net.c
>> +++ b/lib/librte_vhost/virtio_net.c
>> @@ -135,19 +135,10 @@ flush_shadow_used_ring_packed(struct virtio_net *dev,
>> struct vhost_virtqueue *vq)
>> {
>> int i;
>> - uint16_t used_idx = vq->last_used_idx;
>> + uint16_t head_flags, head_idx = vq->last_used_idx;
>>
>> - /* Split loop in two to save memory barriers */
>> - for (i = 0; i < vq->shadow_used_idx; i++) {
>> - vq->desc_packed[used_idx].id = vq->shadow_used_packed[i].id;
>> - vq->desc_packed[used_idx].len = vq->shadow_used_packed[i].len;
>> -
>> - used_idx += vq->shadow_used_packed[i].count;
>> - if (used_idx >= vq->size)
>> - used_idx -= vq->size;
>> - }
>> -
>> - rte_smp_wmb();
>> + if (unlikely(vq->shadow_used_idx == 0))
>> + return;
>>
>> for (i = 0; i < vq->shadow_used_idx; i++) {
>> uint16_t flags;
>> @@ -165,12 +156,24 @@ flush_shadow_used_ring_packed(struct virtio_net *dev,
>> flags &= ~VRING_DESC_F_AVAIL;
>> }
>>
>> - vq->desc_packed[vq->last_used_idx].flags = flags;
>> + vq->desc_packed[vq->last_used_idx].id =
>> + vq->shadow_used_packed[i].id;
>> + vq->desc_packed[vq->last_used_idx].len =
>> + vq->shadow_used_packed[i].len;
>> +
>> + rte_smp_wmb();
>>
>> - vhost_log_cache_used_vring(dev, vq,
>> + if (i > 0) {
>> + vq->desc_packed[vq->last_used_idx].flags = flags;
>> +
>> + vhost_log_cache_used_vring(dev, vq,
>> vq->last_used_idx *
>> sizeof(struct vring_packed_desc),
>> sizeof(struct vring_packed_desc));
>> + } else {
>> + head_idx = vq->last_used_idx;
>> + head_flags = flags;
>> + }
>>
>> vq->last_used_idx += vq->shadow_used_packed[i].count;
>> if (vq->last_used_idx >= vq->size) {
>> @@ -179,8 +182,14 @@ flush_shadow_used_ring_packed(struct virtio_net *dev,
>> }
>> }
>>
>> - rte_smp_wmb();
>> + vq->desc_packed[head_idx].flags = head_flags;
>> vq->shadow_used_idx = 0;
>> +
>> + vhost_log_cache_used_vring(dev, vq,
>> + head_idx *
>> + sizeof(struct vring_packed_desc),
>> + sizeof(struct vring_packed_desc));
>> +
>> vhost_log_cache_sync(dev, vq);
>> }
>>
>>
On Wed, Dec 12, 2018 at 05:34:31PM +0100, Maxime Coquelin wrote:
> Hi Ilya,
>
> On 12/12/18 4:23 PM, Ilya Maximets wrote:
> > On 12.12.2018 11:24, Maxime Coquelin wrote:
> > > Instead of writing back descriptors chains in order, let's
> > > write the first chain flags last in order to improve batching.
> > >
> > > With Kernel's pktgen benchmark, ~3% performance gain is measured.
> > >
> > > Signed-off-by: Maxime Coquelin <maxime.coquelin@redhat.com>
> > > ---
> > > lib/librte_vhost/virtio_net.c | 39 +++++++++++++++++++++--------------
> > > 1 file changed, 24 insertions(+), 15 deletions(-)
> > >
> >
> > Hi.
> > I made some rough testing on my ARMv8 system with this patch and v1 of it.
> > Here is the performance difference with current master:
> > v1: +1.1 %
> > v2: -3.6 %
> >
> > So, write barriers are quiet heavy in practice.
>
> Thanks for testing it on ARM. Indeed, SMP WMB is heavier on ARM.
Besides your ideas for improving packed rings, maybe we should switch to
load_acquite/store_release?
See
virtio: use smp_load_acquire/smp_store_release
which worked fine but as I only tested on x86 did not result in any gains.
On 12/12/18 7:53 PM, Michael S. Tsirkin wrote:
> On Wed, Dec 12, 2018 at 05:34:31PM +0100, Maxime Coquelin wrote:
>> Hi Ilya,
>>
>> On 12/12/18 4:23 PM, Ilya Maximets wrote:
>>> On 12.12.2018 11:24, Maxime Coquelin wrote:
>>>> Instead of writing back descriptors chains in order, let's
>>>> write the first chain flags last in order to improve batching.
>>>>
>>>> With Kernel's pktgen benchmark, ~3% performance gain is measured.
>>>>
>>>> Signed-off-by: Maxime Coquelin <maxime.coquelin@redhat.com>
>>>> ---
>>>> lib/librte_vhost/virtio_net.c | 39 +++++++++++++++++++++--------------
>>>> 1 file changed, 24 insertions(+), 15 deletions(-)
>>>>
>>>
>>> Hi.
>>> I made some rough testing on my ARMv8 system with this patch and v1 of it.
>>> Here is the performance difference with current master:
>>> v1: +1.1 %
>>> v2: -3.6 %
>>>
>>> So, write barriers are quiet heavy in practice.
>>
>> Thanks for testing it on ARM. Indeed, SMP WMB is heavier on ARM.
>
> Besides your ideas for improving packed rings, maybe we should switch to
> load_acquite/store_release?
>
> See
> virtio: use smp_load_acquire/smp_store_release
>
> which worked fine but as I only tested on x86 did not result in any gains.
>
Thanks for the pointer.
We'll look into it for v19.05, as -rc1 for v19.02 is planned for end of
week, so it will be too late to introduce such changes.
Regards,
Maxime
On Wed, Dec 19, 2018 at 10:16:24AM +0100, Maxime Coquelin wrote:
>
>
> On 12/12/18 7:53 PM, Michael S. Tsirkin wrote:
> > On Wed, Dec 12, 2018 at 05:34:31PM +0100, Maxime Coquelin wrote:
> > > Hi Ilya,
> > >
> > > On 12/12/18 4:23 PM, Ilya Maximets wrote:
> > > > On 12.12.2018 11:24, Maxime Coquelin wrote:
> > > > > Instead of writing back descriptors chains in order, let's
> > > > > write the first chain flags last in order to improve batching.
> > > > >
> > > > > With Kernel's pktgen benchmark, ~3% performance gain is measured.
> > > > >
> > > > > Signed-off-by: Maxime Coquelin <maxime.coquelin@redhat.com>
> > > > > ---
> > > > > lib/librte_vhost/virtio_net.c | 39 +++++++++++++++++++++--------------
> > > > > 1 file changed, 24 insertions(+), 15 deletions(-)
> > > > >
> > > >
> > > > Hi.
> > > > I made some rough testing on my ARMv8 system with this patch and v1 of it.
> > > > Here is the performance difference with current master:
> > > > v1: +1.1 %
> > > > v2: -3.6 %
> > > >
> > > > So, write barriers are quiet heavy in practice.
> > >
> > > Thanks for testing it on ARM. Indeed, SMP WMB is heavier on ARM.
> >
> > Besides your ideas for improving packed rings, maybe we should switch to
> > load_acquite/store_release?
> >
> > See
> > virtio: use smp_load_acquire/smp_store_release
> >
> > which worked fine but as I only tested on x86 did not result in any gains.
> >
>
> Thanks for the pointer.
> We'll look into it for v19.05, as -rc1 for v19.02 is planned for end of
> week, so it will be too late to introduce such changes.
>
> Regards,
> Maxime
That's not the only option BTW. For loads, another option it to work
the value into an indirect dependency which does not need
a barrier.
For example:
#define OPTIMIZER_HIDE_VAR(var) \
__asm__ ("" : "=r" (var) : "0" (var))
unsigned empty = last_used == idx->used;
if (!empty) {
OPTIMIZER_HIDE_VAR(empty);
desc = used->ring[last_used + empty];
}
See linux for definitions of OPTIMIZER_HIDE_VAR.
One side effect of this is that this also blocks code speculation.
which can be a good or a bad thing for performance,
but can be a good thing for security.
@@ -135,19 +135,10 @@ flush_shadow_used_ring_packed(struct virtio_net *dev,
struct vhost_virtqueue *vq)
{
int i;
- uint16_t used_idx = vq->last_used_idx;
+ uint16_t head_flags, head_idx = vq->last_used_idx;
- /* Split loop in two to save memory barriers */
- for (i = 0; i < vq->shadow_used_idx; i++) {
- vq->desc_packed[used_idx].id = vq->shadow_used_packed[i].id;
- vq->desc_packed[used_idx].len = vq->shadow_used_packed[i].len;
-
- used_idx += vq->shadow_used_packed[i].count;
- if (used_idx >= vq->size)
- used_idx -= vq->size;
- }
-
- rte_smp_wmb();
+ if (unlikely(vq->shadow_used_idx == 0))
+ return;
for (i = 0; i < vq->shadow_used_idx; i++) {
uint16_t flags;
@@ -165,12 +156,24 @@ flush_shadow_used_ring_packed(struct virtio_net *dev,
flags &= ~VRING_DESC_F_AVAIL;
}
- vq->desc_packed[vq->last_used_idx].flags = flags;
+ vq->desc_packed[vq->last_used_idx].id =
+ vq->shadow_used_packed[i].id;
+ vq->desc_packed[vq->last_used_idx].len =
+ vq->shadow_used_packed[i].len;
+
+ rte_smp_wmb();
- vhost_log_cache_used_vring(dev, vq,
+ if (i > 0) {
+ vq->desc_packed[vq->last_used_idx].flags = flags;
+
+ vhost_log_cache_used_vring(dev, vq,
vq->last_used_idx *
sizeof(struct vring_packed_desc),
sizeof(struct vring_packed_desc));
+ } else {
+ head_idx = vq->last_used_idx;
+ head_flags = flags;
+ }
vq->last_used_idx += vq->shadow_used_packed[i].count;
if (vq->last_used_idx >= vq->size) {
@@ -179,8 +182,14 @@ flush_shadow_used_ring_packed(struct virtio_net *dev,
}
}
- rte_smp_wmb();
+ vq->desc_packed[head_idx].flags = head_flags;
vq->shadow_used_idx = 0;
+
+ vhost_log_cache_used_vring(dev, vq,
+ head_idx *
+ sizeof(struct vring_packed_desc),
+ sizeof(struct vring_packed_desc));
+
vhost_log_cache_sync(dev, vq);
}