vhost: batch used descriptors chains write-back with packed ring

Message ID 20181212082403.12002-1-maxime.coquelin@redhat.com (mailing list archive)
State Superseded, archived
Delegated to: Maxime Coquelin
Headers
Series vhost: batch used descriptors chains write-back with packed ring |

Checks

Context Check Description
ci/Intel-compilation success Compilation OK
ci/mellanox-Performance-Testing success Performance Testing PASS
ci/intel-Performance-Testing success Performance Testing PASS

Commit Message

Maxime Coquelin Dec. 12, 2018, 8:24 a.m. UTC
  Instead of writing back descriptors chains in order, let's
write the first chain flags last in order to improve batching.

With Kernel's pktgen benchmark, ~3% performance gain is measured.

Signed-off-by: Maxime Coquelin <maxime.coquelin@redhat.com>
---
 lib/librte_vhost/virtio_net.c | 39 +++++++++++++++++++++--------------
 1 file changed, 24 insertions(+), 15 deletions(-)
  

Comments

Maxime Coquelin Dec. 12, 2018, 9:41 a.m. UTC | #1
Sorry, just notice I missed to add v2 to the commit message prefix.

On 12/12/18 9:24 AM, Maxime Coquelin wrote:
> Instead of writing back descriptors chains in order, let's
> write the first chain flags last in order to improve batching.
> 
> With Kernel's pktgen benchmark, ~3% performance gain is measured.
> 
> Signed-off-by: Maxime Coquelin <maxime.coquelin@redhat.com>
> ---
>   lib/librte_vhost/virtio_net.c | 39 +++++++++++++++++++++--------------
>   1 file changed, 24 insertions(+), 15 deletions(-)
>
  
Ilya Maximets Dec. 12, 2018, 3:23 p.m. UTC | #2
On 12.12.2018 11:24, Maxime Coquelin wrote:
> Instead of writing back descriptors chains in order, let's
> write the first chain flags last in order to improve batching.
> 
> With Kernel's pktgen benchmark, ~3% performance gain is measured.
> 
> Signed-off-by: Maxime Coquelin <maxime.coquelin@redhat.com>
> ---
>  lib/librte_vhost/virtio_net.c | 39 +++++++++++++++++++++--------------
>  1 file changed, 24 insertions(+), 15 deletions(-)
> 

Hi.
I made some rough testing on my ARMv8 system with this patch and v1 of it.
Here is the performance difference with current master:
    v1: +1.1 %
    v2: -3.6 %

So, write barriers are quiet heavy in practice.

My testcase is the three instances of testpmd on a same host (with v11 from Jens):

    txonly (virtio_user0) --> fwd mode io (vhost0, vhost1) --> rxonly (virtio_user1)

Best regards, Ilya Maximets.

> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
> index 5e1a1a727..c0b3d1137 100644
> --- a/lib/librte_vhost/virtio_net.c
> +++ b/lib/librte_vhost/virtio_net.c
> @@ -135,19 +135,10 @@ flush_shadow_used_ring_packed(struct virtio_net *dev,
>  			struct vhost_virtqueue *vq)
>  {
>  	int i;
> -	uint16_t used_idx = vq->last_used_idx;
> +	uint16_t head_flags, head_idx = vq->last_used_idx;
>  
> -	/* Split loop in two to save memory barriers */
> -	for (i = 0; i < vq->shadow_used_idx; i++) {
> -		vq->desc_packed[used_idx].id = vq->shadow_used_packed[i].id;
> -		vq->desc_packed[used_idx].len = vq->shadow_used_packed[i].len;
> -
> -		used_idx += vq->shadow_used_packed[i].count;
> -		if (used_idx >= vq->size)
> -			used_idx -= vq->size;
> -	}
> -
> -	rte_smp_wmb();
> +	if (unlikely(vq->shadow_used_idx == 0))
> +		return;
>  
>  	for (i = 0; i < vq->shadow_used_idx; i++) {
>  		uint16_t flags;
> @@ -165,12 +156,24 @@ flush_shadow_used_ring_packed(struct virtio_net *dev,
>  			flags &= ~VRING_DESC_F_AVAIL;
>  		}
>  
> -		vq->desc_packed[vq->last_used_idx].flags = flags;
> +		vq->desc_packed[vq->last_used_idx].id =
> +			vq->shadow_used_packed[i].id;
> +		vq->desc_packed[vq->last_used_idx].len =
> +			vq->shadow_used_packed[i].len;
> +
> +		rte_smp_wmb();
>  
> -		vhost_log_cache_used_vring(dev, vq,
> +		if (i > 0) {
> +			vq->desc_packed[vq->last_used_idx].flags = flags;
> +
> +			vhost_log_cache_used_vring(dev, vq,
>  					vq->last_used_idx *
>  					sizeof(struct vring_packed_desc),
>  					sizeof(struct vring_packed_desc));
> +		} else {
> +			head_idx = vq->last_used_idx;
> +			head_flags = flags;
> +		}
>  
>  		vq->last_used_idx += vq->shadow_used_packed[i].count;
>  		if (vq->last_used_idx >= vq->size) {
> @@ -179,8 +182,14 @@ flush_shadow_used_ring_packed(struct virtio_net *dev,
>  		}
>  	}
>  
> -	rte_smp_wmb();
> +	vq->desc_packed[head_idx].flags = head_flags;
>  	vq->shadow_used_idx = 0;
> +
> +	vhost_log_cache_used_vring(dev, vq,
> +				head_idx *
> +				sizeof(struct vring_packed_desc),
> +				sizeof(struct vring_packed_desc));
> +
>  	vhost_log_cache_sync(dev, vq);
>  }
>  
>
  
Maxime Coquelin Dec. 12, 2018, 4:34 p.m. UTC | #3
Hi Ilya,

On 12/12/18 4:23 PM, Ilya Maximets wrote:
> On 12.12.2018 11:24, Maxime Coquelin wrote:
>> Instead of writing back descriptors chains in order, let's
>> write the first chain flags last in order to improve batching.
>>
>> With Kernel's pktgen benchmark, ~3% performance gain is measured.
>>
>> Signed-off-by: Maxime Coquelin <maxime.coquelin@redhat.com>
>> ---
>>   lib/librte_vhost/virtio_net.c | 39 +++++++++++++++++++++--------------
>>   1 file changed, 24 insertions(+), 15 deletions(-)
>>
> 
> Hi.
> I made some rough testing on my ARMv8 system with this patch and v1 of it.
> Here is the performance difference with current master:
>      v1: +1.1 %
>      v2: -3.6 %
> 
> So, write barriers are quiet heavy in practice.

Thanks for testing it on ARM. Indeed, SMP WMB is heavier on ARM.

To reduce the number of WMBs, I propose to revert back to original
implementation by first writing all .len and .id fields, do the write
barrier then writing all flags by writing first one last:

diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 5e1a1a727..58a277c53 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -136,6 +136,8 @@ flush_shadow_used_ring_packed(struct virtio_net *dev,
  {
         int i;
         uint16_t used_idx = vq->last_used_idx;
+       uint16_t head_idx = vq->last_used_idx;
+       uint16_t head_flags = 0;

         /* Split loop in two to save memory barriers */
         for (i = 0; i < vq->shadow_used_idx; i++) {
@@ -165,12 +167,17 @@ flush_shadow_used_ring_packed(struct virtio_net *dev,
                         flags &= ~VRING_DESC_F_AVAIL;
                 }

-               vq->desc_packed[vq->last_used_idx].flags = flags;
+               if (i > 0) {
+                       vq->desc_packed[vq->last_used_idx].flags = flags;

-               vhost_log_cache_used_vring(dev, vq,
+                       vhost_log_cache_used_vring(dev, vq,
                                         vq->last_used_idx *
                                         sizeof(struct vring_packed_desc),
                                         sizeof(struct vring_packed_desc));
+               } else {
+                       head_idx = vq->last_used_idx;
+                       head_flags = flags;
+               }

                 vq->last_used_idx += vq->shadow_used_packed[i].count;
                 if (vq->last_used_idx >= vq->size) {
@@ -179,7 +186,13 @@ flush_shadow_used_ring_packed(struct virtio_net *dev,
                 }
         }

-       rte_smp_wmb();
+       vq->desc_packed[head_idx].flags = head_flags;
+
+       vhost_log_cache_used_vring(dev, vq,
+                               vq->last_used_idx *
+                               sizeof(struct vring_packed_desc),
+                               sizeof(struct vring_packed_desc));
+
         vq->shadow_used_idx = 0;
         vhost_log_cache_sync(dev, vq);
  }


> My testcase is the three instances of testpmd on a same host (with v11 from Jens):
> 
>      txonly (virtio_user0) --> fwd mode io (vhost0, vhost1) --> rxonly (virtio_user1)
> 
> Best regards, Ilya Maximets.
> 
>> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
>> index 5e1a1a727..c0b3d1137 100644
>> --- a/lib/librte_vhost/virtio_net.c
>> +++ b/lib/librte_vhost/virtio_net.c
>> @@ -135,19 +135,10 @@ flush_shadow_used_ring_packed(struct virtio_net *dev,
>>   			struct vhost_virtqueue *vq)
>>   {
>>   	int i;
>> -	uint16_t used_idx = vq->last_used_idx;
>> +	uint16_t head_flags, head_idx = vq->last_used_idx;
>>   
>> -	/* Split loop in two to save memory barriers */
>> -	for (i = 0; i < vq->shadow_used_idx; i++) {
>> -		vq->desc_packed[used_idx].id = vq->shadow_used_packed[i].id;
>> -		vq->desc_packed[used_idx].len = vq->shadow_used_packed[i].len;
>> -
>> -		used_idx += vq->shadow_used_packed[i].count;
>> -		if (used_idx >= vq->size)
>> -			used_idx -= vq->size;
>> -	}
>> -
>> -	rte_smp_wmb();
>> +	if (unlikely(vq->shadow_used_idx == 0))
>> +		return;
>>   
>>   	for (i = 0; i < vq->shadow_used_idx; i++) {
>>   		uint16_t flags;
>> @@ -165,12 +156,24 @@ flush_shadow_used_ring_packed(struct virtio_net *dev,
>>   			flags &= ~VRING_DESC_F_AVAIL;
>>   		}
>>   
>> -		vq->desc_packed[vq->last_used_idx].flags = flags;
>> +		vq->desc_packed[vq->last_used_idx].id =
>> +			vq->shadow_used_packed[i].id;
>> +		vq->desc_packed[vq->last_used_idx].len =
>> +			vq->shadow_used_packed[i].len;
>> +
>> +		rte_smp_wmb();
>>   
>> -		vhost_log_cache_used_vring(dev, vq,
>> +		if (i > 0) {
>> +			vq->desc_packed[vq->last_used_idx].flags = flags;
>> +
>> +			vhost_log_cache_used_vring(dev, vq,
>>   					vq->last_used_idx *
>>   					sizeof(struct vring_packed_desc),
>>   					sizeof(struct vring_packed_desc));
>> +		} else {
>> +			head_idx = vq->last_used_idx;
>> +			head_flags = flags;
>> +		}
>>   
>>   		vq->last_used_idx += vq->shadow_used_packed[i].count;
>>   		if (vq->last_used_idx >= vq->size) {
>> @@ -179,8 +182,14 @@ flush_shadow_used_ring_packed(struct virtio_net *dev,
>>   		}
>>   	}
>>   
>> -	rte_smp_wmb();
>> +	vq->desc_packed[head_idx].flags = head_flags;
>>   	vq->shadow_used_idx = 0;
>> +
>> +	vhost_log_cache_used_vring(dev, vq,
>> +				head_idx *
>> +				sizeof(struct vring_packed_desc),
>> +				sizeof(struct vring_packed_desc));
>> +
>>   	vhost_log_cache_sync(dev, vq);
>>   }
>>   
>>
  
Michael S. Tsirkin Dec. 12, 2018, 6:53 p.m. UTC | #4
On Wed, Dec 12, 2018 at 05:34:31PM +0100, Maxime Coquelin wrote:
> Hi Ilya,
> 
> On 12/12/18 4:23 PM, Ilya Maximets wrote:
> > On 12.12.2018 11:24, Maxime Coquelin wrote:
> > > Instead of writing back descriptors chains in order, let's
> > > write the first chain flags last in order to improve batching.
> > > 
> > > With Kernel's pktgen benchmark, ~3% performance gain is measured.
> > > 
> > > Signed-off-by: Maxime Coquelin <maxime.coquelin@redhat.com>
> > > ---
> > >   lib/librte_vhost/virtio_net.c | 39 +++++++++++++++++++++--------------
> > >   1 file changed, 24 insertions(+), 15 deletions(-)
> > > 
> > 
> > Hi.
> > I made some rough testing on my ARMv8 system with this patch and v1 of it.
> > Here is the performance difference with current master:
> >      v1: +1.1 %
> >      v2: -3.6 %
> > 
> > So, write barriers are quiet heavy in practice.
> 
> Thanks for testing it on ARM. Indeed, SMP WMB is heavier on ARM.

Besides your ideas for improving packed rings, maybe we should switch to
load_acquite/store_release?

See
	virtio: use smp_load_acquire/smp_store_release

which worked fine but as I only tested on x86 did not result in any gains.
  
Maxime Coquelin Dec. 19, 2018, 9:16 a.m. UTC | #5
On 12/12/18 7:53 PM, Michael S. Tsirkin wrote:
> On Wed, Dec 12, 2018 at 05:34:31PM +0100, Maxime Coquelin wrote:
>> Hi Ilya,
>>
>> On 12/12/18 4:23 PM, Ilya Maximets wrote:
>>> On 12.12.2018 11:24, Maxime Coquelin wrote:
>>>> Instead of writing back descriptors chains in order, let's
>>>> write the first chain flags last in order to improve batching.
>>>>
>>>> With Kernel's pktgen benchmark, ~3% performance gain is measured.
>>>>
>>>> Signed-off-by: Maxime Coquelin <maxime.coquelin@redhat.com>
>>>> ---
>>>>    lib/librte_vhost/virtio_net.c | 39 +++++++++++++++++++++--------------
>>>>    1 file changed, 24 insertions(+), 15 deletions(-)
>>>>
>>>
>>> Hi.
>>> I made some rough testing on my ARMv8 system with this patch and v1 of it.
>>> Here is the performance difference with current master:
>>>       v1: +1.1 %
>>>       v2: -3.6 %
>>>
>>> So, write barriers are quiet heavy in practice.
>>
>> Thanks for testing it on ARM. Indeed, SMP WMB is heavier on ARM.
> 
> Besides your ideas for improving packed rings, maybe we should switch to
> load_acquite/store_release?
> 
> See
> 	virtio: use smp_load_acquire/smp_store_release
> 
> which worked fine but as I only tested on x86 did not result in any gains.
> 

Thanks for the pointer.
We'll look into it for v19.05, as -rc1 for v19.02 is planned for end of
week, so it will be too late to introduce such changes.

Regards,
Maxime
  
Michael S. Tsirkin Dec. 19, 2018, 4:10 p.m. UTC | #6
On Wed, Dec 19, 2018 at 10:16:24AM +0100, Maxime Coquelin wrote:
> 
> 
> On 12/12/18 7:53 PM, Michael S. Tsirkin wrote:
> > On Wed, Dec 12, 2018 at 05:34:31PM +0100, Maxime Coquelin wrote:
> > > Hi Ilya,
> > > 
> > > On 12/12/18 4:23 PM, Ilya Maximets wrote:
> > > > On 12.12.2018 11:24, Maxime Coquelin wrote:
> > > > > Instead of writing back descriptors chains in order, let's
> > > > > write the first chain flags last in order to improve batching.
> > > > > 
> > > > > With Kernel's pktgen benchmark, ~3% performance gain is measured.
> > > > > 
> > > > > Signed-off-by: Maxime Coquelin <maxime.coquelin@redhat.com>
> > > > > ---
> > > > >    lib/librte_vhost/virtio_net.c | 39 +++++++++++++++++++++--------------
> > > > >    1 file changed, 24 insertions(+), 15 deletions(-)
> > > > > 
> > > > 
> > > > Hi.
> > > > I made some rough testing on my ARMv8 system with this patch and v1 of it.
> > > > Here is the performance difference with current master:
> > > >       v1: +1.1 %
> > > >       v2: -3.6 %
> > > > 
> > > > So, write barriers are quiet heavy in practice.
> > > 
> > > Thanks for testing it on ARM. Indeed, SMP WMB is heavier on ARM.
> > 
> > Besides your ideas for improving packed rings, maybe we should switch to
> > load_acquite/store_release?
> > 
> > See
> > 	virtio: use smp_load_acquire/smp_store_release
> > 
> > which worked fine but as I only tested on x86 did not result in any gains.
> > 
> 
> Thanks for the pointer.
> We'll look into it for v19.05, as -rc1 for v19.02 is planned for end of
> week, so it will be too late to introduce such changes.
> 
> Regards,
> Maxime

That's not the only option BTW.  For loads, another option it to work
the value into an indirect dependency which does not need
a barrier.

For example:

#define OPTIMIZER_HIDE_VAR(var)                                         \
        __asm__ ("" : "=r" (var) : "0" (var))

unsigned empty = last_used == idx->used;
if (!empty) {
	OPTIMIZER_HIDE_VAR(empty);
	desc = used->ring[last_used + empty];
}


See linux for definitions of OPTIMIZER_HIDE_VAR.

One side effect of this is that this also blocks code speculation.
which can be a good or a bad thing for performance,
but can be a good thing for security.
  

Patch

diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 5e1a1a727..c0b3d1137 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -135,19 +135,10 @@  flush_shadow_used_ring_packed(struct virtio_net *dev,
 			struct vhost_virtqueue *vq)
 {
 	int i;
-	uint16_t used_idx = vq->last_used_idx;
+	uint16_t head_flags, head_idx = vq->last_used_idx;
 
-	/* Split loop in two to save memory barriers */
-	for (i = 0; i < vq->shadow_used_idx; i++) {
-		vq->desc_packed[used_idx].id = vq->shadow_used_packed[i].id;
-		vq->desc_packed[used_idx].len = vq->shadow_used_packed[i].len;
-
-		used_idx += vq->shadow_used_packed[i].count;
-		if (used_idx >= vq->size)
-			used_idx -= vq->size;
-	}
-
-	rte_smp_wmb();
+	if (unlikely(vq->shadow_used_idx == 0))
+		return;
 
 	for (i = 0; i < vq->shadow_used_idx; i++) {
 		uint16_t flags;
@@ -165,12 +156,24 @@  flush_shadow_used_ring_packed(struct virtio_net *dev,
 			flags &= ~VRING_DESC_F_AVAIL;
 		}
 
-		vq->desc_packed[vq->last_used_idx].flags = flags;
+		vq->desc_packed[vq->last_used_idx].id =
+			vq->shadow_used_packed[i].id;
+		vq->desc_packed[vq->last_used_idx].len =
+			vq->shadow_used_packed[i].len;
+
+		rte_smp_wmb();
 
-		vhost_log_cache_used_vring(dev, vq,
+		if (i > 0) {
+			vq->desc_packed[vq->last_used_idx].flags = flags;
+
+			vhost_log_cache_used_vring(dev, vq,
 					vq->last_used_idx *
 					sizeof(struct vring_packed_desc),
 					sizeof(struct vring_packed_desc));
+		} else {
+			head_idx = vq->last_used_idx;
+			head_flags = flags;
+		}
 
 		vq->last_used_idx += vq->shadow_used_packed[i].count;
 		if (vq->last_used_idx >= vq->size) {
@@ -179,8 +182,14 @@  flush_shadow_used_ring_packed(struct virtio_net *dev,
 		}
 	}
 
-	rte_smp_wmb();
+	vq->desc_packed[head_idx].flags = head_flags;
 	vq->shadow_used_idx = 0;
+
+	vhost_log_cache_used_vring(dev, vq,
+				head_idx *
+				sizeof(struct vring_packed_desc),
+				sizeof(struct vring_packed_desc));
+
 	vhost_log_cache_sync(dev, vq);
 }