[v1,2/2] lib/vhost: restrict pointer aliasing for packed path

Message ID 20200611033248.39049-3-joyce.kong@arm.com (mailing list archive)
State Superseded, archived
Delegated to: Maxime Coquelin
Headers
Series virtio: restrict pointer aliasing for loops vectorization |

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/travis-robot success Travis build: passed
ci/Intel-compilation success Compilation OK

Commit Message

Joyce Kong June 11, 2020, 3:32 a.m. UTC
  Restrict pointer aliasing to allow the compiler to vectorize
loops more aggressively.

With this patch, a 9.6% improvement is observed in throughput for the
packed virtio-net PVP case, and a 2.8% improvement in throughput for
the packed virtio-user PVP case. All performance data are measured
under 0.001% acceptable packet loss with 1 core on both vhost and
virtio side.

Signed-off-by: Joyce Kong <joyce.kong@arm.com>
Reviewed-by: Phil Yang <phil.yang@arm.com>
---
 lib/librte_vhost/virtio_net.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)
  

Comments

Adrian Moreno July 7, 2020, 4:25 p.m. UTC | #1
On 6/11/20 5:32 AM, Joyce Kong wrote:
> Restrict pointer aliasing to allow the compiler to vectorize
> loops more aggressively.
> 
> With this patch, a 9.6% improvement is observed in throughput for the
> packed virtio-net PVP case, and a 2.8% improvement in throughput for
> the packed virtio-user PVP case. All performance data are measured
> under 0.001% acceptable packet loss with 1 core on both vhost and
> virtio side.

Is the performance gain related solely to this patch or is it the result of the
combined series?

> 
> Signed-off-by: Joyce Kong <joyce.kong@arm.com>
> Reviewed-by: Phil Yang <phil.yang@arm.com>
> ---
>  lib/librte_vhost/virtio_net.c | 14 +++++++-------
>  1 file changed, 7 insertions(+), 7 deletions(-)
> 
> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
> index 751c1f373..39c92e7e1 100644
> --- a/lib/librte_vhost/virtio_net.c
> +++ b/lib/librte_vhost/virtio_net.c
> @@ -1133,8 +1133,8 @@ virtio_dev_rx_single_packed(struct virtio_net *dev,
>  
>  static __rte_noinline uint32_t
>  virtio_dev_rx_packed(struct virtio_net *dev,
> -		     struct vhost_virtqueue *vq,
> -		     struct rte_mbuf **pkts,
> +		     struct vhost_virtqueue *__restrict vq,
> +		     struct rte_mbuf **__restrict pkts,
>  		     uint32_t count)
>  {
>  	uint32_t pkt_idx = 0;

I wonder if we're extracting the full potential of "restrict" considering that
the heavy lifting is done by the inner functions:
(virtio_dev_rx_batch_packed and virtio_dev_rx_single_packed)

> @@ -1219,7 +1219,7 @@ virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
>  
>  uint16_t
>  rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
> -	struct rte_mbuf **pkts, uint16_t count)
> +	struct rte_mbuf **__restrict pkts, uint16_t count)
>  {
>  	struct virtio_net *dev = get_device(vid);
>  
Is this considered an api change?

> @@ -2124,9 +2124,9 @@ free_zmbuf(struct vhost_virtqueue *vq)
>  
>  static __rte_noinline uint16_t
>  virtio_dev_tx_packed_zmbuf(struct virtio_net *dev,
> -			   struct vhost_virtqueue *vq,
> +			   struct vhost_virtqueue *__restrict vq,
>  			   struct rte_mempool *mbuf_pool,
> -			   struct rte_mbuf **pkts,
> +			   struct rte_mbuf **__restrict pkts,
>  			   uint32_t count)
>  {
>  	uint32_t pkt_idx = 0;
> @@ -2160,9 +2160,9 @@ virtio_dev_tx_packed_zmbuf(struct virtio_net *dev,
>  
>  static __rte_noinline uint16_t
>  virtio_dev_tx_packed(struct virtio_net *dev,
> -		     struct vhost_virtqueue *vq,
> +		     struct vhost_virtqueue *__restrict vq,
>  		     struct rte_mempool *mbuf_pool,
> -		     struct rte_mbuf **pkts,
> +		     struct rte_mbuf **__restrict pkts,
>  		     uint32_t count)
>  {
>  	uint32_t pkt_idx = 0;
>
  
Joyce Kong July 10, 2020, 3:15 a.m. UTC | #2
> -----Original Message-----
> From: Adrian Moreno <amorenoz@redhat.com>
> Sent: Wednesday, July 8, 2020 12:26 AM
> To: Joyce Kong <Joyce.Kong@arm.com>; maxime.coquelin@redhat.com;
> jerinj@marvell.com; zhihong.wang@intel.com; xiaolong.ye@intel.com;
> Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>; Phil Yang
> <Phil.Yang@arm.com>; Ruifeng Wang <Ruifeng.Wang@arm.com>
> Cc: dev@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH v1 2/2] lib/vhost: restrict pointer aliasing for
> packed path
> 
> 
> On 6/11/20 5:32 AM, Joyce Kong wrote:
> > Restrict pointer aliasing to allow the compiler to vectorize loops
> > more aggressively.
> >
> > With this patch, a 9.6% improvement is observed in throughput for the
> > packed virtio-net PVP case, and a 2.8% improvement in throughput for
> > the packed virtio-user PVP case. All performance data are measured
> > under 0.001% acceptable packet loss with 1 core on both vhost and
> > virtio side.
> 
> Is the performance gain related solely to this patch or is it the result of the
> combined series?
> 

The performance gain is solely related to this patch on ThunderX-2 platform.
To test the perf for vhost patch, I use both 1 core on vhost and virtio side.
To test the perf for virtio patch, I use 2 cores on vhost side and 1 core on virtio side
to saturate the vCPU cycles, in this way the benefits of the patch can be manifested.

> >
> > Signed-off-by: Joyce Kong <joyce.kong@arm.com>
> > Reviewed-by: Phil Yang <phil.yang@arm.com>
> > ---
> >  lib/librte_vhost/virtio_net.c | 14 +++++++-------
> >  1 file changed, 7 insertions(+), 7 deletions(-)
> >
> > diff --git a/lib/librte_vhost/virtio_net.c
> > b/lib/librte_vhost/virtio_net.c index 751c1f373..39c92e7e1 100644
> > --- a/lib/librte_vhost/virtio_net.c
> > +++ b/lib/librte_vhost/virtio_net.c
> > @@ -1133,8 +1133,8 @@ virtio_dev_rx_single_packed(struct virtio_net
> > *dev,
> >
> >  static __rte_noinline uint32_t
> >  virtio_dev_rx_packed(struct virtio_net *dev,
> > -		     struct vhost_virtqueue *vq,
> > -		     struct rte_mbuf **pkts,
> > +		     struct vhost_virtqueue *__restrict vq,
> > +		     struct rte_mbuf **__restrict pkts,
> >  		     uint32_t count)
> >  {
> >  	uint32_t pkt_idx = 0;
> 
> I wonder if we're extracting the full potential of "restrict" considering that the
> heavy lifting is done by the inner functions:
> (virtio_dev_rx_batch_packed and virtio_dev_rx_single_packed)
>

Yes, for 'restrict' variables in noinline functions, they will still keep 'restrict' feature
when passed into inner functions.
 
> > @@ -1219,7 +1219,7 @@ virtio_dev_rx(struct virtio_net *dev, uint16_t
> > queue_id,
> >
> >  uint16_t
> >  rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
> > -	struct rte_mbuf **pkts, uint16_t count)
> > +	struct rte_mbuf **__restrict pkts, uint16_t count)
> >  {
> >  	struct virtio_net *dev = get_device(vid);
> >
> Is this considered an api change?
> 
Yes.

> > @@ -2124,9 +2124,9 @@ free_zmbuf(struct vhost_virtqueue *vq)
> >
> >  static __rte_noinline uint16_t
> >  virtio_dev_tx_packed_zmbuf(struct virtio_net *dev,
> > -			   struct vhost_virtqueue *vq,
> > +			   struct vhost_virtqueue *__restrict vq,
> >  			   struct rte_mempool *mbuf_pool,
> > -			   struct rte_mbuf **pkts,
> > +			   struct rte_mbuf **__restrict pkts,
> >  			   uint32_t count)
> >  {
> >  	uint32_t pkt_idx = 0;
> > @@ -2160,9 +2160,9 @@ virtio_dev_tx_packed_zmbuf(struct virtio_net
> > *dev,
> >
> >  static __rte_noinline uint16_t
> >  virtio_dev_tx_packed(struct virtio_net *dev,
> > -		     struct vhost_virtqueue *vq,
> > +		     struct vhost_virtqueue *__restrict vq,
> >  		     struct rte_mempool *mbuf_pool,
> > -		     struct rte_mbuf **pkts,
> > +		     struct rte_mbuf **__restrict pkts,
> >  		     uint32_t count)
> >  {
> >  	uint32_t pkt_idx = 0;
> >
> 
> --
> Adrián Moreno
  

Patch

diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 751c1f373..39c92e7e1 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -1133,8 +1133,8 @@  virtio_dev_rx_single_packed(struct virtio_net *dev,
 
 static __rte_noinline uint32_t
 virtio_dev_rx_packed(struct virtio_net *dev,
-		     struct vhost_virtqueue *vq,
-		     struct rte_mbuf **pkts,
+		     struct vhost_virtqueue *__restrict vq,
+		     struct rte_mbuf **__restrict pkts,
 		     uint32_t count)
 {
 	uint32_t pkt_idx = 0;
@@ -1219,7 +1219,7 @@  virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
 
 uint16_t
 rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
-	struct rte_mbuf **pkts, uint16_t count)
+	struct rte_mbuf **__restrict pkts, uint16_t count)
 {
 	struct virtio_net *dev = get_device(vid);
 
@@ -2124,9 +2124,9 @@  free_zmbuf(struct vhost_virtqueue *vq)
 
 static __rte_noinline uint16_t
 virtio_dev_tx_packed_zmbuf(struct virtio_net *dev,
-			   struct vhost_virtqueue *vq,
+			   struct vhost_virtqueue *__restrict vq,
 			   struct rte_mempool *mbuf_pool,
-			   struct rte_mbuf **pkts,
+			   struct rte_mbuf **__restrict pkts,
 			   uint32_t count)
 {
 	uint32_t pkt_idx = 0;
@@ -2160,9 +2160,9 @@  virtio_dev_tx_packed_zmbuf(struct virtio_net *dev,
 
 static __rte_noinline uint16_t
 virtio_dev_tx_packed(struct virtio_net *dev,
-		     struct vhost_virtqueue *vq,
+		     struct vhost_virtqueue *__restrict vq,
 		     struct rte_mempool *mbuf_pool,
-		     struct rte_mbuf **pkts,
+		     struct rte_mbuf **__restrict pkts,
 		     uint32_t count)
 {
 	uint32_t pkt_idx = 0;