[v1,2/2] lib/vhost: restrict pointer aliasing for packed path
Checks
Commit Message
Restrict pointer aliasing to allow the compiler to vectorize
loops more aggressively.
With this patch, a 9.6% improvement is observed in throughput for the
packed virtio-net PVP case, and a 2.8% improvement in throughput for
the packed virtio-user PVP case. All performance data are measured
under 0.001% acceptable packet loss with 1 core on both vhost and
virtio side.
Signed-off-by: Joyce Kong <joyce.kong@arm.com>
Reviewed-by: Phil Yang <phil.yang@arm.com>
---
lib/librte_vhost/virtio_net.c | 14 +++++++-------
1 file changed, 7 insertions(+), 7 deletions(-)
Comments
On 6/11/20 5:32 AM, Joyce Kong wrote:
> Restrict pointer aliasing to allow the compiler to vectorize
> loops more aggressively.
>
> With this patch, a 9.6% improvement is observed in throughput for the
> packed virtio-net PVP case, and a 2.8% improvement in throughput for
> the packed virtio-user PVP case. All performance data are measured
> under 0.001% acceptable packet loss with 1 core on both vhost and
> virtio side.
Is the performance gain related solely to this patch or is it the result of the
combined series?
>
> Signed-off-by: Joyce Kong <joyce.kong@arm.com>
> Reviewed-by: Phil Yang <phil.yang@arm.com>
> ---
> lib/librte_vhost/virtio_net.c | 14 +++++++-------
> 1 file changed, 7 insertions(+), 7 deletions(-)
>
> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
> index 751c1f373..39c92e7e1 100644
> --- a/lib/librte_vhost/virtio_net.c
> +++ b/lib/librte_vhost/virtio_net.c
> @@ -1133,8 +1133,8 @@ virtio_dev_rx_single_packed(struct virtio_net *dev,
>
> static __rte_noinline uint32_t
> virtio_dev_rx_packed(struct virtio_net *dev,
> - struct vhost_virtqueue *vq,
> - struct rte_mbuf **pkts,
> + struct vhost_virtqueue *__restrict vq,
> + struct rte_mbuf **__restrict pkts,
> uint32_t count)
> {
> uint32_t pkt_idx = 0;
I wonder if we're extracting the full potential of "restrict" considering that
the heavy lifting is done by the inner functions:
(virtio_dev_rx_batch_packed and virtio_dev_rx_single_packed)
> @@ -1219,7 +1219,7 @@ virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
>
> uint16_t
> rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
> - struct rte_mbuf **pkts, uint16_t count)
> + struct rte_mbuf **__restrict pkts, uint16_t count)
> {
> struct virtio_net *dev = get_device(vid);
>
Is this considered an api change?
> @@ -2124,9 +2124,9 @@ free_zmbuf(struct vhost_virtqueue *vq)
>
> static __rte_noinline uint16_t
> virtio_dev_tx_packed_zmbuf(struct virtio_net *dev,
> - struct vhost_virtqueue *vq,
> + struct vhost_virtqueue *__restrict vq,
> struct rte_mempool *mbuf_pool,
> - struct rte_mbuf **pkts,
> + struct rte_mbuf **__restrict pkts,
> uint32_t count)
> {
> uint32_t pkt_idx = 0;
> @@ -2160,9 +2160,9 @@ virtio_dev_tx_packed_zmbuf(struct virtio_net *dev,
>
> static __rte_noinline uint16_t
> virtio_dev_tx_packed(struct virtio_net *dev,
> - struct vhost_virtqueue *vq,
> + struct vhost_virtqueue *__restrict vq,
> struct rte_mempool *mbuf_pool,
> - struct rte_mbuf **pkts,
> + struct rte_mbuf **__restrict pkts,
> uint32_t count)
> {
> uint32_t pkt_idx = 0;
>
> -----Original Message-----
> From: Adrian Moreno <amorenoz@redhat.com>
> Sent: Wednesday, July 8, 2020 12:26 AM
> To: Joyce Kong <Joyce.Kong@arm.com>; maxime.coquelin@redhat.com;
> jerinj@marvell.com; zhihong.wang@intel.com; xiaolong.ye@intel.com;
> Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>; Phil Yang
> <Phil.Yang@arm.com>; Ruifeng Wang <Ruifeng.Wang@arm.com>
> Cc: dev@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH v1 2/2] lib/vhost: restrict pointer aliasing for
> packed path
>
>
> On 6/11/20 5:32 AM, Joyce Kong wrote:
> > Restrict pointer aliasing to allow the compiler to vectorize loops
> > more aggressively.
> >
> > With this patch, a 9.6% improvement is observed in throughput for the
> > packed virtio-net PVP case, and a 2.8% improvement in throughput for
> > the packed virtio-user PVP case. All performance data are measured
> > under 0.001% acceptable packet loss with 1 core on both vhost and
> > virtio side.
>
> Is the performance gain related solely to this patch or is it the result of the
> combined series?
>
The performance gain is solely related to this patch on ThunderX-2 platform.
To test the perf for vhost patch, I use both 1 core on vhost and virtio side.
To test the perf for virtio patch, I use 2 cores on vhost side and 1 core on virtio side
to saturate the vCPU cycles, in this way the benefits of the patch can be manifested.
> >
> > Signed-off-by: Joyce Kong <joyce.kong@arm.com>
> > Reviewed-by: Phil Yang <phil.yang@arm.com>
> > ---
> > lib/librte_vhost/virtio_net.c | 14 +++++++-------
> > 1 file changed, 7 insertions(+), 7 deletions(-)
> >
> > diff --git a/lib/librte_vhost/virtio_net.c
> > b/lib/librte_vhost/virtio_net.c index 751c1f373..39c92e7e1 100644
> > --- a/lib/librte_vhost/virtio_net.c
> > +++ b/lib/librte_vhost/virtio_net.c
> > @@ -1133,8 +1133,8 @@ virtio_dev_rx_single_packed(struct virtio_net
> > *dev,
> >
> > static __rte_noinline uint32_t
> > virtio_dev_rx_packed(struct virtio_net *dev,
> > - struct vhost_virtqueue *vq,
> > - struct rte_mbuf **pkts,
> > + struct vhost_virtqueue *__restrict vq,
> > + struct rte_mbuf **__restrict pkts,
> > uint32_t count)
> > {
> > uint32_t pkt_idx = 0;
>
> I wonder if we're extracting the full potential of "restrict" considering that the
> heavy lifting is done by the inner functions:
> (virtio_dev_rx_batch_packed and virtio_dev_rx_single_packed)
>
Yes, for 'restrict' variables in noinline functions, they will still keep 'restrict' feature
when passed into inner functions.
> > @@ -1219,7 +1219,7 @@ virtio_dev_rx(struct virtio_net *dev, uint16_t
> > queue_id,
> >
> > uint16_t
> > rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
> > - struct rte_mbuf **pkts, uint16_t count)
> > + struct rte_mbuf **__restrict pkts, uint16_t count)
> > {
> > struct virtio_net *dev = get_device(vid);
> >
> Is this considered an api change?
>
Yes.
> > @@ -2124,9 +2124,9 @@ free_zmbuf(struct vhost_virtqueue *vq)
> >
> > static __rte_noinline uint16_t
> > virtio_dev_tx_packed_zmbuf(struct virtio_net *dev,
> > - struct vhost_virtqueue *vq,
> > + struct vhost_virtqueue *__restrict vq,
> > struct rte_mempool *mbuf_pool,
> > - struct rte_mbuf **pkts,
> > + struct rte_mbuf **__restrict pkts,
> > uint32_t count)
> > {
> > uint32_t pkt_idx = 0;
> > @@ -2160,9 +2160,9 @@ virtio_dev_tx_packed_zmbuf(struct virtio_net
> > *dev,
> >
> > static __rte_noinline uint16_t
> > virtio_dev_tx_packed(struct virtio_net *dev,
> > - struct vhost_virtqueue *vq,
> > + struct vhost_virtqueue *__restrict vq,
> > struct rte_mempool *mbuf_pool,
> > - struct rte_mbuf **pkts,
> > + struct rte_mbuf **__restrict pkts,
> > uint32_t count)
> > {
> > uint32_t pkt_idx = 0;
> >
>
> --
> Adrián Moreno
@@ -1133,8 +1133,8 @@ virtio_dev_rx_single_packed(struct virtio_net *dev,
static __rte_noinline uint32_t
virtio_dev_rx_packed(struct virtio_net *dev,
- struct vhost_virtqueue *vq,
- struct rte_mbuf **pkts,
+ struct vhost_virtqueue *__restrict vq,
+ struct rte_mbuf **__restrict pkts,
uint32_t count)
{
uint32_t pkt_idx = 0;
@@ -1219,7 +1219,7 @@ virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
uint16_t
rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
- struct rte_mbuf **pkts, uint16_t count)
+ struct rte_mbuf **__restrict pkts, uint16_t count)
{
struct virtio_net *dev = get_device(vid);
@@ -2124,9 +2124,9 @@ free_zmbuf(struct vhost_virtqueue *vq)
static __rte_noinline uint16_t
virtio_dev_tx_packed_zmbuf(struct virtio_net *dev,
- struct vhost_virtqueue *vq,
+ struct vhost_virtqueue *__restrict vq,
struct rte_mempool *mbuf_pool,
- struct rte_mbuf **pkts,
+ struct rte_mbuf **__restrict pkts,
uint32_t count)
{
uint32_t pkt_idx = 0;
@@ -2160,9 +2160,9 @@ virtio_dev_tx_packed_zmbuf(struct virtio_net *dev,
static __rte_noinline uint16_t
virtio_dev_tx_packed(struct virtio_net *dev,
- struct vhost_virtqueue *vq,
+ struct vhost_virtqueue *__restrict vq,
struct rte_mempool *mbuf_pool,
- struct rte_mbuf **pkts,
+ struct rte_mbuf **__restrict pkts,
uint32_t count)
{
uint32_t pkt_idx = 0;