[v11,06/16] eal: use prefetch intrinsics

Message ID 1691781658-32520-7-git-send-email-roretzla@linux.microsoft.com (mailing list archive)
State Accepted, archived
Delegated to: David Marchand
Headers
Series msvc integration changes |

Checks

Context Check Description
ci/checkpatch success coding style OK

Commit Message

Tyler Retzlaff Aug. 11, 2023, 7:20 p.m. UTC
  Inline assembly is not supported for MSVC x64 instead use _mm_prefetch
and _mm_cldemote intrinsics.

Signed-off-by: Tyler Retzlaff <roretzla@linux.microsoft.com>
Acked-by: Bruce Richardson <bruce.richardson@intel.com>
Acked-by: Morten Brørup <mb@smartsharesystems.com>
Acked-by: Konstantin Ananyev <konstantin.v.ananyev@yandex.ru>
---
 lib/eal/x86/include/rte_prefetch.h | 25 +++++++++++++++++++++----
 1 file changed, 21 insertions(+), 4 deletions(-)
  

Comments

David Marchand Aug. 24, 2023, 12:06 p.m. UTC | #1
On Fri, Aug 11, 2023 at 9:21 PM Tyler Retzlaff
<roretzla@linux.microsoft.com> wrote:
>
> Inline assembly is not supported for MSVC x64 instead use _mm_prefetch
> and _mm_cldemote intrinsics.
>
> Signed-off-by: Tyler Retzlaff <roretzla@linux.microsoft.com>
> Acked-by: Bruce Richardson <bruce.richardson@intel.com>
> Acked-by: Morten Brørup <mb@smartsharesystems.com>
> Acked-by: Konstantin Ananyev <konstantin.v.ananyev@yandex.ru>
> ---
>  lib/eal/x86/include/rte_prefetch.h | 25 +++++++++++++++++++++----
>  1 file changed, 21 insertions(+), 4 deletions(-)
>
> diff --git a/lib/eal/x86/include/rte_prefetch.h b/lib/eal/x86/include/rte_prefetch.h
> index 7fd01c4..7a6988e 100644
> --- a/lib/eal/x86/include/rte_prefetch.h
> +++ b/lib/eal/x86/include/rte_prefetch.h
> @@ -9,30 +9,38 @@
>  extern "C" {
>  #endif
>
> +#include <emmintrin.h>
> +
>  #include <rte_compat.h>
>  #include <rte_common.h>
>  #include "generic/rte_prefetch.h"
>
> +#pragma GCC diagnostic push
> +#pragma GCC diagnostic ignored "-Wcast-qual"
> +
>  static inline void rte_prefetch0(const volatile void *p)
>  {
> -       asm volatile ("prefetcht0 %[p]" : : [p] "m" (*(const volatile char *)p));
> +       _mm_prefetch((const void *)p, _MM_HINT_T0);
>  }
>

Quite surprisingly, this part (exactly) breaks 32bits build in the
vhost library:

ccache cc -Ilib/librte_vhost.a.p -Ilib
-I../../../git/pub/dpdk.org/main/lib -Ilib/vhost
-I../../../git/pub/dpdk.org/main/lib/vhost -I.
-I../../../git/pub/dpdk.org/main -Iconfig
-I../../../git/pub/dpdk.org/main/config -Ilib/eal/include
-I../../../git/pub/dpdk.org/main/lib/eal/include
-Ilib/eal/linux/include
-I../../../git/pub/dpdk.org/main/lib/eal/linux/include
-Ilib/eal/x86/include
-I../../../git/pub/dpdk.org/main/lib/eal/x86/include -Ilib/eal/common
-I../../../git/pub/dpdk.org/main/lib/eal/common -Ilib/eal
-I../../../git/pub/dpdk.org/main/lib/eal -Ilib/kvargs
-I../../../git/pub/dpdk.org/main/lib/kvargs -Ilib/log
-I../../../git/pub/dpdk.org/main/lib/log -Ilib/metrics
-I../../../git/pub/dpdk.org/main/lib/metrics -Ilib/telemetry
-I../../../git/pub/dpdk.org/main/lib/telemetry -Ilib/ethdev
-I../../../git/pub/dpdk.org/main/lib/ethdev -Ilib/net
-I../../../git/pub/dpdk.org/main/lib/net -Ilib/mbuf
-I../../../git/pub/dpdk.org/main/lib/mbuf -Ilib/mempool
-I../../../git/pub/dpdk.org/main/lib/mempool -Ilib/ring
-I../../../git/pub/dpdk.org/main/lib/ring -Ilib/meter
-I../../../git/pub/dpdk.org/main/lib/meter -Ilib/cryptodev
-I../../../git/pub/dpdk.org/main/lib/cryptodev -Ilib/rcu
-I../../../git/pub/dpdk.org/main/lib/rcu -Ilib/hash
-I../../../git/pub/dpdk.org/main/lib/hash -Ilib/pci
-I../../../git/pub/dpdk.org/main/lib/pci -Ilib/dmadev
-I../../../git/pub/dpdk.org/main/lib/dmadev -fdiagnostics-color=always
-D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -Werror -std=c11
-O2 -g -include rte_config.h -Wcast-qual -Wdeprecated -Wformat
-Wformat-nonliteral -Wformat-security -Wmissing-declarations
-Wmissing-prototypes -Wnested-externs -Wold-style-definition
-Wpointer-arith -Wsign-compare -Wstrict-prototypes -Wundef
-Wwrite-strings -Wno-address-of-packed-member -Wno-packed-not-aligned
-Wno-missing-field-initializers -Wno-zero-length-bounds
-Wno-pointer-to-int-cast -D_GNU_SOURCE -m32 -fPIC -march=native -mrtm
-DALLOW_EXPERIMENTAL_API -DALLOW_INTERNAL_API -Wno-format-truncation
-DVHOST_GCC_UNROLL_PRAGMA -fno-strict-aliasing -DVHOST_HAS_VDUSE
-DRTE_LOG_DEFAULT_LOGTYPE=lib.vhost -MD -MQ
lib/librte_vhost.a.p/vhost_virtio_net.c.o -MF
lib/librte_vhost.a.p/vhost_virtio_net.c.o.d -o
lib/librte_vhost.a.p/vhost_virtio_net.c.o -c
../../../git/pub/dpdk.org/main/lib/vhost/virtio_net.c
In function ‘mbuf_to_desc’,
    inlined from ‘vhost_enqueue_async_packed’ at
../../../git/pub/dpdk.org/main/lib/vhost/virtio_net.c:1948:6,
    inlined from ‘virtio_dev_rx_async_packed’ at
../../../git/pub/dpdk.org/main/lib/vhost/virtio_net.c:1965:6,
    inlined from ‘virtio_dev_rx_async_submit_packed’ at
../../../git/pub/dpdk.org/main/lib/vhost/virtio_net.c:2127:7:
../../../git/pub/dpdk.org/main/lib/vhost/virtio_net.c:1230:18: error:
‘buf_vec[0].buf_addr’ may be used uninitialized
[-Werror=maybe-uninitialized]
 1230 |         buf_addr = buf_vec[vec_idx].buf_addr;
      |         ~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~
../../../git/pub/dpdk.org/main/lib/vhost/virtio_net.c: In function
‘virtio_dev_rx_async_submit_packed’:
../../../git/pub/dpdk.org/main/lib/vhost/virtio_net.c:1963:27: note:
‘buf_vec’ declared here
 1963 |         struct buf_vector buf_vec[BUF_VECTOR_MAX];
      |                           ^~~~~~~
In function ‘mbuf_to_desc’,
    inlined from ‘vhost_enqueue_async_packed’ at
../../../git/pub/dpdk.org/main/lib/vhost/virtio_net.c:1948:6,
    inlined from ‘virtio_dev_rx_async_packed’ at
../../../git/pub/dpdk.org/main/lib/vhost/virtio_net.c:1965:6,
    inlined from ‘virtio_dev_rx_async_submit_packed’ at
../../../git/pub/dpdk.org/main/lib/vhost/virtio_net.c:2127:7:
../../../git/pub/dpdk.org/main/lib/vhost/virtio_net.c:1231:18: error:
‘buf_vec[0].buf_iova’ may be used uninitialized
[-Werror=maybe-uninitialized]
 1231 |         buf_iova = buf_vec[vec_idx].buf_iova;
      |         ~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~
../../../git/pub/dpdk.org/main/lib/vhost/virtio_net.c: In function
‘virtio_dev_rx_async_submit_packed’:
../../../git/pub/dpdk.org/main/lib/vhost/virtio_net.c:1963:27: note:
‘buf_vec’ declared here
 1963 |         struct buf_vector buf_vec[BUF_VECTOR_MAX];
      |                           ^~~~~~~
In function ‘mbuf_to_desc’,
    inlined from ‘vhost_enqueue_async_packed’ at
../../../git/pub/dpdk.org/main/lib/vhost/virtio_net.c:1948:6,
    inlined from ‘virtio_dev_rx_async_packed’ at
../../../git/pub/dpdk.org/main/lib/vhost/virtio_net.c:1965:6,
    inlined from ‘virtio_dev_rx_async_submit_packed’ at
../../../git/pub/dpdk.org/main/lib/vhost/virtio_net.c:2127:7:
../../../git/pub/dpdk.org/main/lib/vhost/virtio_net.c:1232:35: error:
‘buf_vec[0].buf_len’ may be used uninitialized
[-Werror=maybe-uninitialized]
 1232 |         buf_len = buf_vec[vec_idx].buf_len;
      |                   ~~~~~~~~~~~~~~~~^~~~~~~~
../../../git/pub/dpdk.org/main/lib/vhost/virtio_net.c: In function
‘virtio_dev_rx_async_submit_packed’:
../../../git/pub/dpdk.org/main/lib/vhost/virtio_net.c:1963:27: note:
‘buf_vec’ declared here
 1963 |         struct buf_vector buf_vec[BUF_VECTOR_MAX];
      |                           ^~~~~~~
cc1: all warnings being treated as errors
ninja: build stopped: subcommand failed.

I had a look at the vhost library and I think the compiler thinks size may be 0.
Changing the loop on size with a do { } while (size > 0); resolves the warning.
I can post a change for this, as we hit a similar issue in the past
and the code does not make sense comparing size on the first iteration
of this loop.

However, I am a bit puzzled why the prefetch change makes the compiler
consider this loop differently.
We have the same constructs everywhere in this library and x86_64
builds are fine...
  
David Marchand Aug. 24, 2023, 12:25 p.m. UTC | #2
On Thu, Aug 24, 2023 at 2:06 PM David Marchand
<david.marchand@redhat.com> wrote:
> I had a look at the vhost library and I think the compiler thinks size may be 0.
> Changing the loop on size with a do { } while (size > 0); resolves the warning.
> I can post a change for this, as we hit a similar issue in the past
> and the code does not make sense comparing size on the first iteration
> of this loop.

This sounds like a commit 4226aa9caca9 ("vhost: fix build with GCC
12") we had on LoongArch.

And the 32bits build warning goes away with this:

diff --git a/lib/vhost/virtio_net.c b/lib/vhost/virtio_net.c
index d7624d18c8..41328b7530 100644
--- a/lib/vhost/virtio_net.c
+++ b/lib/vhost/virtio_net.c
@@ -1906,7 +1906,7 @@ vhost_enqueue_async_packed(struct virtio_net *dev,
        uint16_t buf_id = 0;
        uint32_t len = 0;
        uint16_t desc_count = 0;
-       uint64_t size = pkt->pkt_len + sizeof(struct virtio_net_hdr_mrg_rxbuf);
+       uint64_t size = (uint64_t)pkt->pkt_len + sizeof(struct
virtio_net_hdr_mrg_rxbuf);
        uint32_t buffer_len[vq->size];
        uint16_t buffer_buf_id[vq->size];
        uint16_t buffer_desc_count[vq->size];


>
> However, I am a bit puzzled why the prefetch change makes the compiler
> consider this loop differently.
> We have the same constructs everywhere in this library and x86_64
> builds are fine...
  
Morten Brørup Aug. 24, 2023, 12:46 p.m. UTC | #3
> From: David Marchand [mailto:david.marchand@redhat.com]
> Sent: Thursday, 24 August 2023 14.06
> 
> On Fri, Aug 11, 2023 at 9:21 PM Tyler Retzlaff
> <roretzla@linux.microsoft.com> wrote:
> >
> > Inline assembly is not supported for MSVC x64 instead use _mm_prefetch
> > and _mm_cldemote intrinsics.
> >
> > Signed-off-by: Tyler Retzlaff <roretzla@linux.microsoft.com>
> > Acked-by: Bruce Richardson <bruce.richardson@intel.com>
> > Acked-by: Morten Brørup <mb@smartsharesystems.com>
> > Acked-by: Konstantin Ananyev <konstantin.v.ananyev@yandex.ru>
> > ---
> >  lib/eal/x86/include/rte_prefetch.h | 25 +++++++++++++++++++++----
> >  1 file changed, 21 insertions(+), 4 deletions(-)
> >
> > diff --git a/lib/eal/x86/include/rte_prefetch.h
> b/lib/eal/x86/include/rte_prefetch.h
> > index 7fd01c4..7a6988e 100644
> > --- a/lib/eal/x86/include/rte_prefetch.h
> > +++ b/lib/eal/x86/include/rte_prefetch.h
> > @@ -9,30 +9,38 @@
> >  extern "C" {
> >  #endif
> >
> > +#include <emmintrin.h>
> > +
> >  #include <rte_compat.h>
> >  #include <rte_common.h>
> >  #include "generic/rte_prefetch.h"
> >
> > +#pragma GCC diagnostic push
> > +#pragma GCC diagnostic ignored "-Wcast-qual"
> > +
> >  static inline void rte_prefetch0(const volatile void *p)
> >  {
> > -       asm volatile ("prefetcht0 %[p]" : : [p] "m" (*(const volatile char
> *)p));
> > +       _mm_prefetch((const void *)p, _MM_HINT_T0);
> >  }
> >
> 
> Quite surprisingly, this part (exactly) breaks 32bits build in the
> vhost library:
> 
> ccache cc -Ilib/librte_vhost.a.p -Ilib
> -I../../../git/pub/dpdk.org/main/lib -Ilib/vhost
> -I../../../git/pub/dpdk.org/main/lib/vhost -I.
> -I../../../git/pub/dpdk.org/main -Iconfig
> -I../../../git/pub/dpdk.org/main/config -Ilib/eal/include
> -I../../../git/pub/dpdk.org/main/lib/eal/include
> -Ilib/eal/linux/include
> -I../../../git/pub/dpdk.org/main/lib/eal/linux/include
> -Ilib/eal/x86/include
> -I../../../git/pub/dpdk.org/main/lib/eal/x86/include -Ilib/eal/common
> -I../../../git/pub/dpdk.org/main/lib/eal/common -Ilib/eal
> -I../../../git/pub/dpdk.org/main/lib/eal -Ilib/kvargs
> -I../../../git/pub/dpdk.org/main/lib/kvargs -Ilib/log
> -I../../../git/pub/dpdk.org/main/lib/log -Ilib/metrics
> -I../../../git/pub/dpdk.org/main/lib/metrics -Ilib/telemetry
> -I../../../git/pub/dpdk.org/main/lib/telemetry -Ilib/ethdev
> -I../../../git/pub/dpdk.org/main/lib/ethdev -Ilib/net
> -I../../../git/pub/dpdk.org/main/lib/net -Ilib/mbuf
> -I../../../git/pub/dpdk.org/main/lib/mbuf -Ilib/mempool
> -I../../../git/pub/dpdk.org/main/lib/mempool -Ilib/ring
> -I../../../git/pub/dpdk.org/main/lib/ring -Ilib/meter
> -I../../../git/pub/dpdk.org/main/lib/meter -Ilib/cryptodev
> -I../../../git/pub/dpdk.org/main/lib/cryptodev -Ilib/rcu
> -I../../../git/pub/dpdk.org/main/lib/rcu -Ilib/hash
> -I../../../git/pub/dpdk.org/main/lib/hash -Ilib/pci
> -I../../../git/pub/dpdk.org/main/lib/pci -Ilib/dmadev
> -I../../../git/pub/dpdk.org/main/lib/dmadev -fdiagnostics-color=always
> -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -Werror -std=c11
> -O2 -g -include rte_config.h -Wcast-qual -Wdeprecated -Wformat
> -Wformat-nonliteral -Wformat-security -Wmissing-declarations
> -Wmissing-prototypes -Wnested-externs -Wold-style-definition
> -Wpointer-arith -Wsign-compare -Wstrict-prototypes -Wundef
> -Wwrite-strings -Wno-address-of-packed-member -Wno-packed-not-aligned
> -Wno-missing-field-initializers -Wno-zero-length-bounds
> -Wno-pointer-to-int-cast -D_GNU_SOURCE -m32 -fPIC -march=native -mrtm
> -DALLOW_EXPERIMENTAL_API -DALLOW_INTERNAL_API -Wno-format-truncation
> -DVHOST_GCC_UNROLL_PRAGMA -fno-strict-aliasing -DVHOST_HAS_VDUSE
> -DRTE_LOG_DEFAULT_LOGTYPE=lib.vhost -MD -MQ
> lib/librte_vhost.a.p/vhost_virtio_net.c.o -MF
> lib/librte_vhost.a.p/vhost_virtio_net.c.o.d -o
> lib/librte_vhost.a.p/vhost_virtio_net.c.o -c
> ../../../git/pub/dpdk.org/main/lib/vhost/virtio_net.c
> In function ‘mbuf_to_desc’,
>     inlined from ‘vhost_enqueue_async_packed’ at
> ../../../git/pub/dpdk.org/main/lib/vhost/virtio_net.c:1948:6,
>     inlined from ‘virtio_dev_rx_async_packed’ at
> ../../../git/pub/dpdk.org/main/lib/vhost/virtio_net.c:1965:6,
>     inlined from ‘virtio_dev_rx_async_submit_packed’ at
> ../../../git/pub/dpdk.org/main/lib/vhost/virtio_net.c:2127:7:
> ../../../git/pub/dpdk.org/main/lib/vhost/virtio_net.c:1230:18: error:
> ‘buf_vec[0].buf_addr’ may be used uninitialized
> [-Werror=maybe-uninitialized]
>  1230 |         buf_addr = buf_vec[vec_idx].buf_addr;
>       |         ~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~
> ../../../git/pub/dpdk.org/main/lib/vhost/virtio_net.c: In function
> ‘virtio_dev_rx_async_submit_packed’:
> ../../../git/pub/dpdk.org/main/lib/vhost/virtio_net.c:1963:27: note:
> ‘buf_vec’ declared here
>  1963 |         struct buf_vector buf_vec[BUF_VECTOR_MAX];
>       |                           ^~~~~~~
> In function ‘mbuf_to_desc’,
>     inlined from ‘vhost_enqueue_async_packed’ at
> ../../../git/pub/dpdk.org/main/lib/vhost/virtio_net.c:1948:6,
>     inlined from ‘virtio_dev_rx_async_packed’ at
> ../../../git/pub/dpdk.org/main/lib/vhost/virtio_net.c:1965:6,
>     inlined from ‘virtio_dev_rx_async_submit_packed’ at
> ../../../git/pub/dpdk.org/main/lib/vhost/virtio_net.c:2127:7:
> ../../../git/pub/dpdk.org/main/lib/vhost/virtio_net.c:1231:18: error:
> ‘buf_vec[0].buf_iova’ may be used uninitialized
> [-Werror=maybe-uninitialized]
>  1231 |         buf_iova = buf_vec[vec_idx].buf_iova;
>       |         ~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~
> ../../../git/pub/dpdk.org/main/lib/vhost/virtio_net.c: In function
> ‘virtio_dev_rx_async_submit_packed’:
> ../../../git/pub/dpdk.org/main/lib/vhost/virtio_net.c:1963:27: note:
> ‘buf_vec’ declared here
>  1963 |         struct buf_vector buf_vec[BUF_VECTOR_MAX];
>       |                           ^~~~~~~
> In function ‘mbuf_to_desc’,
>     inlined from ‘vhost_enqueue_async_packed’ at
> ../../../git/pub/dpdk.org/main/lib/vhost/virtio_net.c:1948:6,
>     inlined from ‘virtio_dev_rx_async_packed’ at
> ../../../git/pub/dpdk.org/main/lib/vhost/virtio_net.c:1965:6,
>     inlined from ‘virtio_dev_rx_async_submit_packed’ at
> ../../../git/pub/dpdk.org/main/lib/vhost/virtio_net.c:2127:7:
> ../../../git/pub/dpdk.org/main/lib/vhost/virtio_net.c:1232:35: error:
> ‘buf_vec[0].buf_len’ may be used uninitialized
> [-Werror=maybe-uninitialized]
>  1232 |         buf_len = buf_vec[vec_idx].buf_len;
>       |                   ~~~~~~~~~~~~~~~~^~~~~~~~
> ../../../git/pub/dpdk.org/main/lib/vhost/virtio_net.c: In function
> ‘virtio_dev_rx_async_submit_packed’:
> ../../../git/pub/dpdk.org/main/lib/vhost/virtio_net.c:1963:27: note:
> ‘buf_vec’ declared here
>  1963 |         struct buf_vector buf_vec[BUF_VECTOR_MAX];
>       |                           ^~~~~~~
> cc1: all warnings being treated as errors
> ninja: build stopped: subcommand failed.
> 
> I had a look at the vhost library and I think the compiler thinks size may be
> 0.

In 32 bit architecture, "size" could be zero, ref. line 1909:

uint64_t size = pkt->pkt_len + sizeof(struct virtio_net_hdr_mrg_rxbuf);

The values on both sides of the addition are 32 bit unsigned, so the addition will be performed as 32 bit unsigned, and the result could in theory be 0. Casting one of them to 64 bit, e.g. (uint64_t)sizeof(struct virtio_net_hdr_mrg_rxbuf), might fix the warning if the compiler is clever enough to know that adding a non-zero 64 bit value to any 32 bit value cannot give zero as the result.

Side note, off track: Why is "size" uint64_t, wouldn't uint32_t suffice?

> Changing the loop on size with a do { } while (size > 0); resolves the
> warning.
> I can post a change for this, as we hit a similar issue in the past
> and the code does not make sense comparing size on the first iteration
> of this loop.
> 
> However, I am a bit puzzled why the prefetch change makes the compiler
> consider this loop differently.
> We have the same constructs everywhere in this library and x86_64
> builds are fine...

That is indeed the relevant question here!

Perhaps the compiler somehow ignores the "const" part of the parameter given to the "asm" (in rte_prefetch0()) for 64 bit arch, but not for 32 bit arch?

> 
> 
> 
> --
> David Marchand
  
David Marchand Aug. 24, 2023, 2:18 p.m. UTC | #4
On Thu, Aug 24, 2023 at 2:47 PM Morten Brørup <mb@smartsharesystems.com> wrote:
> > From: David Marchand [mailto:david.marchand@redhat.com]
> > However, I am a bit puzzled why the prefetch change makes the compiler
> > consider this loop differently.
> > We have the same constructs everywhere in this library and x86_64
> > builds are fine...
>
> That is indeed the relevant question here!
>
> Perhaps the compiler somehow ignores the "const" part of the parameter given to the "asm" (in rte_prefetch0()) for 64 bit arch, but not for 32 bit arch?

It is possible to reproduce the issue with current DPDK tree (with
existing prefetch implementation in asm) and removing all
rte_prefetch0 calls from the async rx path:

diff --git a/lib/vhost/virtio_net.c b/lib/vhost/virtio_net.c
index d7624d18c8..6f941cf27d 100644
--- a/lib/vhost/virtio_net.c
+++ b/lib/vhost/virtio_net.c
@@ -748,8 +748,6 @@ map_one_desc(struct virtio_net *dev, struct
vhost_virtqueue *vq,
                if (unlikely(!desc_addr))
                        return -1;

-               rte_prefetch0((void *)(uintptr_t)desc_addr);
-
                buf_vec[vec_id].buf_iova = desc_iova;
                buf_vec[vec_id].buf_addr = desc_addr;
                buf_vec[vec_id].buf_len  = desc_chunck_len;
@@ -1808,8 +1806,6 @@ virtio_dev_rx_async_submit_split(struct
virtio_net *dev, struct vhost_virtqueue
         */
        avail_head = __atomic_load_n(&vq->avail->idx, __ATOMIC_ACQUIRE);

-       rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]);
-
        async_iter_reset(async);

        for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
@@ -1997,7 +1993,6 @@ virtio_dev_rx_async_packed_batch_enqueue(struct
virtio_net *dev,
        uint16_t i;

        vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
-               rte_prefetch0((void *)(uintptr_t)desc_addrs[i]);
                desc = vhost_iova_to_vva(dev, vq, desc_addrs[i],
&lens[i], VHOST_ACCESS_RW);
                hdrs[i] = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc;
                lens[i] = pkts[i]->pkt_len +
@@ -2106,8 +2101,6 @@ virtio_dev_rx_async_submit_packed(struct
virtio_net *dev, struct vhost_virtqueue
        uint16_t i;

        do {
-               rte_prefetch0(&vq->desc_packed[vq->last_avail_idx]);
-
                if (count - pkt_idx >= PACKED_BATCH_SIZE) {
                        if (!virtio_dev_rx_async_packed_batch(dev, vq,
&pkts[pkt_idx],
                                        dma_id, vchan_id)) {


If any prefetch is left, the warning disappears.
So the asm usage probably impacts/disables some compiler feature, for
this code path.
  
Morten Brørup Aug. 24, 2023, 2:43 p.m. UTC | #5
> From: David Marchand [mailto:david.marchand@redhat.com]
> Sent: Thursday, 24 August 2023 16.18
> 
> On Thu, Aug 24, 2023 at 2:47 PM Morten Brørup <mb@smartsharesystems.com>
> wrote:
> > > From: David Marchand [mailto:david.marchand@redhat.com]
> > > However, I am a bit puzzled why the prefetch change makes the compiler
> > > consider this loop differently.
> > > We have the same constructs everywhere in this library and x86_64
> > > builds are fine...
> >
> > That is indeed the relevant question here!
> >
> > Perhaps the compiler somehow ignores the "const" part of the parameter given
> to the "asm" (in rte_prefetch0()) for 64 bit arch, but not for 32 bit arch?
> 
> It is possible to reproduce the issue with current DPDK tree (with
> existing prefetch implementation in asm) and removing all
> rte_prefetch0 calls from the async rx path:
> 
> diff --git a/lib/vhost/virtio_net.c b/lib/vhost/virtio_net.c
> index d7624d18c8..6f941cf27d 100644
> --- a/lib/vhost/virtio_net.c
> +++ b/lib/vhost/virtio_net.c
> @@ -748,8 +748,6 @@ map_one_desc(struct virtio_net *dev, struct
> vhost_virtqueue *vq,
>                 if (unlikely(!desc_addr))
>                         return -1;
> 
> -               rte_prefetch0((void *)(uintptr_t)desc_addr);
> -
>                 buf_vec[vec_id].buf_iova = desc_iova;
>                 buf_vec[vec_id].buf_addr = desc_addr;
>                 buf_vec[vec_id].buf_len  = desc_chunck_len;
> @@ -1808,8 +1806,6 @@ virtio_dev_rx_async_submit_split(struct
> virtio_net *dev, struct vhost_virtqueue
>          */
>         avail_head = __atomic_load_n(&vq->avail->idx, __ATOMIC_ACQUIRE);
> 
> -       rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]);
> -
>         async_iter_reset(async);
> 
>         for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
> @@ -1997,7 +1993,6 @@ virtio_dev_rx_async_packed_batch_enqueue(struct
> virtio_net *dev,
>         uint16_t i;
> 
>         vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
> -               rte_prefetch0((void *)(uintptr_t)desc_addrs[i]);
>                 desc = vhost_iova_to_vva(dev, vq, desc_addrs[i],
> &lens[i], VHOST_ACCESS_RW);
>                 hdrs[i] = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc;
>                 lens[i] = pkts[i]->pkt_len +
> @@ -2106,8 +2101,6 @@ virtio_dev_rx_async_submit_packed(struct
> virtio_net *dev, struct vhost_virtqueue
>         uint16_t i;
> 
>         do {
> -               rte_prefetch0(&vq->desc_packed[vq->last_avail_idx]);
> -
>                 if (count - pkt_idx >= PACKED_BATCH_SIZE) {
>                         if (!virtio_dev_rx_async_packed_batch(dev, vq,
> &pkts[pkt_idx],
>                                         dma_id, vchan_id)) {
> 
> 
> If any prefetch is left, the warning disappears.
> So the asm usage probably impacts/disables some compiler feature, for
> this code path.

Perhaps any asm usage causes the compiler to assume that any memory might have been modified, which then causes the compiler to not warn about the use of potentially uninitialized variables.

If so, then compiling other code using asm might also not omit warnings about potentially uninitialized variables where it otherwise should have.
  
Tyler Retzlaff Aug. 24, 2023, 3:53 p.m. UTC | #6
On Thu, Aug 24, 2023 at 04:18:06PM +0200, David Marchand wrote:
> On Thu, Aug 24, 2023 at 2:47 PM Morten Brørup <mb@smartsharesystems.com> wrote:
> > > From: David Marchand [mailto:david.marchand@redhat.com]
> > > However, I am a bit puzzled why the prefetch change makes the compiler
> > > consider this loop differently.
> > > We have the same constructs everywhere in this library and x86_64
> > > builds are fine...
> >
> > That is indeed the relevant question here!
> >
> > Perhaps the compiler somehow ignores the "const" part of the parameter given to the "asm" (in rte_prefetch0()) for 64 bit arch, but not for 32 bit arch?
> 
> It is possible to reproduce the issue with current DPDK tree (with
> existing prefetch implementation in asm) and removing all
> rte_prefetch0 calls from the async rx path:
> 
> diff --git a/lib/vhost/virtio_net.c b/lib/vhost/virtio_net.c
> index d7624d18c8..6f941cf27d 100644
> --- a/lib/vhost/virtio_net.c
> +++ b/lib/vhost/virtio_net.c
> @@ -748,8 +748,6 @@ map_one_desc(struct virtio_net *dev, struct
> vhost_virtqueue *vq,
>                 if (unlikely(!desc_addr))
>                         return -1;
> 
> -               rte_prefetch0((void *)(uintptr_t)desc_addr);
> -
>                 buf_vec[vec_id].buf_iova = desc_iova;
>                 buf_vec[vec_id].buf_addr = desc_addr;
>                 buf_vec[vec_id].buf_len  = desc_chunck_len;
> @@ -1808,8 +1806,6 @@ virtio_dev_rx_async_submit_split(struct
> virtio_net *dev, struct vhost_virtqueue
>          */
>         avail_head = __atomic_load_n(&vq->avail->idx, __ATOMIC_ACQUIRE);
> 
> -       rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]);
> -
>         async_iter_reset(async);
> 
>         for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
> @@ -1997,7 +1993,6 @@ virtio_dev_rx_async_packed_batch_enqueue(struct
> virtio_net *dev,
>         uint16_t i;
> 
>         vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
> -               rte_prefetch0((void *)(uintptr_t)desc_addrs[i]);
>                 desc = vhost_iova_to_vva(dev, vq, desc_addrs[i],
> &lens[i], VHOST_ACCESS_RW);
>                 hdrs[i] = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc;
>                 lens[i] = pkts[i]->pkt_len +
> @@ -2106,8 +2101,6 @@ virtio_dev_rx_async_submit_packed(struct
> virtio_net *dev, struct vhost_virtqueue
>         uint16_t i;
> 
>         do {
> -               rte_prefetch0(&vq->desc_packed[vq->last_avail_idx]);
> -
>                 if (count - pkt_idx >= PACKED_BATCH_SIZE) {
>                         if (!virtio_dev_rx_async_packed_batch(dev, vq,
> &pkts[pkt_idx],
>                                         dma_id, vchan_id)) {
> 
> 
> If any prefetch is left, the warning disappears.
> So the asm usage probably impacts/disables some compiler feature, for
> this code path.

So as an aside one of the reasons often cited as to why intrinsics are
preferred over inline asm is that inline asm can't be considered when
the compiler performs optimization. It could be that when moving to use
the intrinsics the scope of what can be optimized around the prefetch0
calls can be used for optimization leading to the observations here.

Ty

> 
> 
> -- 
> David Marchand
  
David Marchand Aug. 25, 2023, 8:44 a.m. UTC | #7
On Thu, Aug 24, 2023 at 5:54 PM Tyler Retzlaff
<roretzla@linux.microsoft.com> wrote:
>
> On Thu, Aug 24, 2023 at 04:18:06PM +0200, David Marchand wrote:
> > On Thu, Aug 24, 2023 at 2:47 PM Morten Brørup <mb@smartsharesystems.com> wrote:
> > > > From: David Marchand [mailto:david.marchand@redhat.com]
> > > > However, I am a bit puzzled why the prefetch change makes the compiler
> > > > consider this loop differently.
> > > > We have the same constructs everywhere in this library and x86_64
> > > > builds are fine...
> > >
> > > That is indeed the relevant question here!
> > >
> > > Perhaps the compiler somehow ignores the "const" part of the parameter given to the "asm" (in rte_prefetch0()) for 64 bit arch, but not for 32 bit arch?
> >
> > It is possible to reproduce the issue with current DPDK tree (with
> > existing prefetch implementation in asm) and removing all
> > rte_prefetch0 calls from the async rx path:
> >
> > diff --git a/lib/vhost/virtio_net.c b/lib/vhost/virtio_net.c
> > index d7624d18c8..6f941cf27d 100644
> > --- a/lib/vhost/virtio_net.c
> > +++ b/lib/vhost/virtio_net.c
> > @@ -748,8 +748,6 @@ map_one_desc(struct virtio_net *dev, struct
> > vhost_virtqueue *vq,
> >                 if (unlikely(!desc_addr))
> >                         return -1;
> >
> > -               rte_prefetch0((void *)(uintptr_t)desc_addr);
> > -
> >                 buf_vec[vec_id].buf_iova = desc_iova;
> >                 buf_vec[vec_id].buf_addr = desc_addr;
> >                 buf_vec[vec_id].buf_len  = desc_chunck_len;
> > @@ -1808,8 +1806,6 @@ virtio_dev_rx_async_submit_split(struct
> > virtio_net *dev, struct vhost_virtqueue
> >          */
> >         avail_head = __atomic_load_n(&vq->avail->idx, __ATOMIC_ACQUIRE);
> >
> > -       rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]);
> > -
> >         async_iter_reset(async);
> >
> >         for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
> > @@ -1997,7 +1993,6 @@ virtio_dev_rx_async_packed_batch_enqueue(struct
> > virtio_net *dev,
> >         uint16_t i;
> >
> >         vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
> > -               rte_prefetch0((void *)(uintptr_t)desc_addrs[i]);
> >                 desc = vhost_iova_to_vva(dev, vq, desc_addrs[i],
> > &lens[i], VHOST_ACCESS_RW);
> >                 hdrs[i] = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc;
> >                 lens[i] = pkts[i]->pkt_len +
> > @@ -2106,8 +2101,6 @@ virtio_dev_rx_async_submit_packed(struct
> > virtio_net *dev, struct vhost_virtqueue
> >         uint16_t i;
> >
> >         do {
> > -               rte_prefetch0(&vq->desc_packed[vq->last_avail_idx]);
> > -
> >                 if (count - pkt_idx >= PACKED_BATCH_SIZE) {
> >                         if (!virtio_dev_rx_async_packed_batch(dev, vq,
> > &pkts[pkt_idx],
> >                                         dma_id, vchan_id)) {
> >
> >
> > If any prefetch is left, the warning disappears.
> > So the asm usage probably impacts/disables some compiler feature, for
> > this code path.
>
> So as an aside one of the reasons often cited as to why intrinsics are
> preferred over inline asm is that inline asm can't be considered when
> the compiler performs optimization. It could be that when moving to use
> the intrinsics the scope of what can be optimized around the prefetch0
> calls can be used for optimization leading to the observations here.

Agreed.
While we decide the next steps on the vhost library, and for making
progress on this series, I decided to keep the previous asm
implementation and put intrinsics under a #ifdef MSVC.
  
Morten Brørup Aug. 25, 2023, 9:23 a.m. UTC | #8
Regarding the vhost lib...

> From: David Marchand [mailto:david.marchand@redhat.com]
> Sent: Thursday, 24 August 2023 14.26
> 
> On Thu, Aug 24, 2023 at 2:06 PM David Marchand
> <david.marchand@redhat.com> wrote:
> > I had a look at the vhost library and I think the compiler thinks size may
> be 0.
> > Changing the loop on size with a do { } while (size > 0); resolves the
> warning.
> > I can post a change for this, as we hit a similar issue in the past
> > and the code does not make sense comparing size on the first iteration
> > of this loop.
> 
> This sounds like a commit 4226aa9caca9 ("vhost: fix build with GCC
> 12") we had on LoongArch.

If we know that size > 0 initially, then "do { } while (size > 0);" is more appropriate than "while (size > 0) { }", not only for the benefit of the compiler/optimizer, but also for the benefit of human reviewers.

> 
> And the 32bits build warning goes away with this:
> 
> diff --git a/lib/vhost/virtio_net.c b/lib/vhost/virtio_net.c
> index d7624d18c8..41328b7530 100644
> --- a/lib/vhost/virtio_net.c
> +++ b/lib/vhost/virtio_net.c
> @@ -1906,7 +1906,7 @@ vhost_enqueue_async_packed(struct virtio_net *dev,
>         uint16_t buf_id = 0;
>         uint32_t len = 0;
>         uint16_t desc_count = 0;
> -       uint64_t size = pkt->pkt_len + sizeof(struct
> virtio_net_hdr_mrg_rxbuf);
> +       uint64_t size = (uint64_t)pkt->pkt_len + sizeof(struct
> virtio_net_hdr_mrg_rxbuf);

Yes, but performing a 64 bit addition instead of a 32 bit addition has a (very small) cost for 32 bit arch.

And regardless of this cost for 32 bit arch; if we know that size cannot exceed UINT32_MAX, it should be changed to uint32_t.

>         uint32_t buffer_len[vq->size];
>         uint16_t buffer_buf_id[vq->size];
>         uint16_t buffer_desc_count[vq->size];
  
Tyler Retzlaff Aug. 25, 2023, 3:46 p.m. UTC | #9
On Fri, Aug 25, 2023 at 10:44:01AM +0200, David Marchand wrote:
> On Thu, Aug 24, 2023 at 5:54 PM Tyler Retzlaff
> <roretzla@linux.microsoft.com> wrote:
> >
> > On Thu, Aug 24, 2023 at 04:18:06PM +0200, David Marchand wrote:
> > > On Thu, Aug 24, 2023 at 2:47 PM Morten Brørup <mb@smartsharesystems.com> wrote:
> > > > > From: David Marchand [mailto:david.marchand@redhat.com]
> > > > > However, I am a bit puzzled why the prefetch change makes the compiler
> > > > > consider this loop differently.
> > > > > We have the same constructs everywhere in this library and x86_64
> > > > > builds are fine...
> > > >
> > > > That is indeed the relevant question here!
> > > >
> > > > Perhaps the compiler somehow ignores the "const" part of the parameter given to the "asm" (in rte_prefetch0()) for 64 bit arch, but not for 32 bit arch?
> > >
> > > It is possible to reproduce the issue with current DPDK tree (with
> > > existing prefetch implementation in asm) and removing all
> > > rte_prefetch0 calls from the async rx path:
> > >
> > > diff --git a/lib/vhost/virtio_net.c b/lib/vhost/virtio_net.c
> > > index d7624d18c8..6f941cf27d 100644
> > > --- a/lib/vhost/virtio_net.c
> > > +++ b/lib/vhost/virtio_net.c
> > > @@ -748,8 +748,6 @@ map_one_desc(struct virtio_net *dev, struct
> > > vhost_virtqueue *vq,
> > >                 if (unlikely(!desc_addr))
> > >                         return -1;
> > >
> > > -               rte_prefetch0((void *)(uintptr_t)desc_addr);
> > > -
> > >                 buf_vec[vec_id].buf_iova = desc_iova;
> > >                 buf_vec[vec_id].buf_addr = desc_addr;
> > >                 buf_vec[vec_id].buf_len  = desc_chunck_len;
> > > @@ -1808,8 +1806,6 @@ virtio_dev_rx_async_submit_split(struct
> > > virtio_net *dev, struct vhost_virtqueue
> > >          */
> > >         avail_head = __atomic_load_n(&vq->avail->idx, __ATOMIC_ACQUIRE);
> > >
> > > -       rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]);
> > > -
> > >         async_iter_reset(async);
> > >
> > >         for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
> > > @@ -1997,7 +1993,6 @@ virtio_dev_rx_async_packed_batch_enqueue(struct
> > > virtio_net *dev,
> > >         uint16_t i;
> > >
> > >         vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
> > > -               rte_prefetch0((void *)(uintptr_t)desc_addrs[i]);
> > >                 desc = vhost_iova_to_vva(dev, vq, desc_addrs[i],
> > > &lens[i], VHOST_ACCESS_RW);
> > >                 hdrs[i] = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc;
> > >                 lens[i] = pkts[i]->pkt_len +
> > > @@ -2106,8 +2101,6 @@ virtio_dev_rx_async_submit_packed(struct
> > > virtio_net *dev, struct vhost_virtqueue
> > >         uint16_t i;
> > >
> > >         do {
> > > -               rte_prefetch0(&vq->desc_packed[vq->last_avail_idx]);
> > > -
> > >                 if (count - pkt_idx >= PACKED_BATCH_SIZE) {
> > >                         if (!virtio_dev_rx_async_packed_batch(dev, vq,
> > > &pkts[pkt_idx],
> > >                                         dma_id, vchan_id)) {
> > >
> > >
> > > If any prefetch is left, the warning disappears.
> > > So the asm usage probably impacts/disables some compiler feature, for
> > > this code path.
> >
> > So as an aside one of the reasons often cited as to why intrinsics are
> > preferred over inline asm is that inline asm can't be considered when
> > the compiler performs optimization. It could be that when moving to use
> > the intrinsics the scope of what can be optimized around the prefetch0
> > calls can be used for optimization leading to the observations here.
> 
> Agreed.
> While we decide the next steps on the vhost library, and for making
> progress on this series, I decided to keep the previous asm
> implementation and put intrinsics under a #ifdef MSVC.

I'm fine with this, we can micro-optimize things out of the MSVC only
blocks later in more targeted changes. These kinds of little things is
why earlier versions of the series only used intrinsics for MSVC bah!

Thanks

> 
> -- 
> David Marchand
  

Patch

diff --git a/lib/eal/x86/include/rte_prefetch.h b/lib/eal/x86/include/rte_prefetch.h
index 7fd01c4..7a6988e 100644
--- a/lib/eal/x86/include/rte_prefetch.h
+++ b/lib/eal/x86/include/rte_prefetch.h
@@ -9,30 +9,38 @@ 
 extern "C" {
 #endif
 
+#include <emmintrin.h>
+
 #include <rte_compat.h>
 #include <rte_common.h>
 #include "generic/rte_prefetch.h"
 
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wcast-qual"
+
 static inline void rte_prefetch0(const volatile void *p)
 {
-	asm volatile ("prefetcht0 %[p]" : : [p] "m" (*(const volatile char *)p));
+	_mm_prefetch((const void *)p, _MM_HINT_T0);
 }
 
 static inline void rte_prefetch1(const volatile void *p)
 {
-	asm volatile ("prefetcht1 %[p]" : : [p] "m" (*(const volatile char *)p));
+	_mm_prefetch((const void *)p, _MM_HINT_T1);
 }
 
 static inline void rte_prefetch2(const volatile void *p)
 {
-	asm volatile ("prefetcht2 %[p]" : : [p] "m" (*(const volatile char *)p));
+	_mm_prefetch((const void *)p, _MM_HINT_T2);
 }
 
 static inline void rte_prefetch_non_temporal(const volatile void *p)
 {
-	asm volatile ("prefetchnta %[p]" : : [p] "m" (*(const volatile char *)p));
+	_mm_prefetch((const void *)p, _MM_HINT_NTA);
 }
 
+#pragma GCC diagnostic pop
+
+#ifdef RTE_TOOLCHAIN_MSVC
 /*
  * We use raw byte codes for now as only the newest compiler
  * versions support this instruction natively.
@@ -41,8 +49,17 @@  static inline void rte_prefetch_non_temporal(const volatile void *p)
 static inline void
 rte_cldemote(const volatile void *p)
 {
+	_mm_cldemote(p);
+}
+#else
+__rte_experimental
+static inline void
+rte_cldemote(const volatile void *p)
+{
 	asm volatile(".byte 0x0f, 0x1c, 0x06" :: "S" (p));
 }
+#endif
+
 
 #ifdef __cplusplus
 }