[v3,18/18] lpm: choose vector path at runtime
Checks
Commit Message
When choosing the vector path, max SIMD bitwidth is now checked to
ensure a vector path is allowable. To do this, rather than the vector
lookup functions being called directly from apps, a generic lookup
function is called which will call the vector functions if suitable.
Signed-off-by: Ciara Power <ciara.power@intel.com>
---
lib/librte_lpm/rte_lpm.h | 57 ++++++++++++++++++++++++++------
lib/librte_lpm/rte_lpm_altivec.h | 2 +-
lib/librte_lpm/rte_lpm_neon.h | 2 +-
lib/librte_lpm/rte_lpm_sse.h | 2 +-
4 files changed, 50 insertions(+), 13 deletions(-)
Comments
Hi Ciara,
On 30/09/2020 14:04, Ciara Power wrote:
> When choosing the vector path, max SIMD bitwidth is now checked to
> ensure a vector path is allowable. To do this, rather than the vector
> lookup functions being called directly from apps, a generic lookup
> function is called which will call the vector functions if suitable.
>
> Signed-off-by: Ciara Power <ciara.power@intel.com>
> ---
> lib/librte_lpm/rte_lpm.h | 57 ++++++++++++++++++++++++++------
> lib/librte_lpm/rte_lpm_altivec.h | 2 +-
> lib/librte_lpm/rte_lpm_neon.h | 2 +-
> lib/librte_lpm/rte_lpm_sse.h | 2 +-
> 4 files changed, 50 insertions(+), 13 deletions(-)
>
> diff --git a/lib/librte_lpm/rte_lpm.h b/lib/librte_lpm/rte_lpm.h
> index 03da2d37e0..edba7cafd5 100644
> --- a/lib/librte_lpm/rte_lpm.h
> +++ b/lib/librte_lpm/rte_lpm.h
> @@ -397,8 +397,18 @@ rte_lpm_lookup_bulk_func(const struct rte_lpm *lpm, const uint32_t *ips,
> /* Mask four results. */
> #define RTE_LPM_MASKX4_RES UINT64_C(0x00ffffff00ffffff)
>
> +#if defined(RTE_ARCH_ARM) || defined(RTE_ARCH_ARM64)
> +#include "rte_lpm_neon.h"
> +#elif defined(RTE_ARCH_PPC_64)
> +#include "rte_lpm_altivec.h"
> +#else
> +#include "rte_lpm_sse.h"
> +#endif
> +
> /**
> - * Lookup four IP addresses in an LPM table.
> + * Lookup four IP addresses in an LPM table individually by calling the
> + * lookup function for each ip. This is used when lookupx4 is called but
> + * the vector path is not suitable.
> *
> * @param lpm
> * LPM object handle
> @@ -417,16 +427,43 @@ rte_lpm_lookup_bulk_func(const struct rte_lpm *lpm, const uint32_t *ips,
> * if lookup would fail.
> */
> static inline void
> -rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
> - uint32_t defv);
> +rte_lpm_lookupx4_scalar(struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
> + uint32_t defv)
> +{
> + int i;
> + for (i = 0; i < 4; i++)
> + if (rte_lpm_lookup(lpm, ((uint32_t *) &ip)[i], &hop[i]) < 0)
> + hop[i] = defv; /* lookupx4 expected to set on failure */
> +}
>
> -#if defined(RTE_ARCH_ARM) || defined(RTE_ARCH_ARM64)
> -#include "rte_lpm_neon.h"
> -#elif defined(RTE_ARCH_PPC_64)
> -#include "rte_lpm_altivec.h"
> -#else
> -#include "rte_lpm_sse.h"
> -#endif
> +/**
> + * Lookup four IP addresses in an LPM table.
> + *
> + * @param lpm
> + * LPM object handle
> + * @param ip
> + * Four IPs to be looked up in the LPM table
> + * @param hop
> + * Next hop of the most specific rule found for IP (valid on lookup hit only).
> + * This is an 4 elements array of two byte values.
> + * If the lookup was successful for the given IP, then least significant byte
> + * of the corresponding element is the actual next hop and the most
> + * significant byte is zero.
> + * If the lookup for the given IP failed, then corresponding element would
> + * contain default value, see description of then next parameter.
> + * @param defv
> + * Default value to populate into corresponding element of hop[] array,
> + * if lookup would fail.
> + */
> +static inline void
> +rte_lpm_lookupx4(struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
> + uint32_t defv)
> +{
> + if (rte_get_max_simd_bitwidth() >= RTE_MAX_128_SIMD)
> + rte_lpm_lookupx4_vec(lpm, ip, hop, defv);
> + else
> + rte_lpm_lookupx4_scalar(lpm, ip, hop, defv);
> +}
I'm afraid this will lead to a drop in performance. rte_lpm_lookupx4 is
used in the hot path, and a bulk size is too small to amortize the cost
of adding this extra logic.
>
> #ifdef __cplusplus
> }
> diff --git a/lib/librte_lpm/rte_lpm_altivec.h b/lib/librte_lpm/rte_lpm_altivec.h
> index 228c41b38e..82142d3351 100644
> --- a/lib/librte_lpm/rte_lpm_altivec.h
> +++ b/lib/librte_lpm/rte_lpm_altivec.h
> @@ -16,7 +16,7 @@ extern "C" {
> #endif
>
> static inline void
> -rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
> +rte_lpm_lookupx4_vec(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
> uint32_t defv)
> {
> vector signed int i24;
> diff --git a/lib/librte_lpm/rte_lpm_neon.h b/lib/librte_lpm/rte_lpm_neon.h
> index 6c131d3125..14b184515d 100644
> --- a/lib/librte_lpm/rte_lpm_neon.h
> +++ b/lib/librte_lpm/rte_lpm_neon.h
> @@ -16,7 +16,7 @@ extern "C" {
> #endif
>
> static inline void
> -rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
> +rte_lpm_lookupx4_vec(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
> uint32_t defv)
> {
> uint32x4_t i24;
> diff --git a/lib/librte_lpm/rte_lpm_sse.h b/lib/librte_lpm/rte_lpm_sse.h
> index 44770b6ff8..cb5477c6cf 100644
> --- a/lib/librte_lpm/rte_lpm_sse.h
> +++ b/lib/librte_lpm/rte_lpm_sse.h
> @@ -15,7 +15,7 @@ extern "C" {
> #endif
>
> static inline void
> -rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
> +rte_lpm_lookupx4_vec(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
> uint32_t defv)
> {
> __m128i i24;
>
>
> Hi Ciara,
>
>
> On 30/09/2020 14:04, Ciara Power wrote:
> > When choosing the vector path, max SIMD bitwidth is now checked to
> > ensure a vector path is allowable. To do this, rather than the vector
> > lookup functions being called directly from apps, a generic lookup
> > function is called which will call the vector functions if suitable.
> >
> > Signed-off-by: Ciara Power <ciara.power@intel.com>
> > ---
> > lib/librte_lpm/rte_lpm.h | 57 ++++++++++++++++++++++++++------
> > lib/librte_lpm/rte_lpm_altivec.h | 2 +-
> > lib/librte_lpm/rte_lpm_neon.h | 2 +-
> > lib/librte_lpm/rte_lpm_sse.h | 2 +-
> > 4 files changed, 50 insertions(+), 13 deletions(-)
> >
> > diff --git a/lib/librte_lpm/rte_lpm.h b/lib/librte_lpm/rte_lpm.h
> > index 03da2d37e0..edba7cafd5 100644
> > --- a/lib/librte_lpm/rte_lpm.h
> > +++ b/lib/librte_lpm/rte_lpm.h
> > @@ -397,8 +397,18 @@ rte_lpm_lookup_bulk_func(const struct rte_lpm *lpm, const uint32_t *ips,
> > /* Mask four results. */
> > #define RTE_LPM_MASKX4_RES UINT64_C(0x00ffffff00ffffff)
> >
> > +#if defined(RTE_ARCH_ARM) || defined(RTE_ARCH_ARM64)
> > +#include "rte_lpm_neon.h"
> > +#elif defined(RTE_ARCH_PPC_64)
> > +#include "rte_lpm_altivec.h"
> > +#else
> > +#include "rte_lpm_sse.h"
> > +#endif
> > +
> > /**
> > - * Lookup four IP addresses in an LPM table.
> > + * Lookup four IP addresses in an LPM table individually by calling the
> > + * lookup function for each ip. This is used when lookupx4 is called but
> > + * the vector path is not suitable.
> > *
> > * @param lpm
> > * LPM object handle
> > @@ -417,16 +427,43 @@ rte_lpm_lookup_bulk_func(const struct rte_lpm *lpm, const uint32_t *ips,
> > * if lookup would fail.
> > */
> > static inline void
> > -rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
> > - uint32_t defv);
> > +rte_lpm_lookupx4_scalar(struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
> > + uint32_t defv)
> > +{
> > + int i;
> > + for (i = 0; i < 4; i++)
> > + if (rte_lpm_lookup(lpm, ((uint32_t *) &ip)[i], &hop[i]) < 0)
> > + hop[i] = defv; /* lookupx4 expected to set on failure */
> > +}
> >
> > -#if defined(RTE_ARCH_ARM) || defined(RTE_ARCH_ARM64)
> > -#include "rte_lpm_neon.h"
> > -#elif defined(RTE_ARCH_PPC_64)
> > -#include "rte_lpm_altivec.h"
> > -#else
> > -#include "rte_lpm_sse.h"
> > -#endif
> > +/**
> > + * Lookup four IP addresses in an LPM table.
> > + *
> > + * @param lpm
> > + * LPM object handle
> > + * @param ip
> > + * Four IPs to be looked up in the LPM table
> > + * @param hop
> > + * Next hop of the most specific rule found for IP (valid on lookup hit only).
> > + * This is an 4 elements array of two byte values.
> > + * If the lookup was successful for the given IP, then least significant byte
> > + * of the corresponding element is the actual next hop and the most
> > + * significant byte is zero.
> > + * If the lookup for the given IP failed, then corresponding element would
> > + * contain default value, see description of then next parameter.
> > + * @param defv
> > + * Default value to populate into corresponding element of hop[] array,
> > + * if lookup would fail.
> > + */
> > +static inline void
> > +rte_lpm_lookupx4(struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
> > + uint32_t defv)
> > +{
> > + if (rte_get_max_simd_bitwidth() >= RTE_MAX_128_SIMD)
> > + rte_lpm_lookupx4_vec(lpm, ip, hop, defv);
> > + else
> > + rte_lpm_lookupx4_scalar(lpm, ip, hop, defv);
> > +}
>
> I'm afraid this will lead to a drop in performance. rte_lpm_lookupx4 is
> used in the hot path, and a bulk size is too small to amortize the cost
> of adding this extra logic.
I do share Vladimir's concern regarding performance here.
As I said in other mail - it seems not much point to insert
these checks into inline SSE specific function, as SSE is enabled
by default for all x86 builds.
As another more generic thought - might be better to avoid
these checks in other public SIMD-specific inline functions (if any).
If such function get called from some .c, then at least such SIMD
ISA is already enabled for that .c file and I think this check should be
left for caller to do.
> >
> > #ifdef __cplusplus
> > }
> > diff --git a/lib/librte_lpm/rte_lpm_altivec.h b/lib/librte_lpm/rte_lpm_altivec.h
> > index 228c41b38e..82142d3351 100644
> > --- a/lib/librte_lpm/rte_lpm_altivec.h
> > +++ b/lib/librte_lpm/rte_lpm_altivec.h
> > @@ -16,7 +16,7 @@ extern "C" {
> > #endif
> >
> > static inline void
> > -rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
> > +rte_lpm_lookupx4_vec(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
> > uint32_t defv)
> > {
> > vector signed int i24;
> > diff --git a/lib/librte_lpm/rte_lpm_neon.h b/lib/librte_lpm/rte_lpm_neon.h
> > index 6c131d3125..14b184515d 100644
> > --- a/lib/librte_lpm/rte_lpm_neon.h
> > +++ b/lib/librte_lpm/rte_lpm_neon.h
> > @@ -16,7 +16,7 @@ extern "C" {
> > #endif
> >
> > static inline void
> > -rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
> > +rte_lpm_lookupx4_vec(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
> > uint32_t defv)
> > {
> > uint32x4_t i24;
> > diff --git a/lib/librte_lpm/rte_lpm_sse.h b/lib/librte_lpm/rte_lpm_sse.h
> > index 44770b6ff8..cb5477c6cf 100644
> > --- a/lib/librte_lpm/rte_lpm_sse.h
> > +++ b/lib/librte_lpm/rte_lpm_sse.h
> > @@ -15,7 +15,7 @@ extern "C" {
> > #endif
> >
> > static inline void
> > -rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
> > +rte_lpm_lookupx4_vec(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
> > uint32_t defv)
> > {
> > __m128i i24;
> >
>
> --
> Regards,
> Vladimir
On Wed, Sep 30, 2020 at 3:14 PM Ciara Power <ciara.power@intel.com> wrote:
>
> When choosing the vector path, max SIMD bitwidth is now checked to
> ensure a vector path is allowable. To do this, rather than the vector
> lookup functions being called directly from apps, a generic lookup
> function is called which will call the vector functions if suitable.
>
> Signed-off-by: Ciara Power <ciara.power@intel.com>
Got a build error on this patch with ./devtools/test-meson-builds.sh
("gcc-shared" target):
[2/3] Compiling C object
'examples/c590b3c@@dpdk-l3fwd-thread@exe/performance-thread_l3fwd-thread_main.c.o'.
FAILED: examples/c590b3c@@dpdk-l3fwd-thread@exe/performance-thread_l3fwd-thread_main.c.o
ccache gcc -Iexamples/c590b3c@@dpdk-l3fwd-thread@exe -Iexamples
-I../../dpdk/examples -Iexamples/performance-thread/l3fwd-thread
-I../../dpdk/examples/performance-thread/l3fwd-thread
-I../../dpdk/examples/performance-thread/l3fwd-thread/../common
-I../../dpdk/examples/performance-thread/l3fwd-thread/../common/arch/x86
-I. -I../../dpdk/ -Iconfig -I../../dpdk/config
-Ilib/librte_eal/include -I../../dpdk/lib/librte_eal/include
-Ilib/librte_eal/linux/include
-I../../dpdk/lib/librte_eal/linux/include -Ilib/librte_eal/x86/include
-I../../dpdk/lib/librte_eal/x86/include -Ilib/librte_eal/common
-I../../dpdk/lib/librte_eal/common -Ilib/librte_eal
-I../../dpdk/lib/librte_eal -Ilib/librte_kvargs
-I../../dpdk/lib/librte_kvargs
-Ilib/librte_telemetry/../librte_metrics
-I../../dpdk/lib/librte_telemetry/../librte_metrics
-Ilib/librte_telemetry -I../../dpdk/lib/librte_telemetry
-Ilib/librte_mempool -I../../dpdk/lib/librte_mempool -Ilib/librte_ring
-I../../dpdk/lib/librte_ring -Ilib/librte_net
-I../../dpdk/lib/librte_net -Ilib/librte_mbuf
-I../../dpdk/lib/librte_mbuf -Ilib/librte_ethdev
-I../../dpdk/lib/librte_ethdev -Ilib/librte_meter
-I../../dpdk/lib/librte_meter -Ilib/librte_cmdline
-I../../dpdk/lib/librte_cmdline -Ilib/librte_timer
-I../../dpdk/lib/librte_timer -Ilib/librte_lpm
-I../../dpdk/lib/librte_lpm -Ilib/librte_hash
-I../../dpdk/lib/librte_hash -Ilib/librte_rcu
-I../../dpdk/lib/librte_rcu
-I/home/dmarchan/intel-ipsec-mb/install/include
-fdiagnostics-color=always -pipe -D_FILE_OFFSET_BITS=64 -Wall
-Winvalid-pch -Werror -O2 -g -include rte_config.h -Wextra -Wcast-qual
-Wdeprecated -Wformat-nonliteral -Wformat-security
-Wmissing-declarations -Wmissing-prototypes -Wnested-externs
-Wold-style-definition -Wpointer-arith -Wsign-compare
-Wstrict-prototypes -Wundef -Wwrite-strings
-Wno-address-of-packed-member -Wno-packed-not-aligned
-Wno-missing-field-initializers -D_GNU_SOURCE -march=native
-Wno-format-truncation -DALLOW_EXPERIMENTAL_API -MD -MQ
'examples/c590b3c@@dpdk-l3fwd-thread@exe/performance-thread_l3fwd-thread_main.c.o'
-MF 'examples/c590b3c@@dpdk-l3fwd-thread@exe/performance-thread_l3fwd-thread_main.c.o.d'
-o 'examples/c590b3c@@dpdk-l3fwd-thread@exe/performance-thread_l3fwd-thread_main.c.o'
-c ../../dpdk/examples/performance-thread/l3fwd-thread/main.c
#‘target_mem_ref’ not supported by expression#’In file included from
../../dpdk/examples/performance-thread/l3fwd-thread/main.c:133:
../../dpdk/examples/performance-thread/l3fwd-thread/main.c: In
function ‘process_burst’:
../../dpdk/lib/librte_lpm/rte_lpm.h:435:7: error: may be used
uninitialized in this function [-Werror=maybe-uninitialized]
435 | if (rte_lpm_lookup(lpm, ((uint32_t *) &ip)[i], &hop[i]) < 0)
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
cc1: all warnings being treated as errors
ninja: build stopped: subcommand failed.
gcc (GCC) 9.3.1 20200408 (Red Hat 9.3.1-2)
On Thu, Oct 8, 2020 at 5:19 PM David Marchand <david.marchand@redhat.com> wrote:
>
> On Wed, Sep 30, 2020 at 3:14 PM Ciara Power <ciara.power@intel.com> wrote:
> >
> > When choosing the vector path, max SIMD bitwidth is now checked to
> > ensure a vector path is allowable. To do this, rather than the vector
> > lookup functions being called directly from apps, a generic lookup
> > function is called which will call the vector functions if suitable.
> >
> > Signed-off-by: Ciara Power <ciara.power@intel.com>
>
> Got a build error on this patch with ./devtools/test-meson-builds.sh
> ("gcc-shared" target):
>
> [2/3] Compiling C object
> 'examples/c590b3c@@dpdk-l3fwd-thread@exe/performance-thread_l3fwd-thread_main.c.o'.
> FAILED: examples/c590b3c@@dpdk-l3fwd-thread@exe/performance-thread_l3fwd-thread_main.c.o
> ccache gcc -Iexamples/c590b3c@@dpdk-l3fwd-thread@exe -Iexamples
> -I../../dpdk/examples -Iexamples/performance-thread/l3fwd-thread
> -I../../dpdk/examples/performance-thread/l3fwd-thread
> -I../../dpdk/examples/performance-thread/l3fwd-thread/../common
> -I../../dpdk/examples/performance-thread/l3fwd-thread/../common/arch/x86
> -I. -I../../dpdk/ -Iconfig -I../../dpdk/config
> -Ilib/librte_eal/include -I../../dpdk/lib/librte_eal/include
> -Ilib/librte_eal/linux/include
> -I../../dpdk/lib/librte_eal/linux/include -Ilib/librte_eal/x86/include
> -I../../dpdk/lib/librte_eal/x86/include -Ilib/librte_eal/common
> -I../../dpdk/lib/librte_eal/common -Ilib/librte_eal
> -I../../dpdk/lib/librte_eal -Ilib/librte_kvargs
> -I../../dpdk/lib/librte_kvargs
> -Ilib/librte_telemetry/../librte_metrics
> -I../../dpdk/lib/librte_telemetry/../librte_metrics
> -Ilib/librte_telemetry -I../../dpdk/lib/librte_telemetry
> -Ilib/librte_mempool -I../../dpdk/lib/librte_mempool -Ilib/librte_ring
> -I../../dpdk/lib/librte_ring -Ilib/librte_net
> -I../../dpdk/lib/librte_net -Ilib/librte_mbuf
> -I../../dpdk/lib/librte_mbuf -Ilib/librte_ethdev
> -I../../dpdk/lib/librte_ethdev -Ilib/librte_meter
> -I../../dpdk/lib/librte_meter -Ilib/librte_cmdline
> -I../../dpdk/lib/librte_cmdline -Ilib/librte_timer
> -I../../dpdk/lib/librte_timer -Ilib/librte_lpm
> -I../../dpdk/lib/librte_lpm -Ilib/librte_hash
> -I../../dpdk/lib/librte_hash -Ilib/librte_rcu
> -I../../dpdk/lib/librte_rcu
> -I/home/dmarchan/intel-ipsec-mb/install/include
> -fdiagnostics-color=always -pipe -D_FILE_OFFSET_BITS=64 -Wall
> -Winvalid-pch -Werror -O2 -g -include rte_config.h -Wextra -Wcast-qual
> -Wdeprecated -Wformat-nonliteral -Wformat-security
> -Wmissing-declarations -Wmissing-prototypes -Wnested-externs
> -Wold-style-definition -Wpointer-arith -Wsign-compare
> -Wstrict-prototypes -Wundef -Wwrite-strings
> -Wno-address-of-packed-member -Wno-packed-not-aligned
> -Wno-missing-field-initializers -D_GNU_SOURCE -march=native
> -Wno-format-truncation -DALLOW_EXPERIMENTAL_API -MD -MQ
> 'examples/c590b3c@@dpdk-l3fwd-thread@exe/performance-thread_l3fwd-thread_main.c.o'
> -MF 'examples/c590b3c@@dpdk-l3fwd-thread@exe/performance-thread_l3fwd-thread_main.c.o.d'
> -o 'examples/c590b3c@@dpdk-l3fwd-thread@exe/performance-thread_l3fwd-thread_main.c.o'
> -c ../../dpdk/examples/performance-thread/l3fwd-thread/main.c
> #‘target_mem_ref’ not supported by expression#’In file included from
> ../../dpdk/examples/performance-thread/l3fwd-thread/main.c:133:
> ../../dpdk/examples/performance-thread/l3fwd-thread/main.c: In
> function ‘process_burst’:
> ../../dpdk/lib/librte_lpm/rte_lpm.h:435:7: error: may be used
> uninitialized in this function [-Werror=maybe-uninitialized]
> 435 | if (rte_lpm_lookup(lpm, ((uint32_t *) &ip)[i], &hop[i]) < 0)
> | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
> cc1: all warnings being treated as errors
> ninja: build stopped: subcommand failed.
>
> gcc (GCC) 9.3.1 20200408 (Red Hat 9.3.1-2)
On the build issue, I guess you can use a rte_xmm_t passe-plat.
diff --git a/lib/librte_lpm/rte_lpm.h b/lib/librte_lpm/rte_lpm.h
index edba7cafd5..43db784a76 100644
--- a/lib/librte_lpm/rte_lpm.h
+++ b/lib/librte_lpm/rte_lpm.h
@@ -430,10 +430,14 @@ static inline void
rte_lpm_lookupx4_scalar(struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
uint32_t defv)
{
- int i;
- for (i = 0; i < 4; i++)
- if (rte_lpm_lookup(lpm, ((uint32_t *) &ip)[i], &hop[i]) < 0)
+ unsigned int i;
+ rte_xmm_t _ip;
+
+ _ip.x = ip;
+ for (i = 0; i < RTE_DIM(_ip.u32); i++) {
+ if (rte_lpm_lookup(lpm, _ip.u32[i], &hop[i]) < 0)
hop[i] = defv; /* lookupx4 expected to set on failure */
+ }
}
/**
Hi Konstantin,
>-----Original Message-----
>From: Ananyev, Konstantin <konstantin.ananyev@intel.com>
>Sent: Thursday 8 October 2020 15:40
>To: Medvedkin, Vladimir <vladimir.medvedkin@intel.com>; Power, Ciara
><ciara.power@intel.com>; dev@dpdk.org
>Cc: Richardson, Bruce <bruce.richardson@intel.com>; Jerin Jacob
><jerinj@marvell.com>; Ruifeng Wang <ruifeng.wang@arm.com>
>Subject: RE: [dpdk-dev] [PATCH v3 18/18] lpm: choose vector path at runtime
>
>>
>> Hi Ciara,
>>
>>
>> On 30/09/2020 14:04, Ciara Power wrote:
>> > When choosing the vector path, max SIMD bitwidth is now checked to
>> > ensure a vector path is allowable. To do this, rather than the
>> > vector lookup functions being called directly from apps, a generic
>> > lookup function is called which will call the vector functions if suitable.
>> >
>> > Signed-off-by: Ciara Power <ciara.power@intel.com>
>> > ---
>> > lib/librte_lpm/rte_lpm.h | 57 ++++++++++++++++++++++++++------
>> > lib/librte_lpm/rte_lpm_altivec.h | 2 +-
>> > lib/librte_lpm/rte_lpm_neon.h | 2 +-
>> > lib/librte_lpm/rte_lpm_sse.h | 2 +-
>> > 4 files changed, 50 insertions(+), 13 deletions(-)
>> >
>> > diff --git a/lib/librte_lpm/rte_lpm.h b/lib/librte_lpm/rte_lpm.h
>> > index 03da2d37e0..edba7cafd5 100644
>> > --- a/lib/librte_lpm/rte_lpm.h
>> > +++ b/lib/librte_lpm/rte_lpm.h
>> > @@ -397,8 +397,18 @@ rte_lpm_lookup_bulk_func(const struct rte_lpm
>*lpm, const uint32_t *ips,
>> > /* Mask four results. */
>> > #define RTE_LPM_MASKX4_RES UINT64_C(0x00ffffff00ffffff)
>> >
>> > +#if defined(RTE_ARCH_ARM) || defined(RTE_ARCH_ARM64) #include
>> > +"rte_lpm_neon.h"
>> > +#elif defined(RTE_ARCH_PPC_64)
>> > +#include "rte_lpm_altivec.h"
>> > +#else
>> > +#include "rte_lpm_sse.h"
>> > +#endif
>> > +
>> > /**
>> > - * Lookup four IP addresses in an LPM table.
>> > + * Lookup four IP addresses in an LPM table individually by calling
>> > + the
>> > + * lookup function for each ip. This is used when lookupx4 is
>> > + called but
>> > + * the vector path is not suitable.
>> > *
>> > * @param lpm
>> > * LPM object handle
>> > @@ -417,16 +427,43 @@ rte_lpm_lookup_bulk_func(const struct
>rte_lpm *lpm, const uint32_t *ips,
>> > * if lookup would fail.
>> > */
>> > static inline void
>> > -rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
>> > - uint32_t defv);
>> > +rte_lpm_lookupx4_scalar(struct rte_lpm *lpm, xmm_t ip, uint32_t
>hop[4],
>> > + uint32_t defv)
>> > +{
>> > + int i;
>> > + for (i = 0; i < 4; i++)
>> > + if (rte_lpm_lookup(lpm, ((uint32_t *) &ip)[i], &hop[i]) < 0)
>> > + hop[i] = defv; /* lookupx4 expected to set on failure
>*/ }
>> >
>> > -#if defined(RTE_ARCH_ARM) || defined(RTE_ARCH_ARM64) -#include
>> > "rte_lpm_neon.h"
>> > -#elif defined(RTE_ARCH_PPC_64)
>> > -#include "rte_lpm_altivec.h"
>> > -#else
>> > -#include "rte_lpm_sse.h"
>> > -#endif
>> > +/**
>> > + * Lookup four IP addresses in an LPM table.
>> > + *
>> > + * @param lpm
>> > + * LPM object handle
>> > + * @param ip
>> > + * Four IPs to be looked up in the LPM table
>> > + * @param hop
>> > + * Next hop of the most specific rule found for IP (valid on lookup hit
>only).
>> > + * This is an 4 elements array of two byte values.
>> > + * If the lookup was successful for the given IP, then least significant
>byte
>> > + * of the corresponding element is the actual next hop and the most
>> > + * significant byte is zero.
>> > + * If the lookup for the given IP failed, then corresponding element
>would
>> > + * contain default value, see description of then next parameter.
>> > + * @param defv
>> > + * Default value to populate into corresponding element of hop[] array,
>> > + * if lookup would fail.
>> > + */
>> > +static inline void
>> > +rte_lpm_lookupx4(struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
>> > + uint32_t defv)
>> > +{
>> > + if (rte_get_max_simd_bitwidth() >= RTE_MAX_128_SIMD)
>> > + rte_lpm_lookupx4_vec(lpm, ip, hop, defv);
>> > + else
>> > + rte_lpm_lookupx4_scalar(lpm, ip, hop, defv); }
>>
>> I'm afraid this will lead to a drop in performance. rte_lpm_lookupx4
>> is used in the hot path, and a bulk size is too small to amortize the
>> cost of adding this extra logic.
>
>I do share Vladimir's concern regarding performance here.
>As I said in other mail - it seems not much point to insert these checks into
>inline SSE specific function, as SSE is enabled by default for all x86 builds.
>
The performance impact is quite small, thanks Vladimir for providing these results:
before patches:
LPM LookupX4: 25.1 cycles (fails = 12.5%)
LPM LookupX4: 25.2 cycles (fails = 12.5%)
LPM LookupX4: 25.2 cycles (fails = 12.5%)
v3:
LPM LookupX4: 26.2 cycles (fails = 12.5%)
LPM LookupX4: 26.2 cycles (fails = 12.5%)
LPM LookupX4: 26.2 cycles (fails = 12.5%)
v4:
Note: I haven't sent this publicly yet, modified v3 slightly to check the bitwidth
in LPM create and set a flag that is used in lookupx4 to choose either vector or scalar function.
LPM LookupX4: 25.5 cycles (fails = 12.5%)
LPM LookupX4: 25.5 cycles (fails = 12.5%)
LPM LookupX4: 25.5 cycles (fails = 12.5%)
Thanks,
Ciara
>As another more generic thought - might be better to avoid these checks in
>other public SIMD-specific inline functions (if any).
>If such function get called from some .c, then at least such SIMD ISA is
>already enabled for that .c file and I think this check should be
>left for caller to do.
>
>> >
>> > #ifdef __cplusplus
>> > }
>> > diff --git a/lib/librte_lpm/rte_lpm_altivec.h
>> > b/lib/librte_lpm/rte_lpm_altivec.h
>> > index 228c41b38e..82142d3351 100644
>> > --- a/lib/librte_lpm/rte_lpm_altivec.h
>> > +++ b/lib/librte_lpm/rte_lpm_altivec.h
>> > @@ -16,7 +16,7 @@ extern "C" {
>> > #endif
>> >
>> > static inline void
>> > -rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t
>> > hop[4],
>> > +rte_lpm_lookupx4_vec(const struct rte_lpm *lpm, xmm_t ip, uint32_t
>> > +hop[4],
>> > uint32_t defv)
>> > {
>> > vector signed int i24;
>> > diff --git a/lib/librte_lpm/rte_lpm_neon.h
>> > b/lib/librte_lpm/rte_lpm_neon.h index 6c131d3125..14b184515d 100644
>> > --- a/lib/librte_lpm/rte_lpm_neon.h
>> > +++ b/lib/librte_lpm/rte_lpm_neon.h
>> > @@ -16,7 +16,7 @@ extern "C" {
>> > #endif
>> >
>> > static inline void
>> > -rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t
>> > hop[4],
>> > +rte_lpm_lookupx4_vec(const struct rte_lpm *lpm, xmm_t ip, uint32_t
>> > +hop[4],
>> > uint32_t defv)
>> > {
>> > uint32x4_t i24;
>> > diff --git a/lib/librte_lpm/rte_lpm_sse.h
>> > b/lib/librte_lpm/rte_lpm_sse.h index 44770b6ff8..cb5477c6cf 100644
>> > --- a/lib/librte_lpm/rte_lpm_sse.h
>> > +++ b/lib/librte_lpm/rte_lpm_sse.h
>> > @@ -15,7 +15,7 @@ extern "C" {
>> > #endif
>> >
>> > static inline void
>> > -rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t
>> > hop[4],
>> > +rte_lpm_lookupx4_vec(const struct rte_lpm *lpm, xmm_t ip, uint32_t
>> > +hop[4],
>> > uint32_t defv)
>> > {
>> > __m128i i24;
>> >
>>
>> --
>> Regards,
>> Vladimir
Hi Ciara,
> Hi Konstantin,
>
>
> >-----Original Message-----
> >From: Ananyev, Konstantin <konstantin.ananyev@intel.com>
> >Sent: Thursday 8 October 2020 15:40
> >To: Medvedkin, Vladimir <vladimir.medvedkin@intel.com>; Power, Ciara
> ><ciara.power@intel.com>; dev@dpdk.org
> >Cc: Richardson, Bruce <bruce.richardson@intel.com>; Jerin Jacob
> ><jerinj@marvell.com>; Ruifeng Wang <ruifeng.wang@arm.com>
> >Subject: RE: [dpdk-dev] [PATCH v3 18/18] lpm: choose vector path at runtime
> >
> >>
> >> Hi Ciara,
> >>
> >>
> >> On 30/09/2020 14:04, Ciara Power wrote:
> >> > When choosing the vector path, max SIMD bitwidth is now checked to
> >> > ensure a vector path is allowable. To do this, rather than the
> >> > vector lookup functions being called directly from apps, a generic
> >> > lookup function is called which will call the vector functions if suitable.
> >> >
> >> > Signed-off-by: Ciara Power <ciara.power@intel.com>
> >> > ---
> >> > lib/librte_lpm/rte_lpm.h | 57 ++++++++++++++++++++++++++------
> >> > lib/librte_lpm/rte_lpm_altivec.h | 2 +-
> >> > lib/librte_lpm/rte_lpm_neon.h | 2 +-
> >> > lib/librte_lpm/rte_lpm_sse.h | 2 +-
> >> > 4 files changed, 50 insertions(+), 13 deletions(-)
> >> >
> >> > diff --git a/lib/librte_lpm/rte_lpm.h b/lib/librte_lpm/rte_lpm.h
> >> > index 03da2d37e0..edba7cafd5 100644
> >> > --- a/lib/librte_lpm/rte_lpm.h
> >> > +++ b/lib/librte_lpm/rte_lpm.h
> >> > @@ -397,8 +397,18 @@ rte_lpm_lookup_bulk_func(const struct rte_lpm
> >*lpm, const uint32_t *ips,
> >> > /* Mask four results. */
> >> > #define RTE_LPM_MASKX4_RES UINT64_C(0x00ffffff00ffffff)
> >> >
> >> > +#if defined(RTE_ARCH_ARM) || defined(RTE_ARCH_ARM64) #include
> >> > +"rte_lpm_neon.h"
> >> > +#elif defined(RTE_ARCH_PPC_64)
> >> > +#include "rte_lpm_altivec.h"
> >> > +#else
> >> > +#include "rte_lpm_sse.h"
> >> > +#endif
> >> > +
> >> > /**
> >> > - * Lookup four IP addresses in an LPM table.
> >> > + * Lookup four IP addresses in an LPM table individually by calling
> >> > + the
> >> > + * lookup function for each ip. This is used when lookupx4 is
> >> > + called but
> >> > + * the vector path is not suitable.
> >> > *
> >> > * @param lpm
> >> > * LPM object handle
> >> > @@ -417,16 +427,43 @@ rte_lpm_lookup_bulk_func(const struct
> >rte_lpm *lpm, const uint32_t *ips,
> >> > * if lookup would fail.
> >> > */
> >> > static inline void
> >> > -rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
> >> > - uint32_t defv);
> >> > +rte_lpm_lookupx4_scalar(struct rte_lpm *lpm, xmm_t ip, uint32_t
> >hop[4],
> >> > + uint32_t defv)
> >> > +{
> >> > + int i;
> >> > + for (i = 0; i < 4; i++)
> >> > + if (rte_lpm_lookup(lpm, ((uint32_t *) &ip)[i], &hop[i]) < 0)
> >> > + hop[i] = defv; /* lookupx4 expected to set on failure
> >*/ }
> >> >
> >> > -#if defined(RTE_ARCH_ARM) || defined(RTE_ARCH_ARM64) -#include
> >> > "rte_lpm_neon.h"
> >> > -#elif defined(RTE_ARCH_PPC_64)
> >> > -#include "rte_lpm_altivec.h"
> >> > -#else
> >> > -#include "rte_lpm_sse.h"
> >> > -#endif
> >> > +/**
> >> > + * Lookup four IP addresses in an LPM table.
> >> > + *
> >> > + * @param lpm
> >> > + * LPM object handle
> >> > + * @param ip
> >> > + * Four IPs to be looked up in the LPM table
> >> > + * @param hop
> >> > + * Next hop of the most specific rule found for IP (valid on lookup hit
> >only).
> >> > + * This is an 4 elements array of two byte values.
> >> > + * If the lookup was successful for the given IP, then least significant
> >byte
> >> > + * of the corresponding element is the actual next hop and the most
> >> > + * significant byte is zero.
> >> > + * If the lookup for the given IP failed, then corresponding element
> >would
> >> > + * contain default value, see description of then next parameter.
> >> > + * @param defv
> >> > + * Default value to populate into corresponding element of hop[] array,
> >> > + * if lookup would fail.
> >> > + */
> >> > +static inline void
> >> > +rte_lpm_lookupx4(struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
> >> > + uint32_t defv)
> >> > +{
> >> > + if (rte_get_max_simd_bitwidth() >= RTE_MAX_128_SIMD)
> >> > + rte_lpm_lookupx4_vec(lpm, ip, hop, defv);
> >> > + else
> >> > + rte_lpm_lookupx4_scalar(lpm, ip, hop, defv); }
> >>
> >> I'm afraid this will lead to a drop in performance. rte_lpm_lookupx4
> >> is used in the hot path, and a bulk size is too small to amortize the
> >> cost of adding this extra logic.
> >
> >I do share Vladimir's concern regarding performance here.
> >As I said in other mail - it seems not much point to insert these checks into
> >inline SSE specific function, as SSE is enabled by default for all x86 builds.
> >
>
> The performance impact is quite small, thanks Vladimir for providing these results:
>
> before patches:
> LPM LookupX4: 25.1 cycles (fails = 12.5%)
> LPM LookupX4: 25.2 cycles (fails = 12.5%)
> LPM LookupX4: 25.2 cycles (fails = 12.5%)
>
> v3:
> LPM LookupX4: 26.2 cycles (fails = 12.5%)
> LPM LookupX4: 26.2 cycles (fails = 12.5%)
> LPM LookupX4: 26.2 cycles (fails = 12.5%)
Yes, perf difference is surprisingly small...
Wonder what tests did you use for that?
I'd expect that on l3fwd it would be more noticeable,
especially on machines with low-end cpus.
> v4:
> Note: I haven't sent this publicly yet, modified v3 slightly to check the bitwidth
> in LPM create and set a flag that is used in lookupx4 to choose either vector or scalar function.
Yes, avoiding function call will definitely help here.
Though I am sill not convinced we have to make such checks in that function at all
(and other inline functions).
Inline functions will be compiled within user code and their behaviour should be controlled
together with the rest of user code.
Let say in l3fwd for IA rte_lpm_lookupx4 is called from /l3fwd_lpm_sse.h,
which as name implies is supposed to be build and used with SSE enabled.
If we'd like l3fwd to obey 'max-simd-width' parameter, then it needs to be done
somewhere at startup, when behaviour is selected, not inside every possible inline function
that does use SSE instrincts.
> LPM LookupX4: 25.5 cycles (fails = 12.5%)
> LPM LookupX4: 25.5 cycles (fails = 12.5%)
> LPM LookupX4: 25.5 cycles (fails = 12.5%)
>
>
> Thanks,
> Ciara
>
> >As another more generic thought - might be better to avoid these checks in
> >other public SIMD-specific inline functions (if any).
> >If such function get called from some .c, then at least such SIMD ISA is
> >already enabled for that .c file and I think this check should be
> >left for caller to do.
> >
> >> >
> >> > #ifdef __cplusplus
> >> > }
> >> > diff --git a/lib/librte_lpm/rte_lpm_altivec.h
> >> > b/lib/librte_lpm/rte_lpm_altivec.h
> >> > index 228c41b38e..82142d3351 100644
> >> > --- a/lib/librte_lpm/rte_lpm_altivec.h
> >> > +++ b/lib/librte_lpm/rte_lpm_altivec.h
> >> > @@ -16,7 +16,7 @@ extern "C" {
> >> > #endif
> >> >
> >> > static inline void
> >> > -rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t
> >> > hop[4],
> >> > +rte_lpm_lookupx4_vec(const struct rte_lpm *lpm, xmm_t ip, uint32_t
> >> > +hop[4],
> >> > uint32_t defv)
> >> > {
> >> > vector signed int i24;
> >> > diff --git a/lib/librte_lpm/rte_lpm_neon.h
> >> > b/lib/librte_lpm/rte_lpm_neon.h index 6c131d3125..14b184515d 100644
> >> > --- a/lib/librte_lpm/rte_lpm_neon.h
> >> > +++ b/lib/librte_lpm/rte_lpm_neon.h
> >> > @@ -16,7 +16,7 @@ extern "C" {
> >> > #endif
> >> >
> >> > static inline void
> >> > -rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t
> >> > hop[4],
> >> > +rte_lpm_lookupx4_vec(const struct rte_lpm *lpm, xmm_t ip, uint32_t
> >> > +hop[4],
> >> > uint32_t defv)
> >> > {
> >> > uint32x4_t i24;
> >> > diff --git a/lib/librte_lpm/rte_lpm_sse.h
> >> > b/lib/librte_lpm/rte_lpm_sse.h index 44770b6ff8..cb5477c6cf 100644
> >> > --- a/lib/librte_lpm/rte_lpm_sse.h
> >> > +++ b/lib/librte_lpm/rte_lpm_sse.h
> >> > @@ -15,7 +15,7 @@ extern "C" {
> >> > #endif
> >> >
> >> > static inline void
> >> > -rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t
> >> > hop[4],
> >> > +rte_lpm_lookupx4_vec(const struct rte_lpm *lpm, xmm_t ip, uint32_t
> >> > +hop[4],
> >> > uint32_t defv)
> >> > {
> >> > __m128i i24;
> >> >
> >>
> >> --
> >> Regards,
> >> Vladimir
@@ -397,8 +397,18 @@ rte_lpm_lookup_bulk_func(const struct rte_lpm *lpm, const uint32_t *ips,
/* Mask four results. */
#define RTE_LPM_MASKX4_RES UINT64_C(0x00ffffff00ffffff)
+#if defined(RTE_ARCH_ARM) || defined(RTE_ARCH_ARM64)
+#include "rte_lpm_neon.h"
+#elif defined(RTE_ARCH_PPC_64)
+#include "rte_lpm_altivec.h"
+#else
+#include "rte_lpm_sse.h"
+#endif
+
/**
- * Lookup four IP addresses in an LPM table.
+ * Lookup four IP addresses in an LPM table individually by calling the
+ * lookup function for each ip. This is used when lookupx4 is called but
+ * the vector path is not suitable.
*
* @param lpm
* LPM object handle
@@ -417,16 +427,43 @@ rte_lpm_lookup_bulk_func(const struct rte_lpm *lpm, const uint32_t *ips,
* if lookup would fail.
*/
static inline void
-rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
- uint32_t defv);
+rte_lpm_lookupx4_scalar(struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
+ uint32_t defv)
+{
+ int i;
+ for (i = 0; i < 4; i++)
+ if (rte_lpm_lookup(lpm, ((uint32_t *) &ip)[i], &hop[i]) < 0)
+ hop[i] = defv; /* lookupx4 expected to set on failure */
+}
-#if defined(RTE_ARCH_ARM) || defined(RTE_ARCH_ARM64)
-#include "rte_lpm_neon.h"
-#elif defined(RTE_ARCH_PPC_64)
-#include "rte_lpm_altivec.h"
-#else
-#include "rte_lpm_sse.h"
-#endif
+/**
+ * Lookup four IP addresses in an LPM table.
+ *
+ * @param lpm
+ * LPM object handle
+ * @param ip
+ * Four IPs to be looked up in the LPM table
+ * @param hop
+ * Next hop of the most specific rule found for IP (valid on lookup hit only).
+ * This is an 4 elements array of two byte values.
+ * If the lookup was successful for the given IP, then least significant byte
+ * of the corresponding element is the actual next hop and the most
+ * significant byte is zero.
+ * If the lookup for the given IP failed, then corresponding element would
+ * contain default value, see description of then next parameter.
+ * @param defv
+ * Default value to populate into corresponding element of hop[] array,
+ * if lookup would fail.
+ */
+static inline void
+rte_lpm_lookupx4(struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
+ uint32_t defv)
+{
+ if (rte_get_max_simd_bitwidth() >= RTE_MAX_128_SIMD)
+ rte_lpm_lookupx4_vec(lpm, ip, hop, defv);
+ else
+ rte_lpm_lookupx4_scalar(lpm, ip, hop, defv);
+}
#ifdef __cplusplus
}
@@ -16,7 +16,7 @@ extern "C" {
#endif
static inline void
-rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
+rte_lpm_lookupx4_vec(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
uint32_t defv)
{
vector signed int i24;
@@ -16,7 +16,7 @@ extern "C" {
#endif
static inline void
-rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
+rte_lpm_lookupx4_vec(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
uint32_t defv)
{
uint32x4_t i24;
@@ -15,7 +15,7 @@ extern "C" {
#endif
static inline void
-rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
+rte_lpm_lookupx4_vec(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
uint32_t defv)
{
__m128i i24;