[dpdk-dev,v2,15/16] lpm/arm: implement rte_lpm_lookupx4 using rte_lpm_lookup_bulk on for-x86

Message ID 1445877458-31052-16-git-send-email-viktorin@rehivetech.com (mailing list archive)
State Superseded, archived
Headers

Commit Message

Jan Viktorin Oct. 26, 2015, 4:37 p.m. UTC
  From: Vlastimil Kosar <kosar@rehivetech.com>

LPM function rte_lpm_lookupx4() uses i686/x86_64 SIMD intrinsics. Therefore,
the function is reimplemented using non-vector operations for non-x86
architectures. In the future, each architecture should have vectorized code.
This patch includes rudimentary emulation of intrinsic functions _mm_set_epi32(),
_mm_loadu_si128() and _mm_load_si128() for easy portability of existing
applications.

LPM builds now when on ARM.

FIXME: to be reworked

Signed-off-by: Vlastimil Kosar <kosar@rehivetech.com>
Signed-off-by: Jan Viktorin <viktorin@rehivetech.com>
---
 config/defconfig_arm-armv7-a-linuxapp-gcc |  1 -
 lib/librte_lpm/rte_lpm.h                  | 71 +++++++++++++++++++++++++++++++
 2 files changed, 71 insertions(+), 1 deletion(-)
  

Comments

Ananyev, Konstantin Oct. 27, 2015, 3:31 p.m. UTC | #1
Hi Jan,

> -----Original Message-----
> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Jan Viktorin
> Sent: Monday, October 26, 2015 4:38 PM
> To: Thomas Monjalon; Hunt, David; dev@dpdk.org
> Cc: Vlastimil Kosar
> Subject: [dpdk-dev] [PATCH v2 15/16] lpm/arm: implement rte_lpm_lookupx4 using rte_lpm_lookup_bulk on for-x86
> 
> From: Vlastimil Kosar <kosar@rehivetech.com>
> 
> LPM function rte_lpm_lookupx4() uses i686/x86_64 SIMD intrinsics. Therefore,
> the function is reimplemented using non-vector operations for non-x86
> architectures. In the future, each architecture should have vectorized code.
> This patch includes rudimentary emulation of intrinsic functions _mm_set_epi32(),
> _mm_loadu_si128() and _mm_load_si128() for easy portability of existing
> applications.
> 
> LPM builds now when on ARM.
> 
> FIXME: to be reworked
> 
> Signed-off-by: Vlastimil Kosar <kosar@rehivetech.com>
> Signed-off-by: Jan Viktorin <viktorin@rehivetech.com>
> ---
>  config/defconfig_arm-armv7-a-linuxapp-gcc |  1 -
>  lib/librte_lpm/rte_lpm.h                  | 71 +++++++++++++++++++++++++++++++
>  2 files changed, 71 insertions(+), 1 deletion(-)
> 
> diff --git a/config/defconfig_arm-armv7-a-linuxapp-gcc b/config/defconfig_arm-armv7-a-linuxapp-gcc
> index 5b582a8..33afb33 100644
> --- a/config/defconfig_arm-armv7-a-linuxapp-gcc
> +++ b/config/defconfig_arm-armv7-a-linuxapp-gcc
> @@ -58,7 +58,6 @@ CONFIG_XMM_SIZE=16
> 
>  # fails to compile on ARM
>  CONFIG_RTE_LIBRTE_ACL=n
> -CONFIG_RTE_LIBRTE_LPM=n
> 
>  # cannot use those on ARM
>  CONFIG_RTE_KNI_KMOD=n
> diff --git a/lib/librte_lpm/rte_lpm.h b/lib/librte_lpm/rte_lpm.h
> index c299ce2..4619992 100644
> --- a/lib/librte_lpm/rte_lpm.h
> +++ b/lib/librte_lpm/rte_lpm.h
> @@ -47,7 +47,9 @@
>  #include <rte_byteorder.h>
>  #include <rte_memory.h>
>  #include <rte_common.h>
> +#if defined(RTE_ARCH_X86_64) || defined(RTE_ARCH_I686)
>  #include <rte_vect.h>
> +#endif
> 
>  #ifdef __cplusplus
>  extern "C" {
> @@ -358,6 +360,7 @@ rte_lpm_lookup_bulk_func(const struct rte_lpm *lpm, const uint32_t * ips,
>  	return 0;
>  }
> 
> +#if defined(RTE_ARCH_X86_64) || defined(RTE_ARCH_I686)
>  /* Mask four results. */
>  #define	 RTE_LPM_MASKX4_RES	UINT64_C(0x00ff00ff00ff00ff)
> 
> @@ -472,6 +475,74 @@ rte_lpm_lookupx4(const struct rte_lpm *lpm, __m128i ip, uint16_t hop[4],
>  	hop[2] = (tbl[2] & RTE_LPM_LOOKUP_SUCCESS) ? (uint8_t)tbl[2] : defv;
>  	hop[3] = (tbl[3] & RTE_LPM_LOOKUP_SUCCESS) ? (uint8_t)tbl[3] : defv;
>  }
> +#else

Probably better to create an lib/librte_eal/common/include/arch/arm/rte_vect.h,
and move all these x86 vector support emulation there?
Konstantin

> +// TODO: this code should be reworked.
> +
> +typedef struct {
> +	union uint128 {
> +		uint8_t uint8[16];
> +		uint32_t uint32[4];
> +	} val;
> +} __m128i;
> +
> +static inline __m128i
> +_mm_set_epi32(uint32_t v0, uint32_t v1, uint32_t v2, uint32_t v3)
> +{
> +	__m128i res;
> +	res.val.uint32[0] = v0;
> +	res.val.uint32[1] = v1;
> +	res.val.uint32[2] = v2;
> +	res.val.uint32[3] = v3;
> +	return res;
> +}
> +
> +static inline __m128i
> +_mm_loadu_si128(__m128i * v)
> +{
> +	__m128i res;
> +	res = *v;
> +	return res;
> +}
> +
> +static inline __m128i
> +_mm_load_si128(__m128i * v)
> +{
> +	__m128i res;
> +	res = *v;
> +	return res;
> +}
> +
> +/**
> + * Lookup four IP addresses in an LPM table.
> + *
> + * @param lpm
> + *   LPM object handle
> + * @param ip
> + *   Four IPs to be looked up in the LPM table
> + * @param hop
> + *   Next hop of the most specific rule found for IP (valid on lookup hit only).
> + *   This is an 4 elements array of two byte values.
> + *   If the lookup was succesfull for the given IP, then least significant byte
> + *   of the corresponding element is the  actual next hop and the most
> + *   significant byte is zero.
> + *   If the lookup for the given IP failed, then corresponding element would
> + *   contain default value, see description of then next parameter.
> + * @param defv
> + *   Default value to populate into corresponding element of hop[] array,
> + *   if lookup would fail.
> + */
> +static inline void
> +rte_lpm_lookupx4(const struct rte_lpm *lpm, __m128i ip, uint16_t hop[4],
> +	uint16_t defv)
> +{
> +	rte_lpm_lookup_bulk(lpm, ip.val.uint32, hop, 4);
> +
> +	hop[0] = (hop[0] & RTE_LPM_LOOKUP_SUCCESS) ? (uint8_t)hop[0] : defv;
> +	hop[1] = (hop[1] & RTE_LPM_LOOKUP_SUCCESS) ? (uint8_t)hop[1] : defv;
> +	hop[2] = (hop[2] & RTE_LPM_LOOKUP_SUCCESS) ? (uint8_t)hop[2] : defv;
> +	hop[3] = (hop[3] & RTE_LPM_LOOKUP_SUCCESS) ? (uint8_t)hop[3] : defv;
> +}
> +#endif
> 
>  #ifdef __cplusplus
>  }
> --
> 2.6.1
  
Jan Viktorin Oct. 27, 2015, 3:38 p.m. UTC | #2
Hi Konstantin,

On Tue, 27 Oct 2015 15:31:44 +0000
"Ananyev, Konstantin" <konstantin.ananyev@intel.com> wrote:

> Hi Jan,
> 
> > -----Original Message-----
> > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Jan Viktorin
> > Sent: Monday, October 26, 2015 4:38 PM
> > To: Thomas Monjalon; Hunt, David; dev@dpdk.org
> > Cc: Vlastimil Kosar
> > Subject: [dpdk-dev] [PATCH v2 15/16] lpm/arm: implement rte_lpm_lookupx4 using rte_lpm_lookup_bulk on for-x86
> > 
> > From: Vlastimil Kosar <kosar@rehivetech.com>
> > 
> > LPM function rte_lpm_lookupx4() uses i686/x86_64 SIMD intrinsics. Therefore,
> > the function is reimplemented using non-vector operations for non-x86
> > architectures. In the future, each architecture should have vectorized code.
> > This patch includes rudimentary emulation of intrinsic functions _mm_set_epi32(),
> > _mm_loadu_si128() and _mm_load_si128() for easy portability of existing
> > applications.
> > 
> > LPM builds now when on ARM.
> > 
> > FIXME: to be reworked
> > 
> > Signed-off-by: Vlastimil Kosar <kosar@rehivetech.com>
> > Signed-off-by: Jan Viktorin <viktorin@rehivetech.com>
> > ---
> >  config/defconfig_arm-armv7-a-linuxapp-gcc |  1 -
> >  lib/librte_lpm/rte_lpm.h                  | 71 +++++++++++++++++++++++++++++++
> >  2 files changed, 71 insertions(+), 1 deletion(-)
> > 
> > diff --git a/config/defconfig_arm-armv7-a-linuxapp-gcc b/config/defconfig_arm-armv7-a-linuxapp-gcc
> > index 5b582a8..33afb33 100644
> > --- a/config/defconfig_arm-armv7-a-linuxapp-gcc
> > +++ b/config/defconfig_arm-armv7-a-linuxapp-gcc
> > @@ -58,7 +58,6 @@ CONFIG_XMM_SIZE=16
> > 
> >  # fails to compile on ARM
> >  CONFIG_RTE_LIBRTE_ACL=n
> > -CONFIG_RTE_LIBRTE_LPM=n
> > 
> >  # cannot use those on ARM
> >  CONFIG_RTE_KNI_KMOD=n
> > diff --git a/lib/librte_lpm/rte_lpm.h b/lib/librte_lpm/rte_lpm.h
> > index c299ce2..4619992 100644
> > --- a/lib/librte_lpm/rte_lpm.h
> > +++ b/lib/librte_lpm/rte_lpm.h
> > @@ -47,7 +47,9 @@
> >  #include <rte_byteorder.h>
> >  #include <rte_memory.h>
> >  #include <rte_common.h>
> > +#if defined(RTE_ARCH_X86_64) || defined(RTE_ARCH_I686)
> >  #include <rte_vect.h>
> > +#endif
> > 
> >  #ifdef __cplusplus
> >  extern "C" {
> > @@ -358,6 +360,7 @@ rte_lpm_lookup_bulk_func(const struct rte_lpm *lpm, const uint32_t * ips,
> >  	return 0;
> >  }
> > 
> > +#if defined(RTE_ARCH_X86_64) || defined(RTE_ARCH_I686)
> >  /* Mask four results. */
> >  #define	 RTE_LPM_MASKX4_RES	UINT64_C(0x00ff00ff00ff00ff)
> > 
> > @@ -472,6 +475,74 @@ rte_lpm_lookupx4(const struct rte_lpm *lpm, __m128i ip, uint16_t hop[4],
> >  	hop[2] = (tbl[2] & RTE_LPM_LOOKUP_SUCCESS) ? (uint8_t)tbl[2] : defv;
> >  	hop[3] = (tbl[3] & RTE_LPM_LOOKUP_SUCCESS) ? (uint8_t)tbl[3] : defv;
> >  }
> > +#else  
> 
> Probably better to create an lib/librte_eal/common/include/arch/arm/rte_vect.h,
> and move all these x86 vector support emulation there?
> Konstantin

Sure. This patch is terribly wrong and it's not to be merged. It is a
question whether to make it this way (with the refactoring as you
suggested) or to make some general abstraction of the SSE calls in DPDK.

Jan

> 
> > +// TODO: this code should be reworked.
> > +
> > +typedef struct {
> > +	union uint128 {
> > +		uint8_t uint8[16];
> > +		uint32_t uint32[4];
> > +	} val;
> > +} __m128i;
> > +
> > +static inline __m128i
> > +_mm_set_epi32(uint32_t v0, uint32_t v1, uint32_t v2, uint32_t v3)
> > +{
> > +	__m128i res;
> > +	res.val.uint32[0] = v0;
> > +	res.val.uint32[1] = v1;
> > +	res.val.uint32[2] = v2;
> > +	res.val.uint32[3] = v3;
> > +	return res;
> > +}
> > +
> > +static inline __m128i
> > +_mm_loadu_si128(__m128i * v)
> > +{
> > +	__m128i res;
> > +	res = *v;
> > +	return res;
> > +}
> > +
> > +static inline __m128i
> > +_mm_load_si128(__m128i * v)
> > +{
> > +	__m128i res;
> > +	res = *v;
> > +	return res;
> > +}
> > +
> > +/**
> > + * Lookup four IP addresses in an LPM table.
> > + *
> > + * @param lpm
> > + *   LPM object handle
> > + * @param ip
> > + *   Four IPs to be looked up in the LPM table
> > + * @param hop
> > + *   Next hop of the most specific rule found for IP (valid on lookup hit only).
> > + *   This is an 4 elements array of two byte values.
> > + *   If the lookup was succesfull for the given IP, then least significant byte
> > + *   of the corresponding element is the  actual next hop and the most
> > + *   significant byte is zero.
> > + *   If the lookup for the given IP failed, then corresponding element would
> > + *   contain default value, see description of then next parameter.
> > + * @param defv
> > + *   Default value to populate into corresponding element of hop[] array,
> > + *   if lookup would fail.
> > + */
> > +static inline void
> > +rte_lpm_lookupx4(const struct rte_lpm *lpm, __m128i ip, uint16_t hop[4],
> > +	uint16_t defv)
> > +{
> > +	rte_lpm_lookup_bulk(lpm, ip.val.uint32, hop, 4);
> > +
> > +	hop[0] = (hop[0] & RTE_LPM_LOOKUP_SUCCESS) ? (uint8_t)hop[0] : defv;
> > +	hop[1] = (hop[1] & RTE_LPM_LOOKUP_SUCCESS) ? (uint8_t)hop[1] : defv;
> > +	hop[2] = (hop[2] & RTE_LPM_LOOKUP_SUCCESS) ? (uint8_t)hop[2] : defv;
> > +	hop[3] = (hop[3] & RTE_LPM_LOOKUP_SUCCESS) ? (uint8_t)hop[3] : defv;
> > +}
> > +#endif
> > 
> >  #ifdef __cplusplus
> >  }
> > --
> > 2.6.1  
>
  

Patch

diff --git a/config/defconfig_arm-armv7-a-linuxapp-gcc b/config/defconfig_arm-armv7-a-linuxapp-gcc
index 5b582a8..33afb33 100644
--- a/config/defconfig_arm-armv7-a-linuxapp-gcc
+++ b/config/defconfig_arm-armv7-a-linuxapp-gcc
@@ -58,7 +58,6 @@  CONFIG_XMM_SIZE=16
 
 # fails to compile on ARM
 CONFIG_RTE_LIBRTE_ACL=n
-CONFIG_RTE_LIBRTE_LPM=n
 
 # cannot use those on ARM
 CONFIG_RTE_KNI_KMOD=n
diff --git a/lib/librte_lpm/rte_lpm.h b/lib/librte_lpm/rte_lpm.h
index c299ce2..4619992 100644
--- a/lib/librte_lpm/rte_lpm.h
+++ b/lib/librte_lpm/rte_lpm.h
@@ -47,7 +47,9 @@ 
 #include <rte_byteorder.h>
 #include <rte_memory.h>
 #include <rte_common.h>
+#if defined(RTE_ARCH_X86_64) || defined(RTE_ARCH_I686)
 #include <rte_vect.h>
+#endif
 
 #ifdef __cplusplus
 extern "C" {
@@ -358,6 +360,7 @@  rte_lpm_lookup_bulk_func(const struct rte_lpm *lpm, const uint32_t * ips,
 	return 0;
 }
 
+#if defined(RTE_ARCH_X86_64) || defined(RTE_ARCH_I686)
 /* Mask four results. */
 #define	 RTE_LPM_MASKX4_RES	UINT64_C(0x00ff00ff00ff00ff)
 
@@ -472,6 +475,74 @@  rte_lpm_lookupx4(const struct rte_lpm *lpm, __m128i ip, uint16_t hop[4],
 	hop[2] = (tbl[2] & RTE_LPM_LOOKUP_SUCCESS) ? (uint8_t)tbl[2] : defv;
 	hop[3] = (tbl[3] & RTE_LPM_LOOKUP_SUCCESS) ? (uint8_t)tbl[3] : defv;
 }
+#else
+// TODO: this code should be reworked.
+
+typedef struct {
+	union uint128 {
+		uint8_t uint8[16];
+		uint32_t uint32[4];
+	} val;
+} __m128i;
+
+static inline __m128i
+_mm_set_epi32(uint32_t v0, uint32_t v1, uint32_t v2, uint32_t v3)
+{
+	__m128i res;
+	res.val.uint32[0] = v0;
+	res.val.uint32[1] = v1;
+	res.val.uint32[2] = v2;
+	res.val.uint32[3] = v3;
+	return res;
+}
+
+static inline __m128i
+_mm_loadu_si128(__m128i * v)
+{
+	__m128i res;
+	res = *v;
+	return res;
+}
+
+static inline __m128i
+_mm_load_si128(__m128i * v)
+{
+	__m128i res;
+	res = *v;
+	return res;
+}
+
+/**
+ * Lookup four IP addresses in an LPM table.
+ *
+ * @param lpm
+ *   LPM object handle
+ * @param ip
+ *   Four IPs to be looked up in the LPM table
+ * @param hop
+ *   Next hop of the most specific rule found for IP (valid on lookup hit only).
+ *   This is an 4 elements array of two byte values.
+ *   If the lookup was succesfull for the given IP, then least significant byte
+ *   of the corresponding element is the  actual next hop and the most
+ *   significant byte is zero.
+ *   If the lookup for the given IP failed, then corresponding element would
+ *   contain default value, see description of then next parameter.
+ * @param defv
+ *   Default value to populate into corresponding element of hop[] array,
+ *   if lookup would fail.
+ */
+static inline void
+rte_lpm_lookupx4(const struct rte_lpm *lpm, __m128i ip, uint16_t hop[4],
+	uint16_t defv)
+{
+	rte_lpm_lookup_bulk(lpm, ip.val.uint32, hop, 4);
+
+	hop[0] = (hop[0] & RTE_LPM_LOOKUP_SUCCESS) ? (uint8_t)hop[0] : defv;
+	hop[1] = (hop[1] & RTE_LPM_LOOKUP_SUCCESS) ? (uint8_t)hop[1] : defv;
+	hop[2] = (hop[2] & RTE_LPM_LOOKUP_SUCCESS) ? (uint8_t)hop[2] : defv;
+	hop[3] = (hop[3] & RTE_LPM_LOOKUP_SUCCESS) ? (uint8_t)hop[3] : defv;
+}
+#endif
 
 #ifdef __cplusplus
 }