[dpdk-dev,v4,2/3] lpm: add support for NEON

Message ID 1455280123-9311-3-git-send-email-jerin.jacob@caviumnetworks.com (mailing list archive)
State Changes Requested, archived
Delegated to: Thomas Monjalon
Headers

Commit Message

Jerin Jacob Feb. 12, 2016, 12:28 p.m. UTC
Enabled CONFIG_RTE_LIBRTE_LPM, CONFIG_RTE_LIBRTE_TABLE,
CONFIG_RTE_LIBRTE_PIPELINE libraries for arm and arm64

TABLE, PIPELINE libraries were disabled due to LPM library dependency.

Signed-off-by: Jerin Jacob <jerin.jacob@caviumnetworks.com>
Signed-off-by: Jianbo Liu <jianbo.liu@linaro.org>
---
 app/test/test_xmmt_ops.h                   |  20 ++++
 config/defconfig_arm-armv7a-linuxapp-gcc   |   3 -
 config/defconfig_arm64-armv8a-linuxapp-gcc |   3 -
 lib/librte_lpm/Makefile                    |   4 +
 lib/librte_lpm/rte_lpm.h                   |   4 +
 lib/librte_lpm/rte_lpm_neon.h              | 148 +++++++++++++++++++++++++++++
 6 files changed, 176 insertions(+), 6 deletions(-)
 create mode 100644 lib/librte_lpm/rte_lpm_neon.h
  

Comments

Thomas Monjalon March 1, 2016, 5:46 p.m. UTC | #1
2016-02-12 17:58, Jerin Jacob:
>  # fails to compile on ARM
> -CONFIG_RTE_LIBRTE_LPM=n
> -CONFIG_RTE_LIBRTE_TABLE=n
> -CONFIG_RTE_LIBRTE_PIPELINE=n

The associated examples cannot compile.
Maybe it's too early to enable them.
What about updating the comment to state that only examples fail?

> --- a/lib/librte_lpm/Makefile
> +++ b/lib/librte_lpm/Makefile
> +ifneq ($(filter y,$(CONFIG_RTE_ARCH_ARM) $(CONFIG_RTE_ARCH_ARM64)),)
> +SYMLINK-$(CONFIG_RTE_LIBRTE_LPM)-include += rte_lpm_neon.h

Simpler:
ifneq ($(CONFIG_RTE_ARCH_ARM)$(CONFIG_RTE_ARCH_ARM64),nn)
  
Jerin Jacob March 2, 2016, 6:45 a.m. UTC | #2
On Tue, Mar 01, 2016 at 06:46:04PM +0100, Thomas Monjalon wrote:
> 2016-02-12 17:58, Jerin Jacob:
> >  # fails to compile on ARM
> > -CONFIG_RTE_LIBRTE_LPM=n
> > -CONFIG_RTE_LIBRTE_TABLE=n
> > -CONFIG_RTE_LIBRTE_PIPELINE=n
> 
> The associated examples cannot compile.
> Maybe it's too early to enable them.
> What about updating the comment to state that only examples fail?

Not sure where to comment it though.
The only l3fwd build is failing on arm64 due to insane use of SSE
intrinsics with out proper abstraction in recent l3fwd rework.
l3fwd was building earlier with a minor change; Now it looks like
it needs reasonable cycles to fix it properly.

> 
> > --- a/lib/librte_lpm/Makefile
> > +++ b/lib/librte_lpm/Makefile
> > +ifneq ($(filter y,$(CONFIG_RTE_ARCH_ARM) $(CONFIG_RTE_ARCH_ARM64)),)
> > +SYMLINK-$(CONFIG_RTE_LIBRTE_LPM)-include += rte_lpm_neon.h
> 
> Simpler:
> ifneq ($(CONFIG_RTE_ARCH_ARM)$(CONFIG_RTE_ARCH_ARM64),nn)

I will change it in next version
  

Patch

diff --git a/app/test/test_xmmt_ops.h b/app/test/test_xmmt_ops.h
index c055912..de9c16f 100644
--- a/app/test/test_xmmt_ops.h
+++ b/app/test/test_xmmt_ops.h
@@ -36,6 +36,24 @@ 
 
 #include <rte_vect.h>
 
+#if defined(RTE_ARCH_ARM) || defined(RTE_ARCH_ARM64)
+
+/* vect_* abstraction implementation using NEON */
+
+/* loads the xmm_t value from address p(does not need to be 16-byte aligned)*/
+#define vect_loadu_sil128(p) vld1q_s32((const int32_t *)p)
+
+/* sets the 4 signed 32-bit integer values and returns the xmm_t variable */
+static inline xmm_t  __attribute__((always_inline))
+vect_set_epi32(int i3, int i2, int i1, int i0)
+{
+	int32_t data[4] = {i0, i1, i2, i3};
+
+	return vld1q_s32(data);
+}
+
+#elif defined(RTE_ARCH_X86)
+
 /* vect_* abstraction implementation using SSE */
 
 /* loads the xmm_t value from address p(does not need to be 16-byte aligned)*/
@@ -44,4 +62,6 @@ 
 /* sets the 4 signed 32-bit integer values and returns the xmm_t variable */
 #define vect_set_epi32(i3, i2, i1, i0) _mm_set_epi32(i3, i2, i1, i0)
 
+#endif
+
 #endif /* _TEST_XMMT_OPS_H_ */
diff --git a/config/defconfig_arm-armv7a-linuxapp-gcc b/config/defconfig_arm-armv7a-linuxapp-gcc
index cbebd64..efffa1f 100644
--- a/config/defconfig_arm-armv7a-linuxapp-gcc
+++ b/config/defconfig_arm-armv7a-linuxapp-gcc
@@ -53,9 +53,6 @@  CONFIG_RTE_LIBRTE_KNI=n
 CONFIG_RTE_EAL_IGB_UIO=n
 
 # fails to compile on ARM
-CONFIG_RTE_LIBRTE_LPM=n
-CONFIG_RTE_LIBRTE_TABLE=n
-CONFIG_RTE_LIBRTE_PIPELINE=n
 CONFIG_RTE_SCHED_VECTOR=n
 
 # cannot use those on ARM
diff --git a/config/defconfig_arm64-armv8a-linuxapp-gcc b/config/defconfig_arm64-armv8a-linuxapp-gcc
index eacd01c..52e0c97 100644
--- a/config/defconfig_arm64-armv8a-linuxapp-gcc
+++ b/config/defconfig_arm64-armv8a-linuxapp-gcc
@@ -49,7 +49,4 @@  CONFIG_RTE_LIBRTE_IVSHMEM=n
 CONFIG_RTE_LIBRTE_FM10K_PMD=n
 CONFIG_RTE_LIBRTE_I40E_PMD=n
 
-CONFIG_RTE_LIBRTE_LPM=n
-CONFIG_RTE_LIBRTE_TABLE=n
-CONFIG_RTE_LIBRTE_PIPELINE=n
 CONFIG_RTE_SCHED_VECTOR=n
diff --git a/lib/librte_lpm/Makefile b/lib/librte_lpm/Makefile
index ce3a1d1..656ade2 100644
--- a/lib/librte_lpm/Makefile
+++ b/lib/librte_lpm/Makefile
@@ -47,7 +47,11 @@  SRCS-$(CONFIG_RTE_LIBRTE_LPM) := rte_lpm.c rte_lpm6.c
 # install this header file
 SYMLINK-$(CONFIG_RTE_LIBRTE_LPM)-include := rte_lpm.h rte_lpm6.h
 
+ifneq ($(filter y,$(CONFIG_RTE_ARCH_ARM) $(CONFIG_RTE_ARCH_ARM64)),)
+SYMLINK-$(CONFIG_RTE_LIBRTE_LPM)-include += rte_lpm_neon.h
+else ifeq ($(CONFIG_RTE_ARCH_X86),y)
 SYMLINK-$(CONFIG_RTE_LIBRTE_LPM)-include += rte_lpm_sse.h
+endif
 
 # this lib needs eal
 DEPDIRS-$(CONFIG_RTE_LIBRTE_LPM) += lib/librte_eal
diff --git a/lib/librte_lpm/rte_lpm.h b/lib/librte_lpm/rte_lpm.h
index dfe1378..2c34a25 100644
--- a/lib/librte_lpm/rte_lpm.h
+++ b/lib/librte_lpm/rte_lpm.h
@@ -384,7 +384,11 @@  static inline void
 rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint16_t hop[4],
 		 uint16_t defv);
 
+#if defined(RTE_ARCH_ARM) || defined(RTE_ARCH_ARM64)
+#include "rte_lpm_neon.h"
+#elif defined(RTE_ARCH_X86)
 #include "rte_lpm_sse.h"
+#endif
 
 #ifdef __cplusplus
 }
diff --git a/lib/librte_lpm/rte_lpm_neon.h b/lib/librte_lpm/rte_lpm_neon.h
new file mode 100644
index 0000000..fcd2a8a
--- /dev/null
+++ b/lib/librte_lpm/rte_lpm_neon.h
@@ -0,0 +1,148 @@ 
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2015 Cavium Networks. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Derived rte_lpm_lookupx4 implementation from lib/librte_lpm/rte_lpm_sse.h
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Cavium Networks nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _RTE_LPM_NEON_H_
+#define _RTE_LPM_NEON_H_
+
+#include <rte_branch_prediction.h>
+#include <rte_byteorder.h>
+#include <rte_memory.h>
+#include <rte_common.h>
+#include <rte_vect.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static inline void
+rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint16_t hop[4],
+		 uint16_t defv)
+{
+	uint32x4_t i24;
+	rte_xmm_t i8;
+	uint16_t tbl[4];
+	uint64_t idx, pt;
+
+	const uint32_t mask = UINT8_MAX;
+	const int32x4_t mask8 = vdupq_n_s32(mask);
+
+	/*
+	 * RTE_LPM_VALID_EXT_ENTRY_BITMASK for 4 LPM entries
+	 * as one 64-bit value (0x0300030003000300).
+	 */
+	const uint64_t mask_xv =
+		((uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK |
+		(uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK << 16 |
+		(uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK << 32 |
+		(uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK << 48);
+
+	/*
+	 * RTE_LPM_LOOKUP_SUCCESS for 4 LPM entries
+	 * as one 64-bit value (0x0100010001000100).
+	 */
+	const uint64_t mask_v =
+		((uint64_t)RTE_LPM_LOOKUP_SUCCESS |
+		(uint64_t)RTE_LPM_LOOKUP_SUCCESS << 16 |
+		(uint64_t)RTE_LPM_LOOKUP_SUCCESS << 32 |
+		(uint64_t)RTE_LPM_LOOKUP_SUCCESS << 48);
+
+	/* get 4 indexes for tbl24[]. */
+	i24 = vshrq_n_u32((uint32x4_t)ip, CHAR_BIT);
+
+	/* extract values from tbl24[] */
+	idx = vgetq_lane_u64((uint64x2_t)i24, 0);
+
+	tbl[0] = *(const uint16_t *)&lpm->tbl24[(uint32_t)idx];
+	tbl[1] = *(const uint16_t *)&lpm->tbl24[idx >> 32];
+
+	idx = vgetq_lane_u64((uint64x2_t)i24, 1);
+
+	tbl[2] = *(const uint16_t *)&lpm->tbl24[(uint32_t)idx];
+	tbl[3] = *(const uint16_t *)&lpm->tbl24[idx >> 32];
+
+	/* get 4 indexes for tbl8[]. */
+	i8.x = vandq_s32(ip, mask8);
+
+	pt = (uint64_t)tbl[0] |
+		(uint64_t)tbl[1] << 16 |
+		(uint64_t)tbl[2] << 32 |
+		(uint64_t)tbl[3] << 48;
+
+	/* search successfully finished for all 4 IP addresses. */
+	if (likely((pt & mask_xv) == mask_v)) {
+		uintptr_t ph = (uintptr_t)hop;
+		*(uint64_t *)ph = pt & RTE_LPM_MASKX4_RES;
+		return;
+	}
+
+	if (unlikely((pt & RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
+			RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
+		i8.u32[0] = i8.u32[0] +
+			(uint8_t)tbl[0] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
+		tbl[0] = *(const uint16_t *)&lpm->tbl8[i8.u32[0]];
+	}
+	if (unlikely((pt >> 16 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
+			RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
+		i8.u32[1] = i8.u32[1] +
+			(uint8_t)tbl[1] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
+		tbl[1] = *(const uint16_t *)&lpm->tbl8[i8.u32[1]];
+	}
+	if (unlikely((pt >> 32 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
+			RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
+		i8.u32[2] = i8.u32[2] +
+			(uint8_t)tbl[2] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
+		tbl[2] = *(const uint16_t *)&lpm->tbl8[i8.u32[2]];
+	}
+	if (unlikely((pt >> 48 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
+			RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
+		i8.u32[3] = i8.u32[3] +
+			(uint8_t)tbl[3] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
+		tbl[3] = *(const uint16_t *)&lpm->tbl8[i8.u32[3]];
+	}
+
+	hop[0] = (tbl[0] & RTE_LPM_LOOKUP_SUCCESS) ? (uint8_t)tbl[0] : defv;
+	hop[1] = (tbl[1] & RTE_LPM_LOOKUP_SUCCESS) ? (uint8_t)tbl[1] : defv;
+	hop[2] = (tbl[2] & RTE_LPM_LOOKUP_SUCCESS) ? (uint8_t)tbl[2] : defv;
+	hop[3] = (tbl[3] & RTE_LPM_LOOKUP_SUCCESS) ? (uint8_t)tbl[3] : defv;
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_LPM_NEON_H_ */