[dpdk-dev,v2,1/3] lpm: make rte_lpm_lookupx4 API definition architecture agnostic

Message ID 1449242086-19051-2-git-send-email-jerin.jacob@caviumnetworks.com (mailing list archive)
State Superseded, archived
Headers

Commit Message

Jerin Jacob Dec. 4, 2015, 3:14 p.m. UTC
-Used architecture agnostic xmm_t to represent 128 bit SIMD variable

-Introduced vect_* API abstraction in app/test to test rte_lpm_lookupx4
API in  architecture agnostic way

-Moved rte_lpm_lookupx4 SSE implementation to architecture specific
rte_lpm_sse.h file to accommodate new rte_lpm_lookupx4 implementation
for a different architecture.

Signed-off-by: Jerin Jacob <jerin.jacob@caviumnetworks.com>
---
 app/test/test_lpm.c          |  21 ++++---
 app/test/test_xmmt_ops.h     |  47 ++++++++++++++
 lib/librte_lpm/Makefile      |   2 +
 lib/librte_lpm/rte_lpm.h     |  93 +---------------------------
 lib/librte_lpm/rte_lpm_sse.h | 143 +++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 206 insertions(+), 100 deletions(-)
 create mode 100644 app/test/test_xmmt_ops.h
 create mode 100644 lib/librte_lpm/rte_lpm_sse.h
  

Comments

Jianbo Liu Dec. 7, 2015, 6:15 a.m. UTC | #1
On 4 December 2015 at 23:14, Jerin Jacob <jerin.jacob@caviumnetworks.com> wrote:
> -Used architecture agnostic xmm_t to represent 128 bit SIMD variable
>
> -Introduced vect_* API abstraction in app/test to test rte_lpm_lookupx4
> API in  architecture agnostic way
>
> -Moved rte_lpm_lookupx4 SSE implementation to architecture specific
> rte_lpm_sse.h file to accommodate new rte_lpm_lookupx4 implementation
> for a different architecture.
>
> Signed-off-by: Jerin Jacob <jerin.jacob@caviumnetworks.com>
> ---
>  app/test/test_lpm.c          |  21 ++++---
>  app/test/test_xmmt_ops.h     |  47 ++++++++++++++
>  lib/librte_lpm/Makefile      |   2 +
>  lib/librte_lpm/rte_lpm.h     |  93 +---------------------------
>  lib/librte_lpm/rte_lpm_sse.h | 143 +++++++++++++++++++++++++++++++++++++++++++
>  5 files changed, 206 insertions(+), 100 deletions(-)
>  create mode 100644 app/test/test_xmmt_ops.h
>  create mode 100644 lib/librte_lpm/rte_lpm_sse.h
>
> diff --git a/app/test/test_lpm.c b/app/test/test_lpm.c
> index 8b4ded9..59674f1 100644
> --- a/app/test/test_lpm.c
> +++ b/app/test/test_lpm.c
> @@ -49,6 +49,7 @@
>
>  #include "rte_lpm.h"
>  #include "test_lpm_routes.h"
> +#include "test_xmmt_ops.h"
>
>  #define TEST_LPM_ASSERT(cond) do {                                            \
>         if (!(cond)) {                                                        \
> @@ -308,7 +309,7 @@ test6(void)
>  int32_t
>  test7(void)
>  {
> -       __m128i ipx4;
> +       xmm_t ipx4;
>         uint16_t hop[4];
>         struct rte_lpm *lpm = NULL;
>         uint32_t ip = IPv4(0, 0, 0, 0);
> @@ -324,7 +325,7 @@ test7(void)
>         status = rte_lpm_lookup(lpm, ip, &next_hop_return);
>         TEST_LPM_ASSERT((status == 0) && (next_hop_return == next_hop_add));
>
> -       ipx4 = _mm_set_epi32(ip, ip + 0x100, ip - 0x100, ip);
> +       ipx4 = vect_set_epi32(ip, ip + 0x100, ip - 0x100, ip);
>         rte_lpm_lookupx4(lpm, ipx4, hop, UINT16_MAX);
>         TEST_LPM_ASSERT(hop[0] == next_hop_add);
>         TEST_LPM_ASSERT(hop[1] == UINT16_MAX);
> @@ -354,7 +355,7 @@ test7(void)
>  int32_t
>  test8(void)
>  {
> -       __m128i ipx4;
> +       xmm_t ipx4;
>         uint16_t hop[4];
>         struct rte_lpm *lpm = NULL;
>         uint32_t ip1 = IPv4(127, 255, 255, 255), ip2 = IPv4(128, 0, 0, 0);
> @@ -380,7 +381,7 @@ test8(void)
>                 TEST_LPM_ASSERT((status == 0) &&
>                         (next_hop_return == next_hop_add));
>
> -               ipx4 = _mm_set_epi32(ip2, ip1, ip2, ip1);
> +               ipx4 = vect_set_epi32(ip2, ip1, ip2, ip1);
>                 rte_lpm_lookupx4(lpm, ipx4, hop, UINT16_MAX);
>                 TEST_LPM_ASSERT(hop[0] == UINT16_MAX);
>                 TEST_LPM_ASSERT(hop[1] == next_hop_add);
> @@ -408,7 +409,7 @@ test8(void)
>                 status = rte_lpm_lookup(lpm, ip1, &next_hop_return);
>                 TEST_LPM_ASSERT(status == -ENOENT);
>
> -               ipx4 = _mm_set_epi32(ip1, ip1, ip2, ip2);
> +               ipx4 = vect_set_epi32(ip1, ip1, ip2, ip2);
>                 rte_lpm_lookupx4(lpm, ipx4, hop, UINT16_MAX);
>                 if (depth != 1) {
>                         TEST_LPM_ASSERT(hop[0] == next_hop_add);
> @@ -850,7 +851,7 @@ test11(void)
>  int32_t
>  test12(void)
>  {
> -       __m128i ipx4;
> +       xmm_t ipx4;
>         uint16_t hop[4];
>         struct rte_lpm *lpm = NULL;
>         uint32_t ip, i;
> @@ -872,7 +873,7 @@ test12(void)
>                 TEST_LPM_ASSERT((status == 0) &&
>                                 (next_hop_return == next_hop_add));
>
> -               ipx4 = _mm_set_epi32(ip, ip + 1, ip, ip - 1);
> +               ipx4 = vect_set_epi32(ip, ip + 1, ip, ip - 1);
>                 rte_lpm_lookupx4(lpm, ipx4, hop, UINT16_MAX);
>                 TEST_LPM_ASSERT(hop[0] == UINT16_MAX);
>                 TEST_LPM_ASSERT(hop[1] == next_hop_add);
> @@ -1289,10 +1290,10 @@ perf_test(void)
>                 begin = rte_rdtsc();
>                 for (j = 0; j < BATCH_SIZE; j += RTE_DIM(next_hops)) {
>                         unsigned k;
> -                       __m128i ipx4;
> +                       xmm_t ipx4;
>
> -                       ipx4 = _mm_loadu_si128((__m128i *)(ip_batch + j));
> -                       ipx4 = *(__m128i *)(ip_batch + j);
> +                       ipx4 = vect_loadu_sil128((xmm_t *)(ip_batch + j));
> +                       ipx4 = *(xmm_t *)(ip_batch + j);
>                         rte_lpm_lookupx4(lpm, ipx4, next_hops, UINT16_MAX);
>                         for (k = 0; k < RTE_DIM(next_hops); k++)
>                                 if (unlikely(next_hops[k] == UINT16_MAX))
> diff --git a/app/test/test_xmmt_ops.h b/app/test/test_xmmt_ops.h
> new file mode 100644
> index 0000000..c055912
> --- /dev/null
> +++ b/app/test/test_xmmt_ops.h
Why add this new file under app/test, which is only for test app?
Should vect_loadu_sil128/vect_set_epi32 be in each ARCH's rte_vect.h?

> @@ -0,0 +1,47 @@
> +/*-
> + *   BSD LICENSE
> + *
> + *   Copyright(c) 2015 Cavium Networks. All rights reserved.
> + *   All rights reserved.
> + *
> + *   Redistribution and use in source and binary forms, with or without
> + *   modification, are permitted provided that the following conditions
> + *   are met:
> + *
> + *     * Redistributions of source code must retain the above copyright
> + *       notice, this list of conditions and the following disclaimer.
> + *     * Redistributions in binary form must reproduce the above copyright
> + *       notice, this list of conditions and the following disclaimer in
> + *       the documentation and/or other materials provided with the
> + *       distribution.
> + *     * Neither the name of Cavium Networks nor the names of its
> + *       contributors may be used to endorse or promote products derived
> + *       from this software without specific prior written permission.
> + *
> + *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> + *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> + *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> + *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> + *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> + *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> + *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> + *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> + *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> + *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> + *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
> + */
> +
> +#ifndef _TEST_XMMT_OPS_H_
> +#define _TEST_XMMT_OPS_H_
> +
> +#include <rte_vect.h>
> +
> +/* vect_* abstraction implementation using SSE */
> +
> +/* loads the xmm_t value from address p(does not need to be 16-byte aligned)*/
> +#define vect_loadu_sil128(p) _mm_loadu_si128(p)
> +
> +/* sets the 4 signed 32-bit integer values and returns the xmm_t variable */
> +#define vect_set_epi32(i3, i2, i1, i0) _mm_set_epi32(i3, i2, i1, i0)
> +
> +#endif /* _TEST_XMMT_OPS_H_ */
> diff --git a/lib/librte_lpm/Makefile b/lib/librte_lpm/Makefile
> index 688cfc9..ce3a1d1 100644
> --- a/lib/librte_lpm/Makefile
> +++ b/lib/librte_lpm/Makefile
> @@ -47,6 +47,8 @@ SRCS-$(CONFIG_RTE_LIBRTE_LPM) := rte_lpm.c rte_lpm6.c
>  # install this header file
>  SYMLINK-$(CONFIG_RTE_LIBRTE_LPM)-include := rte_lpm.h rte_lpm6.h
>
> +SYMLINK-$(CONFIG_RTE_LIBRTE_LPM)-include += rte_lpm_sse.h
> +
>  # this lib needs eal
>  DEPDIRS-$(CONFIG_RTE_LIBRTE_LPM) += lib/librte_eal
>
> diff --git a/lib/librte_lpm/rte_lpm.h b/lib/librte_lpm/rte_lpm.h
> index c299ce2..dfe1378 100644
> --- a/lib/librte_lpm/rte_lpm.h
> +++ b/lib/librte_lpm/rte_lpm.h
> @@ -381,97 +381,10 @@ rte_lpm_lookup_bulk_func(const struct rte_lpm *lpm, const uint32_t * ips,
>   *   if lookup would fail.
>   */
>  static inline void
> -rte_lpm_lookupx4(const struct rte_lpm *lpm, __m128i ip, uint16_t hop[4],
> -       uint16_t defv)
> -{
> -       __m128i i24;
> -       rte_xmm_t i8;
> -       uint16_t tbl[4];
> -       uint64_t idx, pt;
> +rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint16_t hop[4],
> +                uint16_t defv);
>
> -       const __m128i mask8 =
> -               _mm_set_epi32(UINT8_MAX, UINT8_MAX, UINT8_MAX, UINT8_MAX);
> -
> -       /*
> -        * RTE_LPM_VALID_EXT_ENTRY_BITMASK for 4 LPM entries
> -        * as one 64-bit value (0x0300030003000300).
> -        */
> -       const uint64_t mask_xv =
> -               ((uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK |
> -               (uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK << 16 |
> -               (uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK << 32 |
> -               (uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK << 48);
> -
> -       /*
> -        * RTE_LPM_LOOKUP_SUCCESS for 4 LPM entries
> -        * as one 64-bit value (0x0100010001000100).
> -        */
> -       const uint64_t mask_v =
> -               ((uint64_t)RTE_LPM_LOOKUP_SUCCESS |
> -               (uint64_t)RTE_LPM_LOOKUP_SUCCESS << 16 |
> -               (uint64_t)RTE_LPM_LOOKUP_SUCCESS << 32 |
> -               (uint64_t)RTE_LPM_LOOKUP_SUCCESS << 48);
> -
> -       /* get 4 indexes for tbl24[]. */
> -       i24 = _mm_srli_epi32(ip, CHAR_BIT);
> -
> -       /* extract values from tbl24[] */
> -       idx = _mm_cvtsi128_si64(i24);
> -       i24 = _mm_srli_si128(i24, sizeof(uint64_t));
> -
> -       tbl[0] = *(const uint16_t *)&lpm->tbl24[(uint32_t)idx];
> -       tbl[1] = *(const uint16_t *)&lpm->tbl24[idx >> 32];
> -
> -       idx = _mm_cvtsi128_si64(i24);
> -
> -       tbl[2] = *(const uint16_t *)&lpm->tbl24[(uint32_t)idx];
> -       tbl[3] = *(const uint16_t *)&lpm->tbl24[idx >> 32];
> -
> -       /* get 4 indexes for tbl8[]. */
> -       i8.x = _mm_and_si128(ip, mask8);
> -
> -       pt = (uint64_t)tbl[0] |
> -               (uint64_t)tbl[1] << 16 |
> -               (uint64_t)tbl[2] << 32 |
> -               (uint64_t)tbl[3] << 48;
> -
> -       /* search successfully finished for all 4 IP addresses. */
> -       if (likely((pt & mask_xv) == mask_v)) {
> -               uintptr_t ph = (uintptr_t)hop;
> -               *(uint64_t *)ph = pt & RTE_LPM_MASKX4_RES;
> -               return;
> -       }
> -
> -       if (unlikely((pt & RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
> -                       RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
> -               i8.u32[0] = i8.u32[0] +
> -                       (uint8_t)tbl[0] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
> -               tbl[0] = *(const uint16_t *)&lpm->tbl8[i8.u32[0]];
> -       }
> -       if (unlikely((pt >> 16 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
> -                       RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
> -               i8.u32[1] = i8.u32[1] +
> -                       (uint8_t)tbl[1] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
> -               tbl[1] = *(const uint16_t *)&lpm->tbl8[i8.u32[1]];
> -       }
> -       if (unlikely((pt >> 32 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
> -                       RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
> -               i8.u32[2] = i8.u32[2] +
> -                       (uint8_t)tbl[2] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
> -               tbl[2] = *(const uint16_t *)&lpm->tbl8[i8.u32[2]];
> -       }
> -       if (unlikely((pt >> 48 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
> -                       RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
> -               i8.u32[3] = i8.u32[3] +
> -                       (uint8_t)tbl[3] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
> -               tbl[3] = *(const uint16_t *)&lpm->tbl8[i8.u32[3]];
> -       }
> -
> -       hop[0] = (tbl[0] & RTE_LPM_LOOKUP_SUCCESS) ? (uint8_t)tbl[0] : defv;
> -       hop[1] = (tbl[1] & RTE_LPM_LOOKUP_SUCCESS) ? (uint8_t)tbl[1] : defv;
> -       hop[2] = (tbl[2] & RTE_LPM_LOOKUP_SUCCESS) ? (uint8_t)tbl[2] : defv;
> -       hop[3] = (tbl[3] & RTE_LPM_LOOKUP_SUCCESS) ? (uint8_t)tbl[3] : defv;
> -}
> +#include "rte_lpm_sse.h"
>
>  #ifdef __cplusplus
>  }
> diff --git a/lib/librte_lpm/rte_lpm_sse.h b/lib/librte_lpm/rte_lpm_sse.h
> new file mode 100644
> index 0000000..2b7eeec
> --- /dev/null
> +++ b/lib/librte_lpm/rte_lpm_sse.h
> @@ -0,0 +1,143 @@
> +/*-
> + *   BSD LICENSE
> + *
> + *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
> + *   All rights reserved.
> + *
> + *   Redistribution and use in source and binary forms, with or without
> + *   modification, are permitted provided that the following conditions
> + *   are met:
> + *
> + *     * Redistributions of source code must retain the above copyright
> + *       notice, this list of conditions and the following disclaimer.
> + *     * Redistributions in binary form must reproduce the above copyright
> + *       notice, this list of conditions and the following disclaimer in
> + *       the documentation and/or other materials provided with the
> + *       distribution.
> + *     * Neither the name of Intel Corporation nor the names of its
> + *       contributors may be used to endorse or promote products derived
> + *       from this software without specific prior written permission.
> + *
> + *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> + *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> + *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> + *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> + *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> + *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> + *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> + *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> + *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> + *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> + *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
> + */
> +
> +#ifndef _RTE_LPM_SSE_H_
> +#define _RTE_LPM_SSE_H_
> +
> +#include <rte_branch_prediction.h>
> +#include <rte_byteorder.h>
> +#include <rte_common.h>
> +#include <rte_vect.h>
> +
> +#ifdef __cplusplus
> +extern "C" {
> +#endif
> +
> +static inline void
> +rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint16_t hop[4],
> +                uint16_t defv)
> +{
> +       __m128i i24;
> +       rte_xmm_t i8;
> +       uint16_t tbl[4];
> +       uint64_t idx, pt;
> +
> +       const __m128i mask8 =
> +               _mm_set_epi32(UINT8_MAX, UINT8_MAX, UINT8_MAX, UINT8_MAX);
> +
> +       /*
> +        * RTE_LPM_VALID_EXT_ENTRY_BITMASK for 4 LPM entries
> +        * as one 64-bit value (0x0300030003000300).
> +        */
> +       const uint64_t mask_xv =
> +               ((uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK |
> +               (uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK << 16 |
> +               (uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK << 32 |
> +               (uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK << 48);
> +
> +       /*
> +        * RTE_LPM_LOOKUP_SUCCESS for 4 LPM entries
> +        * as one 64-bit value (0x0100010001000100).
> +        */
> +       const uint64_t mask_v =
> +               ((uint64_t)RTE_LPM_LOOKUP_SUCCESS |
> +               (uint64_t)RTE_LPM_LOOKUP_SUCCESS << 16 |
> +               (uint64_t)RTE_LPM_LOOKUP_SUCCESS << 32 |
> +               (uint64_t)RTE_LPM_LOOKUP_SUCCESS << 48);
> +
> +       /* get 4 indexes for tbl24[]. */
> +       i24 = _mm_srli_epi32(ip, CHAR_BIT);
> +
> +       /* extract values from tbl24[] */
> +       idx = _mm_cvtsi128_si64(i24);
> +       i24 = _mm_srli_si128(i24, sizeof(uint64_t));
> +
> +       tbl[0] = *(const uint16_t *)&lpm->tbl24[(uint32_t)idx];
> +       tbl[1] = *(const uint16_t *)&lpm->tbl24[idx >> 32];
> +
> +       idx = _mm_cvtsi128_si64(i24);
> +
> +       tbl[2] = *(const uint16_t *)&lpm->tbl24[(uint32_t)idx];
> +       tbl[3] = *(const uint16_t *)&lpm->tbl24[idx >> 32];
> +
> +       /* get 4 indexes for tbl8[]. */
> +       i8.x = _mm_and_si128(ip, mask8);
> +
> +       pt = (uint64_t)tbl[0] |
> +               (uint64_t)tbl[1] << 16 |
> +               (uint64_t)tbl[2] << 32 |
> +               (uint64_t)tbl[3] << 48;
> +
> +       /* search successfully finished for all 4 IP addresses. */
> +       if (likely((pt & mask_xv) == mask_v)) {
> +               uintptr_t ph = (uintptr_t)hop;
> +               *(uint64_t *)ph = pt & RTE_LPM_MASKX4_RES;
> +               return;
> +       }
> +
> +       if (unlikely((pt & RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
> +                       RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
> +               i8.u32[0] = i8.u32[0] +
> +                       (uint8_t)tbl[0] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
> +               tbl[0] = *(const uint16_t *)&lpm->tbl8[i8.u32[0]];
> +       }
> +       if (unlikely((pt >> 16 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
> +                       RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
> +               i8.u32[1] = i8.u32[1] +
> +                       (uint8_t)tbl[1] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
> +               tbl[1] = *(const uint16_t *)&lpm->tbl8[i8.u32[1]];
> +       }
> +       if (unlikely((pt >> 32 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
> +                       RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
> +               i8.u32[2] = i8.u32[2] +
> +                       (uint8_t)tbl[2] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
> +               tbl[2] = *(const uint16_t *)&lpm->tbl8[i8.u32[2]];
> +       }
> +       if (unlikely((pt >> 48 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
> +                       RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
> +               i8.u32[3] = i8.u32[3] +
> +                       (uint8_t)tbl[3] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
> +               tbl[3] = *(const uint16_t *)&lpm->tbl8[i8.u32[3]];
> +       }
> +
> +       hop[0] = (tbl[0] & RTE_LPM_LOOKUP_SUCCESS) ? (uint8_t)tbl[0] : defv;
> +       hop[1] = (tbl[1] & RTE_LPM_LOOKUP_SUCCESS) ? (uint8_t)tbl[1] : defv;
> +       hop[2] = (tbl[2] & RTE_LPM_LOOKUP_SUCCESS) ? (uint8_t)tbl[2] : defv;
> +       hop[3] = (tbl[3] & RTE_LPM_LOOKUP_SUCCESS) ? (uint8_t)tbl[3] : defv;
> +}
> +
> +#ifdef __cplusplus
> +}
> +#endif
> +
> +#endif /* _RTE_LPM_SSE_H_ */
> --
> 2.1.0
>
  
Jerin Jacob Dec. 7, 2015, 6:57 a.m. UTC | #2
On Mon, Dec 07, 2015 at 02:15:28PM +0800, Jianbo Liu wrote:
> On 4 December 2015 at 23:14, Jerin Jacob <jerin.jacob@caviumnetworks.com> wrote:
> > -Used architecture agnostic xmm_t to represent 128 bit SIMD variable
> >
> > -Introduced vect_* API abstraction in app/test to test rte_lpm_lookupx4
> > API in  architecture agnostic way
> >
> > -Moved rte_lpm_lookupx4 SSE implementation to architecture specific
> > rte_lpm_sse.h file to accommodate new rte_lpm_lookupx4 implementation
> > for a different architecture.
> >
> > Signed-off-by: Jerin Jacob <jerin.jacob@caviumnetworks.com>
> > ---
> >  app/test/test_lpm.c          |  21 ++++---
> >  app/test/test_xmmt_ops.h     |  47 ++++++++++++++
> >  lib/librte_lpm/Makefile      |   2 +
> >  lib/librte_lpm/rte_lpm.h     |  93 +---------------------------
> >  lib/librte_lpm/rte_lpm_sse.h | 143 +++++++++++++++++++++++++++++++++++++++++++
> >  5 files changed, 206 insertions(+), 100 deletions(-)
> >  create mode 100644 app/test/test_xmmt_ops.h
> >  create mode 100644 lib/librte_lpm/rte_lpm_sse.h
> >
> > diff --git a/app/test/test_lpm.c b/app/test/test_lpm.c
> > index 8b4ded9..59674f1 100644
> > --- a/app/test/test_lpm.c
> > +++ b/app/test/test_lpm.c
> > @@ -49,6 +49,7 @@
> >
> >  #include "rte_lpm.h"
> >  #include "test_lpm_routes.h"
> > +#include "test_xmmt_ops.h"
> >
> >  #define TEST_LPM_ASSERT(cond) do {                                            \
> >         if (!(cond)) {                                                        \
> > @@ -308,7 +309,7 @@ test6(void)
> >  int32_t
> >  test7(void)
> >  {
> > -       __m128i ipx4;
> > +       xmm_t ipx4;
> >         uint16_t hop[4];
> >         struct rte_lpm *lpm = NULL;
> >         uint32_t ip = IPv4(0, 0, 0, 0);
> > @@ -324,7 +325,7 @@ test7(void)
> >         status = rte_lpm_lookup(lpm, ip, &next_hop_return);
> >         TEST_LPM_ASSERT((status == 0) && (next_hop_return == next_hop_add));
> >
> > -       ipx4 = _mm_set_epi32(ip, ip + 0x100, ip - 0x100, ip);
> > +       ipx4 = vect_set_epi32(ip, ip + 0x100, ip - 0x100, ip);
> >         rte_lpm_lookupx4(lpm, ipx4, hop, UINT16_MAX);
> >         TEST_LPM_ASSERT(hop[0] == next_hop_add);
> >         TEST_LPM_ASSERT(hop[1] == UINT16_MAX);
> > @@ -354,7 +355,7 @@ test7(void)
> >  int32_t
> >  test8(void)
> >  {
> > -       __m128i ipx4;
> > +       xmm_t ipx4;
> >         uint16_t hop[4];
> >         struct rte_lpm *lpm = NULL;
> >         uint32_t ip1 = IPv4(127, 255, 255, 255), ip2 = IPv4(128, 0, 0, 0);
> > @@ -380,7 +381,7 @@ test8(void)
> >                 TEST_LPM_ASSERT((status == 0) &&
> >                         (next_hop_return == next_hop_add));
> >
> > -               ipx4 = _mm_set_epi32(ip2, ip1, ip2, ip1);
> > +               ipx4 = vect_set_epi32(ip2, ip1, ip2, ip1);
> >                 rte_lpm_lookupx4(lpm, ipx4, hop, UINT16_MAX);
> >                 TEST_LPM_ASSERT(hop[0] == UINT16_MAX);
> >                 TEST_LPM_ASSERT(hop[1] == next_hop_add);
> > @@ -408,7 +409,7 @@ test8(void)
> >                 status = rte_lpm_lookup(lpm, ip1, &next_hop_return);
> >                 TEST_LPM_ASSERT(status == -ENOENT);
> >
> > -               ipx4 = _mm_set_epi32(ip1, ip1, ip2, ip2);
> > +               ipx4 = vect_set_epi32(ip1, ip1, ip2, ip2);
> >                 rte_lpm_lookupx4(lpm, ipx4, hop, UINT16_MAX);
> >                 if (depth != 1) {
> >                         TEST_LPM_ASSERT(hop[0] == next_hop_add);
> > @@ -850,7 +851,7 @@ test11(void)
> >  int32_t
> >  test12(void)
> >  {
> > -       __m128i ipx4;
> > +       xmm_t ipx4;
> >         uint16_t hop[4];
> >         struct rte_lpm *lpm = NULL;
> >         uint32_t ip, i;
> > @@ -872,7 +873,7 @@ test12(void)
> >                 TEST_LPM_ASSERT((status == 0) &&
> >                                 (next_hop_return == next_hop_add));
> >
> > -               ipx4 = _mm_set_epi32(ip, ip + 1, ip, ip - 1);
> > +               ipx4 = vect_set_epi32(ip, ip + 1, ip, ip - 1);
> >                 rte_lpm_lookupx4(lpm, ipx4, hop, UINT16_MAX);
> >                 TEST_LPM_ASSERT(hop[0] == UINT16_MAX);
> >                 TEST_LPM_ASSERT(hop[1] == next_hop_add);
> > @@ -1289,10 +1290,10 @@ perf_test(void)
> >                 begin = rte_rdtsc();
> >                 for (j = 0; j < BATCH_SIZE; j += RTE_DIM(next_hops)) {
> >                         unsigned k;
> > -                       __m128i ipx4;
> > +                       xmm_t ipx4;
> >
> > -                       ipx4 = _mm_loadu_si128((__m128i *)(ip_batch + j));
> > -                       ipx4 = *(__m128i *)(ip_batch + j);
> > +                       ipx4 = vect_loadu_sil128((xmm_t *)(ip_batch + j));
> > +                       ipx4 = *(xmm_t *)(ip_batch + j);
> >                         rte_lpm_lookupx4(lpm, ipx4, next_hops, UINT16_MAX);
> >                         for (k = 0; k < RTE_DIM(next_hops); k++)
> >                                 if (unlikely(next_hops[k] == UINT16_MAX))
> > diff --git a/app/test/test_xmmt_ops.h b/app/test/test_xmmt_ops.h
> > new file mode 100644
> > index 0000000..c055912
> > --- /dev/null
> > +++ b/app/test/test_xmmt_ops.h
> Why add this new file under app/test, which is only for test app?
> Should vect_loadu_sil128/vect_set_epi32 be in each ARCH's rte_vect.h?
>

V1 was like that, I thought of moving the file under app/test because

1) all the ARCH can't have the implementation for vector primitives if
architecture doesn't support it like ppc64 and tile so moving EAL may
not be a good idea
2) scope of vector abstraction only for using the API(i.e test app), NOT
for
implementing the library. So its boils down to load/store/set should
not be beyond that.
and I am afraid that if we opening up EAL abstraction that will change the scope
and which will have performance implication to use emulating the logic in
library
3) It's been discussed, There was no disagreement on this
http://dpdk.org/ml/archives/dev/2015-December/029404.html

Thanks,
Jerin

[snip]
  
Ananyev, Konstantin Dec. 7, 2015, 2:06 p.m. UTC | #3
> From: Jerin Jacob [mailto:jerin.jacob@caviumnetworks.com]
> Sent: Friday, December 04, 2015 3:15 PM
> To: dev@dpdk.org
> Cc: thomas.monjalon@6wind.com; Ananyev, Konstantin; viktorin@rehivetech.com; jianbo.liu@linaro.org; Jerin Jacob
> Subject: [dpdk-dev] [PATCH v2 1/3] lpm: make rte_lpm_lookupx4 API definition architecture agnostic
> 
> -Used architecture agnostic xmm_t to represent 128 bit SIMD variable
> 
> -Introduced vect_* API abstraction in app/test to test rte_lpm_lookupx4
> API in  architecture agnostic way
> 
> -Moved rte_lpm_lookupx4 SSE implementation to architecture specific
> rte_lpm_sse.h file to accommodate new rte_lpm_lookupx4 implementation
> for a different architecture.
> 
> Signed-off-by: Jerin Jacob <jerin.jacob@caviumnetworks.com>
> ---

Acked-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
  

Patch

diff --git a/app/test/test_lpm.c b/app/test/test_lpm.c
index 8b4ded9..59674f1 100644
--- a/app/test/test_lpm.c
+++ b/app/test/test_lpm.c
@@ -49,6 +49,7 @@ 
 
 #include "rte_lpm.h"
 #include "test_lpm_routes.h"
+#include "test_xmmt_ops.h"
 
 #define TEST_LPM_ASSERT(cond) do {                                            \
 	if (!(cond)) {                                                        \
@@ -308,7 +309,7 @@  test6(void)
 int32_t
 test7(void)
 {
-	__m128i ipx4;
+	xmm_t ipx4;
 	uint16_t hop[4];
 	struct rte_lpm *lpm = NULL;
 	uint32_t ip = IPv4(0, 0, 0, 0);
@@ -324,7 +325,7 @@  test7(void)
 	status = rte_lpm_lookup(lpm, ip, &next_hop_return);
 	TEST_LPM_ASSERT((status == 0) && (next_hop_return == next_hop_add));
 
-	ipx4 = _mm_set_epi32(ip, ip + 0x100, ip - 0x100, ip);
+	ipx4 = vect_set_epi32(ip, ip + 0x100, ip - 0x100, ip);
 	rte_lpm_lookupx4(lpm, ipx4, hop, UINT16_MAX);
 	TEST_LPM_ASSERT(hop[0] == next_hop_add);
 	TEST_LPM_ASSERT(hop[1] == UINT16_MAX);
@@ -354,7 +355,7 @@  test7(void)
 int32_t
 test8(void)
 {
-	__m128i ipx4;
+	xmm_t ipx4;
 	uint16_t hop[4];
 	struct rte_lpm *lpm = NULL;
 	uint32_t ip1 = IPv4(127, 255, 255, 255), ip2 = IPv4(128, 0, 0, 0);
@@ -380,7 +381,7 @@  test8(void)
 		TEST_LPM_ASSERT((status == 0) &&
 			(next_hop_return == next_hop_add));
 
-		ipx4 = _mm_set_epi32(ip2, ip1, ip2, ip1);
+		ipx4 = vect_set_epi32(ip2, ip1, ip2, ip1);
 		rte_lpm_lookupx4(lpm, ipx4, hop, UINT16_MAX);
 		TEST_LPM_ASSERT(hop[0] == UINT16_MAX);
 		TEST_LPM_ASSERT(hop[1] == next_hop_add);
@@ -408,7 +409,7 @@  test8(void)
 		status = rte_lpm_lookup(lpm, ip1, &next_hop_return);
 		TEST_LPM_ASSERT(status == -ENOENT);
 
-		ipx4 = _mm_set_epi32(ip1, ip1, ip2, ip2);
+		ipx4 = vect_set_epi32(ip1, ip1, ip2, ip2);
 		rte_lpm_lookupx4(lpm, ipx4, hop, UINT16_MAX);
 		if (depth != 1) {
 			TEST_LPM_ASSERT(hop[0] == next_hop_add);
@@ -850,7 +851,7 @@  test11(void)
 int32_t
 test12(void)
 {
-	__m128i ipx4;
+	xmm_t ipx4;
 	uint16_t hop[4];
 	struct rte_lpm *lpm = NULL;
 	uint32_t ip, i;
@@ -872,7 +873,7 @@  test12(void)
 		TEST_LPM_ASSERT((status == 0) &&
 				(next_hop_return == next_hop_add));
 
-		ipx4 = _mm_set_epi32(ip, ip + 1, ip, ip - 1);
+		ipx4 = vect_set_epi32(ip, ip + 1, ip, ip - 1);
 		rte_lpm_lookupx4(lpm, ipx4, hop, UINT16_MAX);
 		TEST_LPM_ASSERT(hop[0] == UINT16_MAX);
 		TEST_LPM_ASSERT(hop[1] == next_hop_add);
@@ -1289,10 +1290,10 @@  perf_test(void)
 		begin = rte_rdtsc();
 		for (j = 0; j < BATCH_SIZE; j += RTE_DIM(next_hops)) {
 			unsigned k;
-			__m128i ipx4;
+			xmm_t ipx4;
 
-			ipx4 = _mm_loadu_si128((__m128i *)(ip_batch + j));
-			ipx4 = *(__m128i *)(ip_batch + j);
+			ipx4 = vect_loadu_sil128((xmm_t *)(ip_batch + j));
+			ipx4 = *(xmm_t *)(ip_batch + j);
 			rte_lpm_lookupx4(lpm, ipx4, next_hops, UINT16_MAX);
 			for (k = 0; k < RTE_DIM(next_hops); k++)
 				if (unlikely(next_hops[k] == UINT16_MAX))
diff --git a/app/test/test_xmmt_ops.h b/app/test/test_xmmt_ops.h
new file mode 100644
index 0000000..c055912
--- /dev/null
+++ b/app/test/test_xmmt_ops.h
@@ -0,0 +1,47 @@ 
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2015 Cavium Networks. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Cavium Networks nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _TEST_XMMT_OPS_H_
+#define _TEST_XMMT_OPS_H_
+
+#include <rte_vect.h>
+
+/* vect_* abstraction implementation using SSE */
+
+/* loads the xmm_t value from address p(does not need to be 16-byte aligned)*/
+#define vect_loadu_sil128(p) _mm_loadu_si128(p)
+
+/* sets the 4 signed 32-bit integer values and returns the xmm_t variable */
+#define vect_set_epi32(i3, i2, i1, i0) _mm_set_epi32(i3, i2, i1, i0)
+
+#endif /* _TEST_XMMT_OPS_H_ */
diff --git a/lib/librte_lpm/Makefile b/lib/librte_lpm/Makefile
index 688cfc9..ce3a1d1 100644
--- a/lib/librte_lpm/Makefile
+++ b/lib/librte_lpm/Makefile
@@ -47,6 +47,8 @@  SRCS-$(CONFIG_RTE_LIBRTE_LPM) := rte_lpm.c rte_lpm6.c
 # install this header file
 SYMLINK-$(CONFIG_RTE_LIBRTE_LPM)-include := rte_lpm.h rte_lpm6.h
 
+SYMLINK-$(CONFIG_RTE_LIBRTE_LPM)-include += rte_lpm_sse.h
+
 # this lib needs eal
 DEPDIRS-$(CONFIG_RTE_LIBRTE_LPM) += lib/librte_eal
 
diff --git a/lib/librte_lpm/rte_lpm.h b/lib/librte_lpm/rte_lpm.h
index c299ce2..dfe1378 100644
--- a/lib/librte_lpm/rte_lpm.h
+++ b/lib/librte_lpm/rte_lpm.h
@@ -381,97 +381,10 @@  rte_lpm_lookup_bulk_func(const struct rte_lpm *lpm, const uint32_t * ips,
  *   if lookup would fail.
  */
 static inline void
-rte_lpm_lookupx4(const struct rte_lpm *lpm, __m128i ip, uint16_t hop[4],
-	uint16_t defv)
-{
-	__m128i i24;
-	rte_xmm_t i8;
-	uint16_t tbl[4];
-	uint64_t idx, pt;
+rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint16_t hop[4],
+		 uint16_t defv);
 
-	const __m128i mask8 =
-		_mm_set_epi32(UINT8_MAX, UINT8_MAX, UINT8_MAX, UINT8_MAX);
-
-	/*
-	 * RTE_LPM_VALID_EXT_ENTRY_BITMASK for 4 LPM entries
-	 * as one 64-bit value (0x0300030003000300).
-	 */
-	const uint64_t mask_xv =
-		((uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK |
-		(uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK << 16 |
-		(uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK << 32 |
-		(uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK << 48);
-
-	/*
-	 * RTE_LPM_LOOKUP_SUCCESS for 4 LPM entries
-	 * as one 64-bit value (0x0100010001000100).
-	 */
-	const uint64_t mask_v =
-		((uint64_t)RTE_LPM_LOOKUP_SUCCESS |
-		(uint64_t)RTE_LPM_LOOKUP_SUCCESS << 16 |
-		(uint64_t)RTE_LPM_LOOKUP_SUCCESS << 32 |
-		(uint64_t)RTE_LPM_LOOKUP_SUCCESS << 48);
-
-	/* get 4 indexes for tbl24[]. */
-	i24 = _mm_srli_epi32(ip, CHAR_BIT);
-
-	/* extract values from tbl24[] */
-	idx = _mm_cvtsi128_si64(i24);
-	i24 = _mm_srli_si128(i24, sizeof(uint64_t));
-
-	tbl[0] = *(const uint16_t *)&lpm->tbl24[(uint32_t)idx];
-	tbl[1] = *(const uint16_t *)&lpm->tbl24[idx >> 32];
-
-	idx = _mm_cvtsi128_si64(i24);
-
-	tbl[2] = *(const uint16_t *)&lpm->tbl24[(uint32_t)idx];
-	tbl[3] = *(const uint16_t *)&lpm->tbl24[idx >> 32];
-
-	/* get 4 indexes for tbl8[]. */
-	i8.x = _mm_and_si128(ip, mask8);
-
-	pt = (uint64_t)tbl[0] |
-		(uint64_t)tbl[1] << 16 |
-		(uint64_t)tbl[2] << 32 |
-		(uint64_t)tbl[3] << 48;
-
-	/* search successfully finished for all 4 IP addresses. */
-	if (likely((pt & mask_xv) == mask_v)) {
-		uintptr_t ph = (uintptr_t)hop;
-		*(uint64_t *)ph = pt & RTE_LPM_MASKX4_RES;
-		return;
-	}
-
-	if (unlikely((pt & RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
-			RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
-		i8.u32[0] = i8.u32[0] +
-			(uint8_t)tbl[0] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
-		tbl[0] = *(const uint16_t *)&lpm->tbl8[i8.u32[0]];
-	}
-	if (unlikely((pt >> 16 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
-			RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
-		i8.u32[1] = i8.u32[1] +
-			(uint8_t)tbl[1] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
-		tbl[1] = *(const uint16_t *)&lpm->tbl8[i8.u32[1]];
-	}
-	if (unlikely((pt >> 32 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
-			RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
-		i8.u32[2] = i8.u32[2] +
-			(uint8_t)tbl[2] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
-		tbl[2] = *(const uint16_t *)&lpm->tbl8[i8.u32[2]];
-	}
-	if (unlikely((pt >> 48 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
-			RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
-		i8.u32[3] = i8.u32[3] +
-			(uint8_t)tbl[3] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
-		tbl[3] = *(const uint16_t *)&lpm->tbl8[i8.u32[3]];
-	}
-
-	hop[0] = (tbl[0] & RTE_LPM_LOOKUP_SUCCESS) ? (uint8_t)tbl[0] : defv;
-	hop[1] = (tbl[1] & RTE_LPM_LOOKUP_SUCCESS) ? (uint8_t)tbl[1] : defv;
-	hop[2] = (tbl[2] & RTE_LPM_LOOKUP_SUCCESS) ? (uint8_t)tbl[2] : defv;
-	hop[3] = (tbl[3] & RTE_LPM_LOOKUP_SUCCESS) ? (uint8_t)tbl[3] : defv;
-}
+#include "rte_lpm_sse.h"
 
 #ifdef __cplusplus
 }
diff --git a/lib/librte_lpm/rte_lpm_sse.h b/lib/librte_lpm/rte_lpm_sse.h
new file mode 100644
index 0000000..2b7eeec
--- /dev/null
+++ b/lib/librte_lpm/rte_lpm_sse.h
@@ -0,0 +1,143 @@ 
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _RTE_LPM_SSE_H_
+#define _RTE_LPM_SSE_H_
+
+#include <rte_branch_prediction.h>
+#include <rte_byteorder.h>
+#include <rte_common.h>
+#include <rte_vect.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static inline void
+rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint16_t hop[4],
+		 uint16_t defv)
+{
+	__m128i i24;
+	rte_xmm_t i8;
+	uint16_t tbl[4];
+	uint64_t idx, pt;
+
+	const __m128i mask8 =
+		_mm_set_epi32(UINT8_MAX, UINT8_MAX, UINT8_MAX, UINT8_MAX);
+
+	/*
+	 * RTE_LPM_VALID_EXT_ENTRY_BITMASK for 4 LPM entries
+	 * as one 64-bit value (0x0300030003000300).
+	 */
+	const uint64_t mask_xv =
+		((uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK |
+		(uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK << 16 |
+		(uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK << 32 |
+		(uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK << 48);
+
+	/*
+	 * RTE_LPM_LOOKUP_SUCCESS for 4 LPM entries
+	 * as one 64-bit value (0x0100010001000100).
+	 */
+	const uint64_t mask_v =
+		((uint64_t)RTE_LPM_LOOKUP_SUCCESS |
+		(uint64_t)RTE_LPM_LOOKUP_SUCCESS << 16 |
+		(uint64_t)RTE_LPM_LOOKUP_SUCCESS << 32 |
+		(uint64_t)RTE_LPM_LOOKUP_SUCCESS << 48);
+
+	/* get 4 indexes for tbl24[]. */
+	i24 = _mm_srli_epi32(ip, CHAR_BIT);
+
+	/* extract values from tbl24[] */
+	idx = _mm_cvtsi128_si64(i24);
+	i24 = _mm_srli_si128(i24, sizeof(uint64_t));
+
+	tbl[0] = *(const uint16_t *)&lpm->tbl24[(uint32_t)idx];
+	tbl[1] = *(const uint16_t *)&lpm->tbl24[idx >> 32];
+
+	idx = _mm_cvtsi128_si64(i24);
+
+	tbl[2] = *(const uint16_t *)&lpm->tbl24[(uint32_t)idx];
+	tbl[3] = *(const uint16_t *)&lpm->tbl24[idx >> 32];
+
+	/* get 4 indexes for tbl8[]. */
+	i8.x = _mm_and_si128(ip, mask8);
+
+	pt = (uint64_t)tbl[0] |
+		(uint64_t)tbl[1] << 16 |
+		(uint64_t)tbl[2] << 32 |
+		(uint64_t)tbl[3] << 48;
+
+	/* search successfully finished for all 4 IP addresses. */
+	if (likely((pt & mask_xv) == mask_v)) {
+		uintptr_t ph = (uintptr_t)hop;
+		*(uint64_t *)ph = pt & RTE_LPM_MASKX4_RES;
+		return;
+	}
+
+	if (unlikely((pt & RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
+			RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
+		i8.u32[0] = i8.u32[0] +
+			(uint8_t)tbl[0] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
+		tbl[0] = *(const uint16_t *)&lpm->tbl8[i8.u32[0]];
+	}
+	if (unlikely((pt >> 16 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
+			RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
+		i8.u32[1] = i8.u32[1] +
+			(uint8_t)tbl[1] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
+		tbl[1] = *(const uint16_t *)&lpm->tbl8[i8.u32[1]];
+	}
+	if (unlikely((pt >> 32 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
+			RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
+		i8.u32[2] = i8.u32[2] +
+			(uint8_t)tbl[2] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
+		tbl[2] = *(const uint16_t *)&lpm->tbl8[i8.u32[2]];
+	}
+	if (unlikely((pt >> 48 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
+			RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
+		i8.u32[3] = i8.u32[3] +
+			(uint8_t)tbl[3] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
+		tbl[3] = *(const uint16_t *)&lpm->tbl8[i8.u32[3]];
+	}
+
+	hop[0] = (tbl[0] & RTE_LPM_LOOKUP_SUCCESS) ? (uint8_t)tbl[0] : defv;
+	hop[1] = (tbl[1] & RTE_LPM_LOOKUP_SUCCESS) ? (uint8_t)tbl[1] : defv;
+	hop[2] = (tbl[2] & RTE_LPM_LOOKUP_SUCCESS) ? (uint8_t)tbl[2] : defv;
+	hop[3] = (tbl[3] & RTE_LPM_LOOKUP_SUCCESS) ? (uint8_t)tbl[3] : defv;
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_LPM_SSE_H_ */