lmp: add lookup x4 with x4 default values

Message ID 20200111160827.10021-1-pbhagavatula@marvell.com (mailing list archive)
State Rejected, archived
Delegated to: Thomas Monjalon
Headers
Series lmp: add lookup x4 with x4 default values |

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/iol-intel-Performance fail Performance Testing issues
ci/iol-testing success Testing PASS
ci/iol-mellanox-Performance success Performance Testing PASS
ci/iol-nxp-Performance success Performance Testing PASS
ci/travis-robot success Travis build: passed
ci/Intel-compilation success Compilation OK

Commit Message

Pavan Nikhilesh Bhagavatula Jan. 11, 2020, 4:08 p.m. UTC
  From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Add lookup x4 with x4 default values.
This can be used in usecases where we have to process  burst of packets
from different ports.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 app/test/test_lpm_perf.c         |  31 +++++++++
 lib/librte_lpm/rte_lpm.h         |  23 +++++++
 lib/librte_lpm/rte_lpm_altivec.h | 109 +++++++++++++++++++++++++++++++
 lib/librte_lpm/rte_lpm_neon.h    | 102 +++++++++++++++++++++++++++++
 lib/librte_lpm/rte_lpm_sse.h     | 104 +++++++++++++++++++++++++++++
 5 files changed, 369 insertions(+)
  

Comments

Vladimir Medvedkin Jan. 13, 2020, 11:06 a.m. UTC | #1
Hi Pavan,

I don't think it is a good idea to add extra function because:

1) it is just a copy of an existing rte_lpm_lookupx4() except the last 4 
ternary ops

2) What is a real world use case for that? Usually returned value is 
used as an index in an array of next_hop structs.

3) You can have the same result by using special unused defv and 
pcmpeqd/vpblendd on a hop[4] after lookup

On 11/01/2020 16:08, pbhagavatula@marvell.com wrote:
> From: Pavan Nikhilesh <pbhagavatula@marvell.com>
>
> Add lookup x4 with x4 default values.
> This can be used in usecases where we have to process  burst of packets
> from different ports.
>
> Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
> ---
>   app/test/test_lpm_perf.c         |  31 +++++++++
>   lib/librte_lpm/rte_lpm.h         |  23 +++++++
>   lib/librte_lpm/rte_lpm_altivec.h | 109 +++++++++++++++++++++++++++++++
>   lib/librte_lpm/rte_lpm_neon.h    | 102 +++++++++++++++++++++++++++++
>   lib/librte_lpm/rte_lpm_sse.h     | 104 +++++++++++++++++++++++++++++
>   5 files changed, 369 insertions(+)
>
> diff --git a/app/test/test_lpm_perf.c b/app/test/test_lpm_perf.c
> index a2578fe90..8e9d4c7eb 100644
> --- a/app/test/test_lpm_perf.c
> +++ b/app/test/test_lpm_perf.c
> @@ -460,6 +460,37 @@ test_lpm_perf(void)
>   			(double)total_time / ((double)ITERATIONS * BATCH_SIZE),
>   			(count * 100.0) / (double)(ITERATIONS * BATCH_SIZE));
>   
> +	/* Measure LookupX4 DefaultX4 */
> +	total_time = 0;
> +	count = 0;
> +	uint32_t def[4] = {UINT32_MAX, UINT32_MAX, UINT32_MAX, UINT32_MAX};
> +	for (i = 0; i < ITERATIONS; i++) {
> +		static uint32_t ip_batch[BATCH_SIZE];
> +		uint32_t next_hops[4];
> +
> +		/* Create array of random IP addresses */
> +		for (j = 0; j < BATCH_SIZE; j++)
> +			ip_batch[j] = rte_rand();
> +
> +		/* Lookup per batch */
> +		begin = rte_rdtsc();
> +		for (j = 0; j < BATCH_SIZE; j += RTE_DIM(next_hops)) {
> +			unsigned int k;
> +			xmm_t ipx4;
> +
> +			ipx4 = vect_loadu_sil128((xmm_t *)(ip_batch + j));
> +			ipx4 = *(xmm_t *)(ip_batch + j);
> +			rte_lpm_lookupx4_defx4(lpm, ipx4, next_hops, def);
> +			for (k = 0; k < RTE_DIM(next_hops); k++)
> +				if (unlikely(next_hops[k] == UINT32_MAX))
> +					count++;
> +		}
> +
> +		total_time += rte_rdtsc() - begin;
> +	}
> +	printf("LPM LookupX4 Defx4: %.1f cycles (fails = %.1f%%)\n",
> +			(double)total_time / ((double)ITERATIONS * BATCH_SIZE),
> +			(count * 100.0) / (double)(ITERATIONS * BATCH_SIZE));
>   	/* Measure Delete */
>   	status = 0;
>   	begin = rte_rdtsc();
> diff --git a/lib/librte_lpm/rte_lpm.h b/lib/librte_lpm/rte_lpm.h
> index b9d49ac87..e66b43e06 100644
> --- a/lib/librte_lpm/rte_lpm.h
> +++ b/lib/librte_lpm/rte_lpm.h
> @@ -370,6 +370,29 @@ static inline void
>   rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
>   	uint32_t defv);
>   
> +/**
> + * Lookup four IP addresses in an LPM table.
> + *
> + * @param lpm
> + *   LPM object handle
> + * @param ip
> + *   Four IPs to be looked up in the LPM table
> + * @param hop
> + *   Next hop of the most specific rule found for IP (valid on lookup hit only).
> + *   This is an 4 elements array of two byte values.
> + *   If the lookup was successful for the given IP, then least significant byte
> + *   of the corresponding element is the  actual next hop and the most
> + *   significant byte is zero.
> + *   If the lookup for the given IP failed, then corresponding element would
> + *   contain default value, see description of then next parameter.
> + * @param defv
> + *   Default value[] to populate into corresponding element of hop[] array,
> + *   if lookup would fail.
> + */
> +static inline void
> +rte_lpm_lookupx4_defx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
> +	uint32_t defv[4]);
> +
>   #if defined(RTE_ARCH_ARM) || defined(RTE_ARCH_ARM64)
>   #include "rte_lpm_neon.h"
>   #elif defined(RTE_ARCH_PPC_64)
> diff --git a/lib/librte_lpm/rte_lpm_altivec.h b/lib/librte_lpm/rte_lpm_altivec.h
> index 228c41b38..1afc7bd74 100644
> --- a/lib/librte_lpm/rte_lpm_altivec.h
> +++ b/lib/librte_lpm/rte_lpm_altivec.h
> @@ -120,6 +120,115 @@ rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
>   	hop[3] = (tbl[3] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[3] & 0x00FFFFFF : defv;
>   }
>   
> +static inline void
> +rte_lpm_lookupx4_defx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
> +	uint32_t defv[4])
> +{
> +	vector signed int i24;
> +	rte_xmm_t i8;
> +	uint32_t tbl[4];
> +	uint64_t idx, pt, pt2;
> +	const uint32_t *ptbl;
> +
> +	const uint32_t mask = UINT8_MAX;
> +	const vector signed int mask8 = (xmm_t){mask, mask, mask, mask};
> +
> +	/*
> +	 * RTE_LPM_VALID_EXT_ENTRY_BITMASK for 2 LPM entries
> +	 * as one 64-bit value (0x0300000003000000).
> +	 */
> +	const uint64_t mask_xv =
> +		((uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK |
> +		(uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK << 32);
> +
> +	/*
> +	 * RTE_LPM_LOOKUP_SUCCESS for 2 LPM entries
> +	 * as one 64-bit value (0x0100000001000000).
> +	 */
> +	const uint64_t mask_v =
> +		((uint64_t)RTE_LPM_LOOKUP_SUCCESS |
> +		(uint64_t)RTE_LPM_LOOKUP_SUCCESS << 32);
> +
> +	/* get 4 indexes for tbl24[]. */
> +	i24 = vec_sr((xmm_t) ip,
> +		(vector unsigned int){CHAR_BIT, CHAR_BIT, CHAR_BIT, CHAR_BIT});
> +
> +	/* extract values from tbl24[] */
> +	idx = (uint32_t)i24[0];
> +	idx = idx < (1<<24) ? idx : (1<<24)-1;
> +	ptbl = (const uint32_t *)&lpm->tbl24[idx];
> +	tbl[0] = *ptbl;
> +
> +	idx = (uint32_t) i24[1];
> +	idx = idx < (1<<24) ? idx : (1<<24)-1;
> +	ptbl = (const uint32_t *)&lpm->tbl24[idx];
> +	tbl[1] = *ptbl;
> +
> +	idx = (uint32_t) i24[2];
> +	idx = idx < (1<<24) ? idx : (1<<24)-1;
> +	ptbl = (const uint32_t *)&lpm->tbl24[idx];
> +	tbl[2] = *ptbl;
> +
> +	idx = (uint32_t) i24[3];
> +	idx = idx < (1<<24) ? idx : (1<<24)-1;
> +	ptbl = (const uint32_t *)&lpm->tbl24[idx];
> +	tbl[3] = *ptbl;
> +
> +	/* get 4 indexes for tbl8[]. */
> +	i8.x = vec_and(ip, mask8);
> +
> +	pt = (uint64_t)tbl[0] |
> +		(uint64_t)tbl[1] << 32;
> +	pt2 = (uint64_t)tbl[2] |
> +		(uint64_t)tbl[3] << 32;
> +
> +	/* search successfully finished for all 4 IP addresses. */
> +	if (likely((pt & mask_xv) == mask_v) &&
> +			likely((pt2 & mask_xv) == mask_v)) {
> +		*(uint64_t *)hop = pt & RTE_LPM_MASKX4_RES;
> +		*(uint64_t *)(hop + 2) = pt2 & RTE_LPM_MASKX4_RES;
> +		return;
> +	}
> +
> +	if (unlikely((pt & RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
> +			RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
> +		i8.u32[0] = i8.u32[0] +
> +			(uint8_t)tbl[0] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
> +		ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[0]];
> +		tbl[0] = *ptbl;
> +	}
> +	if (unlikely((pt >> 32 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
> +			RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
> +		i8.u32[1] = i8.u32[1] +
> +			(uint8_t)tbl[1] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
> +		ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[1]];
> +		tbl[1] = *ptbl;
> +	}
> +	if (unlikely((pt2 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
> +			RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
> +		i8.u32[2] = i8.u32[2] +
> +			(uint8_t)tbl[2] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
> +		ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[2]];
> +		tbl[2] = *ptbl;
> +	}
> +	if (unlikely((pt2 >> 32 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
> +			RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
> +		i8.u32[3] = i8.u32[3] +
> +			(uint8_t)tbl[3] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
> +		ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[3]];
> +		tbl[3] = *ptbl;
> +	}
> +
> +	hop[0] = (tbl[0] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[0] & 0x00FFFFFF :
> +									defv[0];
> +	hop[1] = (tbl[1] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[1] & 0x00FFFFFF :
> +									defv[1];
> +	hop[2] = (tbl[2] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[2] & 0x00FFFFFF :
> +									defv[2];
> +	hop[3] = (tbl[3] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[3] & 0x00FFFFFF :
> +									defv[3];
> +}
> +
>   #ifdef __cplusplus
>   }
>   #endif
> diff --git a/lib/librte_lpm/rte_lpm_neon.h b/lib/librte_lpm/rte_lpm_neon.h
> index 6c131d312..6ef635b18 100644
> --- a/lib/librte_lpm/rte_lpm_neon.h
> +++ b/lib/librte_lpm/rte_lpm_neon.h
> @@ -113,6 +113,108 @@ rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
>   	hop[3] = (tbl[3] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[3] & 0x00FFFFFF : defv;
>   }
>   
> +static inline void
> +rte_lpm_lookupx4_defx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
> +	uint32_t defv[4])
> +{
> +	uint32x4_t i24;
> +	rte_xmm_t i8;
> +	uint32_t tbl[4];
> +	uint64_t idx, pt, pt2;
> +	const uint32_t *ptbl;
> +
> +	const uint32_t mask = UINT8_MAX;
> +	const int32x4_t mask8 = vdupq_n_s32(mask);
> +
> +	/*
> +	 * RTE_LPM_VALID_EXT_ENTRY_BITMASK for 2 LPM entries
> +	 * as one 64-bit value (0x0300000003000000).
> +	 */
> +	const uint64_t mask_xv =
> +		((uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK |
> +		(uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK << 32);
> +
> +	/*
> +	 * RTE_LPM_LOOKUP_SUCCESS for 2 LPM entries
> +	 * as one 64-bit value (0x0100000001000000).
> +	 */
> +	const uint64_t mask_v =
> +		((uint64_t)RTE_LPM_LOOKUP_SUCCESS |
> +		(uint64_t)RTE_LPM_LOOKUP_SUCCESS << 32);
> +
> +	/* get 4 indexes for tbl24[]. */
> +	i24 = vshrq_n_u32((uint32x4_t)ip, CHAR_BIT);
> +
> +	/* extract values from tbl24[] */
> +	idx = vgetq_lane_u64((uint64x2_t)i24, 0);
> +
> +	ptbl = (const uint32_t *)&lpm->tbl24[(uint32_t)idx];
> +	tbl[0] = *ptbl;
> +	ptbl = (const uint32_t *)&lpm->tbl24[idx >> 32];
> +	tbl[1] = *ptbl;
> +
> +	idx = vgetq_lane_u64((uint64x2_t)i24, 1);
> +
> +	ptbl = (const uint32_t *)&lpm->tbl24[(uint32_t)idx];
> +	tbl[2] = *ptbl;
> +	ptbl = (const uint32_t *)&lpm->tbl24[idx >> 32];
> +	tbl[3] = *ptbl;
> +
> +	/* get 4 indexes for tbl8[]. */
> +	i8.x = vandq_s32(ip, mask8);
> +
> +	pt = (uint64_t)tbl[0] |
> +		(uint64_t)tbl[1] << 32;
> +	pt2 = (uint64_t)tbl[2] |
> +		(uint64_t)tbl[3] << 32;
> +
> +	/* search successfully finished for all 4 IP addresses. */
> +	if (likely((pt & mask_xv) == mask_v) &&
> +			likely((pt2 & mask_xv) == mask_v)) {
> +		*(uint64_t *)hop = pt & RTE_LPM_MASKX4_RES;
> +		*(uint64_t *)(hop + 2) = pt2 & RTE_LPM_MASKX4_RES;
> +		return;
> +	}
> +
> +	if (unlikely((pt & RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
> +			RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
> +		i8.u32[0] = i8.u32[0] +
> +			(uint8_t)tbl[0] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
> +		ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[0]];
> +		tbl[0] = *ptbl;
> +	}
> +	if (unlikely((pt >> 32 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
> +			RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
> +		i8.u32[1] = i8.u32[1] +
> +			(uint8_t)tbl[1] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
> +		ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[1]];
> +		tbl[1] = *ptbl;
> +	}
> +	if (unlikely((pt2 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
> +			RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
> +		i8.u32[2] = i8.u32[2] +
> +			(uint8_t)tbl[2] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
> +		ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[2]];
> +		tbl[2] = *ptbl;
> +	}
> +	if (unlikely((pt2 >> 32 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
> +			RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
> +		i8.u32[3] = i8.u32[3] +
> +			(uint8_t)tbl[3] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
> +		ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[3]];
> +		tbl[3] = *ptbl;
> +	}
> +
> +	hop[0] = (tbl[0] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[0] & 0x00FFFFFF :
> +									defv[0];
> +	hop[1] = (tbl[1] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[1] & 0x00FFFFFF :
> +									defv[1];
> +	hop[2] = (tbl[2] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[2] & 0x00FFFFFF :
> +									defv[2];
> +	hop[3] = (tbl[3] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[3] & 0x00FFFFFF :
> +									defv[3];
> +}
> +
>   #ifdef __cplusplus
>   }
>   #endif
> diff --git a/lib/librte_lpm/rte_lpm_sse.h b/lib/librte_lpm/rte_lpm_sse.h
> index 44770b6ff..6ef15816c 100644
> --- a/lib/librte_lpm/rte_lpm_sse.h
> +++ b/lib/librte_lpm/rte_lpm_sse.h
> @@ -114,6 +114,110 @@ rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
>   	hop[3] = (tbl[3] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[3] & 0x00FFFFFF : defv;
>   }
>   
> +static inline void
> +rte_lpm_lookupx4_defx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
> +	uint32_t defv[4])
> +{
> +	__m128i i24;
> +	rte_xmm_t i8;
> +	uint32_t tbl[4];
> +	uint64_t idx, pt, pt2;
> +	const uint32_t *ptbl;
> +
> +	const __m128i mask8 =
> +		_mm_set_epi32(UINT8_MAX, UINT8_MAX, UINT8_MAX, UINT8_MAX);
> +
> +	/*
> +	 * RTE_LPM_VALID_EXT_ENTRY_BITMASK for 2 LPM entries
> +	 * as one 64-bit value (0x0300000003000000).
> +	 */
> +	const uint64_t mask_xv =
> +		((uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK |
> +		(uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK << 32);
> +
> +	/*
> +	 * RTE_LPM_LOOKUP_SUCCESS for 2 LPM entries
> +	 * as one 64-bit value (0x0100000001000000).
> +	 */
> +	const uint64_t mask_v =
> +		((uint64_t)RTE_LPM_LOOKUP_SUCCESS |
> +		(uint64_t)RTE_LPM_LOOKUP_SUCCESS << 32);
> +
> +	/* get 4 indexes for tbl24[]. */
> +	i24 = _mm_srli_epi32(ip, CHAR_BIT);
> +
> +	/* extract values from tbl24[] */
> +	idx = _mm_cvtsi128_si64(i24);
> +	/* With -O0 option, gcc 4.8 - 5.4 fails to fold sizeof() into a constant */
> +	i24 = _mm_srli_si128(i24, /* sizeof(uint64_t) */ 8);
> +
> +	ptbl = (const uint32_t *)&lpm->tbl24[(uint32_t)idx];
> +	tbl[0] = *ptbl;
> +	ptbl = (const uint32_t *)&lpm->tbl24[idx >> 32];
> +	tbl[1] = *ptbl;
> +
> +	idx = _mm_cvtsi128_si64(i24);
> +
> +	ptbl = (const uint32_t *)&lpm->tbl24[(uint32_t)idx];
> +	tbl[2] = *ptbl;
> +	ptbl = (const uint32_t *)&lpm->tbl24[idx >> 32];
> +	tbl[3] = *ptbl;
> +
> +	/* get 4 indexes for tbl8[]. */
> +	i8.x = _mm_and_si128(ip, mask8);
> +
> +	pt = (uint64_t)tbl[0] |
> +		(uint64_t)tbl[1] << 32;
> +	pt2 = (uint64_t)tbl[2] |
> +		(uint64_t)tbl[3] << 32;
> +
> +	/* search successfully finished for all 4 IP addresses. */
> +	if (likely((pt & mask_xv) == mask_v) &&
> +			likely((pt2 & mask_xv) == mask_v)) {
> +		*(uint64_t *)hop = pt & RTE_LPM_MASKX4_RES;
> +		*(uint64_t *)(hop + 2) = pt2 & RTE_LPM_MASKX4_RES;
> +		return;
> +	}
> +
> +	if (unlikely((pt & RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
> +			RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
> +		i8.u32[0] = i8.u32[0] +
> +			(uint8_t)tbl[0] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
> +		ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[0]];
> +		tbl[0] = *ptbl;
> +	}
> +	if (unlikely((pt >> 32 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
> +			RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
> +		i8.u32[1] = i8.u32[1] +
> +			(uint8_t)tbl[1] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
> +		ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[1]];
> +		tbl[1] = *ptbl;
> +	}
> +	if (unlikely((pt2 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
> +			RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
> +		i8.u32[2] = i8.u32[2] +
> +			(uint8_t)tbl[2] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
> +		ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[2]];
> +		tbl[2] = *ptbl;
> +	}
> +	if (unlikely((pt2 >> 32 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
> +			RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
> +		i8.u32[3] = i8.u32[3] +
> +			(uint8_t)tbl[3] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
> +		ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[3]];
> +		tbl[3] = *ptbl;
> +	}
> +
> +	hop[0] = (tbl[0] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[0] & 0x00FFFFFF :
> +									defv[0];
> +	hop[1] = (tbl[1] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[1] & 0x00FFFFFF :
> +									defv[1];
> +	hop[2] = (tbl[2] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[2] & 0x00FFFFFF :
> +									defv[2];
> +	hop[3] = (tbl[3] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[3] & 0x00FFFFFF :
> +									defv[3];
> +}
> +
>   #ifdef __cplusplus
>   }
>   #endif
  
Pavan Nikhilesh Bhagavatula Jan. 13, 2020, 12:34 p.m. UTC | #2
>-----Original Message-----
>From: dev <dev-bounces@dpdk.org> On Behalf Of Medvedkin,
>Vladimir
>Sent: Monday, January 13, 2020 4:37 PM
>To: Pavan Nikhilesh Bhagavatula <pbhagavatula@marvell.com>; Jerin
>Jacob Kollanukkaran <jerinj@marvell.com>; Bruce Richardson
><bruce.richardson@intel.com>; Gavin Hu <gavin.hu@arm.com>
>Cc: dev@dpdk.org
>Subject: Re: [dpdk-dev] [PATCH] lmp: add lookup x4 with x4 default
>values
>
>Hi Pavan,
>

Hi Medvedkin,

>I don't think it is a good idea to add extra function because:
>
>1) it is just a copy of an existing rte_lpm_lookupx4() except the last 4
>ternary ops

Yes, but I had no other option as modifying the current function will break ABI ☹.

>
>2) What is a real world use case for that? Usually returned value is
>used as an index in an array of next_hop structs.

If we take l3fwd as an example the next hop holds fwd port_id whereas the default value 
Passed holds mbuf->port. This allows Tx without having a branch. 

Event devices can aggregate packets from multiple ethernet ports and schedule them on 
a core. The current API requires us to pass a BAD_PORT and compare the result for every 
packet but if we are allowed to pass 4 different default values we could seamlessly send 
them for Tx.

>
>3) You can have the same result by using special unused defv and
>pcmpeqd/vpblendd on a hop[4] after lookup

Yes, but sadly that would be architecture depended.

>
>On 11/01/2020 16:08, pbhagavatula@marvell.com wrote:
>> From: Pavan Nikhilesh <pbhagavatula@marvell.com>
>>
>> Add lookup x4 with x4 default values.
>> This can be used in usecases where we have to process  burst of
>packets
>> from different ports.
>>
>> Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
>> ---
>>   app/test/test_lpm_perf.c         |  31 +++++++++
>>   lib/librte_lpm/rte_lpm.h         |  23 +++++++
>>   lib/librte_lpm/rte_lpm_altivec.h | 109
>+++++++++++++++++++++++++++++++
>>   lib/librte_lpm/rte_lpm_neon.h    | 102
>+++++++++++++++++++++++++++++
>>   lib/librte_lpm/rte_lpm_sse.h     | 104
>+++++++++++++++++++++++++++++
>>   5 files changed, 369 insertions(+)
>>
>> diff --git a/app/test/test_lpm_perf.c b/app/test/test_lpm_perf.c
>> index a2578fe90..8e9d4c7eb 100644
>> --- a/app/test/test_lpm_perf.c
>> +++ b/app/test/test_lpm_perf.c
>> @@ -460,6 +460,37 @@ test_lpm_perf(void)
>>   			(double)total_time / ((double)ITERATIONS *
>BATCH_SIZE),
>>   			(count * 100.0) / (double)(ITERATIONS *
>BATCH_SIZE));
>>
>> +	/* Measure LookupX4 DefaultX4 */
>> +	total_time = 0;
>> +	count = 0;
>> +	uint32_t def[4] = {UINT32_MAX, UINT32_MAX, UINT32_MAX,
>UINT32_MAX};
>> +	for (i = 0; i < ITERATIONS; i++) {
>> +		static uint32_t ip_batch[BATCH_SIZE];
>> +		uint32_t next_hops[4];
>> +
>> +		/* Create array of random IP addresses */
>> +		for (j = 0; j < BATCH_SIZE; j++)
>> +			ip_batch[j] = rte_rand();
>> +
>> +		/* Lookup per batch */
>> +		begin = rte_rdtsc();
>> +		for (j = 0; j < BATCH_SIZE; j += RTE_DIM(next_hops)) {
>> +			unsigned int k;
>> +			xmm_t ipx4;
>> +
>> +			ipx4 = vect_loadu_sil128((xmm_t *)(ip_batch +
>j));
>> +			ipx4 = *(xmm_t *)(ip_batch + j);
>> +			rte_lpm_lookupx4_defx4(lpm, ipx4, next_hops,
>def);
>> +			for (k = 0; k < RTE_DIM(next_hops); k++)
>> +				if (unlikely(next_hops[k] ==
>UINT32_MAX))
>> +					count++;
>> +		}
>> +
>> +		total_time += rte_rdtsc() - begin;
>> +	}
>> +	printf("LPM LookupX4 Defx4: %.1f cycles (fails = %.1f%%)\n",
>> +			(double)total_time / ((double)ITERATIONS *
>BATCH_SIZE),
>> +			(count * 100.0) / (double)(ITERATIONS *
>BATCH_SIZE));
>>   	/* Measure Delete */
>>   	status = 0;
>>   	begin = rte_rdtsc();
>> diff --git a/lib/librte_lpm/rte_lpm.h b/lib/librte_lpm/rte_lpm.h
>> index b9d49ac87..e66b43e06 100644
>> --- a/lib/librte_lpm/rte_lpm.h
>> +++ b/lib/librte_lpm/rte_lpm.h
>> @@ -370,6 +370,29 @@ static inline void
>>   rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t
>hop[4],
>>   	uint32_t defv);
>>
>> +/**
>> + * Lookup four IP addresses in an LPM table.
>> + *
>> + * @param lpm
>> + *   LPM object handle
>> + * @param ip
>> + *   Four IPs to be looked up in the LPM table
>> + * @param hop
>> + *   Next hop of the most specific rule found for IP (valid on lookup
>hit only).
>> + *   This is an 4 elements array of two byte values.
>> + *   If the lookup was successful for the given IP, then least significant
>byte
>> + *   of the corresponding element is the  actual next hop and the
>most
>> + *   significant byte is zero.
>> + *   If the lookup for the given IP failed, then corresponding element
>would
>> + *   contain default value, see description of then next parameter.
>> + * @param defv
>> + *   Default value[] to populate into corresponding element of hop[]
>array,
>> + *   if lookup would fail.
>> + */
>> +static inline void
>> +rte_lpm_lookupx4_defx4(const struct rte_lpm *lpm, xmm_t ip,
>uint32_t hop[4],
>> +	uint32_t defv[4]);
>> +
>>   #if defined(RTE_ARCH_ARM) || defined(RTE_ARCH_ARM64)
>>   #include "rte_lpm_neon.h"
>>   #elif defined(RTE_ARCH_PPC_64)
>> diff --git a/lib/librte_lpm/rte_lpm_altivec.h
>b/lib/librte_lpm/rte_lpm_altivec.h
>> index 228c41b38..1afc7bd74 100644
>> --- a/lib/librte_lpm/rte_lpm_altivec.h
>> +++ b/lib/librte_lpm/rte_lpm_altivec.h
>> @@ -120,6 +120,115 @@ rte_lpm_lookupx4(const struct rte_lpm
>*lpm, xmm_t ip, uint32_t hop[4],
>>   	hop[3] = (tbl[3] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[3] &
>0x00FFFFFF : defv;
>>   }
>>
>> +static inline void
>> +rte_lpm_lookupx4_defx4(const struct rte_lpm *lpm, xmm_t ip,
>uint32_t hop[4],
>> +	uint32_t defv[4])
>> +{
>> +	vector signed int i24;
>> +	rte_xmm_t i8;
>> +	uint32_t tbl[4];
>> +	uint64_t idx, pt, pt2;
>> +	const uint32_t *ptbl;
>> +
>> +	const uint32_t mask = UINT8_MAX;
>> +	const vector signed int mask8 = (xmm_t){mask, mask, mask,
>mask};
>> +
>> +	/*
>> +	 * RTE_LPM_VALID_EXT_ENTRY_BITMASK for 2 LPM entries
>> +	 * as one 64-bit value (0x0300000003000000).
>> +	 */
>> +	const uint64_t mask_xv =
>> +		((uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK |
>> +		(uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK <<
>32);
>> +
>> +	/*
>> +	 * RTE_LPM_LOOKUP_SUCCESS for 2 LPM entries
>> +	 * as one 64-bit value (0x0100000001000000).
>> +	 */
>> +	const uint64_t mask_v =
>> +		((uint64_t)RTE_LPM_LOOKUP_SUCCESS |
>> +		(uint64_t)RTE_LPM_LOOKUP_SUCCESS << 32);
>> +
>> +	/* get 4 indexes for tbl24[]. */
>> +	i24 = vec_sr((xmm_t) ip,
>> +		(vector unsigned int){CHAR_BIT, CHAR_BIT, CHAR_BIT,
>CHAR_BIT});
>> +
>> +	/* extract values from tbl24[] */
>> +	idx = (uint32_t)i24[0];
>> +	idx = idx < (1<<24) ? idx : (1<<24)-1;
>> +	ptbl = (const uint32_t *)&lpm->tbl24[idx];
>> +	tbl[0] = *ptbl;
>> +
>> +	idx = (uint32_t) i24[1];
>> +	idx = idx < (1<<24) ? idx : (1<<24)-1;
>> +	ptbl = (const uint32_t *)&lpm->tbl24[idx];
>> +	tbl[1] = *ptbl;
>> +
>> +	idx = (uint32_t) i24[2];
>> +	idx = idx < (1<<24) ? idx : (1<<24)-1;
>> +	ptbl = (const uint32_t *)&lpm->tbl24[idx];
>> +	tbl[2] = *ptbl;
>> +
>> +	idx = (uint32_t) i24[3];
>> +	idx = idx < (1<<24) ? idx : (1<<24)-1;
>> +	ptbl = (const uint32_t *)&lpm->tbl24[idx];
>> +	tbl[3] = *ptbl;
>> +
>> +	/* get 4 indexes for tbl8[]. */
>> +	i8.x = vec_and(ip, mask8);
>> +
>> +	pt = (uint64_t)tbl[0] |
>> +		(uint64_t)tbl[1] << 32;
>> +	pt2 = (uint64_t)tbl[2] |
>> +		(uint64_t)tbl[3] << 32;
>> +
>> +	/* search successfully finished for all 4 IP addresses. */
>> +	if (likely((pt & mask_xv) == mask_v) &&
>> +			likely((pt2 & mask_xv) == mask_v)) {
>> +		*(uint64_t *)hop = pt & RTE_LPM_MASKX4_RES;
>> +		*(uint64_t *)(hop + 2) = pt2 & RTE_LPM_MASKX4_RES;
>> +		return;
>> +	}
>> +
>> +	if (unlikely((pt & RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
>> +			RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
>> +		i8.u32[0] = i8.u32[0] +
>> +			(uint8_t)tbl[0] *
>RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
>> +		ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[0]];
>> +		tbl[0] = *ptbl;
>> +	}
>> +	if (unlikely((pt >> 32 & RTE_LPM_VALID_EXT_ENTRY_BITMASK)
>==
>> +			RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
>> +		i8.u32[1] = i8.u32[1] +
>> +			(uint8_t)tbl[1] *
>RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
>> +		ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[1]];
>> +		tbl[1] = *ptbl;
>> +	}
>> +	if (unlikely((pt2 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
>> +			RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
>> +		i8.u32[2] = i8.u32[2] +
>> +			(uint8_t)tbl[2] *
>RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
>> +		ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[2]];
>> +		tbl[2] = *ptbl;
>> +	}
>> +	if (unlikely((pt2 >> 32 &
>RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
>> +			RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
>> +		i8.u32[3] = i8.u32[3] +
>> +			(uint8_t)tbl[3] *
>RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
>> +		ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[3]];
>> +		tbl[3] = *ptbl;
>> +	}
>> +
>> +	hop[0] = (tbl[0] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[0] &
>0x00FFFFFF :
>> +
>	defv[0];
>> +	hop[1] = (tbl[1] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[1] &
>0x00FFFFFF :
>> +
>	defv[1];
>> +	hop[2] = (tbl[2] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[2] &
>0x00FFFFFF :
>> +
>	defv[2];
>> +	hop[3] = (tbl[3] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[3] &
>0x00FFFFFF :
>> +
>	defv[3];
>> +}
>> +
>>   #ifdef __cplusplus
>>   }
>>   #endif
>> diff --git a/lib/librte_lpm/rte_lpm_neon.h
>b/lib/librte_lpm/rte_lpm_neon.h
>> index 6c131d312..6ef635b18 100644
>> --- a/lib/librte_lpm/rte_lpm_neon.h
>> +++ b/lib/librte_lpm/rte_lpm_neon.h
>> @@ -113,6 +113,108 @@ rte_lpm_lookupx4(const struct rte_lpm
>*lpm, xmm_t ip, uint32_t hop[4],
>>   	hop[3] = (tbl[3] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[3] &
>0x00FFFFFF : defv;
>>   }
>>
>> +static inline void
>> +rte_lpm_lookupx4_defx4(const struct rte_lpm *lpm, xmm_t ip,
>uint32_t hop[4],
>> +	uint32_t defv[4])
>> +{
>> +	uint32x4_t i24;
>> +	rte_xmm_t i8;
>> +	uint32_t tbl[4];
>> +	uint64_t idx, pt, pt2;
>> +	const uint32_t *ptbl;
>> +
>> +	const uint32_t mask = UINT8_MAX;
>> +	const int32x4_t mask8 = vdupq_n_s32(mask);
>> +
>> +	/*
>> +	 * RTE_LPM_VALID_EXT_ENTRY_BITMASK for 2 LPM entries
>> +	 * as one 64-bit value (0x0300000003000000).
>> +	 */
>> +	const uint64_t mask_xv =
>> +		((uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK |
>> +		(uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK <<
>32);
>> +
>> +	/*
>> +	 * RTE_LPM_LOOKUP_SUCCESS for 2 LPM entries
>> +	 * as one 64-bit value (0x0100000001000000).
>> +	 */
>> +	const uint64_t mask_v =
>> +		((uint64_t)RTE_LPM_LOOKUP_SUCCESS |
>> +		(uint64_t)RTE_LPM_LOOKUP_SUCCESS << 32);
>> +
>> +	/* get 4 indexes for tbl24[]. */
>> +	i24 = vshrq_n_u32((uint32x4_t)ip, CHAR_BIT);
>> +
>> +	/* extract values from tbl24[] */
>> +	idx = vgetq_lane_u64((uint64x2_t)i24, 0);
>> +
>> +	ptbl = (const uint32_t *)&lpm->tbl24[(uint32_t)idx];
>> +	tbl[0] = *ptbl;
>> +	ptbl = (const uint32_t *)&lpm->tbl24[idx >> 32];
>> +	tbl[1] = *ptbl;
>> +
>> +	idx = vgetq_lane_u64((uint64x2_t)i24, 1);
>> +
>> +	ptbl = (const uint32_t *)&lpm->tbl24[(uint32_t)idx];
>> +	tbl[2] = *ptbl;
>> +	ptbl = (const uint32_t *)&lpm->tbl24[idx >> 32];
>> +	tbl[3] = *ptbl;
>> +
>> +	/* get 4 indexes for tbl8[]. */
>> +	i8.x = vandq_s32(ip, mask8);
>> +
>> +	pt = (uint64_t)tbl[0] |
>> +		(uint64_t)tbl[1] << 32;
>> +	pt2 = (uint64_t)tbl[2] |
>> +		(uint64_t)tbl[3] << 32;
>> +
>> +	/* search successfully finished for all 4 IP addresses. */
>> +	if (likely((pt & mask_xv) == mask_v) &&
>> +			likely((pt2 & mask_xv) == mask_v)) {
>> +		*(uint64_t *)hop = pt & RTE_LPM_MASKX4_RES;
>> +		*(uint64_t *)(hop + 2) = pt2 & RTE_LPM_MASKX4_RES;
>> +		return;
>> +	}
>> +
>> +	if (unlikely((pt & RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
>> +			RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
>> +		i8.u32[0] = i8.u32[0] +
>> +			(uint8_t)tbl[0] *
>RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
>> +		ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[0]];
>> +		tbl[0] = *ptbl;
>> +	}
>> +	if (unlikely((pt >> 32 & RTE_LPM_VALID_EXT_ENTRY_BITMASK)
>==
>> +			RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
>> +		i8.u32[1] = i8.u32[1] +
>> +			(uint8_t)tbl[1] *
>RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
>> +		ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[1]];
>> +		tbl[1] = *ptbl;
>> +	}
>> +	if (unlikely((pt2 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
>> +			RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
>> +		i8.u32[2] = i8.u32[2] +
>> +			(uint8_t)tbl[2] *
>RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
>> +		ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[2]];
>> +		tbl[2] = *ptbl;
>> +	}
>> +	if (unlikely((pt2 >> 32 &
>RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
>> +			RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
>> +		i8.u32[3] = i8.u32[3] +
>> +			(uint8_t)tbl[3] *
>RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
>> +		ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[3]];
>> +		tbl[3] = *ptbl;
>> +	}
>> +
>> +	hop[0] = (tbl[0] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[0] &
>0x00FFFFFF :
>> +
>	defv[0];
>> +	hop[1] = (tbl[1] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[1] &
>0x00FFFFFF :
>> +
>	defv[1];
>> +	hop[2] = (tbl[2] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[2] &
>0x00FFFFFF :
>> +
>	defv[2];
>> +	hop[3] = (tbl[3] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[3] &
>0x00FFFFFF :
>> +
>	defv[3];
>> +}
>> +
>>   #ifdef __cplusplus
>>   }
>>   #endif
>> diff --git a/lib/librte_lpm/rte_lpm_sse.h
>b/lib/librte_lpm/rte_lpm_sse.h
>> index 44770b6ff..6ef15816c 100644
>> --- a/lib/librte_lpm/rte_lpm_sse.h
>> +++ b/lib/librte_lpm/rte_lpm_sse.h
>> @@ -114,6 +114,110 @@ rte_lpm_lookupx4(const struct rte_lpm
>*lpm, xmm_t ip, uint32_t hop[4],
>>   	hop[3] = (tbl[3] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[3] &
>0x00FFFFFF : defv;
>>   }
>>
>> +static inline void
>> +rte_lpm_lookupx4_defx4(const struct rte_lpm *lpm, xmm_t ip,
>uint32_t hop[4],
>> +	uint32_t defv[4])
>> +{
>> +	__m128i i24;
>> +	rte_xmm_t i8;
>> +	uint32_t tbl[4];
>> +	uint64_t idx, pt, pt2;
>> +	const uint32_t *ptbl;
>> +
>> +	const __m128i mask8 =
>> +		_mm_set_epi32(UINT8_MAX, UINT8_MAX,
>UINT8_MAX, UINT8_MAX);
>> +
>> +	/*
>> +	 * RTE_LPM_VALID_EXT_ENTRY_BITMASK for 2 LPM entries
>> +	 * as one 64-bit value (0x0300000003000000).
>> +	 */
>> +	const uint64_t mask_xv =
>> +		((uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK |
>> +		(uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK <<
>32);
>> +
>> +	/*
>> +	 * RTE_LPM_LOOKUP_SUCCESS for 2 LPM entries
>> +	 * as one 64-bit value (0x0100000001000000).
>> +	 */
>> +	const uint64_t mask_v =
>> +		((uint64_t)RTE_LPM_LOOKUP_SUCCESS |
>> +		(uint64_t)RTE_LPM_LOOKUP_SUCCESS << 32);
>> +
>> +	/* get 4 indexes for tbl24[]. */
>> +	i24 = _mm_srli_epi32(ip, CHAR_BIT);
>> +
>> +	/* extract values from tbl24[] */
>> +	idx = _mm_cvtsi128_si64(i24);
>> +	/* With -O0 option, gcc 4.8 - 5.4 fails to fold sizeof() into a
>constant */
>> +	i24 = _mm_srli_si128(i24, /* sizeof(uint64_t) */ 8);
>> +
>> +	ptbl = (const uint32_t *)&lpm->tbl24[(uint32_t)idx];
>> +	tbl[0] = *ptbl;
>> +	ptbl = (const uint32_t *)&lpm->tbl24[idx >> 32];
>> +	tbl[1] = *ptbl;
>> +
>> +	idx = _mm_cvtsi128_si64(i24);
>> +
>> +	ptbl = (const uint32_t *)&lpm->tbl24[(uint32_t)idx];
>> +	tbl[2] = *ptbl;
>> +	ptbl = (const uint32_t *)&lpm->tbl24[idx >> 32];
>> +	tbl[3] = *ptbl;
>> +
>> +	/* get 4 indexes for tbl8[]. */
>> +	i8.x = _mm_and_si128(ip, mask8);
>> +
>> +	pt = (uint64_t)tbl[0] |
>> +		(uint64_t)tbl[1] << 32;
>> +	pt2 = (uint64_t)tbl[2] |
>> +		(uint64_t)tbl[3] << 32;
>> +
>> +	/* search successfully finished for all 4 IP addresses. */
>> +	if (likely((pt & mask_xv) == mask_v) &&
>> +			likely((pt2 & mask_xv) == mask_v)) {
>> +		*(uint64_t *)hop = pt & RTE_LPM_MASKX4_RES;
>> +		*(uint64_t *)(hop + 2) = pt2 & RTE_LPM_MASKX4_RES;
>> +		return;
>> +	}
>> +
>> +	if (unlikely((pt & RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
>> +			RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
>> +		i8.u32[0] = i8.u32[0] +
>> +			(uint8_t)tbl[0] *
>RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
>> +		ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[0]];
>> +		tbl[0] = *ptbl;
>> +	}
>> +	if (unlikely((pt >> 32 & RTE_LPM_VALID_EXT_ENTRY_BITMASK)
>==
>> +			RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
>> +		i8.u32[1] = i8.u32[1] +
>> +			(uint8_t)tbl[1] *
>RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
>> +		ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[1]];
>> +		tbl[1] = *ptbl;
>> +	}
>> +	if (unlikely((pt2 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
>> +			RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
>> +		i8.u32[2] = i8.u32[2] +
>> +			(uint8_t)tbl[2] *
>RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
>> +		ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[2]];
>> +		tbl[2] = *ptbl;
>> +	}
>> +	if (unlikely((pt2 >> 32 &
>RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
>> +			RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
>> +		i8.u32[3] = i8.u32[3] +
>> +			(uint8_t)tbl[3] *
>RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
>> +		ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[3]];
>> +		tbl[3] = *ptbl;
>> +	}
>> +
>> +	hop[0] = (tbl[0] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[0] &
>0x00FFFFFF :
>> +
>	defv[0];
>> +	hop[1] = (tbl[1] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[1] &
>0x00FFFFFF :
>> +
>	defv[1];
>> +	hop[2] = (tbl[2] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[2] &
>0x00FFFFFF :
>> +
>	defv[2];
>> +	hop[3] = (tbl[3] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[3] &
>0x00FFFFFF :
>> +
>	defv[3];
>> +}
>> +
>>   #ifdef __cplusplus
>>   }
>>   #endif
>
>--
>Regards,
>Vladimir
  
Vladimir Medvedkin Jan. 13, 2020, 5:48 p.m. UTC | #3
Hi,

On 13/01/2020 12:34, Pavan Nikhilesh Bhagavatula wrote:
>> -----Original Message-----
>> From: dev <dev-bounces@dpdk.org> On Behalf Of Medvedkin,
>> Vladimir
>> Sent: Monday, January 13, 2020 4:37 PM
>> To: Pavan Nikhilesh Bhagavatula <pbhagavatula@marvell.com>; Jerin
>> Jacob Kollanukkaran <jerinj@marvell.com>; Bruce Richardson
>> <bruce.richardson@intel.com>; Gavin Hu <gavin.hu@arm.com>
>> Cc: dev@dpdk.org
>> Subject: Re: [dpdk-dev] [PATCH] lmp: add lookup x4 with x4 default
>> values
>>
>> Hi Pavan,
>>
> Hi Medvedkin,
>
>> I don't think it is a good idea to add extra function because:
>>
>> 1) it is just a copy of an existing rte_lpm_lookupx4() except the last 4
>> ternary ops
> Yes, but I had no other option as modifying the current function will break ABI ☹.
>
>> 2) What is a real world use case for that? Usually returned value is
>> used as an index in an array of next_hop structs.
> If we take l3fwd as an example the next hop holds fwd port_id whereas the default value
> Passed holds mbuf->port. This allows Tx without having a branch.
>
> Event devices can aggregate packets from multiple ethernet ports and schedule them on
> a core. The current API requires us to pass a BAD_PORT and compare the result for every
> packet but if we are allowed to pass 4 different default values we could seamlessly send
> them for Tx.
>
>> 3) You can have the same result by using special unused defv and
>> pcmpeqd/vpblendd on a hop[4] after lookup
> Yes, but sadly that would be architecture depended.

But rte_lpm_lookupx4() itself is architecture depended. My suggestion 
here would be - implement  rte_lpm_lookupx4_defx4() in arch specific .c 
files as a wraper around rte_lpm_lookupx4() and do pcmpeqd/vpblendd 
stuff after. In this case you won't need to copy all of this implemented 
code.

>
>> On 11/01/2020 16:08, pbhagavatula@marvell.com wrote:
>>> From: Pavan Nikhilesh <pbhagavatula@marvell.com>
>>>
>>> Add lookup x4 with x4 default values.
>>> This can be used in usecases where we have to process  burst of
>> packets
>>> from different ports.
>>>
>>> Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
>>> ---
>>>    app/test/test_lpm_perf.c         |  31 +++++++++
>>>    lib/librte_lpm/rte_lpm.h         |  23 +++++++
>>>    lib/librte_lpm/rte_lpm_altivec.h | 109
>> +++++++++++++++++++++++++++++++
>>>    lib/librte_lpm/rte_lpm_neon.h    | 102
>> +++++++++++++++++++++++++++++
>>>    lib/librte_lpm/rte_lpm_sse.h     | 104
>> +++++++++++++++++++++++++++++
>>>    5 files changed, 369 insertions(+)
>>>
>>> diff --git a/app/test/test_lpm_perf.c b/app/test/test_lpm_perf.c
>>> index a2578fe90..8e9d4c7eb 100644
>>> --- a/app/test/test_lpm_perf.c
>>> +++ b/app/test/test_lpm_perf.c
>>> @@ -460,6 +460,37 @@ test_lpm_perf(void)
>>>    			(double)total_time / ((double)ITERATIONS *
>> BATCH_SIZE),
>>>    			(count * 100.0) / (double)(ITERATIONS *
>> BATCH_SIZE));
>>> +	/* Measure LookupX4 DefaultX4 */
>>> +	total_time = 0;
>>> +	count = 0;
>>> +	uint32_t def[4] = {UINT32_MAX, UINT32_MAX, UINT32_MAX,
>> UINT32_MAX};
>>> +	for (i = 0; i < ITERATIONS; i++) {
>>> +		static uint32_t ip_batch[BATCH_SIZE];
>>> +		uint32_t next_hops[4];
>>> +
>>> +		/* Create array of random IP addresses */
>>> +		for (j = 0; j < BATCH_SIZE; j++)
>>> +			ip_batch[j] = rte_rand();
>>> +
>>> +		/* Lookup per batch */
>>> +		begin = rte_rdtsc();
>>> +		for (j = 0; j < BATCH_SIZE; j += RTE_DIM(next_hops)) {
>>> +			unsigned int k;
>>> +			xmm_t ipx4;
>>> +
>>> +			ipx4 = vect_loadu_sil128((xmm_t *)(ip_batch +
>> j));
>>> +			ipx4 = *(xmm_t *)(ip_batch + j);
>>> +			rte_lpm_lookupx4_defx4(lpm, ipx4, next_hops,
>> def);
>>> +			for (k = 0; k < RTE_DIM(next_hops); k++)
>>> +				if (unlikely(next_hops[k] ==
>> UINT32_MAX))
>>> +					count++;
>>> +		}
>>> +
>>> +		total_time += rte_rdtsc() - begin;
>>> +	}
>>> +	printf("LPM LookupX4 Defx4: %.1f cycles (fails = %.1f%%)\n",
>>> +			(double)total_time / ((double)ITERATIONS *
>> BATCH_SIZE),
>>> +			(count * 100.0) / (double)(ITERATIONS *
>> BATCH_SIZE));
>>>    	/* Measure Delete */
>>>    	status = 0;
>>>    	begin = rte_rdtsc();
>>> diff --git a/lib/librte_lpm/rte_lpm.h b/lib/librte_lpm/rte_lpm.h
>>> index b9d49ac87..e66b43e06 100644
>>> --- a/lib/librte_lpm/rte_lpm.h
>>> +++ b/lib/librte_lpm/rte_lpm.h
>>> @@ -370,6 +370,29 @@ static inline void
>>>    rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t
>> hop[4],
>>>    	uint32_t defv);
>>>
>>> +/**
>>> + * Lookup four IP addresses in an LPM table.
>>> + *
>>> + * @param lpm
>>> + *   LPM object handle
>>> + * @param ip
>>> + *   Four IPs to be looked up in the LPM table
>>> + * @param hop
>>> + *   Next hop of the most specific rule found for IP (valid on lookup
>> hit only).
>>> + *   This is an 4 elements array of two byte values.
>>> + *   If the lookup was successful for the given IP, then least significant
>> byte
>>> + *   of the corresponding element is the  actual next hop and the
>> most
>>> + *   significant byte is zero.
>>> + *   If the lookup for the given IP failed, then corresponding element
>> would
>>> + *   contain default value, see description of then next parameter.
>>> + * @param defv
>>> + *   Default value[] to populate into corresponding element of hop[]
>> array,
>>> + *   if lookup would fail.
>>> + */
>>> +static inline void
>>> +rte_lpm_lookupx4_defx4(const struct rte_lpm *lpm, xmm_t ip,
>> uint32_t hop[4],
>>> +	uint32_t defv[4]);
>>> +
>>>    #if defined(RTE_ARCH_ARM) || defined(RTE_ARCH_ARM64)
>>>    #include "rte_lpm_neon.h"
>>>    #elif defined(RTE_ARCH_PPC_64)
>>> diff --git a/lib/librte_lpm/rte_lpm_altivec.h
>> b/lib/librte_lpm/rte_lpm_altivec.h
>>> index 228c41b38..1afc7bd74 100644
>>> --- a/lib/librte_lpm/rte_lpm_altivec.h
>>> +++ b/lib/librte_lpm/rte_lpm_altivec.h
>>> @@ -120,6 +120,115 @@ rte_lpm_lookupx4(const struct rte_lpm
>> *lpm, xmm_t ip, uint32_t hop[4],
>>>    	hop[3] = (tbl[3] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[3] &
>> 0x00FFFFFF : defv;
>>>    }
>>>
>>> +static inline void
>>> +rte_lpm_lookupx4_defx4(const struct rte_lpm *lpm, xmm_t ip,
>> uint32_t hop[4],
>>> +	uint32_t defv[4])
>>> +{
>>> +	vector signed int i24;
>>> +	rte_xmm_t i8;
>>> +	uint32_t tbl[4];
>>> +	uint64_t idx, pt, pt2;
>>> +	const uint32_t *ptbl;
>>> +
>>> +	const uint32_t mask = UINT8_MAX;
>>> +	const vector signed int mask8 = (xmm_t){mask, mask, mask,
>> mask};
>>> +
>>> +	/*
>>> +	 * RTE_LPM_VALID_EXT_ENTRY_BITMASK for 2 LPM entries
>>> +	 * as one 64-bit value (0x0300000003000000).
>>> +	 */
>>> +	const uint64_t mask_xv =
>>> +		((uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK |
>>> +		(uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK <<
>> 32);
>>> +
>>> +	/*
>>> +	 * RTE_LPM_LOOKUP_SUCCESS for 2 LPM entries
>>> +	 * as one 64-bit value (0x0100000001000000).
>>> +	 */
>>> +	const uint64_t mask_v =
>>> +		((uint64_t)RTE_LPM_LOOKUP_SUCCESS |
>>> +		(uint64_t)RTE_LPM_LOOKUP_SUCCESS << 32);
>>> +
>>> +	/* get 4 indexes for tbl24[]. */
>>> +	i24 = vec_sr((xmm_t) ip,
>>> +		(vector unsigned int){CHAR_BIT, CHAR_BIT, CHAR_BIT,
>> CHAR_BIT});
>>> +
>>> +	/* extract values from tbl24[] */
>>> +	idx = (uint32_t)i24[0];
>>> +	idx = idx < (1<<24) ? idx : (1<<24)-1;
>>> +	ptbl = (const uint32_t *)&lpm->tbl24[idx];
>>> +	tbl[0] = *ptbl;
>>> +
>>> +	idx = (uint32_t) i24[1];
>>> +	idx = idx < (1<<24) ? idx : (1<<24)-1;
>>> +	ptbl = (const uint32_t *)&lpm->tbl24[idx];
>>> +	tbl[1] = *ptbl;
>>> +
>>> +	idx = (uint32_t) i24[2];
>>> +	idx = idx < (1<<24) ? idx : (1<<24)-1;
>>> +	ptbl = (const uint32_t *)&lpm->tbl24[idx];
>>> +	tbl[2] = *ptbl;
>>> +
>>> +	idx = (uint32_t) i24[3];
>>> +	idx = idx < (1<<24) ? idx : (1<<24)-1;
>>> +	ptbl = (const uint32_t *)&lpm->tbl24[idx];
>>> +	tbl[3] = *ptbl;
>>> +
>>> +	/* get 4 indexes for tbl8[]. */
>>> +	i8.x = vec_and(ip, mask8);
>>> +
>>> +	pt = (uint64_t)tbl[0] |
>>> +		(uint64_t)tbl[1] << 32;
>>> +	pt2 = (uint64_t)tbl[2] |
>>> +		(uint64_t)tbl[3] << 32;
>>> +
>>> +	/* search successfully finished for all 4 IP addresses. */
>>> +	if (likely((pt & mask_xv) == mask_v) &&
>>> +			likely((pt2 & mask_xv) == mask_v)) {
>>> +		*(uint64_t *)hop = pt & RTE_LPM_MASKX4_RES;
>>> +		*(uint64_t *)(hop + 2) = pt2 & RTE_LPM_MASKX4_RES;
>>> +		return;
>>> +	}
>>> +
>>> +	if (unlikely((pt & RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
>>> +			RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
>>> +		i8.u32[0] = i8.u32[0] +
>>> +			(uint8_t)tbl[0] *
>> RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
>>> +		ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[0]];
>>> +		tbl[0] = *ptbl;
>>> +	}
>>> +	if (unlikely((pt >> 32 & RTE_LPM_VALID_EXT_ENTRY_BITMASK)
>> ==
>>> +			RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
>>> +		i8.u32[1] = i8.u32[1] +
>>> +			(uint8_t)tbl[1] *
>> RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
>>> +		ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[1]];
>>> +		tbl[1] = *ptbl;
>>> +	}
>>> +	if (unlikely((pt2 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
>>> +			RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
>>> +		i8.u32[2] = i8.u32[2] +
>>> +			(uint8_t)tbl[2] *
>> RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
>>> +		ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[2]];
>>> +		tbl[2] = *ptbl;
>>> +	}
>>> +	if (unlikely((pt2 >> 32 &
>> RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
>>> +			RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
>>> +		i8.u32[3] = i8.u32[3] +
>>> +			(uint8_t)tbl[3] *
>> RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
>>> +		ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[3]];
>>> +		tbl[3] = *ptbl;
>>> +	}
>>> +
>>> +	hop[0] = (tbl[0] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[0] &
>> 0x00FFFFFF :
>>> +
>> 	defv[0];
>>> +	hop[1] = (tbl[1] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[1] &
>> 0x00FFFFFF :
>>> +
>> 	defv[1];
>>> +	hop[2] = (tbl[2] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[2] &
>> 0x00FFFFFF :
>>> +
>> 	defv[2];
>>> +	hop[3] = (tbl[3] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[3] &
>> 0x00FFFFFF :
>>> +
>> 	defv[3];
>>> +}
>>> +
>>>    #ifdef __cplusplus
>>>    }
>>>    #endif
>>> diff --git a/lib/librte_lpm/rte_lpm_neon.h
>> b/lib/librte_lpm/rte_lpm_neon.h
>>> index 6c131d312..6ef635b18 100644
>>> --- a/lib/librte_lpm/rte_lpm_neon.h
>>> +++ b/lib/librte_lpm/rte_lpm_neon.h
>>> @@ -113,6 +113,108 @@ rte_lpm_lookupx4(const struct rte_lpm
>> *lpm, xmm_t ip, uint32_t hop[4],
>>>    	hop[3] = (tbl[3] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[3] &
>> 0x00FFFFFF : defv;
>>>    }
>>>
>>> +static inline void
>>> +rte_lpm_lookupx4_defx4(const struct rte_lpm *lpm, xmm_t ip,
>> uint32_t hop[4],
>>> +	uint32_t defv[4])
>>> +{
>>> +	uint32x4_t i24;
>>> +	rte_xmm_t i8;
>>> +	uint32_t tbl[4];
>>> +	uint64_t idx, pt, pt2;
>>> +	const uint32_t *ptbl;
>>> +
>>> +	const uint32_t mask = UINT8_MAX;
>>> +	const int32x4_t mask8 = vdupq_n_s32(mask);
>>> +
>>> +	/*
>>> +	 * RTE_LPM_VALID_EXT_ENTRY_BITMASK for 2 LPM entries
>>> +	 * as one 64-bit value (0x0300000003000000).
>>> +	 */
>>> +	const uint64_t mask_xv =
>>> +		((uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK |
>>> +		(uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK <<
>> 32);
>>> +
>>> +	/*
>>> +	 * RTE_LPM_LOOKUP_SUCCESS for 2 LPM entries
>>> +	 * as one 64-bit value (0x0100000001000000).
>>> +	 */
>>> +	const uint64_t mask_v =
>>> +		((uint64_t)RTE_LPM_LOOKUP_SUCCESS |
>>> +		(uint64_t)RTE_LPM_LOOKUP_SUCCESS << 32);
>>> +
>>> +	/* get 4 indexes for tbl24[]. */
>>> +	i24 = vshrq_n_u32((uint32x4_t)ip, CHAR_BIT);
>>> +
>>> +	/* extract values from tbl24[] */
>>> +	idx = vgetq_lane_u64((uint64x2_t)i24, 0);
>>> +
>>> +	ptbl = (const uint32_t *)&lpm->tbl24[(uint32_t)idx];
>>> +	tbl[0] = *ptbl;
>>> +	ptbl = (const uint32_t *)&lpm->tbl24[idx >> 32];
>>> +	tbl[1] = *ptbl;
>>> +
>>> +	idx = vgetq_lane_u64((uint64x2_t)i24, 1);
>>> +
>>> +	ptbl = (const uint32_t *)&lpm->tbl24[(uint32_t)idx];
>>> +	tbl[2] = *ptbl;
>>> +	ptbl = (const uint32_t *)&lpm->tbl24[idx >> 32];
>>> +	tbl[3] = *ptbl;
>>> +
>>> +	/* get 4 indexes for tbl8[]. */
>>> +	i8.x = vandq_s32(ip, mask8);
>>> +
>>> +	pt = (uint64_t)tbl[0] |
>>> +		(uint64_t)tbl[1] << 32;
>>> +	pt2 = (uint64_t)tbl[2] |
>>> +		(uint64_t)tbl[3] << 32;
>>> +
>>> +	/* search successfully finished for all 4 IP addresses. */
>>> +	if (likely((pt & mask_xv) == mask_v) &&
>>> +			likely((pt2 & mask_xv) == mask_v)) {
>>> +		*(uint64_t *)hop = pt & RTE_LPM_MASKX4_RES;
>>> +		*(uint64_t *)(hop + 2) = pt2 & RTE_LPM_MASKX4_RES;
>>> +		return;
>>> +	}
>>> +
>>> +	if (unlikely((pt & RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
>>> +			RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
>>> +		i8.u32[0] = i8.u32[0] +
>>> +			(uint8_t)tbl[0] *
>> RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
>>> +		ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[0]];
>>> +		tbl[0] = *ptbl;
>>> +	}
>>> +	if (unlikely((pt >> 32 & RTE_LPM_VALID_EXT_ENTRY_BITMASK)
>> ==
>>> +			RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
>>> +		i8.u32[1] = i8.u32[1] +
>>> +			(uint8_t)tbl[1] *
>> RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
>>> +		ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[1]];
>>> +		tbl[1] = *ptbl;
>>> +	}
>>> +	if (unlikely((pt2 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
>>> +			RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
>>> +		i8.u32[2] = i8.u32[2] +
>>> +			(uint8_t)tbl[2] *
>> RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
>>> +		ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[2]];
>>> +		tbl[2] = *ptbl;
>>> +	}
>>> +	if (unlikely((pt2 >> 32 &
>> RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
>>> +			RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
>>> +		i8.u32[3] = i8.u32[3] +
>>> +			(uint8_t)tbl[3] *
>> RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
>>> +		ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[3]];
>>> +		tbl[3] = *ptbl;
>>> +	}
>>> +
>>> +	hop[0] = (tbl[0] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[0] &
>> 0x00FFFFFF :
>>> +
>> 	defv[0];
>>> +	hop[1] = (tbl[1] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[1] &
>> 0x00FFFFFF :
>>> +
>> 	defv[1];
>>> +	hop[2] = (tbl[2] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[2] &
>> 0x00FFFFFF :
>>> +
>> 	defv[2];
>>> +	hop[3] = (tbl[3] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[3] &
>> 0x00FFFFFF :
>>> +
>> 	defv[3];
>>> +}
>>> +
>>>    #ifdef __cplusplus
>>>    }
>>>    #endif
>>> diff --git a/lib/librte_lpm/rte_lpm_sse.h
>> b/lib/librte_lpm/rte_lpm_sse.h
>>> index 44770b6ff..6ef15816c 100644
>>> --- a/lib/librte_lpm/rte_lpm_sse.h
>>> +++ b/lib/librte_lpm/rte_lpm_sse.h
>>> @@ -114,6 +114,110 @@ rte_lpm_lookupx4(const struct rte_lpm
>> *lpm, xmm_t ip, uint32_t hop[4],
>>>    	hop[3] = (tbl[3] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[3] &
>> 0x00FFFFFF : defv;
>>>    }
>>>
>>> +static inline void
>>> +rte_lpm_lookupx4_defx4(const struct rte_lpm *lpm, xmm_t ip,
>> uint32_t hop[4],
>>> +	uint32_t defv[4])
>>> +{
>>> +	__m128i i24;
>>> +	rte_xmm_t i8;
>>> +	uint32_t tbl[4];
>>> +	uint64_t idx, pt, pt2;
>>> +	const uint32_t *ptbl;
>>> +
>>> +	const __m128i mask8 =
>>> +		_mm_set_epi32(UINT8_MAX, UINT8_MAX,
>> UINT8_MAX, UINT8_MAX);
>>> +
>>> +	/*
>>> +	 * RTE_LPM_VALID_EXT_ENTRY_BITMASK for 2 LPM entries
>>> +	 * as one 64-bit value (0x0300000003000000).
>>> +	 */
>>> +	const uint64_t mask_xv =
>>> +		((uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK |
>>> +		(uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK <<
>> 32);
>>> +
>>> +	/*
>>> +	 * RTE_LPM_LOOKUP_SUCCESS for 2 LPM entries
>>> +	 * as one 64-bit value (0x0100000001000000).
>>> +	 */
>>> +	const uint64_t mask_v =
>>> +		((uint64_t)RTE_LPM_LOOKUP_SUCCESS |
>>> +		(uint64_t)RTE_LPM_LOOKUP_SUCCESS << 32);
>>> +
>>> +	/* get 4 indexes for tbl24[]. */
>>> +	i24 = _mm_srli_epi32(ip, CHAR_BIT);
>>> +
>>> +	/* extract values from tbl24[] */
>>> +	idx = _mm_cvtsi128_si64(i24);
>>> +	/* With -O0 option, gcc 4.8 - 5.4 fails to fold sizeof() into a
>> constant */
>>> +	i24 = _mm_srli_si128(i24, /* sizeof(uint64_t) */ 8);
>>> +
>>> +	ptbl = (const uint32_t *)&lpm->tbl24[(uint32_t)idx];
>>> +	tbl[0] = *ptbl;
>>> +	ptbl = (const uint32_t *)&lpm->tbl24[idx >> 32];
>>> +	tbl[1] = *ptbl;
>>> +
>>> +	idx = _mm_cvtsi128_si64(i24);
>>> +
>>> +	ptbl = (const uint32_t *)&lpm->tbl24[(uint32_t)idx];
>>> +	tbl[2] = *ptbl;
>>> +	ptbl = (const uint32_t *)&lpm->tbl24[idx >> 32];
>>> +	tbl[3] = *ptbl;
>>> +
>>> +	/* get 4 indexes for tbl8[]. */
>>> +	i8.x = _mm_and_si128(ip, mask8);
>>> +
>>> +	pt = (uint64_t)tbl[0] |
>>> +		(uint64_t)tbl[1] << 32;
>>> +	pt2 = (uint64_t)tbl[2] |
>>> +		(uint64_t)tbl[3] << 32;
>>> +
>>> +	/* search successfully finished for all 4 IP addresses. */
>>> +	if (likely((pt & mask_xv) == mask_v) &&
>>> +			likely((pt2 & mask_xv) == mask_v)) {
>>> +		*(uint64_t *)hop = pt & RTE_LPM_MASKX4_RES;
>>> +		*(uint64_t *)(hop + 2) = pt2 & RTE_LPM_MASKX4_RES;
>>> +		return;
>>> +	}
>>> +
>>> +	if (unlikely((pt & RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
>>> +			RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
>>> +		i8.u32[0] = i8.u32[0] +
>>> +			(uint8_t)tbl[0] *
>> RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
>>> +		ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[0]];
>>> +		tbl[0] = *ptbl;
>>> +	}
>>> +	if (unlikely((pt >> 32 & RTE_LPM_VALID_EXT_ENTRY_BITMASK)
>> ==
>>> +			RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
>>> +		i8.u32[1] = i8.u32[1] +
>>> +			(uint8_t)tbl[1] *
>> RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
>>> +		ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[1]];
>>> +		tbl[1] = *ptbl;
>>> +	}
>>> +	if (unlikely((pt2 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
>>> +			RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
>>> +		i8.u32[2] = i8.u32[2] +
>>> +			(uint8_t)tbl[2] *
>> RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
>>> +		ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[2]];
>>> +		tbl[2] = *ptbl;
>>> +	}
>>> +	if (unlikely((pt2 >> 32 &
>> RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
>>> +			RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
>>> +		i8.u32[3] = i8.u32[3] +
>>> +			(uint8_t)tbl[3] *
>> RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
>>> +		ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[3]];
>>> +		tbl[3] = *ptbl;
>>> +	}
>>> +
>>> +	hop[0] = (tbl[0] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[0] &
>> 0x00FFFFFF :
>>> +
>> 	defv[0];
>>> +	hop[1] = (tbl[1] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[1] &
>> 0x00FFFFFF :
>>> +
>> 	defv[1];
>>> +	hop[2] = (tbl[2] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[2] &
>> 0x00FFFFFF :
>>> +
>> 	defv[2];
>>> +	hop[3] = (tbl[3] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[3] &
>> 0x00FFFFFF :
>>> +
>> 	defv[3];
>>> +}
>>> +
>>>    #ifdef __cplusplus
>>>    }
>>>    #endif
>> --
>> Regards,
>> Vladimir
  

Patch

diff --git a/app/test/test_lpm_perf.c b/app/test/test_lpm_perf.c
index a2578fe90..8e9d4c7eb 100644
--- a/app/test/test_lpm_perf.c
+++ b/app/test/test_lpm_perf.c
@@ -460,6 +460,37 @@  test_lpm_perf(void)
 			(double)total_time / ((double)ITERATIONS * BATCH_SIZE),
 			(count * 100.0) / (double)(ITERATIONS * BATCH_SIZE));
 
+	/* Measure LookupX4 DefaultX4 */
+	total_time = 0;
+	count = 0;
+	uint32_t def[4] = {UINT32_MAX, UINT32_MAX, UINT32_MAX, UINT32_MAX};
+	for (i = 0; i < ITERATIONS; i++) {
+		static uint32_t ip_batch[BATCH_SIZE];
+		uint32_t next_hops[4];
+
+		/* Create array of random IP addresses */
+		for (j = 0; j < BATCH_SIZE; j++)
+			ip_batch[j] = rte_rand();
+
+		/* Lookup per batch */
+		begin = rte_rdtsc();
+		for (j = 0; j < BATCH_SIZE; j += RTE_DIM(next_hops)) {
+			unsigned int k;
+			xmm_t ipx4;
+
+			ipx4 = vect_loadu_sil128((xmm_t *)(ip_batch + j));
+			ipx4 = *(xmm_t *)(ip_batch + j);
+			rte_lpm_lookupx4_defx4(lpm, ipx4, next_hops, def);
+			for (k = 0; k < RTE_DIM(next_hops); k++)
+				if (unlikely(next_hops[k] == UINT32_MAX))
+					count++;
+		}
+
+		total_time += rte_rdtsc() - begin;
+	}
+	printf("LPM LookupX4 Defx4: %.1f cycles (fails = %.1f%%)\n",
+			(double)total_time / ((double)ITERATIONS * BATCH_SIZE),
+			(count * 100.0) / (double)(ITERATIONS * BATCH_SIZE));
 	/* Measure Delete */
 	status = 0;
 	begin = rte_rdtsc();
diff --git a/lib/librte_lpm/rte_lpm.h b/lib/librte_lpm/rte_lpm.h
index b9d49ac87..e66b43e06 100644
--- a/lib/librte_lpm/rte_lpm.h
+++ b/lib/librte_lpm/rte_lpm.h
@@ -370,6 +370,29 @@  static inline void
 rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
 	uint32_t defv);
 
+/**
+ * Lookup four IP addresses in an LPM table.
+ *
+ * @param lpm
+ *   LPM object handle
+ * @param ip
+ *   Four IPs to be looked up in the LPM table
+ * @param hop
+ *   Next hop of the most specific rule found for IP (valid on lookup hit only).
+ *   This is an 4 elements array of two byte values.
+ *   If the lookup was successful for the given IP, then least significant byte
+ *   of the corresponding element is the  actual next hop and the most
+ *   significant byte is zero.
+ *   If the lookup for the given IP failed, then corresponding element would
+ *   contain default value, see description of then next parameter.
+ * @param defv
+ *   Default value[] to populate into corresponding element of hop[] array,
+ *   if lookup would fail.
+ */
+static inline void
+rte_lpm_lookupx4_defx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
+	uint32_t defv[4]);
+
 #if defined(RTE_ARCH_ARM) || defined(RTE_ARCH_ARM64)
 #include "rte_lpm_neon.h"
 #elif defined(RTE_ARCH_PPC_64)
diff --git a/lib/librte_lpm/rte_lpm_altivec.h b/lib/librte_lpm/rte_lpm_altivec.h
index 228c41b38..1afc7bd74 100644
--- a/lib/librte_lpm/rte_lpm_altivec.h
+++ b/lib/librte_lpm/rte_lpm_altivec.h
@@ -120,6 +120,115 @@  rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
 	hop[3] = (tbl[3] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[3] & 0x00FFFFFF : defv;
 }
 
+static inline void
+rte_lpm_lookupx4_defx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
+	uint32_t defv[4])
+{
+	vector signed int i24;
+	rte_xmm_t i8;
+	uint32_t tbl[4];
+	uint64_t idx, pt, pt2;
+	const uint32_t *ptbl;
+
+	const uint32_t mask = UINT8_MAX;
+	const vector signed int mask8 = (xmm_t){mask, mask, mask, mask};
+
+	/*
+	 * RTE_LPM_VALID_EXT_ENTRY_BITMASK for 2 LPM entries
+	 * as one 64-bit value (0x0300000003000000).
+	 */
+	const uint64_t mask_xv =
+		((uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK |
+		(uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK << 32);
+
+	/*
+	 * RTE_LPM_LOOKUP_SUCCESS for 2 LPM entries
+	 * as one 64-bit value (0x0100000001000000).
+	 */
+	const uint64_t mask_v =
+		((uint64_t)RTE_LPM_LOOKUP_SUCCESS |
+		(uint64_t)RTE_LPM_LOOKUP_SUCCESS << 32);
+
+	/* get 4 indexes for tbl24[]. */
+	i24 = vec_sr((xmm_t) ip,
+		(vector unsigned int){CHAR_BIT, CHAR_BIT, CHAR_BIT, CHAR_BIT});
+
+	/* extract values from tbl24[] */
+	idx = (uint32_t)i24[0];
+	idx = idx < (1<<24) ? idx : (1<<24)-1;
+	ptbl = (const uint32_t *)&lpm->tbl24[idx];
+	tbl[0] = *ptbl;
+
+	idx = (uint32_t) i24[1];
+	idx = idx < (1<<24) ? idx : (1<<24)-1;
+	ptbl = (const uint32_t *)&lpm->tbl24[idx];
+	tbl[1] = *ptbl;
+
+	idx = (uint32_t) i24[2];
+	idx = idx < (1<<24) ? idx : (1<<24)-1;
+	ptbl = (const uint32_t *)&lpm->tbl24[idx];
+	tbl[2] = *ptbl;
+
+	idx = (uint32_t) i24[3];
+	idx = idx < (1<<24) ? idx : (1<<24)-1;
+	ptbl = (const uint32_t *)&lpm->tbl24[idx];
+	tbl[3] = *ptbl;
+
+	/* get 4 indexes for tbl8[]. */
+	i8.x = vec_and(ip, mask8);
+
+	pt = (uint64_t)tbl[0] |
+		(uint64_t)tbl[1] << 32;
+	pt2 = (uint64_t)tbl[2] |
+		(uint64_t)tbl[3] << 32;
+
+	/* search successfully finished for all 4 IP addresses. */
+	if (likely((pt & mask_xv) == mask_v) &&
+			likely((pt2 & mask_xv) == mask_v)) {
+		*(uint64_t *)hop = pt & RTE_LPM_MASKX4_RES;
+		*(uint64_t *)(hop + 2) = pt2 & RTE_LPM_MASKX4_RES;
+		return;
+	}
+
+	if (unlikely((pt & RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
+			RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
+		i8.u32[0] = i8.u32[0] +
+			(uint8_t)tbl[0] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
+		ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[0]];
+		tbl[0] = *ptbl;
+	}
+	if (unlikely((pt >> 32 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
+			RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
+		i8.u32[1] = i8.u32[1] +
+			(uint8_t)tbl[1] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
+		ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[1]];
+		tbl[1] = *ptbl;
+	}
+	if (unlikely((pt2 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
+			RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
+		i8.u32[2] = i8.u32[2] +
+			(uint8_t)tbl[2] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
+		ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[2]];
+		tbl[2] = *ptbl;
+	}
+	if (unlikely((pt2 >> 32 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
+			RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
+		i8.u32[3] = i8.u32[3] +
+			(uint8_t)tbl[3] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
+		ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[3]];
+		tbl[3] = *ptbl;
+	}
+
+	hop[0] = (tbl[0] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[0] & 0x00FFFFFF :
+									defv[0];
+	hop[1] = (tbl[1] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[1] & 0x00FFFFFF :
+									defv[1];
+	hop[2] = (tbl[2] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[2] & 0x00FFFFFF :
+									defv[2];
+	hop[3] = (tbl[3] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[3] & 0x00FFFFFF :
+									defv[3];
+}
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/librte_lpm/rte_lpm_neon.h b/lib/librte_lpm/rte_lpm_neon.h
index 6c131d312..6ef635b18 100644
--- a/lib/librte_lpm/rte_lpm_neon.h
+++ b/lib/librte_lpm/rte_lpm_neon.h
@@ -113,6 +113,108 @@  rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
 	hop[3] = (tbl[3] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[3] & 0x00FFFFFF : defv;
 }
 
+static inline void
+rte_lpm_lookupx4_defx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
+	uint32_t defv[4])
+{
+	uint32x4_t i24;
+	rte_xmm_t i8;
+	uint32_t tbl[4];
+	uint64_t idx, pt, pt2;
+	const uint32_t *ptbl;
+
+	const uint32_t mask = UINT8_MAX;
+	const int32x4_t mask8 = vdupq_n_s32(mask);
+
+	/*
+	 * RTE_LPM_VALID_EXT_ENTRY_BITMASK for 2 LPM entries
+	 * as one 64-bit value (0x0300000003000000).
+	 */
+	const uint64_t mask_xv =
+		((uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK |
+		(uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK << 32);
+
+	/*
+	 * RTE_LPM_LOOKUP_SUCCESS for 2 LPM entries
+	 * as one 64-bit value (0x0100000001000000).
+	 */
+	const uint64_t mask_v =
+		((uint64_t)RTE_LPM_LOOKUP_SUCCESS |
+		(uint64_t)RTE_LPM_LOOKUP_SUCCESS << 32);
+
+	/* get 4 indexes for tbl24[]. */
+	i24 = vshrq_n_u32((uint32x4_t)ip, CHAR_BIT);
+
+	/* extract values from tbl24[] */
+	idx = vgetq_lane_u64((uint64x2_t)i24, 0);
+
+	ptbl = (const uint32_t *)&lpm->tbl24[(uint32_t)idx];
+	tbl[0] = *ptbl;
+	ptbl = (const uint32_t *)&lpm->tbl24[idx >> 32];
+	tbl[1] = *ptbl;
+
+	idx = vgetq_lane_u64((uint64x2_t)i24, 1);
+
+	ptbl = (const uint32_t *)&lpm->tbl24[(uint32_t)idx];
+	tbl[2] = *ptbl;
+	ptbl = (const uint32_t *)&lpm->tbl24[idx >> 32];
+	tbl[3] = *ptbl;
+
+	/* get 4 indexes for tbl8[]. */
+	i8.x = vandq_s32(ip, mask8);
+
+	pt = (uint64_t)tbl[0] |
+		(uint64_t)tbl[1] << 32;
+	pt2 = (uint64_t)tbl[2] |
+		(uint64_t)tbl[3] << 32;
+
+	/* search successfully finished for all 4 IP addresses. */
+	if (likely((pt & mask_xv) == mask_v) &&
+			likely((pt2 & mask_xv) == mask_v)) {
+		*(uint64_t *)hop = pt & RTE_LPM_MASKX4_RES;
+		*(uint64_t *)(hop + 2) = pt2 & RTE_LPM_MASKX4_RES;
+		return;
+	}
+
+	if (unlikely((pt & RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
+			RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
+		i8.u32[0] = i8.u32[0] +
+			(uint8_t)tbl[0] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
+		ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[0]];
+		tbl[0] = *ptbl;
+	}
+	if (unlikely((pt >> 32 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
+			RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
+		i8.u32[1] = i8.u32[1] +
+			(uint8_t)tbl[1] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
+		ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[1]];
+		tbl[1] = *ptbl;
+	}
+	if (unlikely((pt2 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
+			RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
+		i8.u32[2] = i8.u32[2] +
+			(uint8_t)tbl[2] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
+		ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[2]];
+		tbl[2] = *ptbl;
+	}
+	if (unlikely((pt2 >> 32 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
+			RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
+		i8.u32[3] = i8.u32[3] +
+			(uint8_t)tbl[3] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
+		ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[3]];
+		tbl[3] = *ptbl;
+	}
+
+	hop[0] = (tbl[0] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[0] & 0x00FFFFFF :
+									defv[0];
+	hop[1] = (tbl[1] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[1] & 0x00FFFFFF :
+									defv[1];
+	hop[2] = (tbl[2] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[2] & 0x00FFFFFF :
+									defv[2];
+	hop[3] = (tbl[3] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[3] & 0x00FFFFFF :
+									defv[3];
+}
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/librte_lpm/rte_lpm_sse.h b/lib/librte_lpm/rte_lpm_sse.h
index 44770b6ff..6ef15816c 100644
--- a/lib/librte_lpm/rte_lpm_sse.h
+++ b/lib/librte_lpm/rte_lpm_sse.h
@@ -114,6 +114,110 @@  rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
 	hop[3] = (tbl[3] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[3] & 0x00FFFFFF : defv;
 }
 
+static inline void
+rte_lpm_lookupx4_defx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
+	uint32_t defv[4])
+{
+	__m128i i24;
+	rte_xmm_t i8;
+	uint32_t tbl[4];
+	uint64_t idx, pt, pt2;
+	const uint32_t *ptbl;
+
+	const __m128i mask8 =
+		_mm_set_epi32(UINT8_MAX, UINT8_MAX, UINT8_MAX, UINT8_MAX);
+
+	/*
+	 * RTE_LPM_VALID_EXT_ENTRY_BITMASK for 2 LPM entries
+	 * as one 64-bit value (0x0300000003000000).
+	 */
+	const uint64_t mask_xv =
+		((uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK |
+		(uint64_t)RTE_LPM_VALID_EXT_ENTRY_BITMASK << 32);
+
+	/*
+	 * RTE_LPM_LOOKUP_SUCCESS for 2 LPM entries
+	 * as one 64-bit value (0x0100000001000000).
+	 */
+	const uint64_t mask_v =
+		((uint64_t)RTE_LPM_LOOKUP_SUCCESS |
+		(uint64_t)RTE_LPM_LOOKUP_SUCCESS << 32);
+
+	/* get 4 indexes for tbl24[]. */
+	i24 = _mm_srli_epi32(ip, CHAR_BIT);
+
+	/* extract values from tbl24[] */
+	idx = _mm_cvtsi128_si64(i24);
+	/* With -O0 option, gcc 4.8 - 5.4 fails to fold sizeof() into a constant */
+	i24 = _mm_srli_si128(i24, /* sizeof(uint64_t) */ 8);
+
+	ptbl = (const uint32_t *)&lpm->tbl24[(uint32_t)idx];
+	tbl[0] = *ptbl;
+	ptbl = (const uint32_t *)&lpm->tbl24[idx >> 32];
+	tbl[1] = *ptbl;
+
+	idx = _mm_cvtsi128_si64(i24);
+
+	ptbl = (const uint32_t *)&lpm->tbl24[(uint32_t)idx];
+	tbl[2] = *ptbl;
+	ptbl = (const uint32_t *)&lpm->tbl24[idx >> 32];
+	tbl[3] = *ptbl;
+
+	/* get 4 indexes for tbl8[]. */
+	i8.x = _mm_and_si128(ip, mask8);
+
+	pt = (uint64_t)tbl[0] |
+		(uint64_t)tbl[1] << 32;
+	pt2 = (uint64_t)tbl[2] |
+		(uint64_t)tbl[3] << 32;
+
+	/* search successfully finished for all 4 IP addresses. */
+	if (likely((pt & mask_xv) == mask_v) &&
+			likely((pt2 & mask_xv) == mask_v)) {
+		*(uint64_t *)hop = pt & RTE_LPM_MASKX4_RES;
+		*(uint64_t *)(hop + 2) = pt2 & RTE_LPM_MASKX4_RES;
+		return;
+	}
+
+	if (unlikely((pt & RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
+			RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
+		i8.u32[0] = i8.u32[0] +
+			(uint8_t)tbl[0] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
+		ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[0]];
+		tbl[0] = *ptbl;
+	}
+	if (unlikely((pt >> 32 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
+			RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
+		i8.u32[1] = i8.u32[1] +
+			(uint8_t)tbl[1] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
+		ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[1]];
+		tbl[1] = *ptbl;
+	}
+	if (unlikely((pt2 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
+			RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
+		i8.u32[2] = i8.u32[2] +
+			(uint8_t)tbl[2] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
+		ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[2]];
+		tbl[2] = *ptbl;
+	}
+	if (unlikely((pt2 >> 32 & RTE_LPM_VALID_EXT_ENTRY_BITMASK) ==
+			RTE_LPM_VALID_EXT_ENTRY_BITMASK)) {
+		i8.u32[3] = i8.u32[3] +
+			(uint8_t)tbl[3] * RTE_LPM_TBL8_GROUP_NUM_ENTRIES;
+		ptbl = (const uint32_t *)&lpm->tbl8[i8.u32[3]];
+		tbl[3] = *ptbl;
+	}
+
+	hop[0] = (tbl[0] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[0] & 0x00FFFFFF :
+									defv[0];
+	hop[1] = (tbl[1] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[1] & 0x00FFFFFF :
+									defv[1];
+	hop[2] = (tbl[2] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[2] & 0x00FFFFFF :
+									defv[2];
+	hop[3] = (tbl[3] & RTE_LPM_LOOKUP_SUCCESS) ? tbl[3] & 0x00FFFFFF :
+									defv[3];
+}
+
 #ifdef __cplusplus
 }
 #endif