[v3,4/8] fib: introduce AVX512 lookup

Message ID f986d829424a2c9e2e1e22981bd63db5a66c8f4c.1589890262.git.vladimir.medvedkin@intel.com (mailing list archive)
State Superseded, archived
Delegated to: David Marchand
Headers
Series fib: implement AVX512 vector lookup |

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/Intel-compilation fail Compilation issues

Commit Message

Vladimir Medvedkin May 19, 2020, 12:12 p.m. UTC
Add new lookup implementation for DIR24_8 algorithm using
AVX512 instruction set

Signed-off-by: Vladimir Medvedkin <vladimir.medvedkin@intel.com>
---
 lib/librte_fib/Makefile         |  14 ++++
 lib/librte_fib/dir24_8.c        |  24 ++++++
 lib/librte_fib/dir24_8_avx512.c | 165 ++++++++++++++++++++++++++++++++++++++++
 lib/librte_fib/dir24_8_avx512.h |  24 ++++++
 lib/librte_fib/meson.build      |  11 +++
 lib/librte_fib/rte_fib.h        |   3 +-
 6 files changed, 240 insertions(+), 1 deletion(-)
 create mode 100644 lib/librte_fib/dir24_8_avx512.c
 create mode 100644 lib/librte_fib/dir24_8_avx512.h
  

Comments

Ananyev, Konstantin June 24, 2020, 1:18 p.m. UTC | #1
> Add new lookup implementation for DIR24_8 algorithm using
> AVX512 instruction set
> 
> Signed-off-by: Vladimir Medvedkin <vladimir.medvedkin@intel.com>
> ---
>  lib/librte_fib/Makefile         |  14 ++++
>  lib/librte_fib/dir24_8.c        |  24 ++++++
>  lib/librte_fib/dir24_8_avx512.c | 165 ++++++++++++++++++++++++++++++++++++++++
>  lib/librte_fib/dir24_8_avx512.h |  24 ++++++
>  lib/librte_fib/meson.build      |  11 +++
>  lib/librte_fib/rte_fib.h        |   3 +-
>  6 files changed, 240 insertions(+), 1 deletion(-)
>  create mode 100644 lib/librte_fib/dir24_8_avx512.c
>  create mode 100644 lib/librte_fib/dir24_8_avx512.h
> 
> diff --git a/lib/librte_fib/Makefile b/lib/librte_fib/Makefile
> index 1dd2a49..3958da1 100644
> --- a/lib/librte_fib/Makefile
> +++ b/lib/librte_fib/Makefile
> @@ -19,4 +19,18 @@ SRCS-$(CONFIG_RTE_LIBRTE_FIB) := rte_fib.c rte_fib6.c dir24_8.c trie.c
>  # install this header file
>  SYMLINK-$(CONFIG_RTE_LIBRTE_FIB)-include := rte_fib.h rte_fib6.h
> 
> +CC_AVX512F_SUPPORT=$(shell $(CC) -mavx512f -dM -E - </dev/null 2>&1 | \
> +grep -q __AVX512F__ && echo 1)
> +
> +CC_AVX512DQ_SUPPORT=$(shell $(CC) -mavx512dq -dM -E - </dev/null 2>&1 | \
> +grep -q __AVX512DQ__ && echo 1)
> +
> +ifeq ($(CC_AVX512F_SUPPORT), 1)
> +	ifeq ($(CC_AVX512DQ_SUPPORT), 1)
> +		SRCS-$(CONFIG_RTE_LIBRTE_FIB) += dir24_8_avx512.c
> +		CFLAGS_dir24_8_avx512.o += -mavx512f
> +		CFLAGS_dir24_8_avx512.o += -mavx512dq
> +		CFLAGS_dir24_8.o += -DCC_DIR24_8_AVX512_SUPPORT
> +	endif
> +endif
>  include $(RTE_SDK)/mk/rte.lib.mk
> diff --git a/lib/librte_fib/dir24_8.c b/lib/librte_fib/dir24_8.c
> index 9d74653..0a1c53f 100644
> --- a/lib/librte_fib/dir24_8.c
> +++ b/lib/librte_fib/dir24_8.c
> @@ -18,6 +18,12 @@
>  #include <rte_fib.h>
>  #include "dir24_8.h"
> 
> +#ifdef CC_DIR24_8_AVX512_SUPPORT
> +
> +#include "dir24_8_avx512.h"
> +
> +#endif /* CC_DIR24_8_AVX512_SUPPORT */
> +
>  #define DIR24_8_NAMESIZE	64
> 
>  #define ROUNDUP(x, y)	 RTE_ALIGN_CEIL(x, (1 << (32 - y)))
> @@ -62,6 +68,24 @@ dir24_8_get_lookup_fn(void *p, enum rte_fib_dir24_8_lookup_type type)
>  		}
>  	case RTE_FIB_DIR24_8_SCALAR_UNI:
>  		return dir24_8_lookup_bulk_uni;
> +#ifdef CC_DIR24_8_AVX512_SUPPORT
> +	case RTE_FIB_DIR24_8_VECTOR:
> +		if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) <= 0)
> +			return NULL;
> +
> +		switch (nh_sz) {
> +		case RTE_FIB_DIR24_8_1B:
> +			return rte_dir24_8_vec_lookup_bulk_1b;
> +		case RTE_FIB_DIR24_8_2B:
> +			return rte_dir24_8_vec_lookup_bulk_2b;
> +		case RTE_FIB_DIR24_8_4B:
> +			return rte_dir24_8_vec_lookup_bulk_4b;
> +		case RTE_FIB_DIR24_8_8B:
> +			return rte_dir24_8_vec_lookup_bulk_8b;
> +		default:
> +			return NULL;
> +		}
> +#endif
>  	default:
>  		return NULL;
>  	}
> diff --git a/lib/librte_fib/dir24_8_avx512.c b/lib/librte_fib/dir24_8_avx512.c
> new file mode 100644
> index 0000000..43dba28
> --- /dev/null
> +++ b/lib/librte_fib/dir24_8_avx512.c
> @@ -0,0 +1,165 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2020 Intel Corporation
> + */
> +
> +#include <rte_vect.h>
> +#include <rte_fib.h>
> +
> +#include "dir24_8.h"
> +#include "dir24_8_avx512.h"
> +
> +static __rte_always_inline void
> +dir24_8_vec_lookup_x16(void *p, const uint32_t *ips,
> +	uint64_t *next_hops, int size)
> +{
> +	struct dir24_8_tbl *dp = (struct dir24_8_tbl *)p;
> +	__mmask16 msk_ext;
> +	__mmask16 exp_msk = 0x5555;
> +	__m512i ip_vec, idxes, res, bytes;
> +	const __m512i zero = _mm512_set1_epi32(0);
> +	const __m512i lsb = _mm512_set1_epi32(1);
> +	const __m512i lsbyte_msk = _mm512_set1_epi32(0xff);
> +	__m512i tmp1, tmp2, res_msk;
> +	__m256i tmp256;
> +	/* used to mask gather values if size is 1/2 (8/16 bit next hops) */
> +	if (size == sizeof(uint8_t))
> +		res_msk = _mm512_set1_epi32(UINT8_MAX);
> +	else if (size == sizeof(uint16_t))
> +		res_msk = _mm512_set1_epi32(UINT16_MAX);
> +
> +	ip_vec = _mm512_loadu_si512(ips);
> +	/* mask 24 most significant bits */
> +	idxes = _mm512_srli_epi32(ip_vec, 8);
> +
> +	/**
> +	 * lookup in tbl24
> +	 * Put it inside branch to make compiler happy with -O0
> +	 */
> +	if (size == sizeof(uint8_t)) {
> +		res = _mm512_i32gather_epi32(idxes, (const int *)dp->tbl24, 1);
> +		res = _mm512_and_epi32(res, res_msk);
> +	} else if (size == sizeof(uint16_t)) {
> +		res = _mm512_i32gather_epi32(idxes, (const int *)dp->tbl24, 2);
> +		res = _mm512_and_epi32(res, res_msk);
> +	} else
> +		res = _mm512_i32gather_epi32(idxes, (const int *)dp->tbl24, 4);
> +
> +	/* get extended entries indexes */
> +	msk_ext = _mm512_test_epi32_mask(res, lsb);
> +
> +	if (msk_ext != 0) {
> +		idxes = _mm512_srli_epi32(res, 1);
> +		idxes = _mm512_slli_epi32(idxes, 8);
> +		bytes = _mm512_and_epi32(ip_vec, lsbyte_msk);
> +		idxes = _mm512_maskz_add_epi32(msk_ext, idxes, bytes);
> +		if (size == sizeof(uint8_t)) {
> +			idxes = _mm512_mask_i32gather_epi32(zero, msk_ext,
> +				idxes, (const int *)dp->tbl8, 1);
> +			idxes = _mm512_and_epi32(idxes, res_msk);
> +		} else if (size == sizeof(uint16_t)) {
> +			idxes = _mm512_mask_i32gather_epi32(zero, msk_ext,
> +				idxes, (const int *)dp->tbl8, 2);
> +			idxes = _mm512_and_epi32(idxes, res_msk);
> +		} else
> +			idxes = _mm512_mask_i32gather_epi32(zero, msk_ext,
> +				idxes, (const int *)dp->tbl8, 4);
> +
> +		res = _mm512_mask_blend_epi32(msk_ext, res, idxes);
> +	}
> +
> +	res = _mm512_srli_epi32(res, 1);
> +	tmp1 = _mm512_maskz_expand_epi32(exp_msk, res);
> +	tmp256 = _mm512_extracti32x8_epi32(res, 1);
> +	tmp2 = _mm512_maskz_expand_epi32(exp_msk,
> +		_mm512_castsi256_si512(tmp256));
> +	_mm512_storeu_si512(next_hops, tmp1);
> +	_mm512_storeu_si512(next_hops + 8, tmp2);
> +}
> +
> +static __rte_always_inline void
> +dir24_8_vec_lookup_x8_8b(void *p, const uint32_t *ips,
> +	uint64_t *next_hops)
> +{
> +	struct dir24_8_tbl *dp = (struct dir24_8_tbl *)p;
> +	const __m512i zero = _mm512_set1_epi32(0);
> +	const __m512i lsbyte_msk = _mm512_set1_epi64(0xff);
> +	const __m512i lsb = _mm512_set1_epi64(1);
> +	__m512i res, idxes, bytes;
> +	__m256i idxes_256, ip_vec;
> +	__mmask8 msk_ext;
> +
> +	ip_vec = _mm256_loadu_si256((const void *)ips);
> +	/* mask 24 most significant bits */
> +	idxes_256 = _mm256_srli_epi32(ip_vec, 8);
> +
> +	/* lookup in tbl24 */
> +	res = _mm512_i32gather_epi64(idxes_256, (const void *)dp->tbl24, 8);
> +
> +	/* get extended entries indexes */
> +	msk_ext = _mm512_test_epi64_mask(res, lsb);
> +
> +	if (msk_ext != 0) {
> +		bytes = _mm512_cvtepi32_epi64(ip_vec);
> +		idxes = _mm512_srli_epi64(res, 1);
> +		idxes = _mm512_slli_epi64(idxes, 8);
> +		bytes = _mm512_and_epi64(bytes, lsbyte_msk);
> +		idxes = _mm512_maskz_add_epi64(msk_ext, idxes, bytes);
> +		idxes = _mm512_mask_i64gather_epi64(zero, msk_ext, idxes,
> +			(const void *)dp->tbl8, 8);
> +
> +		res = _mm512_mask_blend_epi64(msk_ext, res, idxes);
> +	}
> +
> +	res = _mm512_srli_epi64(res, 1);
> +	_mm512_storeu_si512(next_hops, res);
> +}
> +
> +void
> +rte_dir24_8_vec_lookup_bulk_1b(void *p, const uint32_t *ips,
> +	uint64_t *next_hops, const unsigned int n)
> +{
> +	uint32_t i;
> +	for (i = 0; i < (n / 16); i++)
> +		dir24_8_vec_lookup_x16(p, ips + i * 16, next_hops + i * 16,
> +			sizeof(uint8_t));
> +

Just curious: if for reminder, instead of calling scalar lookup,
Introduce a masked version of avx512 lookup - would it be slower?

> +	dir24_8_lookup_bulk_1b(p, ips + i * 16, next_hops + i * 16,
> +		n - i * 16);
> +}
> +
> +void
> +rte_dir24_8_vec_lookup_bulk_2b(void *p, const uint32_t *ips,
> +	uint64_t *next_hops, const unsigned int n)
> +{
> +	uint32_t i;
> +	for (i = 0; i < (n / 16); i++)
> +		dir24_8_vec_lookup_x16(p, ips + i * 16, next_hops + i * 16,
> +			sizeof(uint16_t));
> +
> +	dir24_8_lookup_bulk_2b(p, ips + i * 16, next_hops + i * 16,
> +		n - i * 16);
> +}
> +
> +void
> +rte_dir24_8_vec_lookup_bulk_4b(void *p, const uint32_t *ips,
> +	uint64_t *next_hops, const unsigned int n)
> +{
> +	uint32_t i;
> +	for (i = 0; i < (n / 16); i++)
> +		dir24_8_vec_lookup_x16(p, ips + i * 16, next_hops + i * 16,
> +			sizeof(uint32_t));
> +
> +	dir24_8_lookup_bulk_4b(p, ips + i * 16, next_hops + i * 16,
> +		n - i * 16);
> +}
> +
> +void
> +rte_dir24_8_vec_lookup_bulk_8b(void *p, const uint32_t *ips,
> +	uint64_t *next_hops, const unsigned int n)
> +{
> +	uint32_t i;
> +	for (i = 0; i < (n / 8); i++)
> +		dir24_8_vec_lookup_x8_8b(p, ips + i * 8, next_hops + i * 8);
> +
> +	dir24_8_lookup_bulk_8b(p, ips + i * 8, next_hops + i * 8, n - i * 8);
> +}
> diff --git a/lib/librte_fib/dir24_8_avx512.h b/lib/librte_fib/dir24_8_avx512.h
> new file mode 100644
> index 0000000..1d3c2b9
> --- /dev/null
> +++ b/lib/librte_fib/dir24_8_avx512.h
> @@ -0,0 +1,24 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2020 Intel Corporation
> + */
> +
> +#ifndef _DIR248_AVX512_H_
> +#define _DIR248_AVX512_H_
> +
> +void
> +rte_dir24_8_vec_lookup_bulk_1b(void *p, const uint32_t *ips,
> +	uint64_t *next_hops, const unsigned int n);
> +
> +void
> +rte_dir24_8_vec_lookup_bulk_2b(void *p, const uint32_t *ips,
> +	uint64_t *next_hops, const unsigned int n);
> +
> +void
> +rte_dir24_8_vec_lookup_bulk_4b(void *p, const uint32_t *ips,
> +	uint64_t *next_hops, const unsigned int n);
> +
> +void
> +rte_dir24_8_vec_lookup_bulk_8b(void *p, const uint32_t *ips,
> +	uint64_t *next_hops, const unsigned int n);
> +
> +#endif /* _DIR248_AVX512_H_ */
> diff --git a/lib/librte_fib/meson.build b/lib/librte_fib/meson.build
> index 771828f..0963f3c 100644
> --- a/lib/librte_fib/meson.build
> +++ b/lib/librte_fib/meson.build
> @@ -5,3 +5,14 @@
>  sources = files('rte_fib.c', 'rte_fib6.c', 'dir24_8.c', 'trie.c')
>  headers = files('rte_fib.h', 'rte_fib6.h')
>  deps += ['rib']
> +
> +if dpdk_conf.has('RTE_ARCH_X86') and cc.has_argument('-mavx512f')
> +	if cc.has_argument('-mavx512dq')
> +		dir24_8_avx512_tmp = static_library('dir24_8_avx512_tmp',
> +			'dir24_8_avx512.c',
> +			dependencies: static_rte_eal,
> +			c_args: cflags + ['-mavx512f'] + ['-mavx512dq'])
> +		objs += dir24_8_avx512_tmp.extract_objects('dir24_8_avx512.c')
> +		cflags += '-DCC_DIR24_8_AVX512_SUPPORT'
> +	endif
> +endif
> diff --git a/lib/librte_fib/rte_fib.h b/lib/librte_fib/rte_fib.h
> index db35685..2919d13 100644
> --- a/lib/librte_fib/rte_fib.h
> +++ b/lib/librte_fib/rte_fib.h
> @@ -54,7 +54,8 @@ enum rte_fib_dir24_8_nh_sz {
>  enum rte_fib_dir24_8_lookup_type {
>  	RTE_FIB_DIR24_8_SCALAR_MACRO,
>  	RTE_FIB_DIR24_8_SCALAR_INLINE,
> -	RTE_FIB_DIR24_8_SCALAR_UNI
> +	RTE_FIB_DIR24_8_SCALAR_UNI,
> +	RTE_FIB_DIR24_8_VECTOR
>  };
> 
>  /** FIB configuration structure */
> --

Acked-by: Konstantin Ananyev <konstantin.ananyev@intel.com>

> 2.7.4
  
Thomas Monjalon July 6, 2020, 7:21 p.m. UTC | #2
19/05/2020 14:12, Vladimir Medvedkin:
> --- a/lib/librte_fib/meson.build
> +++ b/lib/librte_fib/meson.build
> +if dpdk_conf.has('RTE_ARCH_X86') and cc.has_argument('-mavx512f')
> +	if cc.has_argument('-mavx512dq')
> +		dir24_8_avx512_tmp = static_library('dir24_8_avx512_tmp',
> +			'dir24_8_avx512.c',
> +			dependencies: static_rte_eal,
> +			c_args: cflags + ['-mavx512f'] + ['-mavx512dq'])
> +		objs += dir24_8_avx512_tmp.extract_objects('dir24_8_avx512.c')
> +		cflags += '-DCC_DIR24_8_AVX512_SUPPORT'
> +	endif
> +endif

I don't want to try understanding what this hack is.
But please add comments around it, so we will understand why
compilation fails:

In file included from ../../dpdk/lib/librte_fib/dir24_8_avx512.c:5:
../../dpdk/lib/librte_eal/x86/include/rte_vect.h:97:18: error: expected declaration specifiers or ‘...’ before ‘(’ token
   97 | #define ZMM_SIZE (sizeof(__x86_zmm_t))
      |                  ^
  
Bruce Richardson July 7, 2020, 9:44 a.m. UTC | #3
On Tue, May 19, 2020 at 01:12:59PM +0100, Vladimir Medvedkin wrote:
> Add new lookup implementation for DIR24_8 algorithm using
> AVX512 instruction set
> 
> Signed-off-by: Vladimir Medvedkin <vladimir.medvedkin@intel.com>
> ---
>  lib/librte_fib/Makefile         |  14 ++++
>  lib/librte_fib/dir24_8.c        |  24 ++++++
>  lib/librte_fib/dir24_8_avx512.c | 165 ++++++++++++++++++++++++++++++++++++++++
>  lib/librte_fib/dir24_8_avx512.h |  24 ++++++
>  lib/librte_fib/meson.build      |  11 +++
>  lib/librte_fib/rte_fib.h        |   3 +-
>  6 files changed, 240 insertions(+), 1 deletion(-)
>  create mode 100644 lib/librte_fib/dir24_8_avx512.c
>  create mode 100644 lib/librte_fib/dir24_8_avx512.h
>
<snip> 
> diff --git a/lib/librte_fib/meson.build b/lib/librte_fib/meson.build
> index 771828f..0963f3c 100644
> --- a/lib/librte_fib/meson.build
> +++ b/lib/librte_fib/meson.build
> @@ -5,3 +5,14 @@
>  sources = files('rte_fib.c', 'rte_fib6.c', 'dir24_8.c', 'trie.c')
>  headers = files('rte_fib.h', 'rte_fib6.h')
>  deps += ['rib']
> +
> +if dpdk_conf.has('RTE_ARCH_X86') and cc.has_argument('-mavx512f')
> +	if cc.has_argument('-mavx512dq')
> +		dir24_8_avx512_tmp = static_library('dir24_8_avx512_tmp',
> +			'dir24_8_avx512.c',
> +			dependencies: static_rte_eal,
> +			c_args: cflags + ['-mavx512f'] + ['-mavx512dq'])
> +		objs += dir24_8_avx512_tmp.extract_objects('dir24_8_avx512.c')
> +		cflags += '-DCC_DIR24_8_AVX512_SUPPORT'
> +	endif
> +endif

This block looks wrong to me, especially comparing it with the equivalent
block in drivers/net/i40e. Firstly, the two if conditions are unnecessary
and can be merged. However, secondly, I think you should restructure it so
that you first check for AVX-512 already being enabled in the build, and
only if it is not do you need to see about checking compiler support and
using the static lib workaround to get just the one file compiled with
AVX-512. As Thomas suggested, a comment explaining this would also help -
again copying what is in the i40e/meson.build file would probably be a good
start.

/Bruce
  
Vladimir Medvedkin July 8, 2020, 7:57 p.m. UTC | #4
On 24/06/2020 14:18, Ananyev, Konstantin wrote:
> 
>> Add new lookup implementation for DIR24_8 algorithm using
>> AVX512 instruction set
>>
>> Signed-off-by: Vladimir Medvedkin <vladimir.medvedkin@intel.com>
>> ---
>>   lib/librte_fib/Makefile         |  14 ++++
>>   lib/librte_fib/dir24_8.c        |  24 ++++++
>>   lib/librte_fib/dir24_8_avx512.c | 165 ++++++++++++++++++++++++++++++++++++++++
>>   lib/librte_fib/dir24_8_avx512.h |  24 ++++++
>>   lib/librte_fib/meson.build      |  11 +++
>>   lib/librte_fib/rte_fib.h        |   3 +-
>>   6 files changed, 240 insertions(+), 1 deletion(-)
>>   create mode 100644 lib/librte_fib/dir24_8_avx512.c
>>   create mode 100644 lib/librte_fib/dir24_8_avx512.h
>>
>> diff --git a/lib/librte_fib/Makefile b/lib/librte_fib/Makefile
>> index 1dd2a49..3958da1 100644
>> --- a/lib/librte_fib/Makefile
>> +++ b/lib/librte_fib/Makefile
>> @@ -19,4 +19,18 @@ SRCS-$(CONFIG_RTE_LIBRTE_FIB) := rte_fib.c rte_fib6.c dir24_8.c trie.c
>>   # install this header file
>>   SYMLINK-$(CONFIG_RTE_LIBRTE_FIB)-include := rte_fib.h rte_fib6.h
>>
>> +CC_AVX512F_SUPPORT=$(shell $(CC) -mavx512f -dM -E - </dev/null 2>&1 | \
>> +grep -q __AVX512F__ && echo 1)
>> +
>> +CC_AVX512DQ_SUPPORT=$(shell $(CC) -mavx512dq -dM -E - </dev/null 2>&1 | \
>> +grep -q __AVX512DQ__ && echo 1)
>> +
>> +ifeq ($(CC_AVX512F_SUPPORT), 1)
>> +ifeq ($(CC_AVX512DQ_SUPPORT), 1)
>> +SRCS-$(CONFIG_RTE_LIBRTE_FIB) += dir24_8_avx512.c
>> +CFLAGS_dir24_8_avx512.o += -mavx512f
>> +CFLAGS_dir24_8_avx512.o += -mavx512dq
>> +CFLAGS_dir24_8.o += -DCC_DIR24_8_AVX512_SUPPORT
>> +endif
>> +endif
>>   include $(RTE_SDK)/mk/rte.lib.mk
>> diff --git a/lib/librte_fib/dir24_8.c b/lib/librte_fib/dir24_8.c
>> index 9d74653..0a1c53f 100644
>> --- a/lib/librte_fib/dir24_8.c
>> +++ b/lib/librte_fib/dir24_8.c
>> @@ -18,6 +18,12 @@
>>   #include <rte_fib.h>
>>   #include "dir24_8.h"
>>
>> +#ifdef CC_DIR24_8_AVX512_SUPPORT
>> +
>> +#include "dir24_8_avx512.h"
>> +
>> +#endif /* CC_DIR24_8_AVX512_SUPPORT */
>> +
>>   #define DIR24_8_NAMESIZE64
>>
>>   #define ROUNDUP(x, y) RTE_ALIGN_CEIL(x, (1 << (32 - y)))
>> @@ -62,6 +68,24 @@ dir24_8_get_lookup_fn(void *p, enum rte_fib_dir24_8_lookup_type type)
>>   }
>>   case RTE_FIB_DIR24_8_SCALAR_UNI:
>>   return dir24_8_lookup_bulk_uni;
>> +#ifdef CC_DIR24_8_AVX512_SUPPORT
>> +case RTE_FIB_DIR24_8_VECTOR:
>> +if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) <= 0)
>> +return NULL;
>> +
>> +switch (nh_sz) {
>> +case RTE_FIB_DIR24_8_1B:
>> +return rte_dir24_8_vec_lookup_bulk_1b;
>> +case RTE_FIB_DIR24_8_2B:
>> +return rte_dir24_8_vec_lookup_bulk_2b;
>> +case RTE_FIB_DIR24_8_4B:
>> +return rte_dir24_8_vec_lookup_bulk_4b;
>> +case RTE_FIB_DIR24_8_8B:
>> +return rte_dir24_8_vec_lookup_bulk_8b;
>> +default:
>> +return NULL;
>> +}
>> +#endif
>>   default:
>>   return NULL;
>>   }
>> diff --git a/lib/librte_fib/dir24_8_avx512.c b/lib/librte_fib/dir24_8_avx512.c
>> new file mode 100644
>> index 0000000..43dba28
>> --- /dev/null
>> +++ b/lib/librte_fib/dir24_8_avx512.c
>> @@ -0,0 +1,165 @@
>> +/* SPDX-License-Identifier: BSD-3-Clause
>> + * Copyright(c) 2020 Intel Corporation
>> + */
>> +
>> +#include <rte_vect.h>
>> +#include <rte_fib.h>
>> +
>> +#include "dir24_8.h"
>> +#include "dir24_8_avx512.h"
>> +
>> +static __rte_always_inline void
>> +dir24_8_vec_lookup_x16(void *p, const uint32_t *ips,
>> +uint64_t *next_hops, int size)
>> +{
>> +struct dir24_8_tbl *dp = (struct dir24_8_tbl *)p;
>> +__mmask16 msk_ext;
>> +__mmask16 exp_msk = 0x5555;
>> +__m512i ip_vec, idxes, res, bytes;
>> +const __m512i zero = _mm512_set1_epi32(0);
>> +const __m512i lsb = _mm512_set1_epi32(1);
>> +const __m512i lsbyte_msk = _mm512_set1_epi32(0xff);
>> +__m512i tmp1, tmp2, res_msk;
>> +__m256i tmp256;
>> +/* used to mask gather values if size is 1/2 (8/16 bit next hops) */
>> +if (size == sizeof(uint8_t))
>> +res_msk = _mm512_set1_epi32(UINT8_MAX);
>> +else if (size == sizeof(uint16_t))
>> +res_msk = _mm512_set1_epi32(UINT16_MAX);
>> +
>> +ip_vec = _mm512_loadu_si512(ips);
>> +/* mask 24 most significant bits */
>> +idxes = _mm512_srli_epi32(ip_vec, 8);
>> +
>> +/**
>> + * lookup in tbl24
>> + * Put it inside branch to make compiler happy with -O0
>> + */
>> +if (size == sizeof(uint8_t)) {
>> +res = _mm512_i32gather_epi32(idxes, (const int *)dp->tbl24, 1);
>> +res = _mm512_and_epi32(res, res_msk);
>> +} else if (size == sizeof(uint16_t)) {
>> +res = _mm512_i32gather_epi32(idxes, (const int *)dp->tbl24, 2);
>> +res = _mm512_and_epi32(res, res_msk);
>> +} else
>> +res = _mm512_i32gather_epi32(idxes, (const int *)dp->tbl24, 4);
>> +
>> +/* get extended entries indexes */
>> +msk_ext = _mm512_test_epi32_mask(res, lsb);
>> +
>> +if (msk_ext != 0) {
>> +idxes = _mm512_srli_epi32(res, 1);
>> +idxes = _mm512_slli_epi32(idxes, 8);
>> +bytes = _mm512_and_epi32(ip_vec, lsbyte_msk);
>> +idxes = _mm512_maskz_add_epi32(msk_ext, idxes, bytes);
>> +if (size == sizeof(uint8_t)) {
>> +idxes = _mm512_mask_i32gather_epi32(zero, msk_ext,
>> +idxes, (const int *)dp->tbl8, 1);
>> +idxes = _mm512_and_epi32(idxes, res_msk);
>> +} else if (size == sizeof(uint16_t)) {
>> +idxes = _mm512_mask_i32gather_epi32(zero, msk_ext,
>> +idxes, (const int *)dp->tbl8, 2);
>> +idxes = _mm512_and_epi32(idxes, res_msk);
>> +} else
>> +idxes = _mm512_mask_i32gather_epi32(zero, msk_ext,
>> +idxes, (const int *)dp->tbl8, 4);
>> +
>> +res = _mm512_mask_blend_epi32(msk_ext, res, idxes);
>> +}
>> +
>> +res = _mm512_srli_epi32(res, 1);
>> +tmp1 = _mm512_maskz_expand_epi32(exp_msk, res);
>> +tmp256 = _mm512_extracti32x8_epi32(res, 1);
>> +tmp2 = _mm512_maskz_expand_epi32(exp_msk,
>> +_mm512_castsi256_si512(tmp256));
>> +_mm512_storeu_si512(next_hops, tmp1);
>> +_mm512_storeu_si512(next_hops + 8, tmp2);
>> +}
>> +
>> +static __rte_always_inline void
>> +dir24_8_vec_lookup_x8_8b(void *p, const uint32_t *ips,
>> +uint64_t *next_hops)
>> +{
>> +struct dir24_8_tbl *dp = (struct dir24_8_tbl *)p;
>> +const __m512i zero = _mm512_set1_epi32(0);
>> +const __m512i lsbyte_msk = _mm512_set1_epi64(0xff);
>> +const __m512i lsb = _mm512_set1_epi64(1);
>> +__m512i res, idxes, bytes;
>> +__m256i idxes_256, ip_vec;
>> +__mmask8 msk_ext;
>> +
>> +ip_vec = _mm256_loadu_si256((const void *)ips);
>> +/* mask 24 most significant bits */
>> +idxes_256 = _mm256_srli_epi32(ip_vec, 8);
>> +
>> +/* lookup in tbl24 */
>> +res = _mm512_i32gather_epi64(idxes_256, (const void *)dp->tbl24, 8);
>> +
>> +/* get extended entries indexes */
>> +msk_ext = _mm512_test_epi64_mask(res, lsb);
>> +
>> +if (msk_ext != 0) {
>> +bytes = _mm512_cvtepi32_epi64(ip_vec);
>> +idxes = _mm512_srli_epi64(res, 1);
>> +idxes = _mm512_slli_epi64(idxes, 8);
>> +bytes = _mm512_and_epi64(bytes, lsbyte_msk);
>> +idxes = _mm512_maskz_add_epi64(msk_ext, idxes, bytes);
>> +idxes = _mm512_mask_i64gather_epi64(zero, msk_ext, idxes,
>> +(const void *)dp->tbl8, 8);
>> +
>> +res = _mm512_mask_blend_epi64(msk_ext, res, idxes);
>> +}
>> +
>> +res = _mm512_srli_epi64(res, 1);
>> +_mm512_storeu_si512(next_hops, res);
>> +}
>> +
>> +void
>> +rte_dir24_8_vec_lookup_bulk_1b(void *p, const uint32_t *ips,
>> +uint64_t *next_hops, const unsigned int n)
>> +{
>> +uint32_t i;
>> +for (i = 0; i < (n / 16); i++)
>> +dir24_8_vec_lookup_x16(p, ips + i * 16, next_hops + i * 16,
>> +sizeof(uint8_t));
>> +
> 
> Just curious: if for reminder, instead of calling scalar lookup,
> Introduce a masked version of avx512 lookup - would it be slower?

As was discussed offline, I tried, and it is slower than using scalar 
lookup for reminder.

> 
>> +dir24_8_lookup_bulk_1b(p, ips + i * 16, next_hops + i * 16,
>> +n - i * 16);
>> +}
>> +
>> +void
>> +rte_dir24_8_vec_lookup_bulk_2b(void *p, const uint32_t *ips,
>> +uint64_t *next_hops, const unsigned int n)
>> +{
>> +uint32_t i;
>> +for (i = 0; i < (n / 16); i++)
>> +dir24_8_vec_lookup_x16(p, ips + i * 16, next_hops + i * 16,
>> +sizeof(uint16_t));
>> +
>> +dir24_8_lookup_bulk_2b(p, ips + i * 16, next_hops + i * 16,
>> +n - i * 16);
>> +}
>> +
>> +void
>> +rte_dir24_8_vec_lookup_bulk_4b(void *p, const uint32_t *ips,
>> +uint64_t *next_hops, const unsigned int n)
>> +{
>> +uint32_t i;
>> +for (i = 0; i < (n / 16); i++)
>> +dir24_8_vec_lookup_x16(p, ips + i * 16, next_hops + i * 16,
>> +sizeof(uint32_t));
>> +
>> +dir24_8_lookup_bulk_4b(p, ips + i * 16, next_hops + i * 16,
>> +n - i * 16);
>> +}
>> +
>> +void
>> +rte_dir24_8_vec_lookup_bulk_8b(void *p, const uint32_t *ips,
>> +uint64_t *next_hops, const unsigned int n)
>> +{
>> +uint32_t i;
>> +for (i = 0; i < (n / 8); i++)
>> +dir24_8_vec_lookup_x8_8b(p, ips + i * 8, next_hops + i * 8);
>> +
>> +dir24_8_lookup_bulk_8b(p, ips + i * 8, next_hops + i * 8, n - i * 8);
>> +}
>> diff --git a/lib/librte_fib/dir24_8_avx512.h b/lib/librte_fib/dir24_8_avx512.h
>> new file mode 100644
>> index 0000000..1d3c2b9
>> --- /dev/null
>> +++ b/lib/librte_fib/dir24_8_avx512.h
>> @@ -0,0 +1,24 @@
>> +/* SPDX-License-Identifier: BSD-3-Clause
>> + * Copyright(c) 2020 Intel Corporation
>> + */
>> +
>> +#ifndef _DIR248_AVX512_H_
>> +#define _DIR248_AVX512_H_
>> +
>> +void
>> +rte_dir24_8_vec_lookup_bulk_1b(void *p, const uint32_t *ips,
>> +uint64_t *next_hops, const unsigned int n);
>> +
>> +void
>> +rte_dir24_8_vec_lookup_bulk_2b(void *p, const uint32_t *ips,
>> +uint64_t *next_hops, const unsigned int n);
>> +
>> +void
>> +rte_dir24_8_vec_lookup_bulk_4b(void *p, const uint32_t *ips,
>> +uint64_t *next_hops, const unsigned int n);
>> +
>> +void
>> +rte_dir24_8_vec_lookup_bulk_8b(void *p, const uint32_t *ips,
>> +uint64_t *next_hops, const unsigned int n);
>> +
>> +#endif /* _DIR248_AVX512_H_ */
>> diff --git a/lib/librte_fib/meson.build b/lib/librte_fib/meson.build
>> index 771828f..0963f3c 100644
>> --- a/lib/librte_fib/meson.build
>> +++ b/lib/librte_fib/meson.build
>> @@ -5,3 +5,14 @@
>>   sources = files('rte_fib.c', 'rte_fib6.c', 'dir24_8.c', 'trie.c')
>>   headers = files('rte_fib.h', 'rte_fib6.h')
>>   deps += ['rib']
>> +
>> +if dpdk_conf.has('RTE_ARCH_X86') and cc.has_argument('-mavx512f')
>> +if cc.has_argument('-mavx512dq')
>> +dir24_8_avx512_tmp = static_library('dir24_8_avx512_tmp',
>> +'dir24_8_avx512.c',
>> +dependencies: static_rte_eal,
>> +c_args: cflags + ['-mavx512f'] + ['-mavx512dq'])
>> +objs += dir24_8_avx512_tmp.extract_objects('dir24_8_avx512.c')
>> +cflags += '-DCC_DIR24_8_AVX512_SUPPORT'
>> +endif
>> +endif
>> diff --git a/lib/librte_fib/rte_fib.h b/lib/librte_fib/rte_fib.h
>> index db35685..2919d13 100644
>> --- a/lib/librte_fib/rte_fib.h
>> +++ b/lib/librte_fib/rte_fib.h
>> @@ -54,7 +54,8 @@ enum rte_fib_dir24_8_nh_sz {
>>   enum rte_fib_dir24_8_lookup_type {
>>   RTE_FIB_DIR24_8_SCALAR_MACRO,
>>   RTE_FIB_DIR24_8_SCALAR_INLINE,
>> -RTE_FIB_DIR24_8_SCALAR_UNI
>> +RTE_FIB_DIR24_8_SCALAR_UNI,
>> +RTE_FIB_DIR24_8_VECTOR
>>   };
>>
>>   /** FIB configuration structure */
>> --
> 
> Acked-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
> 
>> 2.7.4
>
  
Vladimir Medvedkin July 8, 2020, 8:19 p.m. UTC | #5
Hi Thomas,

On 06/07/2020 20:21, Thomas Monjalon wrote:
> 19/05/2020 14:12, Vladimir Medvedkin:
>> --- a/lib/librte_fib/meson.build
>> +++ b/lib/librte_fib/meson.build
>> +if dpdk_conf.has('RTE_ARCH_X86') and cc.has_argument('-mavx512f')
>> +	if cc.has_argument('-mavx512dq')
>> +		dir24_8_avx512_tmp = static_library('dir24_8_avx512_tmp',
>> +			'dir24_8_avx512.c',
>> +			dependencies: static_rte_eal,
>> +			c_args: cflags + ['-mavx512f'] + ['-mavx512dq'])
>> +		objs += dir24_8_avx512_tmp.extract_objects('dir24_8_avx512.c')
>> +		cflags += '-DCC_DIR24_8_AVX512_SUPPORT'
>> +	endif
>> +endif
> 
> I don't want to try understanding what this hack is.
> But please add comments around it, so we will understand why
> compilation fails:
> 
> In file included from ../../dpdk/lib/librte_fib/dir24_8_avx512.c:5:
> ../../dpdk/lib/librte_eal/x86/include/rte_vect.h:97:18: error: expected declaration specifiers or ‘...’ before ‘(’ token
>     97 | #define ZMM_SIZE (sizeof(__x86_zmm_t))
>        |                  ^
> 
> 

I sent v4 with slightly reworked meson.build, please check compilation.

>
  

Patch

diff --git a/lib/librte_fib/Makefile b/lib/librte_fib/Makefile
index 1dd2a49..3958da1 100644
--- a/lib/librte_fib/Makefile
+++ b/lib/librte_fib/Makefile
@@ -19,4 +19,18 @@  SRCS-$(CONFIG_RTE_LIBRTE_FIB) := rte_fib.c rte_fib6.c dir24_8.c trie.c
 # install this header file
 SYMLINK-$(CONFIG_RTE_LIBRTE_FIB)-include := rte_fib.h rte_fib6.h
 
+CC_AVX512F_SUPPORT=$(shell $(CC) -mavx512f -dM -E - </dev/null 2>&1 | \
+grep -q __AVX512F__ && echo 1)
+
+CC_AVX512DQ_SUPPORT=$(shell $(CC) -mavx512dq -dM -E - </dev/null 2>&1 | \
+grep -q __AVX512DQ__ && echo 1)
+
+ifeq ($(CC_AVX512F_SUPPORT), 1)
+	ifeq ($(CC_AVX512DQ_SUPPORT), 1)
+		SRCS-$(CONFIG_RTE_LIBRTE_FIB) += dir24_8_avx512.c
+		CFLAGS_dir24_8_avx512.o += -mavx512f
+		CFLAGS_dir24_8_avx512.o += -mavx512dq
+		CFLAGS_dir24_8.o += -DCC_DIR24_8_AVX512_SUPPORT
+	endif
+endif
 include $(RTE_SDK)/mk/rte.lib.mk
diff --git a/lib/librte_fib/dir24_8.c b/lib/librte_fib/dir24_8.c
index 9d74653..0a1c53f 100644
--- a/lib/librte_fib/dir24_8.c
+++ b/lib/librte_fib/dir24_8.c
@@ -18,6 +18,12 @@ 
 #include <rte_fib.h>
 #include "dir24_8.h"
 
+#ifdef CC_DIR24_8_AVX512_SUPPORT
+
+#include "dir24_8_avx512.h"
+
+#endif /* CC_DIR24_8_AVX512_SUPPORT */
+
 #define DIR24_8_NAMESIZE	64
 
 #define ROUNDUP(x, y)	 RTE_ALIGN_CEIL(x, (1 << (32 - y)))
@@ -62,6 +68,24 @@  dir24_8_get_lookup_fn(void *p, enum rte_fib_dir24_8_lookup_type type)
 		}
 	case RTE_FIB_DIR24_8_SCALAR_UNI:
 		return dir24_8_lookup_bulk_uni;
+#ifdef CC_DIR24_8_AVX512_SUPPORT
+	case RTE_FIB_DIR24_8_VECTOR:
+		if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) <= 0)
+			return NULL;
+
+		switch (nh_sz) {
+		case RTE_FIB_DIR24_8_1B:
+			return rte_dir24_8_vec_lookup_bulk_1b;
+		case RTE_FIB_DIR24_8_2B:
+			return rte_dir24_8_vec_lookup_bulk_2b;
+		case RTE_FIB_DIR24_8_4B:
+			return rte_dir24_8_vec_lookup_bulk_4b;
+		case RTE_FIB_DIR24_8_8B:
+			return rte_dir24_8_vec_lookup_bulk_8b;
+		default:
+			return NULL;
+		}
+#endif
 	default:
 		return NULL;
 	}
diff --git a/lib/librte_fib/dir24_8_avx512.c b/lib/librte_fib/dir24_8_avx512.c
new file mode 100644
index 0000000..43dba28
--- /dev/null
+++ b/lib/librte_fib/dir24_8_avx512.c
@@ -0,0 +1,165 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2020 Intel Corporation
+ */
+
+#include <rte_vect.h>
+#include <rte_fib.h>
+
+#include "dir24_8.h"
+#include "dir24_8_avx512.h"
+
+static __rte_always_inline void
+dir24_8_vec_lookup_x16(void *p, const uint32_t *ips,
+	uint64_t *next_hops, int size)
+{
+	struct dir24_8_tbl *dp = (struct dir24_8_tbl *)p;
+	__mmask16 msk_ext;
+	__mmask16 exp_msk = 0x5555;
+	__m512i ip_vec, idxes, res, bytes;
+	const __m512i zero = _mm512_set1_epi32(0);
+	const __m512i lsb = _mm512_set1_epi32(1);
+	const __m512i lsbyte_msk = _mm512_set1_epi32(0xff);
+	__m512i tmp1, tmp2, res_msk;
+	__m256i tmp256;
+	/* used to mask gather values if size is 1/2 (8/16 bit next hops) */
+	if (size == sizeof(uint8_t))
+		res_msk = _mm512_set1_epi32(UINT8_MAX);
+	else if (size == sizeof(uint16_t))
+		res_msk = _mm512_set1_epi32(UINT16_MAX);
+
+	ip_vec = _mm512_loadu_si512(ips);
+	/* mask 24 most significant bits */
+	idxes = _mm512_srli_epi32(ip_vec, 8);
+
+	/**
+	 * lookup in tbl24
+	 * Put it inside branch to make compiler happy with -O0
+	 */
+	if (size == sizeof(uint8_t)) {
+		res = _mm512_i32gather_epi32(idxes, (const int *)dp->tbl24, 1);
+		res = _mm512_and_epi32(res, res_msk);
+	} else if (size == sizeof(uint16_t)) {
+		res = _mm512_i32gather_epi32(idxes, (const int *)dp->tbl24, 2);
+		res = _mm512_and_epi32(res, res_msk);
+	} else
+		res = _mm512_i32gather_epi32(idxes, (const int *)dp->tbl24, 4);
+
+	/* get extended entries indexes */
+	msk_ext = _mm512_test_epi32_mask(res, lsb);
+
+	if (msk_ext != 0) {
+		idxes = _mm512_srli_epi32(res, 1);
+		idxes = _mm512_slli_epi32(idxes, 8);
+		bytes = _mm512_and_epi32(ip_vec, lsbyte_msk);
+		idxes = _mm512_maskz_add_epi32(msk_ext, idxes, bytes);
+		if (size == sizeof(uint8_t)) {
+			idxes = _mm512_mask_i32gather_epi32(zero, msk_ext,
+				idxes, (const int *)dp->tbl8, 1);
+			idxes = _mm512_and_epi32(idxes, res_msk);
+		} else if (size == sizeof(uint16_t)) {
+			idxes = _mm512_mask_i32gather_epi32(zero, msk_ext,
+				idxes, (const int *)dp->tbl8, 2);
+			idxes = _mm512_and_epi32(idxes, res_msk);
+		} else
+			idxes = _mm512_mask_i32gather_epi32(zero, msk_ext,
+				idxes, (const int *)dp->tbl8, 4);
+
+		res = _mm512_mask_blend_epi32(msk_ext, res, idxes);
+	}
+
+	res = _mm512_srli_epi32(res, 1);
+	tmp1 = _mm512_maskz_expand_epi32(exp_msk, res);
+	tmp256 = _mm512_extracti32x8_epi32(res, 1);
+	tmp2 = _mm512_maskz_expand_epi32(exp_msk,
+		_mm512_castsi256_si512(tmp256));
+	_mm512_storeu_si512(next_hops, tmp1);
+	_mm512_storeu_si512(next_hops + 8, tmp2);
+}
+
+static __rte_always_inline void
+dir24_8_vec_lookup_x8_8b(void *p, const uint32_t *ips,
+	uint64_t *next_hops)
+{
+	struct dir24_8_tbl *dp = (struct dir24_8_tbl *)p;
+	const __m512i zero = _mm512_set1_epi32(0);
+	const __m512i lsbyte_msk = _mm512_set1_epi64(0xff);
+	const __m512i lsb = _mm512_set1_epi64(1);
+	__m512i res, idxes, bytes;
+	__m256i idxes_256, ip_vec;
+	__mmask8 msk_ext;
+
+	ip_vec = _mm256_loadu_si256((const void *)ips);
+	/* mask 24 most significant bits */
+	idxes_256 = _mm256_srli_epi32(ip_vec, 8);
+
+	/* lookup in tbl24 */
+	res = _mm512_i32gather_epi64(idxes_256, (const void *)dp->tbl24, 8);
+
+	/* get extended entries indexes */
+	msk_ext = _mm512_test_epi64_mask(res, lsb);
+
+	if (msk_ext != 0) {
+		bytes = _mm512_cvtepi32_epi64(ip_vec);
+		idxes = _mm512_srli_epi64(res, 1);
+		idxes = _mm512_slli_epi64(idxes, 8);
+		bytes = _mm512_and_epi64(bytes, lsbyte_msk);
+		idxes = _mm512_maskz_add_epi64(msk_ext, idxes, bytes);
+		idxes = _mm512_mask_i64gather_epi64(zero, msk_ext, idxes,
+			(const void *)dp->tbl8, 8);
+
+		res = _mm512_mask_blend_epi64(msk_ext, res, idxes);
+	}
+
+	res = _mm512_srli_epi64(res, 1);
+	_mm512_storeu_si512(next_hops, res);
+}
+
+void
+rte_dir24_8_vec_lookup_bulk_1b(void *p, const uint32_t *ips,
+	uint64_t *next_hops, const unsigned int n)
+{
+	uint32_t i;
+	for (i = 0; i < (n / 16); i++)
+		dir24_8_vec_lookup_x16(p, ips + i * 16, next_hops + i * 16,
+			sizeof(uint8_t));
+
+	dir24_8_lookup_bulk_1b(p, ips + i * 16, next_hops + i * 16,
+		n - i * 16);
+}
+
+void
+rte_dir24_8_vec_lookup_bulk_2b(void *p, const uint32_t *ips,
+	uint64_t *next_hops, const unsigned int n)
+{
+	uint32_t i;
+	for (i = 0; i < (n / 16); i++)
+		dir24_8_vec_lookup_x16(p, ips + i * 16, next_hops + i * 16,
+			sizeof(uint16_t));
+
+	dir24_8_lookup_bulk_2b(p, ips + i * 16, next_hops + i * 16,
+		n - i * 16);
+}
+
+void
+rte_dir24_8_vec_lookup_bulk_4b(void *p, const uint32_t *ips,
+	uint64_t *next_hops, const unsigned int n)
+{
+	uint32_t i;
+	for (i = 0; i < (n / 16); i++)
+		dir24_8_vec_lookup_x16(p, ips + i * 16, next_hops + i * 16,
+			sizeof(uint32_t));
+
+	dir24_8_lookup_bulk_4b(p, ips + i * 16, next_hops + i * 16,
+		n - i * 16);
+}
+
+void
+rte_dir24_8_vec_lookup_bulk_8b(void *p, const uint32_t *ips,
+	uint64_t *next_hops, const unsigned int n)
+{
+	uint32_t i;
+	for (i = 0; i < (n / 8); i++)
+		dir24_8_vec_lookup_x8_8b(p, ips + i * 8, next_hops + i * 8);
+
+	dir24_8_lookup_bulk_8b(p, ips + i * 8, next_hops + i * 8, n - i * 8);
+}
diff --git a/lib/librte_fib/dir24_8_avx512.h b/lib/librte_fib/dir24_8_avx512.h
new file mode 100644
index 0000000..1d3c2b9
--- /dev/null
+++ b/lib/librte_fib/dir24_8_avx512.h
@@ -0,0 +1,24 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2020 Intel Corporation
+ */
+
+#ifndef _DIR248_AVX512_H_
+#define _DIR248_AVX512_H_
+
+void
+rte_dir24_8_vec_lookup_bulk_1b(void *p, const uint32_t *ips,
+	uint64_t *next_hops, const unsigned int n);
+
+void
+rte_dir24_8_vec_lookup_bulk_2b(void *p, const uint32_t *ips,
+	uint64_t *next_hops, const unsigned int n);
+
+void
+rte_dir24_8_vec_lookup_bulk_4b(void *p, const uint32_t *ips,
+	uint64_t *next_hops, const unsigned int n);
+
+void
+rte_dir24_8_vec_lookup_bulk_8b(void *p, const uint32_t *ips,
+	uint64_t *next_hops, const unsigned int n);
+
+#endif /* _DIR248_AVX512_H_ */
diff --git a/lib/librte_fib/meson.build b/lib/librte_fib/meson.build
index 771828f..0963f3c 100644
--- a/lib/librte_fib/meson.build
+++ b/lib/librte_fib/meson.build
@@ -5,3 +5,14 @@ 
 sources = files('rte_fib.c', 'rte_fib6.c', 'dir24_8.c', 'trie.c')
 headers = files('rte_fib.h', 'rte_fib6.h')
 deps += ['rib']
+
+if dpdk_conf.has('RTE_ARCH_X86') and cc.has_argument('-mavx512f')
+	if cc.has_argument('-mavx512dq')
+		dir24_8_avx512_tmp = static_library('dir24_8_avx512_tmp',
+			'dir24_8_avx512.c',
+			dependencies: static_rte_eal,
+			c_args: cflags + ['-mavx512f'] + ['-mavx512dq'])
+		objs += dir24_8_avx512_tmp.extract_objects('dir24_8_avx512.c')
+		cflags += '-DCC_DIR24_8_AVX512_SUPPORT'
+	endif
+endif
diff --git a/lib/librte_fib/rte_fib.h b/lib/librte_fib/rte_fib.h
index db35685..2919d13 100644
--- a/lib/librte_fib/rte_fib.h
+++ b/lib/librte_fib/rte_fib.h
@@ -54,7 +54,8 @@  enum rte_fib_dir24_8_nh_sz {
 enum rte_fib_dir24_8_lookup_type {
 	RTE_FIB_DIR24_8_SCALAR_MACRO,
 	RTE_FIB_DIR24_8_SCALAR_INLINE,
-	RTE_FIB_DIR24_8_SCALAR_UNI
+	RTE_FIB_DIR24_8_SCALAR_UNI,
+	RTE_FIB_DIR24_8_VECTOR
 };
 
 /** FIB configuration structure */