[v2,2/3] ip_frag: improve reassembly lookup performance

Message ID 20230523143921.7420-2-pbhagavatula@marvell.com (mailing list archive)
State Superseded, archived
Delegated to: Thomas Monjalon
Headers
Series [v2,1/3] ip_frag: optimize key compare and hash generation |

Checks

Context Check Description
ci/checkpatch success coding style OK

Commit Message

Pavan Nikhilesh Bhagavatula May 23, 2023, 2:39 p.m. UTC
  From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Improve reassembly lookup performance by using NEON intrinsics for
key validation.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 lib/ip_frag/ip_frag_internal.c   | 224 +++++++++++++++++++++++++------
 lib/ip_frag/ip_reassembly.h      |   6 +
 lib/ip_frag/rte_ip_frag_common.c |  10 ++
 3 files changed, 196 insertions(+), 44 deletions(-)
  

Comments

Honnappa Nagarahalli May 23, 2023, 4:22 p.m. UTC | #1
> -----Original Message-----
> From: pbhagavatula@marvell.com <pbhagavatula@marvell.com>
> Sent: Tuesday, May 23, 2023 9:39 AM
> To: jerinj@marvell.com; Honnappa Nagarahalli
> <Honnappa.Nagarahalli@arm.com>; nd <nd@arm.com>; Konstantin Ananyev
> <konstantin.v.ananyev@yandex.ru>
> Cc: dev@dpdk.org; Pavan Nikhilesh <pbhagavatula@marvell.com>
> Subject: [PATCH v2 2/3] ip_frag: improve reassembly lookup performance
> 
> From: Pavan Nikhilesh <pbhagavatula@marvell.com>
> 
> Improve reassembly lookup performance by using NEON intrinsics for key
> validation.
What is the improvement do you see with this?

> 
> Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
> ---
>  lib/ip_frag/ip_frag_internal.c   | 224 +++++++++++++++++++++++++------
>  lib/ip_frag/ip_reassembly.h      |   6 +
>  lib/ip_frag/rte_ip_frag_common.c |  10 ++
>  3 files changed, 196 insertions(+), 44 deletions(-)
> 
> diff --git a/lib/ip_frag/ip_frag_internal.c b/lib/ip_frag/ip_frag_internal.c index
> 7cbef647df..de78a0ed8f 100644
> --- a/lib/ip_frag/ip_frag_internal.c
> +++ b/lib/ip_frag/ip_frag_internal.c
> @@ -4,8 +4,9 @@
> 
>  #include <stddef.h>
> 
> -#include <rte_jhash.h>
>  #include <rte_hash_crc.h>
> +#include <rte_jhash.h>
> +#include <rte_vect.h>
> 
>  #include "ip_frag_common.h"
> 
> @@ -280,10 +281,166 @@ ip_frag_find(struct rte_ip_frag_tbl *tbl, struct
> rte_ip_frag_death_row *dr,
>  	return pkt;
>  }
> 
> -struct ip_frag_pkt *
> -ip_frag_lookup(struct rte_ip_frag_tbl *tbl,
> -	const struct ip_frag_key *key, uint64_t tms,
> -	struct ip_frag_pkt **free, struct ip_frag_pkt **stale)
> +static inline void
> +ip_frag_dbg(struct rte_ip_frag_tbl *tbl, struct ip_frag_pkt *p,
> +	    uint32_t list_idx, uint32_t list_cnt) {
> +	RTE_SET_USED(tbl);
> +	RTE_SET_USED(list_idx);
> +	RTE_SET_USED(list_cnt);
> +	if (p->key.key_len == IPV4_KEYLEN)
> +		IP_FRAG_LOG(DEBUG,
> +			    "%s:%d:\n"
> +			    "tbl: %p, max_entries: %u, use_entries: %u\n"
> +			    "ipv4_frag_pkt line0: %p, index: %u from %u\n"
> +			    "key: <%" PRIx64 ", %#x>, start: %" PRIu64 "\n",
> +			    __func__, __LINE__, tbl, tbl->max_entries,
> +			    tbl->use_entries, p, list_idx, list_cnt,
> +			    p->key.src_dst[0], p->key.id, p->start);
> +	else
> +		IP_FRAG_LOG(DEBUG,
> +			    "%s:%d:\n"
> +			    "tbl: %p, max_entries: %u, use_entries: %u\n"
> +			    "ipv6_frag_pkt line0: %p, index: %u from %u\n"
> +			    "key: <" IPv6_KEY_BYTES_FMT
> +			    ", %#x>, start: %" PRIu64 "\n",
> +			    __func__, __LINE__, tbl, tbl->max_entries,
> +			    tbl->use_entries, p, list_idx, list_cnt,
> +			    IPv6_KEY_BYTES(p1[i].key.src_dst), p->key.id,
> +			    p->start);
> +}
> +
> +#if defined(RTE_ARCH_ARM64)
> +static inline struct ip_frag_pkt *
> +ip_frag_lookup_neon(struct rte_ip_frag_tbl *tbl, const struct ip_frag_key
> *key, uint64_t tms,
> +		    struct ip_frag_pkt **free, struct ip_frag_pkt **stale) {
> +	struct ip_frag_pkt *empty, *old;
> +	struct ip_frag_pkt *p1, *p2;
> +	uint32_t assoc, sig1, sig2;
> +	uint64_t max_cycles;
> +
> +	empty = NULL;
> +	old = NULL;
> +
> +	max_cycles = tbl->max_cycles;
> +	assoc = tbl->bucket_entries;
> +
> +	if (tbl->last != NULL && ip_frag_key_cmp(key, &tbl->last->key) == 0)
> +		return tbl->last;
> +
> +	/* different hashing methods for IPv4 and IPv6 */
> +	if (key->key_len == IPV4_KEYLEN)
> +		ipv4_frag_hash(key, &sig1, &sig2);
> +	else
> +		ipv6_frag_hash(key, &sig1, &sig2);
> +
> +	p1 = IP_FRAG_TBL_POS(tbl, sig1);
> +	p2 = IP_FRAG_TBL_POS(tbl, sig2);
> +
> +	uint64x2_t key0, key1, key2, key3;
> +	uint64_t vmask, zmask, ts_mask;
> +	uint64x2_t ts0, ts1;
> +	uint32x4_t nz_key;
> +	uint8_t idx;
> +	/* Bucket entries are always power of 2. */
> +	rte_prefetch0(&p1[0].key);
> +	rte_prefetch0(&p1[1].key);
> +	rte_prefetch0(&p2[0].key);
> +	rte_prefetch0(&p2[1].key);
> +
> +	while (assoc > 1) {
> +		if (assoc > 2) {
> +			rte_prefetch0(&p1[2].key);
> +			rte_prefetch0(&p1[3].key);
> +			rte_prefetch0(&p2[2].key);
> +			rte_prefetch0(&p2[3].key);
> +		}
> +		struct ip_frag_pkt *p[] = {&p1[0], &p2[0], &p1[1], &p2[1]};
> +		key0 = vld1q_u64(&p[0]->key.id_key_len);
> +		key1 = vld1q_u64(&p[1]->key.id_key_len);
> +		key2 = vld1q_u64(&p[2]->key.id_key_len);
> +		key3 = vld1q_u64(&p[3]->key.id_key_len);
> +
> +		nz_key =
> vsetq_lane_u32(vgetq_lane_u32(vreinterpretq_u32_u64(key0), 1), nz_key, 0);
> +		nz_key =
> vsetq_lane_u32(vgetq_lane_u32(vreinterpretq_u32_u64(key1), 1), nz_key, 1);
> +		nz_key =
> vsetq_lane_u32(vgetq_lane_u32(vreinterpretq_u32_u64(key2), 1), nz_key, 2);
> +		nz_key =
> vsetq_lane_u32(vgetq_lane_u32(vreinterpretq_u32_u64(key3),
> +1), nz_key, 3);
> +
> +		nz_key = vceqzq_u32(nz_key);
> +		zmask =
> vget_lane_u64(vreinterpret_u64_u16(vshrn_n_u32(nz_key, 16)), 0);
> +		vmask = ~zmask;
> +
> +		vmask &= 0x8000800080008000;
> +		for (; vmask > 0; vmask &= vmask - 1) {
> +			idx = __builtin_ctzll(vmask) >> 4;
> +			if (ip_frag_key_cmp(key, &p[idx]->key) == 0)
> +				return p[idx];
> +		}
> +
> +		vmask = ~zmask;
> +		if (zmask && empty == NULL) {
> +			zmask &= 0x8000800080008000;
> +			idx = __builtin_ctzll(zmask) >> 4;
> +			empty = p[idx];
> +		}
> +
> +		if (vmask && old == NULL) {
> +			const uint64x2_t max_cyc =
> vdupq_n_u64(max_cycles);
> +			const uint64x2_t cur_cyc = vdupq_n_u64(tms);
> +
> +			ts0 = vsetq_lane_u64(vgetq_lane_u64(key0, 1), ts0,
> 0);
> +			ts0 = vsetq_lane_u64(vgetq_lane_u64(key1, 1), ts0,
> 1);
> +			ts1 = vsetq_lane_u64(vgetq_lane_u64(key2, 1), ts1,
> 0);
> +			ts1 = vsetq_lane_u64(vgetq_lane_u64(key3, 1), ts1,
> 1);
> +
> +			ts0 = vcgtq_u64(cur_cyc, vaddq_u64(ts0, max_cyc));
> +			ts1 = vcgtq_u64(cur_cyc, vaddq_u64(ts1, max_cyc));
> +
> +			ts_mask =
> vget_lane_u64(vreinterpret_u64_u16(vshrn_n_u32(
> +
> 	vuzp1q_u32(vreinterpretq_u32_u64(ts0),
> +
> vreinterpretq_u32_u64(ts1)),
> +							16)),
> +						0);
> +			vmask &= 0x8000800080008000;
> +			ts_mask &= vmask;
> +			if (ts_mask) {
> +				idx = __builtin_ctzll(ts_mask) >> 4;
> +				old = p[idx];
> +			}
> +		}
> +		p1 += 2;
> +		p2 += 2;
> +		assoc -= 4;
> +	}
> +	while (assoc) {
> +		if (ip_frag_key_cmp(key, &p1->key) == 0)
> +			return p1;
> +		else if (ip_frag_key_is_empty(&p1->key))
> +			empty = (empty == NULL) ? p1 : empty;
> +		else if (max_cycles + p1->start < tms)
> +			old = (old == NULL) ? p1 : old;
> +
> +		if (ip_frag_key_cmp(key, &p2->key) == 0)
> +			return p2;
> +		else if (ip_frag_key_is_empty(&p2->key))
> +			empty = (empty == NULL) ? p2 : empty;
> +		else if (max_cycles + p2->start < tms)
> +			old = (old == NULL) ? p2 : old;
> +		p1++;
> +		p2++;
> +		assoc--;
> +	}
> +
> +	*free = empty;
> +	*stale = old;
> +	return NULL;
> +}
> +#endif
> +
> +static struct ip_frag_pkt *
> +ip_frag_lookup_scalar(struct rte_ip_frag_tbl *tbl, const struct ip_frag_key
> *key, uint64_t tms,
> +		      struct ip_frag_pkt **free, struct ip_frag_pkt **stale)
>  {
>  	struct ip_frag_pkt *p1, *p2;
>  	struct ip_frag_pkt *empty, *old;
> @@ -309,25 +466,7 @@ ip_frag_lookup(struct rte_ip_frag_tbl *tbl,
>  	p2 = IP_FRAG_TBL_POS(tbl, sig2);
> 
>  	for (i = 0; i != assoc; i++) {
> -		if (p1->key.key_len == IPV4_KEYLEN)
> -			IP_FRAG_LOG(DEBUG, "%s:%d:\n"
> -					"tbl: %p, max_entries: %u,
> use_entries: %u\n"
> -					"ipv4_frag_pkt line0: %p, index: %u
> from %u\n"
> -			"key: <%" PRIx64 ", %#x>, start: %" PRIu64 "\n",
> -					__func__, __LINE__,
> -					tbl, tbl->max_entries, tbl->use_entries,
> -					p1, i, assoc,
> -			p1[i].key.src_dst[0], p1[i].key.id, p1[i].start);
> -		else
> -			IP_FRAG_LOG(DEBUG, "%s:%d:\n"
> -					"tbl: %p, max_entries: %u,
> use_entries: %u\n"
> -					"ipv6_frag_pkt line0: %p, index: %u
> from %u\n"
> -			"key: <" IPv6_KEY_BYTES_FMT ", %#x>, start: %"
> PRIu64 "\n",
> -					__func__, __LINE__,
> -					tbl, tbl->max_entries, tbl->use_entries,
> -					p1, i, assoc,
> -			IPv6_KEY_BYTES(p1[i].key.src_dst), p1[i].key.id,
> p1[i].start);
> -
> +		ip_frag_dbg(tbl, &p1[i], i, assoc);
>  		if (ip_frag_key_cmp(key, &p1[i].key) == 0)
>  			return p1 + i;
>  		else if (ip_frag_key_is_empty(&p1[i].key))
> @@ -335,29 +474,11 @@ ip_frag_lookup(struct rte_ip_frag_tbl *tbl,
>  		else if (max_cycles + p1[i].start < tms)
>  			old = (old == NULL) ? (p1 + i) : old;
> 
> -		if (p2->key.key_len == IPV4_KEYLEN)
> -			IP_FRAG_LOG(DEBUG, "%s:%d:\n"
> -					"tbl: %p, max_entries: %u,
> use_entries: %u\n"
> -					"ipv4_frag_pkt line1: %p, index: %u
> from %u\n"
> -			"key: <%" PRIx64 ", %#x>, start: %" PRIu64 "\n",
> -					__func__, __LINE__,
> -					tbl, tbl->max_entries, tbl->use_entries,
> -					p2, i, assoc,
> -			p2[i].key.src_dst[0], p2[i].key.id, p2[i].start);
> -		else
> -			IP_FRAG_LOG(DEBUG, "%s:%d:\n"
> -					"tbl: %p, max_entries: %u,
> use_entries: %u\n"
> -					"ipv6_frag_pkt line1: %p, index: %u
> from %u\n"
> -			"key: <" IPv6_KEY_BYTES_FMT ", %#x>, start: %"
> PRIu64 "\n",
> -					__func__, __LINE__,
> -					tbl, tbl->max_entries, tbl->use_entries,
> -					p2, i, assoc,
> -			IPv6_KEY_BYTES(p2[i].key.src_dst), p2[i].key.id,
> p2[i].start);
> -
> +		ip_frag_dbg(tbl, &p2[i], i, assoc);
>  		if (ip_frag_key_cmp(key, &p2[i].key) == 0)
>  			return p2 + i;
>  		else if (ip_frag_key_is_empty(&p2[i].key))
> -			empty = (empty == NULL) ?( p2 + i) : empty;
> +			empty = (empty == NULL) ? (p2 + i) : empty;
>  		else if (max_cycles + p2[i].start < tms)
>  			old = (old == NULL) ? (p2 + i) : old;
>  	}
> @@ -366,3 +487,18 @@ ip_frag_lookup(struct rte_ip_frag_tbl *tbl,
>  	*stale = old;
>  	return NULL;
>  }
> +
> +struct ip_frag_pkt *
> +ip_frag_lookup(struct rte_ip_frag_tbl *tbl, const struct ip_frag_key *key,
> uint64_t tms,
> +	       struct ip_frag_pkt **free, struct ip_frag_pkt **stale) {
> +	switch (tbl->lookup_fn) {
> +#if defined(RTE_ARCH_ARM64)
> +	case REASSEMBLY_LOOKUP_NEON:
> +		return ip_frag_lookup_neon(tbl, key, tms, free, stale); #endif
> +	case REASSEMBLY_LOOKUP_SCALAR:
> +	default:
> +		return ip_frag_lookup_scalar(tbl, key, tms, free, stale);
> +	}
> +}
> diff --git a/lib/ip_frag/ip_reassembly.h b/lib/ip_frag/ip_reassembly.h index
> ef9d8c0d75..049437ae32 100644
> --- a/lib/ip_frag/ip_reassembly.h
> +++ b/lib/ip_frag/ip_reassembly.h
> @@ -12,6 +12,11 @@
> 
>  #include <rte_ip_frag.h>
> 
> +enum ip_frag_lookup_func {
> +	REASSEMBLY_LOOKUP_SCALAR = 0,
> +	REASSEMBLY_LOOKUP_NEON,
> +};
> +
>  enum {
>  	IP_LAST_FRAG_IDX,    /* index of last fragment */
>  	IP_FIRST_FRAG_IDX,   /* index of first fragment */
> @@ -83,6 +88,7 @@ struct rte_ip_frag_tbl {
>  	struct ip_frag_pkt *last;     /* last used entry. */
>  	struct ip_pkt_list lru;       /* LRU list for table entries. */
>  	struct ip_frag_tbl_stat stat; /* statistics counters. */
> +	enum ip_frag_lookup_func lookup_fn;	/* hash table lookup function.
> */
>  	__extension__ struct ip_frag_pkt pkt[]; /* hash table. */  };
> 
> diff --git a/lib/ip_frag/rte_ip_frag_common.c
> b/lib/ip_frag/rte_ip_frag_common.c
> index c1de2e81b6..ef3c104e45 100644
> --- a/lib/ip_frag/rte_ip_frag_common.c
> +++ b/lib/ip_frag/rte_ip_frag_common.c
> @@ -5,7 +5,9 @@
>  #include <stddef.h>
>  #include <stdio.h>
> 
> +#include <rte_cpuflags.h>
>  #include <rte_log.h>
> +#include <rte_vect.h>
> 
>  #include "ip_frag_common.h"
> 
> @@ -75,6 +77,14 @@ rte_ip_frag_table_create(uint32_t bucket_num,
> uint32_t bucket_entries,
>  	tbl->bucket_entries = bucket_entries;
>  	tbl->entry_mask = (tbl->nb_entries - 1) & ~(tbl->bucket_entries  - 1);
> 
> +#if defined(RTE_ARCH_ARM64)
> +	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON) &&
> +	    rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128)
> +		tbl->lookup_fn = REASSEMBLY_LOOKUP_NEON;
> +	else
> +#endif
> +		tbl->lookup_fn = REASSEMBLY_LOOKUP_SCALAR;
> +
>  	TAILQ_INIT(&(tbl->lru));
>  	return tbl;
>  }
> --
> 2.25.1
  
Pavan Nikhilesh Bhagavatula May 23, 2023, 5:58 p.m. UTC | #2
> > -----Original Message-----
> > From: pbhagavatula@marvell.com <pbhagavatula@marvell.com>
> > Sent: Tuesday, May 23, 2023 9:39 AM
> > To: jerinj@marvell.com; Honnappa Nagarahalli
> > <Honnappa.Nagarahalli@arm.com>; nd <nd@arm.com>; Konstantin
> Ananyev
> > <konstantin.v.ananyev@yandex.ru>
> > Cc: dev@dpdk.org; Pavan Nikhilesh <pbhagavatula@marvell.com>
> > Subject: [PATCH v2 2/3] ip_frag: improve reassembly lookup performance
> >
> > From: Pavan Nikhilesh <pbhagavatula@marvell.com>
> >
> > Improve reassembly lookup performance by using NEON intrinsics for key
> > validation.
> What is the improvement do you see with this?

On Neoverse-N2 I see around improvement of 300-600c per flow and ~200c per insert.

Here are some test results.

Without patch:
+==========================================================================================================+
| IPV4                            | Flow Count : 32768                                                     |
+================+================+=============+=============+========================+===================+
| Fragment Order | Fragments/Flow | Outstanding | Cycles/Flow | Cycles/Fragment insert | Cycles/Reassembly |
+================+================+=============+=============+========================+===================+
| LINEAR         | 2              | 0           | 1244        | 919                    | 114               |
+================+================+=============+=============+========================+===================+
| RANDOM         | 2              | 0           | 1653        | 968                    | 128               |
+================+================+=============+=============+========================+===================+
| LINEAR         | 3              | 0           | 1379        | 503                    | 110               |
+================+================+=============+=============+========================+===================+
| RANDOM         | 3              | 0           | 1613        | 520                    | 139               |
+================+================+=============+=============+========================+===================+
| LINEAR         | 8              | 0           | 2030        | 199                    | 190               |
+================+================+=============+=============+========================+===================+
| RANDOM         | 8              | 0           | 4393        | 309                    | 402               |
+================+================+=============+=============+========================+===================+
| LINEAR         | RANDOM         | 0           | 1531        | 333                    | 147               |
+================+================+=============+=============+========================+===================+
| RANDOM         | RANDOM         | 0           | 2771        | 357                    | 213               |
+================+================+=============+=============+========================+===================+
| LINEAR         | 2              | 100         | 1228        | 920                    | 102               |
+================+================+=============+=============+========================+===================+
| LINEAR         | 2              | 500         | 1197        | 905                    | 103               |
+================+================+=============+=============+========================+===================+
| LINEAR         | 2              | 1000        | 1183        | 904                    | 104               |
+================+================+=============+=============+========================+===================+
| LINEAR         | 2              | 2000        | 1153        | 921                    | 105               |
+================+================+=============+=============+========================+===================+
| LINEAR         | 2              | 3000        | 1123        | 911                    | 111               |
+================+================+=============+=============+========================+===================+
| LINEAR         | 8              | 100         | 829         | 193                    | 690               |
+================+================+=============+=============+========================+===================+
| LINEAR         | 8              | 500         | 830         | 195                    | 682               |
+================+================+=============+=============+========================+===================+
| LINEAR         | 8              | 1000        | 817         | 211                    | 690               |
+================+================+=============+=============+========================+===================+
| LINEAR         | 8              | 2000        | 819         | 195                    | 690               |
+================+================+=============+=============+========================+===================+
| LINEAR         | 8              | 3000        | 823         | 223                    | 676               |
+================+================+=============+=============+========================+===================+
| INTERLEAVED    | 2              | 0           | 1765        | 1038                   | 177               |
+================+================+=============+=============+========================+===================+
| INTERLEAVED    | 3              | 0           | 2588        | 699                    | 190               |
+================+================+=============+=============+========================+===================+
| INTERLEAVED    | 8              | 0           | 5253        | 265                    | 403               |
+================+================+=============+=============+========================+===================+
| INTERLEAVED    | RANDOM         | 0           | 3398        | 493                    | 301               |
+================+================+=============+=============+========================+===================+

+==========================================================================================================+
| IPV6                            | Flow Count : 32768                                                     |
+================+================+=============+=============+========================+===================+
| Fragment Order | Fragments/Flow | Outstanding | Cycles/Flow | Cycles/Fragment insert | Cycles/Reassembly |
+================+================+=============+=============+========================+===================+
| LINEAR         | 2              | 0           | 1838        | 1176                   | 136               |
+================+================+=============+=============+========================+===================+
| RANDOM         | 2              | 0           | 1892        | 1188                   | 160               |
+================+================+=============+=============+========================+===================+
| LINEAR         | 3              | 0           | 1986        | 628                    | 143               |
+================+================+=============+=============+========================+===================+
| RANDOM         | 3              | 0           | 2670        | 646                    | 155               |
+================+================+=============+=============+========================+===================+
| LINEAR         | 8              | 0           | 3152        | 261                    | 271               |
+================+================+=============+=============+========================+===================+
| RANDOM         | 8              | 0           | 5127        | 324                    | 434               |
+================+================+=============+=============+========================+===================+
| LINEAR         | RANDOM         | 0           | 2169        | 427                    | 203               |
+================+================+=============+=============+========================+===================+
| RANDOM         | RANDOM         | 0           | 3382        | 452                    | 255               |
+================+================+=============+=============+========================+===================+
| LINEAR         | 2              | 100         | 1837        | 1164                   | 124               |
+================+================+=============+=============+========================+===================+
| LINEAR         | 2              | 500         | 1790        | 1158                   | 126               |
+================+================+=============+=============+========================+===================+
| LINEAR         | 2              | 1000        | 1807        | 1161                   | 138               |
+================+================+=============+=============+========================+===================+
| LINEAR         | 2              | 2000        | 1776        | 1160                   | 138               |
+================+================+=============+=============+========================+===================+
| LINEAR         | 2              | 3000        | 1715        | 1169                   | 144               |
+================+================+=============+=============+========================+===================+
| LINEAR         | 8              | 100         | 1488        | 256                    | 1228              |
+================+================+=============+=============+========================+===================+
| LINEAR         | 8              | 500         | 1461        | 300                    | 1205              |
+================+================+=============+=============+========================+===================+
| LINEAR         | 8              | 1000        | 1457        | 303                    | 1202              |
+================+================+=============+=============+========================+===================+
| LINEAR         | 8              | 2000        | 1456        | 305                    | 1201              |
+================+================+=============+=============+========================+===================+
| LINEAR         | 8              | 3000        | 1460        | 308                    | 1205              |
+================+================+=============+=============+========================+===================+
| INTERLEAVED    | 2              | 0           | 2145        | 1330                   | 296               |
+================+================+=============+=============+========================+===================+
| INTERLEAVED    | 3              | 0           | 2778        | 830                    | 330               |
+================+================+=============+=============+========================+===================+
| INTERLEAVED    | 8              | 0           | 5715        | 324                    | 444               |
+================+================+=============+=============+========================+===================+
| INTERLEAVED    | RANDOM         | 0           | 3625        | 550                    | 363               |
+================+================+=============+=============+========================+===================+

With patch :

+==========================================================================================================+
| IPV4                            | Flow Count : 32768                                                     |
+================+================+=============+=============+========================+===================+
| Fragment Order | Fragments/Flow | Outstanding | Cycles/Flow | Cycles/Fragment insert | Cycles/Reassembly |
+================+================+=============+=============+========================+===================+
| LINEAR         | 2              | 0           | 950         | 717                    | 98                |
+================+================+=============+=============+========================+===================+
| RANDOM         | 2              | 0           | 1013        | 706                    | 108               |
+================+================+=============+=============+========================+===================+
| LINEAR         | 3              | 0           | 1096        | 397                    | 115               |
+================+================+=============+=============+========================+===================+
| RANDOM         | 3              | 0           | 1150        | 412                    | 128               |
+================+================+=============+=============+========================+===================+
| LINEAR         | 8              | 0           | 1783        | 166                    | 202               |
+================+================+=============+=============+========================+===================+
| RANDOM         | 8              | 0           | 3933        | 284                    | 424               |
+================+================+=============+=============+========================+===================+
| LINEAR         | RANDOM         | 0           | 1288        | 267                    | 159               |
+================+================+=============+=============+========================+===================+
| RANDOM         | RANDOM         | 0           | 2393        | 302                    | 235               |
+================+================+=============+=============+========================+===================+
| LINEAR         | 2              | 100         | 956         | 703                    | 110               |
+================+================+=============+=============+========================+===================+
| LINEAR         | 2              | 500         | 937         | 693                    | 112               |
+================+================+=============+=============+========================+===================+
| LINEAR         | 2              | 1000        | 912         | 670                    | 121               |
+================+================+=============+=============+========================+===================+
| LINEAR         | 2              | 2000        | 908         | 688                    | 122               |
+================+================+=============+=============+========================+===================+
| LINEAR         | 2              | 3000        | 894         | 688                    | 128               |
+================+================+=============+=============+========================+===================+
| LINEAR         | 8              | 100         | 1019        | 179                    | 865               |
+================+================+=============+=============+========================+===================+
| LINEAR         | 8              | 500         | 1052        | 176                    | 895               |
+================+================+=============+=============+========================+===================+
| LINEAR         | 8              | 1000        | 1130        | 180                    | 1003              |
+================+================+=============+=============+========================+===================+
| LINEAR         | 8              | 2000        | 1143        | 180                    | 1020              |
+================+================+=============+=============+========================+===================+
| LINEAR         | 8              | 3000        | 1130        | 181                    | 985               |
+================+================+=============+=============+========================+===================+
| INTERLEAVED    | 2              | 0           | 1582        | 710                    | 168               |
+================+================+=============+=============+========================+===================+
| INTERLEAVED    | 3              | 0           | 2162        | 446                    | 194               |
+================+================+=============+=============+========================+===================+
| INTERLEAVED    | 8              | 0           | 4997        | 214                    | 426               |
+================+================+=============+=============+========================+===================+
| INTERLEAVED    | RANDOM         | 0           | 2921        | 341                    | 311               |
+================+================+=============+=============+========================+===================+

+==========================================================================================================+
| IPV6                            | Flow Count : 32768                                                     |
+================+================+=============+=============+========================+===================+
| Fragment Order | Fragments/Flow | Outstanding | Cycles/Flow | Cycles/Fragment insert | Cycles/Reassembly |
+================+================+=============+=============+========================+===================+
| LINEAR         | 2              | 0           | 1275        | 687                    | 125               |
+================+================+=============+=============+========================+===================+
| RANDOM         | 2              | 0           | 1335        | 721                    | 169               |
+================+================+=============+=============+========================+===================+
| LINEAR         | 3              | 0           | 1388        | 415                    | 169               |
+================+================+=============+=============+========================+===================+
| RANDOM         | 3              | 0           | 2117        | 393                    | 163               |
+================+================+=============+=============+========================+===================+
| LINEAR         | 8              | 0           | 2811        | 172                    | 241               |
+================+================+=============+=============+========================+===================+
| RANDOM         | 8              | 0           | 4322        | 227                    | 401               |
+================+================+=============+=============+========================+===================+
| LINEAR         | RANDOM         | 0           | 1730        | 270                    | 192               |
+================+================+=============+=============+========================+===================+
| RANDOM         | RANDOM         | 0           | 2839        | 317                    | 264               |
+================+================+=============+=============+========================+===================+
| LINEAR         | 2              | 100         | 1152        | 662                    | 126               |
+================+================+=============+=============+========================+===================+
| LINEAR         | 2              | 500         | 1107        | 658                    | 130               |
+================+================+=============+=============+========================+===================+
| LINEAR         | 2              | 1000        | 1190        | 647                    | 138               |
+================+================+=============+=============+========================+===================+
| LINEAR         | 2              | 2000        | 1086        | 635                    | 141               |
+================+================+=============+=============+========================+===================+
| LINEAR         | 2              | 3000        | 1064        | 645                    | 150               |
+================+================+=============+=============+========================+===================+
| LINEAR         | 8              | 100         | 1560        | 172                    | 1296              |
+================+================+=============+=============+========================+===================+
| LINEAR         | 8              | 500         | 1536        | 226                    | 1274              |
+================+================+=============+=============+========================+===================+
| LINEAR         | 8              | 1000        | 1543        | 228                    | 1282              |
+================+================+=============+=============+========================+===================+
| LINEAR         | 8              | 2000        | 1548        | 228                    | 1287              |
+================+================+=============+=============+========================+===================+
| LINEAR         | 8              | 3000        | 1541        | 227                    | 1280              |
+================+================+=============+=============+========================+===================+
| INTERLEAVED    | 2              | 0           | 1585        | 769                    | 281               |
+================+================+=============+=============+========================+===================+
| INTERLEAVED    | 3              | 0           | 2222        | 536                    | 327               |
+================+================+=============+=============+========================+===================+
| INTERLEAVED    | 8              | 0           | 4962        | 232                    | 439               |
+================+================+=============+=============+========================+===================+
| INTERLEAVED    | RANDOM         | 0           | 2998        | 373                    | 360               |
+================+================+=============+=============+========================+===================+

> 
> >
> > Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
> > ---
> >  lib/ip_frag/ip_frag_internal.c   | 224 +++++++++++++++++++++++++------
> >  lib/ip_frag/ip_reassembly.h      |   6 +
> >  lib/ip_frag/rte_ip_frag_common.c |  10 ++
> >  3 files changed, 196 insertions(+), 44 deletions(-)
> >
> > diff --git a/lib/ip_frag/ip_frag_internal.c b/lib/ip_frag/ip_frag_internal.c
> index
> > 7cbef647df..de78a0ed8f 100644
> > --- a/lib/ip_frag/ip_frag_internal.c
> > +++ b/lib/ip_frag/ip_frag_internal.c
> > @@ -4,8 +4,9 @@
> >
> >  #include <stddef.h>
> >
> > -#include <rte_jhash.h>
> >  #include <rte_hash_crc.h>
> > +#include <rte_jhash.h>
> > +#include <rte_vect.h>
> >
> >  #include "ip_frag_common.h"
> >
> > @@ -280,10 +281,166 @@ ip_frag_find(struct rte_ip_frag_tbl *tbl, struct
> > rte_ip_frag_death_row *dr,
> >  	return pkt;
> >  }
> >
> > -struct ip_frag_pkt *
> > -ip_frag_lookup(struct rte_ip_frag_tbl *tbl,
> > -	const struct ip_frag_key *key, uint64_t tms,
> > -	struct ip_frag_pkt **free, struct ip_frag_pkt **stale)
> > +static inline void
> > +ip_frag_dbg(struct rte_ip_frag_tbl *tbl, struct ip_frag_pkt *p,
> > +	    uint32_t list_idx, uint32_t list_cnt) {
> > +	RTE_SET_USED(tbl);
> > +	RTE_SET_USED(list_idx);
> > +	RTE_SET_USED(list_cnt);
> > +	if (p->key.key_len == IPV4_KEYLEN)
> > +		IP_FRAG_LOG(DEBUG,
> > +			    "%s:%d:\n"
> > +			    "tbl: %p, max_entries: %u, use_entries: %u\n"
> > +			    "ipv4_frag_pkt line0: %p, index: %u from %u\n"
> > +			    "key: <%" PRIx64 ", %#x>, start: %" PRIu64 "\n",
> > +			    __func__, __LINE__, tbl, tbl->max_entries,
> > +			    tbl->use_entries, p, list_idx, list_cnt,
> > +			    p->key.src_dst[0], p->key.id, p->start);
> > +	else
> > +		IP_FRAG_LOG(DEBUG,
> > +			    "%s:%d:\n"
> > +			    "tbl: %p, max_entries: %u, use_entries: %u\n"
> > +			    "ipv6_frag_pkt line0: %p, index: %u from %u\n"
> > +			    "key: <" IPv6_KEY_BYTES_FMT
> > +			    ", %#x>, start: %" PRIu64 "\n",
> > +			    __func__, __LINE__, tbl, tbl->max_entries,
> > +			    tbl->use_entries, p, list_idx, list_cnt,
> > +			    IPv6_KEY_BYTES(p1[i].key.src_dst), p->key.id,
> > +			    p->start);
> > +}
> > +
> > +#if defined(RTE_ARCH_ARM64)
> > +static inline struct ip_frag_pkt *
> > +ip_frag_lookup_neon(struct rte_ip_frag_tbl *tbl, const struct ip_frag_key
> > *key, uint64_t tms,
> > +		    struct ip_frag_pkt **free, struct ip_frag_pkt **stale) {
> > +	struct ip_frag_pkt *empty, *old;
> > +	struct ip_frag_pkt *p1, *p2;
> > +	uint32_t assoc, sig1, sig2;
> > +	uint64_t max_cycles;
> > +
> > +	empty = NULL;
> > +	old = NULL;
> > +
> > +	max_cycles = tbl->max_cycles;
> > +	assoc = tbl->bucket_entries;
> > +
> > +	if (tbl->last != NULL && ip_frag_key_cmp(key, &tbl->last->key) == 0)
> > +		return tbl->last;
> > +
> > +	/* different hashing methods for IPv4 and IPv6 */
> > +	if (key->key_len == IPV4_KEYLEN)
> > +		ipv4_frag_hash(key, &sig1, &sig2);
> > +	else
> > +		ipv6_frag_hash(key, &sig1, &sig2);
> > +
> > +	p1 = IP_FRAG_TBL_POS(tbl, sig1);
> > +	p2 = IP_FRAG_TBL_POS(tbl, sig2);
> > +
> > +	uint64x2_t key0, key1, key2, key3;
> > +	uint64_t vmask, zmask, ts_mask;
> > +	uint64x2_t ts0, ts1;
> > +	uint32x4_t nz_key;
> > +	uint8_t idx;
> > +	/* Bucket entries are always power of 2. */
> > +	rte_prefetch0(&p1[0].key);
> > +	rte_prefetch0(&p1[1].key);
> > +	rte_prefetch0(&p2[0].key);
> > +	rte_prefetch0(&p2[1].key);
> > +
> > +	while (assoc > 1) {
> > +		if (assoc > 2) {
> > +			rte_prefetch0(&p1[2].key);
> > +			rte_prefetch0(&p1[3].key);
> > +			rte_prefetch0(&p2[2].key);
> > +			rte_prefetch0(&p2[3].key);
> > +		}
> > +		struct ip_frag_pkt *p[] = {&p1[0], &p2[0], &p1[1], &p2[1]};
> > +		key0 = vld1q_u64(&p[0]->key.id_key_len);
> > +		key1 = vld1q_u64(&p[1]->key.id_key_len);
> > +		key2 = vld1q_u64(&p[2]->key.id_key_len);
> > +		key3 = vld1q_u64(&p[3]->key.id_key_len);
> > +
> > +		nz_key =
> > vsetq_lane_u32(vgetq_lane_u32(vreinterpretq_u32_u64(key0), 1),
> nz_key, 0);
> > +		nz_key =
> > vsetq_lane_u32(vgetq_lane_u32(vreinterpretq_u32_u64(key1), 1),
> nz_key, 1);
> > +		nz_key =
> > vsetq_lane_u32(vgetq_lane_u32(vreinterpretq_u32_u64(key2), 1),
> nz_key, 2);
> > +		nz_key =
> > vsetq_lane_u32(vgetq_lane_u32(vreinterpretq_u32_u64(key3),
> > +1), nz_key, 3);
> > +
> > +		nz_key = vceqzq_u32(nz_key);
> > +		zmask =
> > vget_lane_u64(vreinterpret_u64_u16(vshrn_n_u32(nz_key, 16)), 0);
> > +		vmask = ~zmask;
> > +
> > +		vmask &= 0x8000800080008000;
> > +		for (; vmask > 0; vmask &= vmask - 1) {
> > +			idx = __builtin_ctzll(vmask) >> 4;
> > +			if (ip_frag_key_cmp(key, &p[idx]->key) == 0)
> > +				return p[idx];
> > +		}
> > +
> > +		vmask = ~zmask;
> > +		if (zmask && empty == NULL) {
> > +			zmask &= 0x8000800080008000;
> > +			idx = __builtin_ctzll(zmask) >> 4;
> > +			empty = p[idx];
> > +		}
> > +
> > +		if (vmask && old == NULL) {
> > +			const uint64x2_t max_cyc =
> > vdupq_n_u64(max_cycles);
> > +			const uint64x2_t cur_cyc = vdupq_n_u64(tms);
> > +
> > +			ts0 = vsetq_lane_u64(vgetq_lane_u64(key0, 1), ts0,
> > 0);
> > +			ts0 = vsetq_lane_u64(vgetq_lane_u64(key1, 1), ts0,
> > 1);
> > +			ts1 = vsetq_lane_u64(vgetq_lane_u64(key2, 1), ts1,
> > 0);
> > +			ts1 = vsetq_lane_u64(vgetq_lane_u64(key3, 1), ts1,
> > 1);
> > +
> > +			ts0 = vcgtq_u64(cur_cyc, vaddq_u64(ts0, max_cyc));
> > +			ts1 = vcgtq_u64(cur_cyc, vaddq_u64(ts1, max_cyc));
> > +
> > +			ts_mask =
> > vget_lane_u64(vreinterpret_u64_u16(vshrn_n_u32(
> > +
> > 	vuzp1q_u32(vreinterpretq_u32_u64(ts0),
> > +
> > vreinterpretq_u32_u64(ts1)),
> > +							16)),
> > +						0);
> > +			vmask &= 0x8000800080008000;
> > +			ts_mask &= vmask;
> > +			if (ts_mask) {
> > +				idx = __builtin_ctzll(ts_mask) >> 4;
> > +				old = p[idx];
> > +			}
> > +		}
> > +		p1 += 2;
> > +		p2 += 2;
> > +		assoc -= 4;
> > +	}
> > +	while (assoc) {
> > +		if (ip_frag_key_cmp(key, &p1->key) == 0)
> > +			return p1;
> > +		else if (ip_frag_key_is_empty(&p1->key))
> > +			empty = (empty == NULL) ? p1 : empty;
> > +		else if (max_cycles + p1->start < tms)
> > +			old = (old == NULL) ? p1 : old;
> > +
> > +		if (ip_frag_key_cmp(key, &p2->key) == 0)
> > +			return p2;
> > +		else if (ip_frag_key_is_empty(&p2->key))
> > +			empty = (empty == NULL) ? p2 : empty;
> > +		else if (max_cycles + p2->start < tms)
> > +			old = (old == NULL) ? p2 : old;
> > +		p1++;
> > +		p2++;
> > +		assoc--;
> > +	}
> > +
> > +	*free = empty;
> > +	*stale = old;
> > +	return NULL;
> > +}
> > +#endif
> > +
> > +static struct ip_frag_pkt *
> > +ip_frag_lookup_scalar(struct rte_ip_frag_tbl *tbl, const struct
> ip_frag_key
> > *key, uint64_t tms,
> > +		      struct ip_frag_pkt **free, struct ip_frag_pkt **stale)
> >  {
> >  	struct ip_frag_pkt *p1, *p2;
> >  	struct ip_frag_pkt *empty, *old;
> > @@ -309,25 +466,7 @@ ip_frag_lookup(struct rte_ip_frag_tbl *tbl,
> >  	p2 = IP_FRAG_TBL_POS(tbl, sig2);
> >
> >  	for (i = 0; i != assoc; i++) {
> > -		if (p1->key.key_len == IPV4_KEYLEN)
> > -			IP_FRAG_LOG(DEBUG, "%s:%d:\n"
> > -					"tbl: %p, max_entries: %u,
> > use_entries: %u\n"
> > -					"ipv4_frag_pkt line0: %p, index: %u
> > from %u\n"
> > -			"key: <%" PRIx64 ", %#x>, start: %" PRIu64 "\n",
> > -					__func__, __LINE__,
> > -					tbl, tbl->max_entries, tbl-
> >use_entries,
> > -					p1, i, assoc,
> > -			p1[i].key.src_dst[0], p1[i].key.id, p1[i].start);
> > -		else
> > -			IP_FRAG_LOG(DEBUG, "%s:%d:\n"
> > -					"tbl: %p, max_entries: %u,
> > use_entries: %u\n"
> > -					"ipv6_frag_pkt line0: %p, index: %u
> > from %u\n"
> > -			"key: <" IPv6_KEY_BYTES_FMT ", %#x>, start: %"
> > PRIu64 "\n",
> > -					__func__, __LINE__,
> > -					tbl, tbl->max_entries, tbl-
> >use_entries,
> > -					p1, i, assoc,
> > -			IPv6_KEY_BYTES(p1[i].key.src_dst), p1[i].key.id,
> > p1[i].start);
> > -
> > +		ip_frag_dbg(tbl, &p1[i], i, assoc);
> >  		if (ip_frag_key_cmp(key, &p1[i].key) == 0)
> >  			return p1 + i;
> >  		else if (ip_frag_key_is_empty(&p1[i].key))
> > @@ -335,29 +474,11 @@ ip_frag_lookup(struct rte_ip_frag_tbl *tbl,
> >  		else if (max_cycles + p1[i].start < tms)
> >  			old = (old == NULL) ? (p1 + i) : old;
> >
> > -		if (p2->key.key_len == IPV4_KEYLEN)
> > -			IP_FRAG_LOG(DEBUG, "%s:%d:\n"
> > -					"tbl: %p, max_entries: %u,
> > use_entries: %u\n"
> > -					"ipv4_frag_pkt line1: %p, index: %u
> > from %u\n"
> > -			"key: <%" PRIx64 ", %#x>, start: %" PRIu64 "\n",
> > -					__func__, __LINE__,
> > -					tbl, tbl->max_entries, tbl-
> >use_entries,
> > -					p2, i, assoc,
> > -			p2[i].key.src_dst[0], p2[i].key.id, p2[i].start);
> > -		else
> > -			IP_FRAG_LOG(DEBUG, "%s:%d:\n"
> > -					"tbl: %p, max_entries: %u,
> > use_entries: %u\n"
> > -					"ipv6_frag_pkt line1: %p, index: %u
> > from %u\n"
> > -			"key: <" IPv6_KEY_BYTES_FMT ", %#x>, start: %"
> > PRIu64 "\n",
> > -					__func__, __LINE__,
> > -					tbl, tbl->max_entries, tbl-
> >use_entries,
> > -					p2, i, assoc,
> > -			IPv6_KEY_BYTES(p2[i].key.src_dst), p2[i].key.id,
> > p2[i].start);
> > -
> > +		ip_frag_dbg(tbl, &p2[i], i, assoc);
> >  		if (ip_frag_key_cmp(key, &p2[i].key) == 0)
> >  			return p2 + i;
> >  		else if (ip_frag_key_is_empty(&p2[i].key))
> > -			empty = (empty == NULL) ?( p2 + i) : empty;
> > +			empty = (empty == NULL) ? (p2 + i) : empty;
> >  		else if (max_cycles + p2[i].start < tms)
> >  			old = (old == NULL) ? (p2 + i) : old;
> >  	}
> > @@ -366,3 +487,18 @@ ip_frag_lookup(struct rte_ip_frag_tbl *tbl,
> >  	*stale = old;
> >  	return NULL;
> >  }
> > +
> > +struct ip_frag_pkt *
> > +ip_frag_lookup(struct rte_ip_frag_tbl *tbl, const struct ip_frag_key *key,
> > uint64_t tms,
> > +	       struct ip_frag_pkt **free, struct ip_frag_pkt **stale) {
> > +	switch (tbl->lookup_fn) {
> > +#if defined(RTE_ARCH_ARM64)
> > +	case REASSEMBLY_LOOKUP_NEON:
> > +		return ip_frag_lookup_neon(tbl, key, tms, free, stale);
> #endif
> > +	case REASSEMBLY_LOOKUP_SCALAR:
> > +	default:
> > +		return ip_frag_lookup_scalar(tbl, key, tms, free, stale);
> > +	}
> > +}
> > diff --git a/lib/ip_frag/ip_reassembly.h b/lib/ip_frag/ip_reassembly.h index
> > ef9d8c0d75..049437ae32 100644
> > --- a/lib/ip_frag/ip_reassembly.h
> > +++ b/lib/ip_frag/ip_reassembly.h
> > @@ -12,6 +12,11 @@
> >
> >  #include <rte_ip_frag.h>
> >
> > +enum ip_frag_lookup_func {
> > +	REASSEMBLY_LOOKUP_SCALAR = 0,
> > +	REASSEMBLY_LOOKUP_NEON,
> > +};
> > +
> >  enum {
> >  	IP_LAST_FRAG_IDX,    /* index of last fragment */
> >  	IP_FIRST_FRAG_IDX,   /* index of first fragment */
> > @@ -83,6 +88,7 @@ struct rte_ip_frag_tbl {
> >  	struct ip_frag_pkt *last;     /* last used entry. */
> >  	struct ip_pkt_list lru;       /* LRU list for table entries. */
> >  	struct ip_frag_tbl_stat stat; /* statistics counters. */
> > +	enum ip_frag_lookup_func lookup_fn;	/* hash table lookup
> function.
> > */
> >  	__extension__ struct ip_frag_pkt pkt[]; /* hash table. */  };
> >
> > diff --git a/lib/ip_frag/rte_ip_frag_common.c
> > b/lib/ip_frag/rte_ip_frag_common.c
> > index c1de2e81b6..ef3c104e45 100644
> > --- a/lib/ip_frag/rte_ip_frag_common.c
> > +++ b/lib/ip_frag/rte_ip_frag_common.c
> > @@ -5,7 +5,9 @@
> >  #include <stddef.h>
> >  #include <stdio.h>
> >
> > +#include <rte_cpuflags.h>
> >  #include <rte_log.h>
> > +#include <rte_vect.h>
> >
> >  #include "ip_frag_common.h"
> >
> > @@ -75,6 +77,14 @@ rte_ip_frag_table_create(uint32_t bucket_num,
> > uint32_t bucket_entries,
> >  	tbl->bucket_entries = bucket_entries;
> >  	tbl->entry_mask = (tbl->nb_entries - 1) & ~(tbl->bucket_entries  - 1);
> >
> > +#if defined(RTE_ARCH_ARM64)
> > +	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON) &&
> > +	    rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128)
> > +		tbl->lookup_fn = REASSEMBLY_LOOKUP_NEON;
> > +	else
> > +#endif
> > +		tbl->lookup_fn = REASSEMBLY_LOOKUP_SCALAR;
> > +
> >  	TAILQ_INIT(&(tbl->lru));
> >  	return tbl;
> >  }
> > --
> > 2.25.1
  
Pavan Nikhilesh Bhagavatula May 23, 2023, 10:23 p.m. UTC | #3
> -----Original Message-----
> From: Pavan Nikhilesh Bhagavatula
> Sent: Tuesday, May 23, 2023 11:29 PM
> To: Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>; Jerin Jacob
> Kollanukkaran <jerinj@marvell.com>; nd <nd@arm.com>; Konstantin
> Ananyev <konstantin.v.ananyev@yandex.ru>
> Cc: dev@dpdk.org; nd <nd@arm.com>; nd <nd@arm.com>
> Subject: RE: [PATCH v2 2/3] ip_frag: improve reassembly lookup
> performance
> 
> > > -----Original Message-----
> > > From: pbhagavatula@marvell.com <pbhagavatula@marvell.com>
> > > Sent: Tuesday, May 23, 2023 9:39 AM
> > > To: jerinj@marvell.com; Honnappa Nagarahalli
> > > <Honnappa.Nagarahalli@arm.com>; nd <nd@arm.com>; Konstantin
> > Ananyev
> > > <konstantin.v.ananyev@yandex.ru>
> > > Cc: dev@dpdk.org; Pavan Nikhilesh <pbhagavatula@marvell.com>
> > > Subject: [PATCH v2 2/3] ip_frag: improve reassembly lookup
> performance
> > >
> > > From: Pavan Nikhilesh <pbhagavatula@marvell.com>
> > >
> > > Improve reassembly lookup performance by using NEON intrinsics for key
> > > validation.
> > What is the improvement do you see with this?
> 
> On Neoverse-N2 I see around improvement of 300-600c per flow and ~200c
> per insert.
> 

Below data is incorrect due to a bug (See below), but I still see improvement with ipv6.

> Here are some test results.
> 
> Without patch:
> +=========================================================
> =================================================+
> | IPV4                            | Flow Count : 32768                                                     |
> +================+================+=============+=========
> ====+========================+===================+
> | Fragment Order | Fragments/Flow | Outstanding | Cycles/Flow |
> Cycles/Fragment insert | Cycles/Reassembly |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 2              | 0           | 1244        | 919                    | 114               |
> +================+================+=============+=========
> ====+========================+===================+
> | RANDOM         | 2              | 0           | 1653        | 968                    | 128               |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 3              | 0           | 1379        | 503                    | 110               |
> +================+================+=============+=========
> ====+========================+===================+
> | RANDOM         | 3              | 0           | 1613        | 520                    | 139               |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 8              | 0           | 2030        | 199                    | 190               |
> +================+================+=============+=========
> ====+========================+===================+
> | RANDOM         | 8              | 0           | 4393        | 309                    | 402               |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | RANDOM         | 0           | 1531        | 333                    | 147               |
> +================+================+=============+=========
> ====+========================+===================+
> | RANDOM         | RANDOM         | 0           | 2771        | 357                    | 213
> |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 2              | 100         | 1228        | 920                    | 102               |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 2              | 500         | 1197        | 905                    | 103               |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 2              | 1000        | 1183        | 904                    | 104               |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 2              | 2000        | 1153        | 921                    | 105               |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 2              | 3000        | 1123        | 911                    | 111               |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 8              | 100         | 829         | 193                    | 690               |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 8              | 500         | 830         | 195                    | 682               |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 8              | 1000        | 817         | 211                    | 690               |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 8              | 2000        | 819         | 195                    | 690               |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 8              | 3000        | 823         | 223                    | 676               |
> +================+================+=============+=========
> ====+========================+===================+
> | INTERLEAVED    | 2              | 0           | 1765        | 1038                   | 177               |
> +================+================+=============+=========
> ====+========================+===================+
> | INTERLEAVED    | 3              | 0           | 2588        | 699                    | 190               |
> +================+================+=============+=========
> ====+========================+===================+
> | INTERLEAVED    | 8              | 0           | 5253        | 265                    | 403               |
> +================+================+=============+=========
> ====+========================+===================+
> | INTERLEAVED    | RANDOM         | 0           | 3398        | 493                    | 301
> |
> +================+================+=============+=========
> ====+========================+===================+
> 
> +=========================================================
> =================================================+
> | IPV6                            | Flow Count : 32768                                                     |
> +================+================+=============+=========
> ====+========================+===================+
> | Fragment Order | Fragments/Flow | Outstanding | Cycles/Flow |
> Cycles/Fragment insert | Cycles/Reassembly |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 2              | 0           | 1838        | 1176                   | 136               |
> +================+================+=============+=========
> ====+========================+===================+
> | RANDOM         | 2              | 0           | 1892        | 1188                   | 160               |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 3              | 0           | 1986        | 628                    | 143               |
> +================+================+=============+=========
> ====+========================+===================+
> | RANDOM         | 3              | 0           | 2670        | 646                    | 155               |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 8              | 0           | 3152        | 261                    | 271               |
> +================+================+=============+=========
> ====+========================+===================+
> | RANDOM         | 8              | 0           | 5127        | 324                    | 434               |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | RANDOM         | 0           | 2169        | 427                    | 203               |
> +================+================+=============+=========
> ====+========================+===================+
> | RANDOM         | RANDOM         | 0           | 3382        | 452                    | 255
> |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 2              | 100         | 1837        | 1164                   | 124               |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 2              | 500         | 1790        | 1158                   | 126               |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 2              | 1000        | 1807        | 1161                   | 138               |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 2              | 2000        | 1776        | 1160                   | 138               |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 2              | 3000        | 1715        | 1169                   | 144               |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 8              | 100         | 1488        | 256                    | 1228              |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 8              | 500         | 1461        | 300                    | 1205              |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 8              | 1000        | 1457        | 303                    | 1202              |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 8              | 2000        | 1456        | 305                    | 1201              |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 8              | 3000        | 1460        | 308                    | 1205              |
> +================+================+=============+=========
> ====+========================+===================+
> | INTERLEAVED    | 2              | 0           | 2145        | 1330                   | 296               |
> +================+================+=============+=========
> ====+========================+===================+
> | INTERLEAVED    | 3              | 0           | 2778        | 830                    | 330               |
> +================+================+=============+=========
> ====+========================+===================+
> | INTERLEAVED    | 8              | 0           | 5715        | 324                    | 444               |
> +================+================+=============+=========
> ====+========================+===================+
> | INTERLEAVED    | RANDOM         | 0           | 3625        | 550                    | 363
> |
> +================+================+=============+=========
> ====+========================+===================+
> 
> With patch :
> 
> +=========================================================
> =================================================+
> | IPV4                            | Flow Count : 32768                                                     |
> +================+================+=============+=========
> ====+========================+===================+
> | Fragment Order | Fragments/Flow | Outstanding | Cycles/Flow |
> Cycles/Fragment insert | Cycles/Reassembly |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 2              | 0           | 950         | 717                    | 98                |
> +================+================+=============+=========
> ====+========================+===================+
> | RANDOM         | 2              | 0           | 1013        | 706                    | 108               |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 3              | 0           | 1096        | 397                    | 115               |
> +================+================+=============+=========
> ====+========================+===================+
> | RANDOM         | 3              | 0           | 1150        | 412                    | 128               |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 8              | 0           | 1783        | 166                    | 202               |
> +================+================+=============+=========
> ====+========================+===================+
> | RANDOM         | 8              | 0           | 3933        | 284                    | 424               |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | RANDOM         | 0           | 1288        | 267                    | 159               |
> +================+================+=============+=========
> ====+========================+===================+
> | RANDOM         | RANDOM         | 0           | 2393        | 302                    | 235
> |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 2              | 100         | 956         | 703                    | 110               |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 2              | 500         | 937         | 693                    | 112               |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 2              | 1000        | 912         | 670                    | 121               |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 2              | 2000        | 908         | 688                    | 122               |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 2              | 3000        | 894         | 688                    | 128               |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 8              | 100         | 1019        | 179                    | 865               |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 8              | 500         | 1052        | 176                    | 895               |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 8              | 1000        | 1130        | 180                    | 1003              |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 8              | 2000        | 1143        | 180                    | 1020              |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 8              | 3000        | 1130        | 181                    | 985               |
> +================+================+=============+=========
> ====+========================+===================+
> | INTERLEAVED    | 2              | 0           | 1582        | 710                    | 168               |
> +================+================+=============+=========
> ====+========================+===================+
> | INTERLEAVED    | 3              | 0           | 2162        | 446                    | 194               |
> +================+================+=============+=========
> ====+========================+===================+
> | INTERLEAVED    | 8              | 0           | 4997        | 214                    | 426               |
> +================+================+=============+=========
> ====+========================+===================+
> | INTERLEAVED    | RANDOM         | 0           | 2921        | 341                    | 311
> |
> +================+================+=============+=========
> ====+========================+===================+
> 
> +=========================================================
> =================================================+
> | IPV6                            | Flow Count : 32768                                                     |
> +================+================+=============+=========
> ====+========================+===================+
> | Fragment Order | Fragments/Flow | Outstanding | Cycles/Flow |
> Cycles/Fragment insert | Cycles/Reassembly |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 2              | 0           | 1275        | 687                    | 125               |
> +================+================+=============+=========
> ====+========================+===================+
> | RANDOM         | 2              | 0           | 1335        | 721                    | 169               |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 3              | 0           | 1388        | 415                    | 169               |
> +================+================+=============+=========
> ====+========================+===================+
> | RANDOM         | 3              | 0           | 2117        | 393                    | 163               |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 8              | 0           | 2811        | 172                    | 241               |
> +================+================+=============+=========
> ====+========================+===================+
> | RANDOM         | 8              | 0           | 4322        | 227                    | 401               |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | RANDOM         | 0           | 1730        | 270                    | 192               |
> +================+================+=============+=========
> ====+========================+===================+
> | RANDOM         | RANDOM         | 0           | 2839        | 317                    | 264
> |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 2              | 100         | 1152        | 662                    | 126               |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 2              | 500         | 1107        | 658                    | 130               |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 2              | 1000        | 1190        | 647                    | 138               |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 2              | 2000        | 1086        | 635                    | 141               |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 2              | 3000        | 1064        | 645                    | 150               |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 8              | 100         | 1560        | 172                    | 1296              |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 8              | 500         | 1536        | 226                    | 1274              |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 8              | 1000        | 1543        | 228                    | 1282              |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 8              | 2000        | 1548        | 228                    | 1287              |
> +================+================+=============+=========
> ====+========================+===================+
> | LINEAR         | 8              | 3000        | 1541        | 227                    | 1280              |
> +================+================+=============+=========
> ====+========================+===================+
> | INTERLEAVED    | 2              | 0           | 1585        | 769                    | 281               |
> +================+================+=============+=========
> ====+========================+===================+
> | INTERLEAVED    | 3              | 0           | 2222        | 536                    | 327               |
> +================+================+=============+=========
> ====+========================+===================+
> | INTERLEAVED    | 8              | 0           | 4962        | 232                    | 439               |
> +================+================+=============+=========
> ====+========================+===================+
> | INTERLEAVED    | RANDOM         | 0           | 2998        | 373                    | 360
> |
> +================+================+=============+=========
> ====+========================+===================+
> 
> >
> > >
> > > Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
> > > ---
> > >  lib/ip_frag/ip_frag_internal.c   | 224 +++++++++++++++++++++++++-----
> -
> > >  lib/ip_frag/ip_reassembly.h      |   6 +
> > >  lib/ip_frag/rte_ip_frag_common.c |  10 ++
> > >  3 files changed, 196 insertions(+), 44 deletions(-)
> > >
> > > diff --git a/lib/ip_frag/ip_frag_internal.c b/lib/ip_frag/ip_frag_internal.c
> > index
> > > 7cbef647df..de78a0ed8f 100644
> > > --- a/lib/ip_frag/ip_frag_internal.c
> > > +++ b/lib/ip_frag/ip_frag_internal.c
> > > @@ -4,8 +4,9 @@
> > >
> > >  #include <stddef.h>
> > >
> > > -#include <rte_jhash.h>
> > >  #include <rte_hash_crc.h>
> > > +#include <rte_jhash.h>
> > > +#include <rte_vect.h>
> > >
> > >  #include "ip_frag_common.h"
> > >
> > > @@ -280,10 +281,166 @@ ip_frag_find(struct rte_ip_frag_tbl *tbl, struct
> > > rte_ip_frag_death_row *dr,
> > >  	return pkt;
> > >  }
> > >
> > > -struct ip_frag_pkt *
> > > -ip_frag_lookup(struct rte_ip_frag_tbl *tbl,
> > > -	const struct ip_frag_key *key, uint64_t tms,
> > > -	struct ip_frag_pkt **free, struct ip_frag_pkt **stale)
> > > +static inline void
> > > +ip_frag_dbg(struct rte_ip_frag_tbl *tbl, struct ip_frag_pkt *p,
> > > +	    uint32_t list_idx, uint32_t list_cnt) {
> > > +	RTE_SET_USED(tbl);
> > > +	RTE_SET_USED(list_idx);
> > > +	RTE_SET_USED(list_cnt);
> > > +	if (p->key.key_len == IPV4_KEYLEN)
> > > +		IP_FRAG_LOG(DEBUG,
> > > +			    "%s:%d:\n"
> > > +			    "tbl: %p, max_entries: %u, use_entries: %u\n"
> > > +			    "ipv4_frag_pkt line0: %p, index: %u from %u\n"
> > > +			    "key: <%" PRIx64 ", %#x>, start: %" PRIu64 "\n",
> > > +			    __func__, __LINE__, tbl, tbl->max_entries,
> > > +			    tbl->use_entries, p, list_idx, list_cnt,
> > > +			    p->key.src_dst[0], p->key.id, p->start);
> > > +	else
> > > +		IP_FRAG_LOG(DEBUG,
> > > +			    "%s:%d:\n"
> > > +			    "tbl: %p, max_entries: %u, use_entries: %u\n"
> > > +			    "ipv6_frag_pkt line0: %p, index: %u from %u\n"
> > > +			    "key: <" IPv6_KEY_BYTES_FMT
> > > +			    ", %#x>, start: %" PRIu64 "\n",
> > > +			    __func__, __LINE__, tbl, tbl->max_entries,
> > > +			    tbl->use_entries, p, list_idx, list_cnt,
> > > +			    IPv6_KEY_BYTES(p1[i].key.src_dst), p->key.id,
> > > +			    p->start);
> > > +}
> > > +
> > > +#if defined(RTE_ARCH_ARM64)
> > > +static inline struct ip_frag_pkt *
> > > +ip_frag_lookup_neon(struct rte_ip_frag_tbl *tbl, const struct
> ip_frag_key
> > > *key, uint64_t tms,
> > > +		    struct ip_frag_pkt **free, struct ip_frag_pkt **stale) {
> > > +	struct ip_frag_pkt *empty, *old;
> > > +	struct ip_frag_pkt *p1, *p2;
> > > +	uint32_t assoc, sig1, sig2;
> > > +	uint64_t max_cycles;
> > > +
> > > +	empty = NULL;
> > > +	old = NULL;
> > > +
> > > +	max_cycles = tbl->max_cycles;
> > > +	assoc = tbl->bucket_entries;
> > > +
> > > +	if (tbl->last != NULL && ip_frag_key_cmp(key, &tbl->last->key) == 0)
> > > +		return tbl->last;
> > > +
> > > +	/* different hashing methods for IPv4 and IPv6 */
> > > +	if (key->key_len == IPV4_KEYLEN)
> > > +		ipv4_frag_hash(key, &sig1, &sig2);
> > > +	else
> > > +		ipv6_frag_hash(key, &sig1, &sig2);
> > > +
> > > +	p1 = IP_FRAG_TBL_POS(tbl, sig1);
> > > +	p2 = IP_FRAG_TBL_POS(tbl, sig2);
> > > +
> > > +	uint64x2_t key0, key1, key2, key3;
> > > +	uint64_t vmask, zmask, ts_mask;
> > > +	uint64x2_t ts0, ts1;
> > > +	uint32x4_t nz_key;
> > > +	uint8_t idx;
> > > +	/* Bucket entries are always power of 2. */
> > > +	rte_prefetch0(&p1[0].key);
> > > +	rte_prefetch0(&p1[1].key);
> > > +	rte_prefetch0(&p2[0].key);
> > > +	rte_prefetch0(&p2[1].key);
> > > +
> > > +	while (assoc > 1) {
> > > +		if (assoc > 2) {
> > > +			rte_prefetch0(&p1[2].key);
> > > +			rte_prefetch0(&p1[3].key);
> > > +			rte_prefetch0(&p2[2].key);
> > > +			rte_prefetch0(&p2[3].key);
> > > +		}
> > > +		struct ip_frag_pkt *p[] = {&p1[0], &p2[0], &p1[1], &p2[1]};
> > > +		key0 = vld1q_u64(&p[0]->key.id_key_len);
> > > +		key1 = vld1q_u64(&p[1]->key.id_key_len);
> > > +		key2 = vld1q_u64(&p[2]->key.id_key_len);
> > > +		key3 = vld1q_u64(&p[3]->key.id_key_len);
> > > +
> > > +		nz_key =
> > > vsetq_lane_u32(vgetq_lane_u32(vreinterpretq_u32_u64(key0), 1),
> > nz_key, 0);
> > > +		nz_key =
> > > vsetq_lane_u32(vgetq_lane_u32(vreinterpretq_u32_u64(key1), 1),
> > nz_key, 1);
> > > +		nz_key =
> > > vsetq_lane_u32(vgetq_lane_u32(vreinterpretq_u32_u64(key2), 1),
> > nz_key, 2);
> > > +		nz_key =
> > > vsetq_lane_u32(vgetq_lane_u32(vreinterpretq_u32_u64(key3),
> > > +1), nz_key, 3);
> > > +

I think we can compare id part too since its already in the vector register, I will rewrite this part.

> > > +		nz_key = vceqzq_u32(nz_key);
> > > +		zmask =
> > > vget_lane_u64(vreinterpret_u64_u16(vshrn_n_u32(nz_key, 16)), 0);
> > > +		vmask = ~zmask;
> > > +
> > > +		vmask &= 0x8000800080008000;
> > > +		for (; vmask > 0; vmask &= vmask - 1) {
> > > +			idx = __builtin_ctzll(vmask) >> 4;
> > > +			if (ip_frag_key_cmp(key, &p[idx]->key) == 0)
> > > +				return p[idx];
> > > +		}
> > > +
> > > +		vmask = ~zmask;
> > > +		if (zmask && empty == NULL) {
> > > +			zmask &= 0x8000800080008000;
> > > +			idx = __builtin_ctzll(zmask) >> 4;
> > > +			empty = p[idx];
> > > +		}
> > > +
> > > +		if (vmask && old == NULL) {
> > > +			const uint64x2_t max_cyc =
> > > vdupq_n_u64(max_cycles);
> > > +			const uint64x2_t cur_cyc = vdupq_n_u64(tms);
> > > +
> > > +			ts0 = vsetq_lane_u64(vgetq_lane_u64(key0, 1), ts0,
> > > 0);
> > > +			ts0 = vsetq_lane_u64(vgetq_lane_u64(key1, 1), ts0,
> > > 1);
> > > +			ts1 = vsetq_lane_u64(vgetq_lane_u64(key2, 1), ts1,
> > > 0);
> > > +			ts1 = vsetq_lane_u64(vgetq_lane_u64(key3, 1), ts1,
> > > 1);
> > > +
> > > +			ts0 = vcgtq_u64(cur_cyc, vaddq_u64(ts0, max_cyc));
> > > +			ts1 = vcgtq_u64(cur_cyc, vaddq_u64(ts1, max_cyc));
> > > +
> > > +			ts_mask =
> > > vget_lane_u64(vreinterpret_u64_u16(vshrn_n_u32(
> > > +
> > > 	vuzp1q_u32(vreinterpretq_u32_u64(ts0),
> > > +
> > > vreinterpretq_u32_u64(ts1)),
> > > +							16)),
> > > +						0);
> > > +			vmask &= 0x8000800080008000;
> > > +			ts_mask &= vmask;
> > > +			if (ts_mask) {
> > > +				idx = __builtin_ctzll(ts_mask) >> 4;
> > > +				old = p[idx];
> > > +			}
> > > +		}
> > > +		p1 += 2;
> > > +		p2 += 2;
> > > +		assoc -= 4;

Should be -=2

> > > +	}
> > > +	while (assoc) {
> > > +		if (ip_frag_key_cmp(key, &p1->key) == 0)
> > > +			return p1;
> > > +		else if (ip_frag_key_is_empty(&p1->key))
> > > +			empty = (empty == NULL) ? p1 : empty;
> > > +		else if (max_cycles + p1->start < tms)
> > > +			old = (old == NULL) ? p1 : old;
> > > +
> > > +		if (ip_frag_key_cmp(key, &p2->key) == 0)
> > > +			return p2;
> > > +		else if (ip_frag_key_is_empty(&p2->key))
> > > +			empty = (empty == NULL) ? p2 : empty;
> > > +		else if (max_cycles + p2->start < tms)
> > > +			old = (old == NULL) ? p2 : old;
> > > +		p1++;
> > > +		p2++;
> > > +		assoc--;
> > > +	}
> > > +
> > > +	*free = empty;
> > > +	*stale = old;
> > > +	return NULL;
> > > +}
> > > +#endif
> > > +
> > > +static struct ip_frag_pkt *
> > > +ip_frag_lookup_scalar(struct rte_ip_frag_tbl *tbl, const struct
> > ip_frag_key
> > > *key, uint64_t tms,
> > > +		      struct ip_frag_pkt **free, struct ip_frag_pkt **stale)
> > >  {
> > >  	struct ip_frag_pkt *p1, *p2;
> > >  	struct ip_frag_pkt *empty, *old;
> > > @@ -309,25 +466,7 @@ ip_frag_lookup(struct rte_ip_frag_tbl *tbl,
> > >  	p2 = IP_FRAG_TBL_POS(tbl, sig2);
> > >
> > >  	for (i = 0; i != assoc; i++) {
> > > -		if (p1->key.key_len == IPV4_KEYLEN)
> > > -			IP_FRAG_LOG(DEBUG, "%s:%d:\n"
> > > -					"tbl: %p, max_entries: %u,
> > > use_entries: %u\n"
> > > -					"ipv4_frag_pkt line0: %p, index: %u
> > > from %u\n"
> > > -			"key: <%" PRIx64 ", %#x>, start: %" PRIu64 "\n",
> > > -					__func__, __LINE__,
> > > -					tbl, tbl->max_entries, tbl-
> > >use_entries,
> > > -					p1, i, assoc,
> > > -			p1[i].key.src_dst[0], p1[i].key.id, p1[i].start);
> > > -		else
> > > -			IP_FRAG_LOG(DEBUG, "%s:%d:\n"
> > > -					"tbl: %p, max_entries: %u,
> > > use_entries: %u\n"
> > > -					"ipv6_frag_pkt line0: %p, index: %u
> > > from %u\n"
> > > -			"key: <" IPv6_KEY_BYTES_FMT ", %#x>, start: %"
> > > PRIu64 "\n",
> > > -					__func__, __LINE__,
> > > -					tbl, tbl->max_entries, tbl-
> > >use_entries,
> > > -					p1, i, assoc,
> > > -			IPv6_KEY_BYTES(p1[i].key.src_dst), p1[i].key.id,
> > > p1[i].start);
> > > -
> > > +		ip_frag_dbg(tbl, &p1[i], i, assoc);
> > >  		if (ip_frag_key_cmp(key, &p1[i].key) == 0)
> > >  			return p1 + i;
> > >  		else if (ip_frag_key_is_empty(&p1[i].key))
> > > @@ -335,29 +474,11 @@ ip_frag_lookup(struct rte_ip_frag_tbl *tbl,
> > >  		else if (max_cycles + p1[i].start < tms)
> > >  			old = (old == NULL) ? (p1 + i) : old;
> > >
> > > -		if (p2->key.key_len == IPV4_KEYLEN)
> > > -			IP_FRAG_LOG(DEBUG, "%s:%d:\n"
> > > -					"tbl: %p, max_entries: %u,
> > > use_entries: %u\n"
> > > -					"ipv4_frag_pkt line1: %p, index: %u
> > > from %u\n"
> > > -			"key: <%" PRIx64 ", %#x>, start: %" PRIu64 "\n",
> > > -					__func__, __LINE__,
> > > -					tbl, tbl->max_entries, tbl-
> > >use_entries,
> > > -					p2, i, assoc,
> > > -			p2[i].key.src_dst[0], p2[i].key.id, p2[i].start);
> > > -		else
> > > -			IP_FRAG_LOG(DEBUG, "%s:%d:\n"
> > > -					"tbl: %p, max_entries: %u,
> > > use_entries: %u\n"
> > > -					"ipv6_frag_pkt line1: %p, index: %u
> > > from %u\n"
> > > -			"key: <" IPv6_KEY_BYTES_FMT ", %#x>, start: %"
> > > PRIu64 "\n",
> > > -					__func__, __LINE__,
> > > -					tbl, tbl->max_entries, tbl-
> > >use_entries,
> > > -					p2, i, assoc,
> > > -			IPv6_KEY_BYTES(p2[i].key.src_dst), p2[i].key.id,
> > > p2[i].start);
> > > -
> > > +		ip_frag_dbg(tbl, &p2[i], i, assoc);
> > >  		if (ip_frag_key_cmp(key, &p2[i].key) == 0)
> > >  			return p2 + i;
> > >  		else if (ip_frag_key_is_empty(&p2[i].key))
> > > -			empty = (empty == NULL) ?( p2 + i) : empty;
> > > +			empty = (empty == NULL) ? (p2 + i) : empty;
> > >  		else if (max_cycles + p2[i].start < tms)
> > >  			old = (old == NULL) ? (p2 + i) : old;
> > >  	}
> > > @@ -366,3 +487,18 @@ ip_frag_lookup(struct rte_ip_frag_tbl *tbl,
> > >  	*stale = old;
> > >  	return NULL;
> > >  }
> > > +
> > > +struct ip_frag_pkt *
> > > +ip_frag_lookup(struct rte_ip_frag_tbl *tbl, const struct ip_frag_key
> *key,
> > > uint64_t tms,
> > > +	       struct ip_frag_pkt **free, struct ip_frag_pkt **stale) {
> > > +	switch (tbl->lookup_fn) {
> > > +#if defined(RTE_ARCH_ARM64)
> > > +	case REASSEMBLY_LOOKUP_NEON:
> > > +		return ip_frag_lookup_neon(tbl, key, tms, free, stale);
> > #endif
> > > +	case REASSEMBLY_LOOKUP_SCALAR:
> > > +	default:
> > > +		return ip_frag_lookup_scalar(tbl, key, tms, free, stale);
> > > +	}
> > > +}
> > > diff --git a/lib/ip_frag/ip_reassembly.h b/lib/ip_frag/ip_reassembly.h
> index
> > > ef9d8c0d75..049437ae32 100644
> > > --- a/lib/ip_frag/ip_reassembly.h
> > > +++ b/lib/ip_frag/ip_reassembly.h
> > > @@ -12,6 +12,11 @@
> > >
> > >  #include <rte_ip_frag.h>
> > >
> > > +enum ip_frag_lookup_func {
> > > +	REASSEMBLY_LOOKUP_SCALAR = 0,
> > > +	REASSEMBLY_LOOKUP_NEON,
> > > +};
> > > +
> > >  enum {
> > >  	IP_LAST_FRAG_IDX,    /* index of last fragment */
> > >  	IP_FIRST_FRAG_IDX,   /* index of first fragment */
> > > @@ -83,6 +88,7 @@ struct rte_ip_frag_tbl {
> > >  	struct ip_frag_pkt *last;     /* last used entry. */
> > >  	struct ip_pkt_list lru;       /* LRU list for table entries. */
> > >  	struct ip_frag_tbl_stat stat; /* statistics counters. */
> > > +	enum ip_frag_lookup_func lookup_fn;	/* hash table lookup
> > function.
> > > */
> > >  	__extension__ struct ip_frag_pkt pkt[]; /* hash table. */  };
> > >
> > > diff --git a/lib/ip_frag/rte_ip_frag_common.c
> > > b/lib/ip_frag/rte_ip_frag_common.c
> > > index c1de2e81b6..ef3c104e45 100644
> > > --- a/lib/ip_frag/rte_ip_frag_common.c
> > > +++ b/lib/ip_frag/rte_ip_frag_common.c
> > > @@ -5,7 +5,9 @@
> > >  #include <stddef.h>
> > >  #include <stdio.h>
> > >
> > > +#include <rte_cpuflags.h>
> > >  #include <rte_log.h>
> > > +#include <rte_vect.h>
> > >
> > >  #include "ip_frag_common.h"
> > >
> > > @@ -75,6 +77,14 @@ rte_ip_frag_table_create(uint32_t bucket_num,
> > > uint32_t bucket_entries,
> > >  	tbl->bucket_entries = bucket_entries;
> > >  	tbl->entry_mask = (tbl->nb_entries - 1) & ~(tbl->bucket_entries  - 1);
> > >
> > > +#if defined(RTE_ARCH_ARM64)
> > > +	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON) &&
> > > +	    rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128)
> > > +		tbl->lookup_fn = REASSEMBLY_LOOKUP_NEON;
> > > +	else
> > > +#endif
> > > +		tbl->lookup_fn = REASSEMBLY_LOOKUP_SCALAR;
> > > +
> > >  	TAILQ_INIT(&(tbl->lru));
> > >  	return tbl;
> > >  }
> > > --
> > > 2.25.1
  
Stephen Hemminger May 23, 2023, 10:30 p.m. UTC | #4
On Tue, 23 May 2023 20:09:20 +0530
<pbhagavatula@marvell.com> wrote:

> From: Pavan Nikhilesh <pbhagavatula@marvell.com>
> 
> Improve reassembly lookup performance by using NEON intrinsics for
> key validation.
> 
> Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
> ---
>  lib/ip_frag/ip_frag_internal.c   | 224 +++++++++++++++++++++++++------
>  lib/ip_frag/ip_reassembly.h      |   6 +
>  lib/ip_frag/rte_ip_frag_common.c |  10 ++
>  3 files changed, 196 insertions(+), 44 deletions(-)


Using a function pointer for the lookup has some downsides.
On Intel an indirect call is slower especially with SPECTRE mitigations.

The bigger issue is that indirect call will break usage from primary/secondary
process with ASLR. If primary sets up table and secondary uses it the function
will be in different places.
  
Pavan Nikhilesh Bhagavatula May 29, 2023, 1:17 p.m. UTC | #5
> On Tue, 23 May 2023 20:09:20 +0530
> <pbhagavatula@marvell.com> wrote:
> 
> > From: Pavan Nikhilesh <pbhagavatula@marvell.com>
> >
> > Improve reassembly lookup performance by using NEON intrinsics for
> > key validation.
> >
> > Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
> > ---
> >  lib/ip_frag/ip_frag_internal.c   | 224 +++++++++++++++++++++++++------
> >  lib/ip_frag/ip_reassembly.h      |   6 +
> >  lib/ip_frag/rte_ip_frag_common.c |  10 ++
> >  3 files changed, 196 insertions(+), 44 deletions(-)
> 
> 
> Using a function pointer for the lookup has some downsides.
> On Intel an indirect call is slower especially with SPECTRE mitigations.
> 

The patch doesn't use direct function pointers, it stores the function id and switches between them.
Function ID scheme doesn't break primary/secondary process scheme even with ASLR scheme.

> The bigger issue is that indirect call will break usage from primary/secondary
> process with ASLR. If primary sets up table and secondary uses it the function
> will be in different places.

I will be dropping this patch since the performance improvement with NEON is negligible, there is lot of bucket 
state that we unfortunately don't cache with the current implementation.
  

Patch

diff --git a/lib/ip_frag/ip_frag_internal.c b/lib/ip_frag/ip_frag_internal.c
index 7cbef647df..de78a0ed8f 100644
--- a/lib/ip_frag/ip_frag_internal.c
+++ b/lib/ip_frag/ip_frag_internal.c
@@ -4,8 +4,9 @@ 
 
 #include <stddef.h>
 
-#include <rte_jhash.h>
 #include <rte_hash_crc.h>
+#include <rte_jhash.h>
+#include <rte_vect.h>
 
 #include "ip_frag_common.h"
 
@@ -280,10 +281,166 @@  ip_frag_find(struct rte_ip_frag_tbl *tbl, struct rte_ip_frag_death_row *dr,
 	return pkt;
 }
 
-struct ip_frag_pkt *
-ip_frag_lookup(struct rte_ip_frag_tbl *tbl,
-	const struct ip_frag_key *key, uint64_t tms,
-	struct ip_frag_pkt **free, struct ip_frag_pkt **stale)
+static inline void
+ip_frag_dbg(struct rte_ip_frag_tbl *tbl, struct ip_frag_pkt *p,
+	    uint32_t list_idx, uint32_t list_cnt)
+{
+	RTE_SET_USED(tbl);
+	RTE_SET_USED(list_idx);
+	RTE_SET_USED(list_cnt);
+	if (p->key.key_len == IPV4_KEYLEN)
+		IP_FRAG_LOG(DEBUG,
+			    "%s:%d:\n"
+			    "tbl: %p, max_entries: %u, use_entries: %u\n"
+			    "ipv4_frag_pkt line0: %p, index: %u from %u\n"
+			    "key: <%" PRIx64 ", %#x>, start: %" PRIu64 "\n",
+			    __func__, __LINE__, tbl, tbl->max_entries,
+			    tbl->use_entries, p, list_idx, list_cnt,
+			    p->key.src_dst[0], p->key.id, p->start);
+	else
+		IP_FRAG_LOG(DEBUG,
+			    "%s:%d:\n"
+			    "tbl: %p, max_entries: %u, use_entries: %u\n"
+			    "ipv6_frag_pkt line0: %p, index: %u from %u\n"
+			    "key: <" IPv6_KEY_BYTES_FMT
+			    ", %#x>, start: %" PRIu64 "\n",
+			    __func__, __LINE__, tbl, tbl->max_entries,
+			    tbl->use_entries, p, list_idx, list_cnt,
+			    IPv6_KEY_BYTES(p1[i].key.src_dst), p->key.id,
+			    p->start);
+}
+
+#if defined(RTE_ARCH_ARM64)
+static inline struct ip_frag_pkt *
+ip_frag_lookup_neon(struct rte_ip_frag_tbl *tbl, const struct ip_frag_key *key, uint64_t tms,
+		    struct ip_frag_pkt **free, struct ip_frag_pkt **stale)
+{
+	struct ip_frag_pkt *empty, *old;
+	struct ip_frag_pkt *p1, *p2;
+	uint32_t assoc, sig1, sig2;
+	uint64_t max_cycles;
+
+	empty = NULL;
+	old = NULL;
+
+	max_cycles = tbl->max_cycles;
+	assoc = tbl->bucket_entries;
+
+	if (tbl->last != NULL && ip_frag_key_cmp(key, &tbl->last->key) == 0)
+		return tbl->last;
+
+	/* different hashing methods for IPv4 and IPv6 */
+	if (key->key_len == IPV4_KEYLEN)
+		ipv4_frag_hash(key, &sig1, &sig2);
+	else
+		ipv6_frag_hash(key, &sig1, &sig2);
+
+	p1 = IP_FRAG_TBL_POS(tbl, sig1);
+	p2 = IP_FRAG_TBL_POS(tbl, sig2);
+
+	uint64x2_t key0, key1, key2, key3;
+	uint64_t vmask, zmask, ts_mask;
+	uint64x2_t ts0, ts1;
+	uint32x4_t nz_key;
+	uint8_t idx;
+	/* Bucket entries are always power of 2. */
+	rte_prefetch0(&p1[0].key);
+	rte_prefetch0(&p1[1].key);
+	rte_prefetch0(&p2[0].key);
+	rte_prefetch0(&p2[1].key);
+
+	while (assoc > 1) {
+		if (assoc > 2) {
+			rte_prefetch0(&p1[2].key);
+			rte_prefetch0(&p1[3].key);
+			rte_prefetch0(&p2[2].key);
+			rte_prefetch0(&p2[3].key);
+		}
+		struct ip_frag_pkt *p[] = {&p1[0], &p2[0], &p1[1], &p2[1]};
+		key0 = vld1q_u64(&p[0]->key.id_key_len);
+		key1 = vld1q_u64(&p[1]->key.id_key_len);
+		key2 = vld1q_u64(&p[2]->key.id_key_len);
+		key3 = vld1q_u64(&p[3]->key.id_key_len);
+
+		nz_key = vsetq_lane_u32(vgetq_lane_u32(vreinterpretq_u32_u64(key0), 1), nz_key, 0);
+		nz_key = vsetq_lane_u32(vgetq_lane_u32(vreinterpretq_u32_u64(key1), 1), nz_key, 1);
+		nz_key = vsetq_lane_u32(vgetq_lane_u32(vreinterpretq_u32_u64(key2), 1), nz_key, 2);
+		nz_key = vsetq_lane_u32(vgetq_lane_u32(vreinterpretq_u32_u64(key3), 1), nz_key, 3);
+
+		nz_key = vceqzq_u32(nz_key);
+		zmask = vget_lane_u64(vreinterpret_u64_u16(vshrn_n_u32(nz_key, 16)), 0);
+		vmask = ~zmask;
+
+		vmask &= 0x8000800080008000;
+		for (; vmask > 0; vmask &= vmask - 1) {
+			idx = __builtin_ctzll(vmask) >> 4;
+			if (ip_frag_key_cmp(key, &p[idx]->key) == 0)
+				return p[idx];
+		}
+
+		vmask = ~zmask;
+		if (zmask && empty == NULL) {
+			zmask &= 0x8000800080008000;
+			idx = __builtin_ctzll(zmask) >> 4;
+			empty = p[idx];
+		}
+
+		if (vmask && old == NULL) {
+			const uint64x2_t max_cyc = vdupq_n_u64(max_cycles);
+			const uint64x2_t cur_cyc = vdupq_n_u64(tms);
+
+			ts0 = vsetq_lane_u64(vgetq_lane_u64(key0, 1), ts0, 0);
+			ts0 = vsetq_lane_u64(vgetq_lane_u64(key1, 1), ts0, 1);
+			ts1 = vsetq_lane_u64(vgetq_lane_u64(key2, 1), ts1, 0);
+			ts1 = vsetq_lane_u64(vgetq_lane_u64(key3, 1), ts1, 1);
+
+			ts0 = vcgtq_u64(cur_cyc, vaddq_u64(ts0, max_cyc));
+			ts1 = vcgtq_u64(cur_cyc, vaddq_u64(ts1, max_cyc));
+
+			ts_mask = vget_lane_u64(vreinterpret_u64_u16(vshrn_n_u32(
+							vuzp1q_u32(vreinterpretq_u32_u64(ts0),
+								   vreinterpretq_u32_u64(ts1)),
+							16)),
+						0);
+			vmask &= 0x8000800080008000;
+			ts_mask &= vmask;
+			if (ts_mask) {
+				idx = __builtin_ctzll(ts_mask) >> 4;
+				old = p[idx];
+			}
+		}
+		p1 += 2;
+		p2 += 2;
+		assoc -= 4;
+	}
+	while (assoc) {
+		if (ip_frag_key_cmp(key, &p1->key) == 0)
+			return p1;
+		else if (ip_frag_key_is_empty(&p1->key))
+			empty = (empty == NULL) ? p1 : empty;
+		else if (max_cycles + p1->start < tms)
+			old = (old == NULL) ? p1 : old;
+
+		if (ip_frag_key_cmp(key, &p2->key) == 0)
+			return p2;
+		else if (ip_frag_key_is_empty(&p2->key))
+			empty = (empty == NULL) ? p2 : empty;
+		else if (max_cycles + p2->start < tms)
+			old = (old == NULL) ? p2 : old;
+		p1++;
+		p2++;
+		assoc--;
+	}
+
+	*free = empty;
+	*stale = old;
+	return NULL;
+}
+#endif
+
+static struct ip_frag_pkt *
+ip_frag_lookup_scalar(struct rte_ip_frag_tbl *tbl, const struct ip_frag_key *key, uint64_t tms,
+		      struct ip_frag_pkt **free, struct ip_frag_pkt **stale)
 {
 	struct ip_frag_pkt *p1, *p2;
 	struct ip_frag_pkt *empty, *old;
@@ -309,25 +466,7 @@  ip_frag_lookup(struct rte_ip_frag_tbl *tbl,
 	p2 = IP_FRAG_TBL_POS(tbl, sig2);
 
 	for (i = 0; i != assoc; i++) {
-		if (p1->key.key_len == IPV4_KEYLEN)
-			IP_FRAG_LOG(DEBUG, "%s:%d:\n"
-					"tbl: %p, max_entries: %u, use_entries: %u\n"
-					"ipv4_frag_pkt line0: %p, index: %u from %u\n"
-			"key: <%" PRIx64 ", %#x>, start: %" PRIu64 "\n",
-					__func__, __LINE__,
-					tbl, tbl->max_entries, tbl->use_entries,
-					p1, i, assoc,
-			p1[i].key.src_dst[0], p1[i].key.id, p1[i].start);
-		else
-			IP_FRAG_LOG(DEBUG, "%s:%d:\n"
-					"tbl: %p, max_entries: %u, use_entries: %u\n"
-					"ipv6_frag_pkt line0: %p, index: %u from %u\n"
-			"key: <" IPv6_KEY_BYTES_FMT ", %#x>, start: %" PRIu64 "\n",
-					__func__, __LINE__,
-					tbl, tbl->max_entries, tbl->use_entries,
-					p1, i, assoc,
-			IPv6_KEY_BYTES(p1[i].key.src_dst), p1[i].key.id, p1[i].start);
-
+		ip_frag_dbg(tbl, &p1[i], i, assoc);
 		if (ip_frag_key_cmp(key, &p1[i].key) == 0)
 			return p1 + i;
 		else if (ip_frag_key_is_empty(&p1[i].key))
@@ -335,29 +474,11 @@  ip_frag_lookup(struct rte_ip_frag_tbl *tbl,
 		else if (max_cycles + p1[i].start < tms)
 			old = (old == NULL) ? (p1 + i) : old;
 
-		if (p2->key.key_len == IPV4_KEYLEN)
-			IP_FRAG_LOG(DEBUG, "%s:%d:\n"
-					"tbl: %p, max_entries: %u, use_entries: %u\n"
-					"ipv4_frag_pkt line1: %p, index: %u from %u\n"
-			"key: <%" PRIx64 ", %#x>, start: %" PRIu64 "\n",
-					__func__, __LINE__,
-					tbl, tbl->max_entries, tbl->use_entries,
-					p2, i, assoc,
-			p2[i].key.src_dst[0], p2[i].key.id, p2[i].start);
-		else
-			IP_FRAG_LOG(DEBUG, "%s:%d:\n"
-					"tbl: %p, max_entries: %u, use_entries: %u\n"
-					"ipv6_frag_pkt line1: %p, index: %u from %u\n"
-			"key: <" IPv6_KEY_BYTES_FMT ", %#x>, start: %" PRIu64 "\n",
-					__func__, __LINE__,
-					tbl, tbl->max_entries, tbl->use_entries,
-					p2, i, assoc,
-			IPv6_KEY_BYTES(p2[i].key.src_dst), p2[i].key.id, p2[i].start);
-
+		ip_frag_dbg(tbl, &p2[i], i, assoc);
 		if (ip_frag_key_cmp(key, &p2[i].key) == 0)
 			return p2 + i;
 		else if (ip_frag_key_is_empty(&p2[i].key))
-			empty = (empty == NULL) ?( p2 + i) : empty;
+			empty = (empty == NULL) ? (p2 + i) : empty;
 		else if (max_cycles + p2[i].start < tms)
 			old = (old == NULL) ? (p2 + i) : old;
 	}
@@ -366,3 +487,18 @@  ip_frag_lookup(struct rte_ip_frag_tbl *tbl,
 	*stale = old;
 	return NULL;
 }
+
+struct ip_frag_pkt *
+ip_frag_lookup(struct rte_ip_frag_tbl *tbl, const struct ip_frag_key *key, uint64_t tms,
+	       struct ip_frag_pkt **free, struct ip_frag_pkt **stale)
+{
+	switch (tbl->lookup_fn) {
+#if defined(RTE_ARCH_ARM64)
+	case REASSEMBLY_LOOKUP_NEON:
+		return ip_frag_lookup_neon(tbl, key, tms, free, stale);
+#endif
+	case REASSEMBLY_LOOKUP_SCALAR:
+	default:
+		return ip_frag_lookup_scalar(tbl, key, tms, free, stale);
+	}
+}
diff --git a/lib/ip_frag/ip_reassembly.h b/lib/ip_frag/ip_reassembly.h
index ef9d8c0d75..049437ae32 100644
--- a/lib/ip_frag/ip_reassembly.h
+++ b/lib/ip_frag/ip_reassembly.h
@@ -12,6 +12,11 @@ 
 
 #include <rte_ip_frag.h>
 
+enum ip_frag_lookup_func {
+	REASSEMBLY_LOOKUP_SCALAR = 0,
+	REASSEMBLY_LOOKUP_NEON,
+};
+
 enum {
 	IP_LAST_FRAG_IDX,    /* index of last fragment */
 	IP_FIRST_FRAG_IDX,   /* index of first fragment */
@@ -83,6 +88,7 @@  struct rte_ip_frag_tbl {
 	struct ip_frag_pkt *last;     /* last used entry. */
 	struct ip_pkt_list lru;       /* LRU list for table entries. */
 	struct ip_frag_tbl_stat stat; /* statistics counters. */
+	enum ip_frag_lookup_func lookup_fn;	/* hash table lookup function. */
 	__extension__ struct ip_frag_pkt pkt[]; /* hash table. */
 };
 
diff --git a/lib/ip_frag/rte_ip_frag_common.c b/lib/ip_frag/rte_ip_frag_common.c
index c1de2e81b6..ef3c104e45 100644
--- a/lib/ip_frag/rte_ip_frag_common.c
+++ b/lib/ip_frag/rte_ip_frag_common.c
@@ -5,7 +5,9 @@ 
 #include <stddef.h>
 #include <stdio.h>
 
+#include <rte_cpuflags.h>
 #include <rte_log.h>
+#include <rte_vect.h>
 
 #include "ip_frag_common.h"
 
@@ -75,6 +77,14 @@  rte_ip_frag_table_create(uint32_t bucket_num, uint32_t bucket_entries,
 	tbl->bucket_entries = bucket_entries;
 	tbl->entry_mask = (tbl->nb_entries - 1) & ~(tbl->bucket_entries  - 1);
 
+#if defined(RTE_ARCH_ARM64)
+	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON) &&
+	    rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128)
+		tbl->lookup_fn = REASSEMBLY_LOOKUP_NEON;
+	else
+#endif
+		tbl->lookup_fn = REASSEMBLY_LOOKUP_SCALAR;
+
 	TAILQ_INIT(&(tbl->lru));
 	return tbl;
 }