[2/3] ip_frag: improve reassembly lookup performance

Message ID 20230523125413.6324-2-pbhagavatula@marvell.com (mailing list archive)
State Superseded, archived
Delegated to: Thomas Monjalon
Headers
Series [1/3] ip_frag: optimize key compare and hash generation |

Checks

Context Check Description
ci/checkpatch success coding style OK

Commit Message

Pavan Nikhilesh Bhagavatula May 23, 2023, 12:54 p.m. UTC
  From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Improve reassembly lookup performance by using NEON intrinsics for
key validation.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 lib/ip_frag/ip_frag_internal.c   | 224 +++++++++++++++++++++++++------
 lib/ip_frag/ip_reassembly.h      |   6 +
 lib/ip_frag/rte_ip_frag_common.c |  10 ++
 3 files changed, 196 insertions(+), 44 deletions(-)

--
2.39.1
  

Patch

diff --git a/lib/ip_frag/ip_frag_internal.c b/lib/ip_frag/ip_frag_internal.c
index 7cbef647df..de78a0ed8f 100644
--- a/lib/ip_frag/ip_frag_internal.c
+++ b/lib/ip_frag/ip_frag_internal.c
@@ -4,8 +4,9 @@ 

 #include <stddef.h>

-#include <rte_jhash.h>
 #include <rte_hash_crc.h>
+#include <rte_jhash.h>
+#include <rte_vect.h>

 #include "ip_frag_common.h"

@@ -280,10 +281,166 @@  ip_frag_find(struct rte_ip_frag_tbl *tbl, struct rte_ip_frag_death_row *dr,
 	return pkt;
 }

-struct ip_frag_pkt *
-ip_frag_lookup(struct rte_ip_frag_tbl *tbl,
-	const struct ip_frag_key *key, uint64_t tms,
-	struct ip_frag_pkt **free, struct ip_frag_pkt **stale)
+static inline void
+ip_frag_dbg(struct rte_ip_frag_tbl *tbl, struct ip_frag_pkt *p,
+	    uint32_t list_idx, uint32_t list_cnt)
+{
+	RTE_SET_USED(tbl);
+	RTE_SET_USED(list_idx);
+	RTE_SET_USED(list_cnt);
+	if (p->key.key_len == IPV4_KEYLEN)
+		IP_FRAG_LOG(DEBUG,
+			    "%s:%d:\n"
+			    "tbl: %p, max_entries: %u, use_entries: %u\n"
+			    "ipv4_frag_pkt line0: %p, index: %u from %u\n"
+			    "key: <%" PRIx64 ", %#x>, start: %" PRIu64 "\n",
+			    __func__, __LINE__, tbl, tbl->max_entries,
+			    tbl->use_entries, p, list_idx, list_cnt,
+			    p->key.src_dst[0], p->key.id, p->start);
+	else
+		IP_FRAG_LOG(DEBUG,
+			    "%s:%d:\n"
+			    "tbl: %p, max_entries: %u, use_entries: %u\n"
+			    "ipv6_frag_pkt line0: %p, index: %u from %u\n"
+			    "key: <" IPv6_KEY_BYTES_FMT
+			    ", %#x>, start: %" PRIu64 "\n",
+			    __func__, __LINE__, tbl, tbl->max_entries,
+			    tbl->use_entries, p, list_idx, list_cnt,
+			    IPv6_KEY_BYTES(p1[i].key.src_dst), p->key.id,
+			    p->start);
+}
+
+#if defined(RTE_ARCH_ARM64)
+static inline struct ip_frag_pkt *
+ip_frag_lookup_neon(struct rte_ip_frag_tbl *tbl, const struct ip_frag_key *key, uint64_t tms,
+		    struct ip_frag_pkt **free, struct ip_frag_pkt **stale)
+{
+	struct ip_frag_pkt *empty, *old;
+	struct ip_frag_pkt *p1, *p2;
+	uint32_t assoc, sig1, sig2;
+	uint64_t max_cycles;
+
+	empty = NULL;
+	old = NULL;
+
+	max_cycles = tbl->max_cycles;
+	assoc = tbl->bucket_entries;
+
+	if (tbl->last != NULL && ip_frag_key_cmp(key, &tbl->last->key) == 0)
+		return tbl->last;
+
+	/* different hashing methods for IPv4 and IPv6 */
+	if (key->key_len == IPV4_KEYLEN)
+		ipv4_frag_hash(key, &sig1, &sig2);
+	else
+		ipv6_frag_hash(key, &sig1, &sig2);
+
+	p1 = IP_FRAG_TBL_POS(tbl, sig1);
+	p2 = IP_FRAG_TBL_POS(tbl, sig2);
+
+	uint64x2_t key0, key1, key2, key3;
+	uint64_t vmask, zmask, ts_mask;
+	uint64x2_t ts0, ts1;
+	uint32x4_t nz_key;
+	uint8_t idx;
+	/* Bucket entries are always power of 2. */
+	rte_prefetch0(&p1[0].key);
+	rte_prefetch0(&p1[1].key);
+	rte_prefetch0(&p2[0].key);
+	rte_prefetch0(&p2[1].key);
+
+	while (assoc > 1) {
+		if (assoc > 2) {
+			rte_prefetch0(&p1[2].key);
+			rte_prefetch0(&p1[3].key);
+			rte_prefetch0(&p2[2].key);
+			rte_prefetch0(&p2[3].key);
+		}
+		struct ip_frag_pkt *p[] = {&p1[0], &p2[0], &p1[1], &p2[1]};
+		key0 = vld1q_u64(&p[0]->key.id_key_len);
+		key1 = vld1q_u64(&p[1]->key.id_key_len);
+		key2 = vld1q_u64(&p[2]->key.id_key_len);
+		key3 = vld1q_u64(&p[3]->key.id_key_len);
+
+		nz_key = vsetq_lane_u32(vgetq_lane_u32(vreinterpretq_u32_u64(key0), 1), nz_key, 0);
+		nz_key = vsetq_lane_u32(vgetq_lane_u32(vreinterpretq_u32_u64(key1), 1), nz_key, 1);
+		nz_key = vsetq_lane_u32(vgetq_lane_u32(vreinterpretq_u32_u64(key2), 1), nz_key, 2);
+		nz_key = vsetq_lane_u32(vgetq_lane_u32(vreinterpretq_u32_u64(key3), 1), nz_key, 3);
+
+		nz_key = vceqzq_u32(nz_key);
+		zmask = vget_lane_u64(vreinterpret_u64_u16(vshrn_n_u32(nz_key, 16)), 0);
+		vmask = ~zmask;
+
+		vmask &= 0x8000800080008000;
+		for (; vmask > 0; vmask &= vmask - 1) {
+			idx = __builtin_ctzll(vmask) >> 4;
+			if (ip_frag_key_cmp(key, &p[idx]->key) == 0)
+				return p[idx];
+		}
+
+		vmask = ~zmask;
+		if (zmask && empty == NULL) {
+			zmask &= 0x8000800080008000;
+			idx = __builtin_ctzll(zmask) >> 4;
+			empty = p[idx];
+		}
+
+		if (vmask && old == NULL) {
+			const uint64x2_t max_cyc = vdupq_n_u64(max_cycles);
+			const uint64x2_t cur_cyc = vdupq_n_u64(tms);
+
+			ts0 = vsetq_lane_u64(vgetq_lane_u64(key0, 1), ts0, 0);
+			ts0 = vsetq_lane_u64(vgetq_lane_u64(key1, 1), ts0, 1);
+			ts1 = vsetq_lane_u64(vgetq_lane_u64(key2, 1), ts1, 0);
+			ts1 = vsetq_lane_u64(vgetq_lane_u64(key3, 1), ts1, 1);
+
+			ts0 = vcgtq_u64(cur_cyc, vaddq_u64(ts0, max_cyc));
+			ts1 = vcgtq_u64(cur_cyc, vaddq_u64(ts1, max_cyc));
+
+			ts_mask = vget_lane_u64(vreinterpret_u64_u16(vshrn_n_u32(
+							vuzp1q_u32(vreinterpretq_u32_u64(ts0),
+								   vreinterpretq_u32_u64(ts1)),
+							16)),
+						0);
+			vmask &= 0x8000800080008000;
+			ts_mask &= vmask;
+			if (ts_mask) {
+				idx = __builtin_ctzll(ts_mask) >> 4;
+				old = p[idx];
+			}
+		}
+		p1 += 2;
+		p2 += 2;
+		assoc -= 4;
+	}
+	while (assoc) {
+		if (ip_frag_key_cmp(key, &p1->key) == 0)
+			return p1;
+		else if (ip_frag_key_is_empty(&p1->key))
+			empty = (empty == NULL) ? p1 : empty;
+		else if (max_cycles + p1->start < tms)
+			old = (old == NULL) ? p1 : old;
+
+		if (ip_frag_key_cmp(key, &p2->key) == 0)
+			return p2;
+		else if (ip_frag_key_is_empty(&p2->key))
+			empty = (empty == NULL) ? p2 : empty;
+		else if (max_cycles + p2->start < tms)
+			old = (old == NULL) ? p2 : old;
+		p1++;
+		p2++;
+		assoc--;
+	}
+
+	*free = empty;
+	*stale = old;
+	return NULL;
+}
+#endif
+
+static struct ip_frag_pkt *
+ip_frag_lookup_scalar(struct rte_ip_frag_tbl *tbl, const struct ip_frag_key *key, uint64_t tms,
+		      struct ip_frag_pkt **free, struct ip_frag_pkt **stale)
 {
 	struct ip_frag_pkt *p1, *p2;
 	struct ip_frag_pkt *empty, *old;
@@ -309,25 +466,7 @@  ip_frag_lookup(struct rte_ip_frag_tbl *tbl,
 	p2 = IP_FRAG_TBL_POS(tbl, sig2);

 	for (i = 0; i != assoc; i++) {
-		if (p1->key.key_len == IPV4_KEYLEN)
-			IP_FRAG_LOG(DEBUG, "%s:%d:\n"
-					"tbl: %p, max_entries: %u, use_entries: %u\n"
-					"ipv4_frag_pkt line0: %p, index: %u from %u\n"
-			"key: <%" PRIx64 ", %#x>, start: %" PRIu64 "\n",
-					__func__, __LINE__,
-					tbl, tbl->max_entries, tbl->use_entries,
-					p1, i, assoc,
-			p1[i].key.src_dst[0], p1[i].key.id, p1[i].start);
-		else
-			IP_FRAG_LOG(DEBUG, "%s:%d:\n"
-					"tbl: %p, max_entries: %u, use_entries: %u\n"
-					"ipv6_frag_pkt line0: %p, index: %u from %u\n"
-			"key: <" IPv6_KEY_BYTES_FMT ", %#x>, start: %" PRIu64 "\n",
-					__func__, __LINE__,
-					tbl, tbl->max_entries, tbl->use_entries,
-					p1, i, assoc,
-			IPv6_KEY_BYTES(p1[i].key.src_dst), p1[i].key.id, p1[i].start);
-
+		ip_frag_dbg(tbl, &p1[i], i, assoc);
 		if (ip_frag_key_cmp(key, &p1[i].key) == 0)
 			return p1 + i;
 		else if (ip_frag_key_is_empty(&p1[i].key))
@@ -335,29 +474,11 @@  ip_frag_lookup(struct rte_ip_frag_tbl *tbl,
 		else if (max_cycles + p1[i].start < tms)
 			old = (old == NULL) ? (p1 + i) : old;

-		if (p2->key.key_len == IPV4_KEYLEN)
-			IP_FRAG_LOG(DEBUG, "%s:%d:\n"
-					"tbl: %p, max_entries: %u, use_entries: %u\n"
-					"ipv4_frag_pkt line1: %p, index: %u from %u\n"
-			"key: <%" PRIx64 ", %#x>, start: %" PRIu64 "\n",
-					__func__, __LINE__,
-					tbl, tbl->max_entries, tbl->use_entries,
-					p2, i, assoc,
-			p2[i].key.src_dst[0], p2[i].key.id, p2[i].start);
-		else
-			IP_FRAG_LOG(DEBUG, "%s:%d:\n"
-					"tbl: %p, max_entries: %u, use_entries: %u\n"
-					"ipv6_frag_pkt line1: %p, index: %u from %u\n"
-			"key: <" IPv6_KEY_BYTES_FMT ", %#x>, start: %" PRIu64 "\n",
-					__func__, __LINE__,
-					tbl, tbl->max_entries, tbl->use_entries,
-					p2, i, assoc,
-			IPv6_KEY_BYTES(p2[i].key.src_dst), p2[i].key.id, p2[i].start);
-
+		ip_frag_dbg(tbl, &p2[i], i, assoc);
 		if (ip_frag_key_cmp(key, &p2[i].key) == 0)
 			return p2 + i;
 		else if (ip_frag_key_is_empty(&p2[i].key))
-			empty = (empty == NULL) ?( p2 + i) : empty;
+			empty = (empty == NULL) ? (p2 + i) : empty;
 		else if (max_cycles + p2[i].start < tms)
 			old = (old == NULL) ? (p2 + i) : old;
 	}
@@ -366,3 +487,18 @@  ip_frag_lookup(struct rte_ip_frag_tbl *tbl,
 	*stale = old;
 	return NULL;
 }
+
+struct ip_frag_pkt *
+ip_frag_lookup(struct rte_ip_frag_tbl *tbl, const struct ip_frag_key *key, uint64_t tms,
+	       struct ip_frag_pkt **free, struct ip_frag_pkt **stale)
+{
+	switch (tbl->lookup_fn) {
+#if defined(RTE_ARCH_ARM64)
+	case REASSEMBLY_LOOKUP_NEON:
+		return ip_frag_lookup_neon(tbl, key, tms, free, stale);
+#endif
+	case REASSEMBLY_LOOKUP_SCALAR:
+	default:
+		return ip_frag_lookup_scalar(tbl, key, tms, free, stale);
+	}
+}
diff --git a/lib/ip_frag/ip_reassembly.h b/lib/ip_frag/ip_reassembly.h
index ef9d8c0d75..049437ae32 100644
--- a/lib/ip_frag/ip_reassembly.h
+++ b/lib/ip_frag/ip_reassembly.h
@@ -12,6 +12,11 @@ 

 #include <rte_ip_frag.h>

+enum ip_frag_lookup_func {
+	REASSEMBLY_LOOKUP_SCALAR = 0,
+	REASSEMBLY_LOOKUP_NEON,
+};
+
 enum {
 	IP_LAST_FRAG_IDX,    /* index of last fragment */
 	IP_FIRST_FRAG_IDX,   /* index of first fragment */
@@ -83,6 +88,7 @@  struct rte_ip_frag_tbl {
 	struct ip_frag_pkt *last;     /* last used entry. */
 	struct ip_pkt_list lru;       /* LRU list for table entries. */
 	struct ip_frag_tbl_stat stat; /* statistics counters. */
+	enum ip_frag_lookup_func lookup_fn;	/* hash table lookup function. */
 	__extension__ struct ip_frag_pkt pkt[]; /* hash table. */
 };

diff --git a/lib/ip_frag/rte_ip_frag_common.c b/lib/ip_frag/rte_ip_frag_common.c
index c1de2e81b6..ef3c104e45 100644
--- a/lib/ip_frag/rte_ip_frag_common.c
+++ b/lib/ip_frag/rte_ip_frag_common.c
@@ -5,7 +5,9 @@ 
 #include <stddef.h>
 #include <stdio.h>

+#include <rte_cpuflags.h>
 #include <rte_log.h>
+#include <rte_vect.h>

 #include "ip_frag_common.h"

@@ -75,6 +77,14 @@  rte_ip_frag_table_create(uint32_t bucket_num, uint32_t bucket_entries,
 	tbl->bucket_entries = bucket_entries;
 	tbl->entry_mask = (tbl->nb_entries - 1) & ~(tbl->bucket_entries  - 1);

+#if defined(RTE_ARCH_ARM64)
+	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_NEON) &&
+	    rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128)
+		tbl->lookup_fn = REASSEMBLY_LOOKUP_NEON;
+	else
+#endif
+		tbl->lookup_fn = REASSEMBLY_LOOKUP_SCALAR;
+
 	TAILQ_INIT(&(tbl->lru));
 	return tbl;
 }