From patchwork Mon Jan 12 19:16:10 2015 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Ananyev, Konstantin" X-Patchwork-Id: 2253 Return-Path: X-Original-To: patchwork@dpdk.org Delivered-To: patchwork@dpdk.org Received: from [92.243.14.124] (localhost [IPv6:::1]) by dpdk.org (Postfix) with ESMTP id 624BF5A84; Mon, 12 Jan 2015 20:16:59 +0100 (CET) Received: from mga02.intel.com (mga02.intel.com [134.134.136.20]) by dpdk.org (Postfix) with ESMTP id 449495A7C for ; Mon, 12 Jan 2015 20:16:38 +0100 (CET) Received: from fmsmga001.fm.intel.com ([10.253.24.23]) by orsmga101.jf.intel.com with ESMTP; 12 Jan 2015 11:16:36 -0800 X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="5.07,745,1413270000"; d="scan'208";a="649962529" Received: from irvmail001.ir.intel.com ([163.33.26.43]) by fmsmga001.fm.intel.com with ESMTP; 12 Jan 2015 11:16:35 -0800 Received: from sivswdev02.ir.intel.com (sivswdev02.ir.intel.com [10.237.217.46]) by irvmail001.ir.intel.com (8.14.3/8.13.6/MailSET/Hub) with ESMTP id t0CJGZ4v008625; Mon, 12 Jan 2015 19:16:35 GMT Received: from sivswdev02.ir.intel.com (localhost [127.0.0.1]) by sivswdev02.ir.intel.com with ESMTP id t0CJGZZh017242; Mon, 12 Jan 2015 19:16:35 GMT Received: (from kananye1@localhost) by sivswdev02.ir.intel.com with id t0CJGZM0017238; Mon, 12 Jan 2015 19:16:35 GMT From: Konstantin Ananyev To: dev@dpdk.org Date: Mon, 12 Jan 2015 19:16:10 +0000 Message-Id: <1421090181-17150-7-git-send-email-konstantin.ananyev@intel.com> X-Mailer: git-send-email 1.7.4.1 In-Reply-To: <1421090181-17150-1-git-send-email-konstantin.ananyev@intel.com> References: <1421090181-17150-1-git-send-email-konstantin.ananyev@intel.com> Subject: [dpdk-dev] [PATCH v2 06/17] librte_acl: introduce DFA nodes compression (group64) for identical entries. X-BeenThere: dev@dpdk.org X-Mailman-Version: 2.1.15 Precedence: list List-Id: patches and discussions about DPDK List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: dev-bounces@dpdk.org Sender: "dev" Introduced division of whole 256 child transition enties into 4 sub-groups (64 kids per group). So 2 groups within the same node with identical children, can use one set of transition entries. That allows to compact some DFA nodes and get space savings in the RT table, without any negative performance impact. From what I've seen an average space savings: ~20%. Signed-off-by: Konstantin Ananyev --- lib/librte_acl/acl.h | 12 ++- lib/librte_acl/acl_gen.c | 195 ++++++++++++++++++++++++++++------------ lib/librte_acl/acl_run_scalar.c | 38 ++++---- lib/librte_acl/acl_run_sse.c | 99 ++++++-------------- 4 files changed, 196 insertions(+), 148 deletions(-) diff --git a/lib/librte_acl/acl.h b/lib/librte_acl/acl.h index 102fa51..3f6ac79 100644 --- a/lib/librte_acl/acl.h +++ b/lib/librte_acl/acl.h @@ -47,6 +47,11 @@ extern"C" { #define RTE_ACL_DFA_MAX UINT8_MAX #define RTE_ACL_DFA_SIZE (UINT8_MAX + 1) +#define RTE_ACL_DFA_GR64_SIZE 64 +#define RTE_ACL_DFA_GR64_NUM (RTE_ACL_DFA_SIZE / RTE_ACL_DFA_GR64_SIZE) +#define RTE_ACL_DFA_GR64_BIT \ + (CHAR_BIT * sizeof(uint32_t) / RTE_ACL_DFA_GR64_NUM) + typedef int bits_t; #define RTE_ACL_BIT_SET_SIZE ((UINT8_MAX + 1) / (sizeof(bits_t) * CHAR_BIT)) @@ -100,8 +105,11 @@ struct rte_acl_node { /* number of ranges (transitions w/ consecutive bits) */ int32_t id; struct rte_acl_match_results *mrt; /* only valid when match_flag != 0 */ - char transitions[RTE_ACL_QUAD_SIZE]; - /* boundaries for ranged node */ + union { + char transitions[RTE_ACL_QUAD_SIZE]; + /* boundaries for ranged node */ + uint8_t dfa_gr64[RTE_ACL_DFA_GR64_NUM]; + }; struct rte_acl_node *next; /* free list link or pointer to duplicate node during merge */ struct rte_acl_node *prev; diff --git a/lib/librte_acl/acl_gen.c b/lib/librte_acl/acl_gen.c index b1f766b..c9b7839 100644 --- a/lib/librte_acl/acl_gen.c +++ b/lib/librte_acl/acl_gen.c @@ -43,13 +43,14 @@ } while (0) struct acl_node_counters { - int match; - int match_used; - int single; - int quad; - int quad_vectors; - int dfa; - int smallest_match; + int32_t match; + int32_t match_used; + int32_t single; + int32_t quad; + int32_t quad_vectors; + int32_t dfa; + int32_t dfa_gr64; + int32_t smallest_match; }; struct rte_acl_indices { @@ -61,24 +62,118 @@ struct rte_acl_indices { static void acl_gen_log_stats(const struct rte_acl_ctx *ctx, - const struct acl_node_counters *counts) + const struct acl_node_counters *counts, + const struct rte_acl_indices *indices) { RTE_LOG(DEBUG, ACL, "Gen phase for ACL \"%s\":\n" "runtime memory footprint on socket %d:\n" "single nodes/bytes used: %d/%zu\n" - "quad nodes/bytes used: %d/%zu\n" - "DFA nodes/bytes used: %d/%zu\n" + "quad nodes/vectors/bytes used: %d/%d/%zu\n" + "DFA nodes/group64/bytes used: %d/%d/%zu\n" "match nodes/bytes used: %d/%zu\n" "total: %zu bytes\n", ctx->name, ctx->socket_id, counts->single, counts->single * sizeof(uint64_t), - counts->quad, counts->quad_vectors * sizeof(uint64_t), - counts->dfa, counts->dfa * RTE_ACL_DFA_SIZE * sizeof(uint64_t), + counts->quad, counts->quad_vectors, + (indices->quad_index - indices->dfa_index) * sizeof(uint64_t), + counts->dfa, counts->dfa_gr64, + indices->dfa_index * sizeof(uint64_t), counts->match, counts->match * sizeof(struct rte_acl_match_results), ctx->mem_sz); } +static uint64_t +acl_dfa_gen_idx(const struct rte_acl_node *node, uint32_t index) +{ + uint64_t idx; + uint32_t i; + + idx = 0; + for (i = 0; i != RTE_DIM(node->dfa_gr64); i++) { + RTE_ACL_VERIFY(node->dfa_gr64[i] < RTE_ACL_DFA_GR64_NUM); + RTE_ACL_VERIFY(node->dfa_gr64[i] < node->fanout); + idx |= (i - node->dfa_gr64[i]) << + (6 + RTE_ACL_DFA_GR64_BIT * i); + } + + return idx << (CHAR_BIT * sizeof(index)) | index | node->node_type; +} + +static void +acl_dfa_fill_gr64(const struct rte_acl_node *node, + const uint64_t src[RTE_ACL_DFA_SIZE], uint64_t dst[RTE_ACL_DFA_SIZE]) +{ + uint32_t i; + + for (i = 0; i != RTE_DIM(node->dfa_gr64); i++) { + memcpy(dst + node->dfa_gr64[i] * RTE_ACL_DFA_GR64_SIZE, + src + i * RTE_ACL_DFA_GR64_SIZE, + RTE_ACL_DFA_GR64_SIZE * sizeof(dst[0])); + } +} + +static uint32_t +acl_dfa_count_gr64(const uint64_t array_ptr[RTE_ACL_DFA_SIZE], + uint8_t gr64[RTE_ACL_DFA_GR64_NUM]) +{ + uint32_t i, j, k; + + k = 0; + for (i = 0; i != RTE_ACL_DFA_GR64_NUM; i++) { + gr64[i] = i; + for (j = 0; j != i; j++) { + if (memcmp(array_ptr + i * RTE_ACL_DFA_GR64_SIZE, + array_ptr + j * RTE_ACL_DFA_GR64_SIZE, + RTE_ACL_DFA_GR64_SIZE * + sizeof(array_ptr[0])) == 0) + break; + } + gr64[i] = (j != i) ? gr64[j] : k++; + } + + return k; +} + +static uint32_t +acl_node_fill_dfa(const struct rte_acl_node *node, + uint64_t dfa[RTE_ACL_DFA_SIZE], uint64_t no_match, int32_t resolved) +{ + uint32_t n, x; + uint32_t ranges, last_bit; + struct rte_acl_node *child; + struct rte_acl_bitset *bits; + + ranges = 0; + last_bit = 0; + + for (n = 0; n < RTE_ACL_DFA_SIZE; n++) + dfa[n] = no_match; + + for (x = 0; x < node->num_ptrs; x++) { + + child = node->ptrs[x].ptr; + if (child == NULL) + continue; + + bits = &node->ptrs[x].values; + for (n = 0; n < RTE_ACL_DFA_SIZE; n++) { + + if (bits->bits[n / (sizeof(bits_t) * CHAR_BIT)] & + (1 << (n % (sizeof(bits_t) * CHAR_BIT)))) { + + dfa[n] = resolved ? child->node_index : x; + ranges += (last_bit == 0); + last_bit = 1; + } else { + last_bit = 0; + } + } + } + + return ranges; +} + /* * Counts the number of groups of sequential bits that are * either 0 or 1, as specified by the zero_one parameter. This is used to @@ -150,10 +245,11 @@ acl_count_fanout(struct rte_acl_node *node) */ static int acl_count_trie_types(struct acl_node_counters *counts, - struct rte_acl_node *node, int match, int force_dfa) + struct rte_acl_node *node, uint64_t no_match, int match, int force_dfa) { uint32_t n; int num_ptrs; + uint64_t dfa[RTE_ACL_DFA_SIZE]; /* skip if this node has been counted */ if (node->node_type != (uint32_t)RTE_ACL_NODE_UNDEFINED) @@ -186,6 +282,16 @@ acl_count_trie_types(struct acl_node_counters *counts, } else { counts->dfa++; node->node_type = RTE_ACL_NODE_DFA; + if (force_dfa != 0) { + /* always expand to a max number of nodes. */ + for (n = 0; n != RTE_DIM(node->dfa_gr64); n++) + node->dfa_gr64[n] = n; + node->fanout = n; + } else { + acl_node_fill_dfa(node, dfa, no_match, 0); + node->fanout = acl_dfa_count_gr64(dfa, node->dfa_gr64); + } + counts->dfa_gr64 += node->fanout; } /* @@ -194,7 +300,7 @@ acl_count_trie_types(struct acl_node_counters *counts, for (n = 0; n < node->num_ptrs; n++) { if (node->ptrs[n].ptr != NULL) match = acl_count_trie_types(counts, node->ptrs[n].ptr, - match, 0); + no_match, match, 0); } return match; @@ -204,38 +310,11 @@ static void acl_add_ptrs(struct rte_acl_node *node, uint64_t *node_array, uint64_t no_match, int resolved) { - uint32_t n, x; - int m, ranges, last_bit; - struct rte_acl_node *child; - struct rte_acl_bitset *bits; + uint32_t x; + int32_t m; uint64_t *node_a, index, dfa[RTE_ACL_DFA_SIZE]; - ranges = 0; - last_bit = 0; - - for (n = 0; n < RTE_DIM(dfa); n++) - dfa[n] = no_match; - - for (x = 0; x < node->num_ptrs; x++) { - - child = node->ptrs[x].ptr; - if (child == NULL) - continue; - - bits = &node->ptrs[x].values; - for (n = 0; n < RTE_DIM(dfa); n++) { - - if (bits->bits[n / (sizeof(bits_t) * CHAR_BIT)] & - (1 << (n % (sizeof(bits_t) * CHAR_BIT)))) { - - dfa[n] = resolved ? child->node_index : x; - ranges += (last_bit == 0); - last_bit = 1; - } else { - last_bit = 0; - } - } - } + acl_node_fill_dfa(node, dfa, no_match, resolved); /* * Rather than going from 0 to 256, the range count and @@ -272,8 +351,7 @@ acl_add_ptrs(struct rte_acl_node *node, uint64_t *node_array, uint64_t no_match, RTE_ACL_VERIFY(m <= RTE_ACL_QUAD_SIZE); } else if (node->node_type == RTE_ACL_NODE_DFA && resolved) { - for (n = 0; n < RTE_DIM(dfa); n++) - node_array[n] = dfa[n]; + acl_dfa_fill_gr64(node, dfa, node_array); } } @@ -286,7 +364,7 @@ static void acl_gen_node(struct rte_acl_node *node, uint64_t *node_array, uint64_t no_match, struct rte_acl_indices *index, int num_categories) { - uint32_t n, *qtrp; + uint32_t n, sz, *qtrp; uint64_t *array_ptr; struct rte_acl_match_results *match; @@ -297,10 +375,11 @@ acl_gen_node(struct rte_acl_node *node, uint64_t *node_array, switch (node->node_type) { case RTE_ACL_NODE_DFA: - node->node_index = index->dfa_index | node->node_type; array_ptr = &node_array[index->dfa_index]; - index->dfa_index += RTE_ACL_DFA_SIZE; - for (n = 0; n < RTE_ACL_DFA_SIZE; n++) + node->node_index = acl_dfa_gen_idx(node, index->dfa_index); + sz = node->fanout * RTE_ACL_DFA_GR64_SIZE; + index->dfa_index += sz; + for (n = 0; n < sz; n++) array_ptr[n] = no_match; break; case RTE_ACL_NODE_SINGLE: @@ -312,7 +391,7 @@ acl_gen_node(struct rte_acl_node *node, uint64_t *node_array, break; case RTE_ACL_NODE_QRANGE: array_ptr = &node_array[index->quad_index]; - acl_add_ptrs(node, array_ptr, no_match, 0); + acl_add_ptrs(node, array_ptr, no_match, 0); qtrp = (uint32_t *)node->transitions; node->node_index = qtrp[0]; node->node_index <<= sizeof(index->quad_index) * CHAR_BIT; @@ -368,7 +447,7 @@ static int acl_calc_counts_indices(struct acl_node_counters *counts, struct rte_acl_indices *indices, struct rte_acl_trie *trie, struct rte_acl_bld_trie *node_bld_trie, uint32_t num_tries, - int match_num) + int match_num, uint64_t no_match) { uint32_t n; @@ -379,13 +458,13 @@ acl_calc_counts_indices(struct acl_node_counters *counts, for (n = 0; n < num_tries; n++) { counts->smallest_match = INT32_MAX; match_num = acl_count_trie_types(counts, node_bld_trie[n].trie, - match_num, 1); + no_match, match_num, 1); trie[n].smallest = counts->smallest_match; } indices->dfa_index = RTE_ACL_DFA_SIZE + 1; indices->quad_index = indices->dfa_index + - counts->dfa * RTE_ACL_DFA_SIZE; + counts->dfa_gr64 * RTE_ACL_DFA_GR64_SIZE; indices->single_index = indices->quad_index + counts->quad_vectors; indices->match_index = indices->single_index + counts->single + 1; indices->match_index = RTE_ALIGN(indices->match_index, @@ -410,9 +489,11 @@ rte_acl_gen(struct rte_acl_ctx *ctx, struct rte_acl_trie *trie, struct acl_node_counters counts; struct rte_acl_indices indices; + no_match = RTE_ACL_NODE_MATCH; + /* Fill counts and indices arrays from the nodes. */ match_num = acl_calc_counts_indices(&counts, &indices, trie, - node_bld_trie, num_tries, match_num); + node_bld_trie, num_tries, match_num, no_match); /* Allocate runtime memory (align to cache boundary) */ total_size = RTE_ALIGN(data_index_sz, RTE_CACHE_LINE_SIZE) + @@ -440,11 +521,11 @@ rte_acl_gen(struct rte_acl_ctx *ctx, struct rte_acl_trie *trie, */ node_array[RTE_ACL_DFA_SIZE] = RTE_ACL_DFA_SIZE | RTE_ACL_NODE_SINGLE; - no_match = RTE_ACL_NODE_MATCH; for (n = 0; n < RTE_ACL_DFA_SIZE; n++) node_array[n] = no_match; + /* NOMATCH result at index 0 */ match = ((struct rte_acl_match_results *)(node_array + match_index)); memset(match, 0, sizeof(*match)); @@ -470,6 +551,6 @@ rte_acl_gen(struct rte_acl_ctx *ctx, struct rte_acl_trie *trie, ctx->trans_table = node_array; memcpy(ctx->trie, trie, sizeof(ctx->trie)); - acl_gen_log_stats(ctx, &counts); + acl_gen_log_stats(ctx, &counts, &indices); return 0; } diff --git a/lib/librte_acl/acl_run_scalar.c b/lib/librte_acl/acl_run_scalar.c index 43c8fc3..40691ce 100644 --- a/lib/librte_acl/acl_run_scalar.c +++ b/lib/librte_acl/acl_run_scalar.c @@ -94,15 +94,6 @@ resolve_priority_scalar(uint64_t transition, int n, } } -/* - * When processing the transition, rather than using if/else - * construct, the offset is calculated for DFA and QRANGE and - * then conditionally added to the address based on node type. - * This is done to avoid branch mis-predictions. Since the - * offset is rather simple calculation it is more efficient - * to do the calculation and do a condition move rather than - * a conditional branch to determine which calculation to do. - */ static inline uint32_t scan_forward(uint32_t input, uint32_t max) { @@ -117,18 +108,27 @@ scalar_transition(const uint64_t *trans_table, uint64_t transition, /* break transition into component parts */ ranges = transition >> (sizeof(index) * CHAR_BIT); - - /* calc address for a QRANGE node */ - c = input * SCALAR_QRANGE_MULT; - a = ranges | SCALAR_QRANGE_MIN; index = transition & ~RTE_ACL_NODE_INDEX; - a -= (c & SCALAR_QRANGE_MASK); - b = c & SCALAR_QRANGE_MIN; addr = transition ^ index; - a &= SCALAR_QRANGE_MIN; - a ^= (ranges ^ b) & (a ^ b); - x = scan_forward(a, 32) >> 3; - addr += (index == RTE_ACL_NODE_DFA) ? input : x; + + if (index != RTE_ACL_NODE_DFA) { + /* calc address for a QRANGE/SINGLE node */ + c = (uint32_t)input * SCALAR_QRANGE_MULT; + a = ranges | SCALAR_QRANGE_MIN; + a -= (c & SCALAR_QRANGE_MASK); + b = c & SCALAR_QRANGE_MIN; + a &= SCALAR_QRANGE_MIN; + a ^= (ranges ^ b) & (a ^ b); + x = scan_forward(a, 32) >> 3; + } else { + /* calc address for a DFA node */ + x = ranges >> (input / + RTE_ACL_DFA_GR64_SIZE * RTE_ACL_DFA_GR64_BIT); + x &= UINT8_MAX; + x = input - x; + } + + addr += x; /* pickup next transition */ transition = *(trans_table + addr); diff --git a/lib/librte_acl/acl_run_sse.c b/lib/librte_acl/acl_run_sse.c index 69a9d77..576c92b 100644 --- a/lib/librte_acl/acl_run_sse.c +++ b/lib/librte_acl/acl_run_sse.c @@ -40,24 +40,6 @@ enum { SHUFFLE32_SWAP64 = 0x4e, }; -static const rte_xmm_t mm_type_quad_range = { - .u32 = { - RTE_ACL_NODE_QRANGE, - RTE_ACL_NODE_QRANGE, - RTE_ACL_NODE_QRANGE, - RTE_ACL_NODE_QRANGE, - }, -}; - -static const rte_xmm_t mm_type_quad_range64 = { - .u32 = { - RTE_ACL_NODE_QRANGE, - RTE_ACL_NODE_QRANGE, - 0, - 0, - }, -}; - static const rte_xmm_t mm_shuffle_input = { .u32 = {0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c}, }; @@ -70,14 +52,6 @@ static const rte_xmm_t mm_ones_16 = { .u16 = {1, 1, 1, 1, 1, 1, 1, 1}, }; -static const rte_xmm_t mm_bytes = { - .u32 = {UINT8_MAX, UINT8_MAX, UINT8_MAX, UINT8_MAX}, -}; - -static const rte_xmm_t mm_bytes64 = { - .u32 = {UINT8_MAX, UINT8_MAX, 0, 0}, -}; - static const rte_xmm_t mm_match_mask = { .u32 = { RTE_ACL_NODE_MATCH, @@ -236,10 +210,14 @@ acl_match_check_x4(int slot, const struct rte_acl_ctx *ctx, struct parms *parms, */ static inline xmm_t acl_calc_addr(xmm_t index_mask, xmm_t next_input, xmm_t shuffle_input, - xmm_t ones_16, xmm_t bytes, xmm_t type_quad_range, - xmm_t *indices1, xmm_t *indices2) + xmm_t ones_16, xmm_t indices1, xmm_t indices2) { - xmm_t addr, node_types, temp; + xmm_t addr, node_types, range, temp; + xmm_t dfa_msk, dfa_ofs, quad_ofs; + xmm_t in, r, t; + + const xmm_t range_base = _mm_set_epi32(0xffffff0c, 0xffffff08, + 0xffffff04, 0xffffff00); /* * Note that no transition is done for a match @@ -248,10 +226,13 @@ acl_calc_addr(xmm_t index_mask, xmm_t next_input, xmm_t shuffle_input, */ /* Shuffle low 32 into temp and high 32 into indices2 */ - temp = (xmm_t)MM_SHUFFLEPS((__m128)*indices1, (__m128)*indices2, - 0x88); - *indices2 = (xmm_t)MM_SHUFFLEPS((__m128)*indices1, - (__m128)*indices2, 0xdd); + temp = (xmm_t)MM_SHUFFLEPS((__m128)indices1, (__m128)indices2, 0x88); + range = (xmm_t)MM_SHUFFLEPS((__m128)indices1, (__m128)indices2, 0xdd); + + t = MM_XOR(index_mask, index_mask); + + /* shuffle input byte to all 4 positions of 32 bit value */ + in = MM_SHUFFLE8(next_input, shuffle_input); /* Calc node type and node addr */ node_types = MM_ANDNOT(index_mask, temp); @@ -262,17 +243,15 @@ acl_calc_addr(xmm_t index_mask, xmm_t next_input, xmm_t shuffle_input, */ /* mask for DFA type (0) nodes */ - temp = MM_CMPEQ32(node_types, MM_XOR(node_types, node_types)); + dfa_msk = MM_CMPEQ32(node_types, t); - /* add input byte to DFA position */ - temp = MM_AND(temp, bytes); - temp = MM_AND(temp, next_input); - addr = MM_ADD32(addr, temp); + r = _mm_srli_epi32(in, 30); + r = _mm_add_epi8(r, range_base); - /* - * Calc addr for Range nodes -> range_index + range(input) - */ - node_types = MM_CMPEQ32(node_types, type_quad_range); + t = _mm_srli_epi32(in, 24); + r = _mm_shuffle_epi8(range, r); + + dfa_ofs = _mm_sub_epi32(t, r); /* * Calculate number of range boundaries that are less than the @@ -282,11 +261,8 @@ acl_calc_addr(xmm_t index_mask, xmm_t next_input, xmm_t shuffle_input, * input byte. */ - /* shuffle input byte to all 4 positions of 32 bit value */ - temp = MM_SHUFFLE8(next_input, shuffle_input); - /* check ranges */ - temp = MM_CMPGT8(temp, *indices2); + temp = MM_CMPGT8(in, range); /* convert -1 to 1 (bytes greater than input byte */ temp = MM_SIGN8(temp, temp); @@ -295,10 +271,10 @@ acl_calc_addr(xmm_t index_mask, xmm_t next_input, xmm_t shuffle_input, temp = MM_MADD8(temp, temp); /* horizontal add pairs of words into dwords */ - temp = MM_MADD16(temp, ones_16); + quad_ofs = MM_MADD16(temp, ones_16); /* mask to range type nodes */ - temp = MM_AND(temp, node_types); + temp = _mm_blendv_epi8(quad_ofs, dfa_ofs, dfa_msk); /* add index into node position */ return MM_ADD32(addr, temp); @@ -309,8 +285,8 @@ acl_calc_addr(xmm_t index_mask, xmm_t next_input, xmm_t shuffle_input, */ static inline xmm_t transition4(xmm_t index_mask, xmm_t next_input, xmm_t shuffle_input, - xmm_t ones_16, xmm_t bytes, xmm_t type_quad_range, - const uint64_t *trans, xmm_t *indices1, xmm_t *indices2) + xmm_t ones_16, const uint64_t *trans, + xmm_t *indices1, xmm_t *indices2) { xmm_t addr; uint64_t trans0, trans2; @@ -318,7 +294,7 @@ transition4(xmm_t index_mask, xmm_t next_input, xmm_t shuffle_input, /* Calculate the address (array index) for all 4 transitions. */ addr = acl_calc_addr(index_mask, next_input, shuffle_input, ones_16, - bytes, type_quad_range, indices1, indices2); + *indices1, *indices2); /* Gather 64 bit transitions and pack back into 2 registers. */ @@ -408,42 +384,34 @@ search_sse_8(const struct rte_acl_ctx *ctx, const uint8_t **data, input0 = transition4(mm_index_mask.m, input0, mm_shuffle_input.m, mm_ones_16.m, - mm_bytes.m, mm_type_quad_range.m, flows.trans, &indices1, &indices2); input1 = transition4(mm_index_mask.m, input1, mm_shuffle_input.m, mm_ones_16.m, - mm_bytes.m, mm_type_quad_range.m, flows.trans, &indices3, &indices4); input0 = transition4(mm_index_mask.m, input0, mm_shuffle_input.m, mm_ones_16.m, - mm_bytes.m, mm_type_quad_range.m, flows.trans, &indices1, &indices2); input1 = transition4(mm_index_mask.m, input1, mm_shuffle_input.m, mm_ones_16.m, - mm_bytes.m, mm_type_quad_range.m, flows.trans, &indices3, &indices4); input0 = transition4(mm_index_mask.m, input0, mm_shuffle_input.m, mm_ones_16.m, - mm_bytes.m, mm_type_quad_range.m, flows.trans, &indices1, &indices2); input1 = transition4(mm_index_mask.m, input1, mm_shuffle_input.m, mm_ones_16.m, - mm_bytes.m, mm_type_quad_range.m, flows.trans, &indices3, &indices4); input0 = transition4(mm_index_mask.m, input0, mm_shuffle_input.m, mm_ones_16.m, - mm_bytes.m, mm_type_quad_range.m, flows.trans, &indices1, &indices2); input1 = transition4(mm_index_mask.m, input1, mm_shuffle_input.m, mm_ones_16.m, - mm_bytes.m, mm_type_quad_range.m, flows.trans, &indices3, &indices4); /* Check for any matches. */ @@ -496,22 +464,18 @@ search_sse_4(const struct rte_acl_ctx *ctx, const uint8_t **data, /* Process the 4 bytes of input on each stream. */ input = transition4(mm_index_mask.m, input, mm_shuffle_input.m, mm_ones_16.m, - mm_bytes.m, mm_type_quad_range.m, flows.trans, &indices1, &indices2); input = transition4(mm_index_mask.m, input, mm_shuffle_input.m, mm_ones_16.m, - mm_bytes.m, mm_type_quad_range.m, flows.trans, &indices1, &indices2); input = transition4(mm_index_mask.m, input, mm_shuffle_input.m, mm_ones_16.m, - mm_bytes.m, mm_type_quad_range.m, flows.trans, &indices1, &indices2); input = transition4(mm_index_mask.m, input, mm_shuffle_input.m, mm_ones_16.m, - mm_bytes.m, mm_type_quad_range.m, flows.trans, &indices1, &indices2); /* Check for any matches. */ @@ -524,8 +488,7 @@ search_sse_4(const struct rte_acl_ctx *ctx, const uint8_t **data, static inline xmm_t transition2(xmm_t index_mask, xmm_t next_input, xmm_t shuffle_input, - xmm_t ones_16, xmm_t bytes, xmm_t type_quad_range, - const uint64_t *trans, xmm_t *indices1) + xmm_t ones_16, const uint64_t *trans, xmm_t *indices1) { uint64_t t; xmm_t addr, indices2; @@ -533,7 +496,7 @@ transition2(xmm_t index_mask, xmm_t next_input, xmm_t shuffle_input, indices2 = MM_XOR(ones_16, ones_16); addr = acl_calc_addr(index_mask, next_input, shuffle_input, ones_16, - bytes, type_quad_range, indices1, &indices2); + *indices1, indices2); /* Gather 64 bit transitions and pack 2 per register. */ @@ -583,22 +546,18 @@ search_sse_2(const struct rte_acl_ctx *ctx, const uint8_t **data, input = transition2(mm_index_mask64.m, input, mm_shuffle_input64.m, mm_ones_16.m, - mm_bytes64.m, mm_type_quad_range64.m, flows.trans, &indices); input = transition2(mm_index_mask64.m, input, mm_shuffle_input64.m, mm_ones_16.m, - mm_bytes64.m, mm_type_quad_range64.m, flows.trans, &indices); input = transition2(mm_index_mask64.m, input, mm_shuffle_input64.m, mm_ones_16.m, - mm_bytes64.m, mm_type_quad_range64.m, flows.trans, &indices); input = transition2(mm_index_mask64.m, input, mm_shuffle_input64.m, mm_ones_16.m, - mm_bytes64.m, mm_type_quad_range64.m, flows.trans, &indices); /* Check for any matches. */