get:
Show a patch.

patch:
Update a patch.

put:
Update a patch.

GET /api/patches/2402/?format=api
HTTP 200 OK
Allow: GET, PUT, PATCH, HEAD, OPTIONS
Content-Type: application/json
Vary: Accept

{
    "id": 2402,
    "url": "https://patches.dpdk.org/api/patches/2402/?format=api",
    "web_url": "https://patches.dpdk.org/project/dpdk/patch/1421779267-18492-7-git-send-email-konstantin.ananyev@intel.com/",
    "project": {
        "id": 1,
        "url": "https://patches.dpdk.org/api/projects/1/?format=api",
        "name": "DPDK",
        "link_name": "dpdk",
        "list_id": "dev.dpdk.org",
        "list_email": "dev@dpdk.org",
        "web_url": "http://core.dpdk.org",
        "scm_url": "git://dpdk.org/dpdk",
        "webscm_url": "http://git.dpdk.org/dpdk",
        "list_archive_url": "https://inbox.dpdk.org/dev",
        "list_archive_url_format": "https://inbox.dpdk.org/dev/{}",
        "commit_url_format": ""
    },
    "msgid": "<1421779267-18492-7-git-send-email-konstantin.ananyev@intel.com>",
    "list_archive_url": "https://inbox.dpdk.org/dev/1421779267-18492-7-git-send-email-konstantin.ananyev@intel.com",
    "date": "2015-01-20T18:40:55",
    "name": "[dpdk-dev,v3,06/18] librte_acl: introduce DFA nodes compression (group64) for identical entries.",
    "commit_ref": null,
    "pull_url": null,
    "state": "accepted",
    "archived": true,
    "hash": "c8cf10682f6abbb60b93ab15ba220a7aa2c90139",
    "submitter": {
        "id": 33,
        "url": "https://patches.dpdk.org/api/people/33/?format=api",
        "name": "Ananyev, Konstantin",
        "email": "konstantin.ananyev@intel.com"
    },
    "delegate": null,
    "mbox": "https://patches.dpdk.org/project/dpdk/patch/1421779267-18492-7-git-send-email-konstantin.ananyev@intel.com/mbox/",
    "series": [],
    "comments": "https://patches.dpdk.org/api/patches/2402/comments/",
    "check": "pending",
    "checks": "https://patches.dpdk.org/api/patches/2402/checks/",
    "tags": {},
    "related": [],
    "headers": {
        "Return-Path": "<dev-bounces@dpdk.org>",
        "X-Original-To": "patchwork@dpdk.org",
        "Delivered-To": "patchwork@dpdk.org",
        "Received": [
            "from [92.243.14.124] (localhost [IPv6:::1])\n\tby dpdk.org (Postfix) with ESMTP id B37E55AB0;\n\tTue, 20 Jan 2015 19:41:33 +0100 (CET)",
            "from mga14.intel.com (mga14.intel.com [192.55.52.115])\n\tby dpdk.org (Postfix) with ESMTP id AC47D5A96\n\tfor <dev@dpdk.org>; Tue, 20 Jan 2015 19:41:23 +0100 (CET)",
            "from fmsmga003.fm.intel.com ([10.253.24.29])\n\tby fmsmga103.fm.intel.com with ESMTP; 20 Jan 2015 10:35:42 -0800",
            "from irvmail001.ir.intel.com ([163.33.26.43])\n\tby FMSMGA003.fm.intel.com with ESMTP; 20 Jan 2015 10:28:04 -0800",
            "from sivswdev02.ir.intel.com (sivswdev02.ir.intel.com\n\t[10.237.217.46])\n\tby irvmail001.ir.intel.com (8.14.3/8.13.6/MailSET/Hub) with ESMTP id\n\tt0KIfKac029490; Tue, 20 Jan 2015 18:41:20 GMT",
            "from sivswdev02.ir.intel.com (localhost [127.0.0.1])\n\tby sivswdev02.ir.intel.com with ESMTP id t0KIfKOT018827;\n\tTue, 20 Jan 2015 18:41:20 GMT",
            "(from kananye1@localhost)\n\tby sivswdev02.ir.intel.com with  id t0KIfK6H018823;\n\tTue, 20 Jan 2015 18:41:20 GMT"
        ],
        "X-ExtLoop1": "1",
        "X-IronPort-AV": "E=Sophos;i=\"4.97,862,1389772800\"; d=\"scan'208\";a=\"442827078\"",
        "From": "Konstantin Ananyev <konstantin.ananyev@intel.com>",
        "To": "dev@dpdk.org",
        "Date": "Tue, 20 Jan 2015 18:40:55 +0000",
        "Message-Id": "<1421779267-18492-7-git-send-email-konstantin.ananyev@intel.com>",
        "X-Mailer": "git-send-email 1.7.4.1",
        "In-Reply-To": "<1421779267-18492-1-git-send-email-konstantin.ananyev@intel.com>",
        "References": "<1421779267-18492-1-git-send-email-konstantin.ananyev@intel.com>",
        "Subject": "[dpdk-dev] [PATCH v3 06/18] librte_acl: introduce DFA nodes\n\tcompression (group64) for identical entries.",
        "X-BeenThere": "dev@dpdk.org",
        "X-Mailman-Version": "2.1.15",
        "Precedence": "list",
        "List-Id": "patches and discussions about DPDK <dev.dpdk.org>",
        "List-Unsubscribe": "<http://dpdk.org/ml/options/dev>,\n\t<mailto:dev-request@dpdk.org?subject=unsubscribe>",
        "List-Archive": "<http://dpdk.org/ml/archives/dev/>",
        "List-Post": "<mailto:dev@dpdk.org>",
        "List-Help": "<mailto:dev-request@dpdk.org?subject=help>",
        "List-Subscribe": "<http://dpdk.org/ml/listinfo/dev>,\n\t<mailto:dev-request@dpdk.org?subject=subscribe>",
        "Errors-To": "dev-bounces@dpdk.org",
        "Sender": "\"dev\" <dev-bounces@dpdk.org>"
    },
    "content": "Introduced division of whole 256 child transition enties\ninto 4 sub-groups (64 kids per group).\nSo 2 groups within the same node with identical children,\ncan use one set of transition entries.\nThat allows to compact some DFA nodes and get space savings in the RT table,\nwithout any negative performance impact.\nFrom what I've seen an average space savings: ~20%.\n\nSigned-off-by: Konstantin Ananyev <konstantin.ananyev@intel.com>\n---\n lib/librte_acl/acl.h            |  12 ++-\n lib/librte_acl/acl_gen.c        | 195 ++++++++++++++++++++++++++++------------\n lib/librte_acl/acl_run_scalar.c |  38 ++++----\n lib/librte_acl/acl_run_sse.c    |  99 ++++++--------------\n 4 files changed, 196 insertions(+), 148 deletions(-)",
    "diff": "diff --git a/lib/librte_acl/acl.h b/lib/librte_acl/acl.h\nindex 102fa51..3f6ac79 100644\n--- a/lib/librte_acl/acl.h\n+++ b/lib/librte_acl/acl.h\n@@ -47,6 +47,11 @@ extern\"C\" {\n #define RTE_ACL_DFA_MAX\t\tUINT8_MAX\n #define RTE_ACL_DFA_SIZE\t(UINT8_MAX + 1)\n \n+#define\tRTE_ACL_DFA_GR64_SIZE\t64\n+#define\tRTE_ACL_DFA_GR64_NUM\t(RTE_ACL_DFA_SIZE / RTE_ACL_DFA_GR64_SIZE)\n+#define\tRTE_ACL_DFA_GR64_BIT\t\\\n+\t(CHAR_BIT * sizeof(uint32_t) / RTE_ACL_DFA_GR64_NUM)\n+\n typedef int bits_t;\n \n #define\tRTE_ACL_BIT_SET_SIZE\t((UINT8_MAX + 1) / (sizeof(bits_t) * CHAR_BIT))\n@@ -100,8 +105,11 @@ struct rte_acl_node {\n \t/* number of ranges (transitions w/ consecutive bits) */\n \tint32_t                 id;\n \tstruct rte_acl_match_results *mrt; /* only valid when match_flag != 0 */\n-\tchar                         transitions[RTE_ACL_QUAD_SIZE];\n-\t/* boundaries for ranged node */\n+\tunion {\n+\t\tchar            transitions[RTE_ACL_QUAD_SIZE];\n+\t\t/* boundaries for ranged node */\n+\t\tuint8_t         dfa_gr64[RTE_ACL_DFA_GR64_NUM];\n+\t};\n \tstruct rte_acl_node     *next;\n \t/* free list link or pointer to duplicate node during merge */\n \tstruct rte_acl_node     *prev;\ndiff --git a/lib/librte_acl/acl_gen.c b/lib/librte_acl/acl_gen.c\nindex b1f766b..c9b7839 100644\n--- a/lib/librte_acl/acl_gen.c\n+++ b/lib/librte_acl/acl_gen.c\n@@ -43,13 +43,14 @@\n } while (0)\n \n struct acl_node_counters {\n-\tint                match;\n-\tint                match_used;\n-\tint                single;\n-\tint                quad;\n-\tint                quad_vectors;\n-\tint                dfa;\n-\tint                smallest_match;\n+\tint32_t match;\n+\tint32_t match_used;\n+\tint32_t single;\n+\tint32_t quad;\n+\tint32_t quad_vectors;\n+\tint32_t dfa;\n+\tint32_t dfa_gr64;\n+\tint32_t smallest_match;\n };\n \n struct rte_acl_indices {\n@@ -61,24 +62,118 @@ struct rte_acl_indices {\n \n static void\n acl_gen_log_stats(const struct rte_acl_ctx *ctx,\n-\tconst struct acl_node_counters *counts)\n+\tconst struct acl_node_counters *counts,\n+\tconst struct rte_acl_indices *indices)\n {\n \tRTE_LOG(DEBUG, ACL, \"Gen phase for ACL \\\"%s\\\":\\n\"\n \t\t\"runtime memory footprint on socket %d:\\n\"\n \t\t\"single nodes/bytes used: %d/%zu\\n\"\n-\t\t\"quad nodes/bytes used: %d/%zu\\n\"\n-\t\t\"DFA nodes/bytes used: %d/%zu\\n\"\n+\t\t\"quad nodes/vectors/bytes used: %d/%d/%zu\\n\"\n+\t\t\"DFA nodes/group64/bytes used: %d/%d/%zu\\n\"\n \t\t\"match nodes/bytes used: %d/%zu\\n\"\n \t\t\"total: %zu bytes\\n\",\n \t\tctx->name, ctx->socket_id,\n \t\tcounts->single, counts->single * sizeof(uint64_t),\n-\t\tcounts->quad, counts->quad_vectors * sizeof(uint64_t),\n-\t\tcounts->dfa, counts->dfa * RTE_ACL_DFA_SIZE * sizeof(uint64_t),\n+\t\tcounts->quad, counts->quad_vectors,\n+\t\t(indices->quad_index - indices->dfa_index) * sizeof(uint64_t),\n+\t\tcounts->dfa, counts->dfa_gr64,\n+\t\tindices->dfa_index * sizeof(uint64_t),\n \t\tcounts->match,\n \t\tcounts->match * sizeof(struct rte_acl_match_results),\n \t\tctx->mem_sz);\n }\n \n+static uint64_t\n+acl_dfa_gen_idx(const struct rte_acl_node *node, uint32_t index)\n+{\n+\tuint64_t idx;\n+\tuint32_t i;\n+\n+\tidx = 0;\n+\tfor (i = 0; i != RTE_DIM(node->dfa_gr64); i++) {\n+\t\tRTE_ACL_VERIFY(node->dfa_gr64[i] < RTE_ACL_DFA_GR64_NUM);\n+\t\tRTE_ACL_VERIFY(node->dfa_gr64[i] < node->fanout);\n+\t\tidx |= (i - node->dfa_gr64[i]) <<\n+\t\t\t(6 + RTE_ACL_DFA_GR64_BIT * i);\n+\t}\n+\n+\treturn idx << (CHAR_BIT * sizeof(index)) | index | node->node_type;\n+}\n+\n+static void\n+acl_dfa_fill_gr64(const struct rte_acl_node *node,\n+\tconst uint64_t src[RTE_ACL_DFA_SIZE], uint64_t dst[RTE_ACL_DFA_SIZE])\n+{\n+\tuint32_t i;\n+\n+\tfor (i = 0; i != RTE_DIM(node->dfa_gr64); i++) {\n+\t\tmemcpy(dst + node->dfa_gr64[i] * RTE_ACL_DFA_GR64_SIZE,\n+\t\t\tsrc + i * RTE_ACL_DFA_GR64_SIZE,\n+\t\t\tRTE_ACL_DFA_GR64_SIZE * sizeof(dst[0]));\n+\t}\n+}\n+\n+static uint32_t\n+acl_dfa_count_gr64(const uint64_t array_ptr[RTE_ACL_DFA_SIZE],\n+\tuint8_t gr64[RTE_ACL_DFA_GR64_NUM])\n+{\n+\tuint32_t i, j, k;\n+\n+\tk = 0;\n+\tfor (i = 0; i != RTE_ACL_DFA_GR64_NUM; i++) {\n+\t\tgr64[i] = i;\n+\t\tfor (j = 0; j != i; j++) {\n+\t\t\tif (memcmp(array_ptr + i * RTE_ACL_DFA_GR64_SIZE,\n+\t\t\t\t\tarray_ptr + j * RTE_ACL_DFA_GR64_SIZE,\n+\t\t\t\t\tRTE_ACL_DFA_GR64_SIZE *\n+\t\t\t\t\tsizeof(array_ptr[0])) == 0)\n+\t\t\t\tbreak;\n+\t\t}\n+\t\tgr64[i] = (j != i) ? gr64[j] : k++;\n+\t}\n+\n+\treturn k;\n+}\n+\n+static uint32_t\n+acl_node_fill_dfa(const struct rte_acl_node *node,\n+\tuint64_t dfa[RTE_ACL_DFA_SIZE], uint64_t no_match, int32_t resolved)\n+{\n+\tuint32_t n, x;\n+\tuint32_t ranges, last_bit;\n+\tstruct rte_acl_node *child;\n+\tstruct rte_acl_bitset *bits;\n+\n+\tranges = 0;\n+\tlast_bit = 0;\n+\n+\tfor (n = 0; n < RTE_ACL_DFA_SIZE; n++)\n+\t\tdfa[n] = no_match;\n+\n+\tfor (x = 0; x < node->num_ptrs; x++) {\n+\n+\t\tchild = node->ptrs[x].ptr;\n+\t\tif (child == NULL)\n+\t\t\tcontinue;\n+\n+\t\tbits = &node->ptrs[x].values;\n+\t\tfor (n = 0; n < RTE_ACL_DFA_SIZE; n++) {\n+\n+\t\t\tif (bits->bits[n / (sizeof(bits_t) * CHAR_BIT)] &\n+\t\t\t\t(1 << (n % (sizeof(bits_t) * CHAR_BIT)))) {\n+\n+\t\t\t\tdfa[n] = resolved ? child->node_index : x;\n+\t\t\t\tranges += (last_bit == 0);\n+\t\t\t\tlast_bit = 1;\n+\t\t\t} else {\n+\t\t\t\tlast_bit = 0;\n+\t\t\t}\n+\t\t}\n+\t}\n+\n+\treturn ranges;\n+}\n+\n /*\n *  Counts the number of groups of sequential bits that are\n *  either 0 or 1, as specified by the zero_one parameter. This is used to\n@@ -150,10 +245,11 @@ acl_count_fanout(struct rte_acl_node *node)\n  */\n static int\n acl_count_trie_types(struct acl_node_counters *counts,\n-\tstruct rte_acl_node *node, int match, int force_dfa)\n+\tstruct rte_acl_node *node, uint64_t no_match, int match, int force_dfa)\n {\n \tuint32_t n;\n \tint num_ptrs;\n+\tuint64_t dfa[RTE_ACL_DFA_SIZE];\n \n \t/* skip if this node has been counted */\n \tif (node->node_type != (uint32_t)RTE_ACL_NODE_UNDEFINED)\n@@ -186,6 +282,16 @@ acl_count_trie_types(struct acl_node_counters *counts,\n \t} else {\n \t\tcounts->dfa++;\n \t\tnode->node_type = RTE_ACL_NODE_DFA;\n+\t\tif (force_dfa != 0) {\n+\t\t\t/* always expand to a max number of nodes. */\n+\t\t\tfor (n = 0; n != RTE_DIM(node->dfa_gr64); n++)\n+\t\t\t\tnode->dfa_gr64[n] = n;\n+\t\t\tnode->fanout = n;\n+\t\t} else {\n+\t\t\tacl_node_fill_dfa(node, dfa, no_match, 0);\n+\t\t\tnode->fanout = acl_dfa_count_gr64(dfa, node->dfa_gr64);\n+\t\t}\n+\t\tcounts->dfa_gr64 += node->fanout;\n \t}\n \n \t/*\n@@ -194,7 +300,7 @@ acl_count_trie_types(struct acl_node_counters *counts,\n \tfor (n = 0; n < node->num_ptrs; n++) {\n \t\tif (node->ptrs[n].ptr != NULL)\n \t\t\tmatch = acl_count_trie_types(counts, node->ptrs[n].ptr,\n-\t\t\t\tmatch, 0);\n+\t\t\t\tno_match, match, 0);\n \t}\n \n \treturn match;\n@@ -204,38 +310,11 @@ static void\n acl_add_ptrs(struct rte_acl_node *node, uint64_t *node_array, uint64_t no_match,\n \tint resolved)\n {\n-\tuint32_t n, x;\n-\tint m, ranges, last_bit;\n-\tstruct rte_acl_node *child;\n-\tstruct rte_acl_bitset *bits;\n+\tuint32_t x;\n+\tint32_t m;\n \tuint64_t *node_a, index, dfa[RTE_ACL_DFA_SIZE];\n \n-\tranges = 0;\n-\tlast_bit = 0;\n-\n-\tfor (n = 0; n < RTE_DIM(dfa); n++)\n-\t\tdfa[n] = no_match;\n-\n-\tfor (x = 0; x < node->num_ptrs; x++) {\n-\n-\t\tchild = node->ptrs[x].ptr;\n-\t\tif (child == NULL)\n-\t\t\tcontinue;\n-\n-\t\tbits = &node->ptrs[x].values;\n-\t\tfor (n = 0; n < RTE_DIM(dfa); n++) {\n-\n-\t\t\tif (bits->bits[n / (sizeof(bits_t) * CHAR_BIT)] &\n-\t\t\t\t(1 << (n % (sizeof(bits_t) * CHAR_BIT)))) {\n-\n-\t\t\t\tdfa[n] = resolved ? child->node_index : x;\n-\t\t\t\tranges += (last_bit == 0);\n-\t\t\t\tlast_bit = 1;\n-\t\t\t} else {\n-\t\t\t\tlast_bit = 0;\n-\t\t\t}\n-\t\t}\n-\t}\n+\tacl_node_fill_dfa(node, dfa, no_match, resolved);\n \n \t/*\n \t * Rather than going from 0 to 256, the range count and\n@@ -272,8 +351,7 @@ acl_add_ptrs(struct rte_acl_node *node, uint64_t *node_array, uint64_t no_match,\n \t\tRTE_ACL_VERIFY(m <= RTE_ACL_QUAD_SIZE);\n \n \t} else if (node->node_type == RTE_ACL_NODE_DFA && resolved) {\n-\t\tfor (n = 0; n < RTE_DIM(dfa); n++)\n-\t\t\tnode_array[n] = dfa[n];\n+\t\tacl_dfa_fill_gr64(node, dfa, node_array);\n \t}\n }\n \n@@ -286,7 +364,7 @@ static void\n acl_gen_node(struct rte_acl_node *node, uint64_t *node_array,\n \tuint64_t no_match, struct rte_acl_indices *index, int num_categories)\n {\n-\tuint32_t n, *qtrp;\n+\tuint32_t n, sz, *qtrp;\n \tuint64_t *array_ptr;\n \tstruct rte_acl_match_results *match;\n \n@@ -297,10 +375,11 @@ acl_gen_node(struct rte_acl_node *node, uint64_t *node_array,\n \n \tswitch (node->node_type) {\n \tcase RTE_ACL_NODE_DFA:\n-\t\tnode->node_index = index->dfa_index | node->node_type;\n \t\tarray_ptr = &node_array[index->dfa_index];\n-\t\tindex->dfa_index += RTE_ACL_DFA_SIZE;\n-\t\tfor (n = 0; n < RTE_ACL_DFA_SIZE; n++)\n+\t\tnode->node_index = acl_dfa_gen_idx(node, index->dfa_index);\n+\t\tsz = node->fanout * RTE_ACL_DFA_GR64_SIZE;\n+\t\tindex->dfa_index += sz;\n+\t\tfor (n = 0; n < sz; n++)\n \t\t\tarray_ptr[n] = no_match;\n \t\tbreak;\n \tcase RTE_ACL_NODE_SINGLE:\n@@ -312,7 +391,7 @@ acl_gen_node(struct rte_acl_node *node, uint64_t *node_array,\n \t\tbreak;\n \tcase RTE_ACL_NODE_QRANGE:\n \t\tarray_ptr = &node_array[index->quad_index];\n-\t\tacl_add_ptrs(node, array_ptr, no_match,  0);\n+\t\tacl_add_ptrs(node, array_ptr, no_match, 0);\n \t\tqtrp = (uint32_t *)node->transitions;\n \t\tnode->node_index = qtrp[0];\n \t\tnode->node_index <<= sizeof(index->quad_index) * CHAR_BIT;\n@@ -368,7 +447,7 @@ static int\n acl_calc_counts_indices(struct acl_node_counters *counts,\n \tstruct rte_acl_indices *indices, struct rte_acl_trie *trie,\n \tstruct rte_acl_bld_trie *node_bld_trie, uint32_t num_tries,\n-\tint match_num)\n+\tint match_num, uint64_t no_match)\n {\n \tuint32_t n;\n \n@@ -379,13 +458,13 @@ acl_calc_counts_indices(struct acl_node_counters *counts,\n \tfor (n = 0; n < num_tries; n++) {\n \t\tcounts->smallest_match = INT32_MAX;\n \t\tmatch_num = acl_count_trie_types(counts, node_bld_trie[n].trie,\n-\t\t\tmatch_num, 1);\n+\t\t\tno_match, match_num, 1);\n \t\ttrie[n].smallest = counts->smallest_match;\n \t}\n \n \tindices->dfa_index = RTE_ACL_DFA_SIZE + 1;\n \tindices->quad_index = indices->dfa_index +\n-\t\tcounts->dfa * RTE_ACL_DFA_SIZE;\n+\t\tcounts->dfa_gr64 * RTE_ACL_DFA_GR64_SIZE;\n \tindices->single_index = indices->quad_index + counts->quad_vectors;\n \tindices->match_index = indices->single_index + counts->single + 1;\n \tindices->match_index = RTE_ALIGN(indices->match_index,\n@@ -410,9 +489,11 @@ rte_acl_gen(struct rte_acl_ctx *ctx, struct rte_acl_trie *trie,\n \tstruct acl_node_counters counts;\n \tstruct rte_acl_indices indices;\n \n+\tno_match = RTE_ACL_NODE_MATCH;\n+\n \t/* Fill counts and indices arrays from the nodes. */\n \tmatch_num = acl_calc_counts_indices(&counts, &indices, trie,\n-\t\tnode_bld_trie, num_tries, match_num);\n+\t\tnode_bld_trie, num_tries, match_num, no_match);\n \n \t/* Allocate runtime memory (align to cache boundary) */\n \ttotal_size = RTE_ALIGN(data_index_sz, RTE_CACHE_LINE_SIZE) +\n@@ -440,11 +521,11 @@ rte_acl_gen(struct rte_acl_ctx *ctx, struct rte_acl_trie *trie,\n \t */\n \n \tnode_array[RTE_ACL_DFA_SIZE] = RTE_ACL_DFA_SIZE | RTE_ACL_NODE_SINGLE;\n-\tno_match = RTE_ACL_NODE_MATCH;\n \n \tfor (n = 0; n < RTE_ACL_DFA_SIZE; n++)\n \t\tnode_array[n] = no_match;\n \n+\t/* NOMATCH result at index 0 */\n \tmatch = ((struct rte_acl_match_results *)(node_array + match_index));\n \tmemset(match, 0, sizeof(*match));\n \n@@ -470,6 +551,6 @@ rte_acl_gen(struct rte_acl_ctx *ctx, struct rte_acl_trie *trie,\n \tctx->trans_table = node_array;\n \tmemcpy(ctx->trie, trie, sizeof(ctx->trie));\n \n-\tacl_gen_log_stats(ctx, &counts);\n+\tacl_gen_log_stats(ctx, &counts, &indices);\n \treturn 0;\n }\ndiff --git a/lib/librte_acl/acl_run_scalar.c b/lib/librte_acl/acl_run_scalar.c\nindex 43c8fc3..40691ce 100644\n--- a/lib/librte_acl/acl_run_scalar.c\n+++ b/lib/librte_acl/acl_run_scalar.c\n@@ -94,15 +94,6 @@ resolve_priority_scalar(uint64_t transition, int n,\n \t}\n }\n \n-/*\n- * When processing the transition, rather than using if/else\n- * construct, the offset is calculated for DFA and QRANGE and\n- * then conditionally added to the address based on node type.\n- * This is done to avoid branch mis-predictions. Since the\n- * offset is rather simple calculation it is more efficient\n- * to do the calculation and do a condition move rather than\n- * a conditional branch to determine which calculation to do.\n- */\n static inline uint32_t\n scan_forward(uint32_t input, uint32_t max)\n {\n@@ -117,18 +108,27 @@ scalar_transition(const uint64_t *trans_table, uint64_t transition,\n \n \t/* break transition into component parts */\n \tranges = transition >> (sizeof(index) * CHAR_BIT);\n-\n-\t/* calc address for a QRANGE node */\n-\tc = input * SCALAR_QRANGE_MULT;\n-\ta = ranges | SCALAR_QRANGE_MIN;\n \tindex = transition & ~RTE_ACL_NODE_INDEX;\n-\ta -= (c & SCALAR_QRANGE_MASK);\n-\tb = c & SCALAR_QRANGE_MIN;\n \taddr = transition ^ index;\n-\ta &= SCALAR_QRANGE_MIN;\n-\ta ^= (ranges ^ b) & (a ^ b);\n-\tx = scan_forward(a, 32) >> 3;\n-\taddr += (index == RTE_ACL_NODE_DFA) ? input : x;\n+\n+\tif (index != RTE_ACL_NODE_DFA) {\n+\t\t/* calc address for a QRANGE/SINGLE node */\n+\t\tc = (uint32_t)input * SCALAR_QRANGE_MULT;\n+\t\ta = ranges | SCALAR_QRANGE_MIN;\n+\t\ta -= (c & SCALAR_QRANGE_MASK);\n+\t\tb = c & SCALAR_QRANGE_MIN;\n+\t\ta &= SCALAR_QRANGE_MIN;\n+\t\ta ^= (ranges ^ b) & (a ^ b);\n+\t\tx = scan_forward(a, 32) >> 3;\n+\t} else {\n+\t\t/* calc address for a DFA node */\n+\t\tx = ranges >> (input /\n+\t\t\tRTE_ACL_DFA_GR64_SIZE * RTE_ACL_DFA_GR64_BIT);\n+\t\tx &= UINT8_MAX;\n+\t\tx = input - x;\n+\t}\n+\n+\taddr += x;\n \n \t/* pickup next transition */\n \ttransition = *(trans_table + addr);\ndiff --git a/lib/librte_acl/acl_run_sse.c b/lib/librte_acl/acl_run_sse.c\nindex 69a9d77..576c92b 100644\n--- a/lib/librte_acl/acl_run_sse.c\n+++ b/lib/librte_acl/acl_run_sse.c\n@@ -40,24 +40,6 @@ enum {\n \tSHUFFLE32_SWAP64 = 0x4e,\n };\n \n-static const rte_xmm_t mm_type_quad_range = {\n-\t.u32 = {\n-\t\tRTE_ACL_NODE_QRANGE,\n-\t\tRTE_ACL_NODE_QRANGE,\n-\t\tRTE_ACL_NODE_QRANGE,\n-\t\tRTE_ACL_NODE_QRANGE,\n-\t},\n-};\n-\n-static const rte_xmm_t mm_type_quad_range64 = {\n-\t.u32 = {\n-\t\tRTE_ACL_NODE_QRANGE,\n-\t\tRTE_ACL_NODE_QRANGE,\n-\t\t0,\n-\t\t0,\n-\t},\n-};\n-\n static const rte_xmm_t mm_shuffle_input = {\n \t.u32 = {0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c},\n };\n@@ -70,14 +52,6 @@ static const rte_xmm_t mm_ones_16 = {\n \t.u16 = {1, 1, 1, 1, 1, 1, 1, 1},\n };\n \n-static const rte_xmm_t mm_bytes = {\n-\t.u32 = {UINT8_MAX, UINT8_MAX, UINT8_MAX, UINT8_MAX},\n-};\n-\n-static const rte_xmm_t mm_bytes64 = {\n-\t.u32 = {UINT8_MAX, UINT8_MAX, 0, 0},\n-};\n-\n static const rte_xmm_t mm_match_mask = {\n \t.u32 = {\n \t\tRTE_ACL_NODE_MATCH,\n@@ -236,10 +210,14 @@ acl_match_check_x4(int slot, const struct rte_acl_ctx *ctx, struct parms *parms,\n  */\n static inline xmm_t\n acl_calc_addr(xmm_t index_mask, xmm_t next_input, xmm_t shuffle_input,\n-\txmm_t ones_16, xmm_t bytes, xmm_t type_quad_range,\n-\txmm_t *indices1, xmm_t *indices2)\n+\txmm_t ones_16, xmm_t indices1, xmm_t indices2)\n {\n-\txmm_t addr, node_types, temp;\n+\txmm_t addr, node_types, range, temp;\n+\txmm_t dfa_msk, dfa_ofs, quad_ofs;\n+\txmm_t in, r, t;\n+\n+\tconst xmm_t range_base = _mm_set_epi32(0xffffff0c, 0xffffff08,\n+\t\t0xffffff04, 0xffffff00);\n \n \t/*\n \t * Note that no transition is done for a match\n@@ -248,10 +226,13 @@ acl_calc_addr(xmm_t index_mask, xmm_t next_input, xmm_t shuffle_input,\n \t */\n \n \t/* Shuffle low 32 into temp and high 32 into indices2 */\n-\ttemp = (xmm_t)MM_SHUFFLEPS((__m128)*indices1, (__m128)*indices2,\n-\t\t0x88);\n-\t*indices2 = (xmm_t)MM_SHUFFLEPS((__m128)*indices1,\n-\t\t(__m128)*indices2, 0xdd);\n+\ttemp = (xmm_t)MM_SHUFFLEPS((__m128)indices1, (__m128)indices2, 0x88);\n+\trange = (xmm_t)MM_SHUFFLEPS((__m128)indices1, (__m128)indices2, 0xdd);\n+\n+\tt = MM_XOR(index_mask, index_mask);\n+\n+\t/* shuffle input byte to all 4 positions of 32 bit value */\n+\tin = MM_SHUFFLE8(next_input, shuffle_input);\n \n \t/* Calc node type and node addr */\n \tnode_types = MM_ANDNOT(index_mask, temp);\n@@ -262,17 +243,15 @@ acl_calc_addr(xmm_t index_mask, xmm_t next_input, xmm_t shuffle_input,\n \t */\n \n \t/* mask for DFA type (0) nodes */\n-\ttemp = MM_CMPEQ32(node_types, MM_XOR(node_types, node_types));\n+\tdfa_msk = MM_CMPEQ32(node_types, t);\n \n-\t/* add input byte to DFA position */\n-\ttemp = MM_AND(temp, bytes);\n-\ttemp = MM_AND(temp, next_input);\n-\taddr = MM_ADD32(addr, temp);\n+\tr = _mm_srli_epi32(in, 30);\n+\tr = _mm_add_epi8(r, range_base);\n \n-\t/*\n-\t * Calc addr for Range nodes -> range_index + range(input)\n-\t */\n-\tnode_types = MM_CMPEQ32(node_types, type_quad_range);\n+\tt = _mm_srli_epi32(in, 24);\n+\tr = _mm_shuffle_epi8(range, r);\n+\n+\tdfa_ofs = _mm_sub_epi32(t, r);\n \n \t/*\n \t * Calculate number of range boundaries that are less than the\n@@ -282,11 +261,8 @@ acl_calc_addr(xmm_t index_mask, xmm_t next_input, xmm_t shuffle_input,\n \t * input byte.\n \t */\n \n-\t/* shuffle input byte to all 4 positions of 32 bit value */\n-\ttemp = MM_SHUFFLE8(next_input, shuffle_input);\n-\n \t/* check ranges */\n-\ttemp = MM_CMPGT8(temp, *indices2);\n+\ttemp = MM_CMPGT8(in, range);\n \n \t/* convert -1 to 1 (bytes greater than input byte */\n \ttemp = MM_SIGN8(temp, temp);\n@@ -295,10 +271,10 @@ acl_calc_addr(xmm_t index_mask, xmm_t next_input, xmm_t shuffle_input,\n \ttemp = MM_MADD8(temp, temp);\n \n \t/* horizontal add pairs of words into dwords */\n-\ttemp = MM_MADD16(temp, ones_16);\n+\tquad_ofs = MM_MADD16(temp, ones_16);\n \n \t/* mask to range type nodes */\n-\ttemp = MM_AND(temp, node_types);\n+\ttemp = _mm_blendv_epi8(quad_ofs, dfa_ofs, dfa_msk);\n \n \t/* add index into node position */\n \treturn MM_ADD32(addr, temp);\n@@ -309,8 +285,8 @@ acl_calc_addr(xmm_t index_mask, xmm_t next_input, xmm_t shuffle_input,\n  */\n static inline xmm_t\n transition4(xmm_t index_mask, xmm_t next_input, xmm_t shuffle_input,\n-\txmm_t ones_16, xmm_t bytes, xmm_t type_quad_range,\n-\tconst uint64_t *trans, xmm_t *indices1, xmm_t *indices2)\n+\txmm_t ones_16, const uint64_t *trans,\n+\txmm_t *indices1, xmm_t *indices2)\n {\n \txmm_t addr;\n \tuint64_t trans0, trans2;\n@@ -318,7 +294,7 @@ transition4(xmm_t index_mask, xmm_t next_input, xmm_t shuffle_input,\n \t /* Calculate the address (array index) for all 4 transitions. */\n \n \taddr = acl_calc_addr(index_mask, next_input, shuffle_input, ones_16,\n-\t\tbytes, type_quad_range, indices1, indices2);\n+\t\t*indices1, *indices2);\n \n \t /* Gather 64 bit transitions and pack back into 2 registers. */\n \n@@ -408,42 +384,34 @@ search_sse_8(const struct rte_acl_ctx *ctx, const uint8_t **data,\n \n \t\tinput0 = transition4(mm_index_mask.m, input0,\n \t\t\tmm_shuffle_input.m, mm_ones_16.m,\n-\t\t\tmm_bytes.m, mm_type_quad_range.m,\n \t\t\tflows.trans, &indices1, &indices2);\n \n \t\tinput1 = transition4(mm_index_mask.m, input1,\n \t\t\tmm_shuffle_input.m, mm_ones_16.m,\n-\t\t\tmm_bytes.m, mm_type_quad_range.m,\n \t\t\tflows.trans, &indices3, &indices4);\n \n \t\tinput0 = transition4(mm_index_mask.m, input0,\n \t\t\tmm_shuffle_input.m, mm_ones_16.m,\n-\t\t\tmm_bytes.m, mm_type_quad_range.m,\n \t\t\tflows.trans, &indices1, &indices2);\n \n \t\tinput1 = transition4(mm_index_mask.m, input1,\n \t\t\tmm_shuffle_input.m, mm_ones_16.m,\n-\t\t\tmm_bytes.m, mm_type_quad_range.m,\n \t\t\tflows.trans, &indices3, &indices4);\n \n \t\tinput0 = transition4(mm_index_mask.m, input0,\n \t\t\tmm_shuffle_input.m, mm_ones_16.m,\n-\t\t\tmm_bytes.m, mm_type_quad_range.m,\n \t\t\tflows.trans, &indices1, &indices2);\n \n \t\tinput1 = transition4(mm_index_mask.m, input1,\n \t\t\tmm_shuffle_input.m, mm_ones_16.m,\n-\t\t\tmm_bytes.m, mm_type_quad_range.m,\n \t\t\tflows.trans, &indices3, &indices4);\n \n \t\tinput0 = transition4(mm_index_mask.m, input0,\n \t\t\tmm_shuffle_input.m, mm_ones_16.m,\n-\t\t\tmm_bytes.m, mm_type_quad_range.m,\n \t\t\tflows.trans, &indices1, &indices2);\n \n \t\tinput1 = transition4(mm_index_mask.m, input1,\n \t\t\tmm_shuffle_input.m, mm_ones_16.m,\n-\t\t\tmm_bytes.m, mm_type_quad_range.m,\n \t\t\tflows.trans, &indices3, &indices4);\n \n \t\t /* Check for any matches. */\n@@ -496,22 +464,18 @@ search_sse_4(const struct rte_acl_ctx *ctx, const uint8_t **data,\n \t\t/* Process the 4 bytes of input on each stream. */\n \t\tinput = transition4(mm_index_mask.m, input,\n \t\t\tmm_shuffle_input.m, mm_ones_16.m,\n-\t\t\tmm_bytes.m, mm_type_quad_range.m,\n \t\t\tflows.trans, &indices1, &indices2);\n \n \t\t input = transition4(mm_index_mask.m, input,\n \t\t\tmm_shuffle_input.m, mm_ones_16.m,\n-\t\t\tmm_bytes.m, mm_type_quad_range.m,\n \t\t\tflows.trans, &indices1, &indices2);\n \n \t\t input = transition4(mm_index_mask.m, input,\n \t\t\tmm_shuffle_input.m, mm_ones_16.m,\n-\t\t\tmm_bytes.m, mm_type_quad_range.m,\n \t\t\tflows.trans, &indices1, &indices2);\n \n \t\t input = transition4(mm_index_mask.m, input,\n \t\t\tmm_shuffle_input.m, mm_ones_16.m,\n-\t\t\tmm_bytes.m, mm_type_quad_range.m,\n \t\t\tflows.trans, &indices1, &indices2);\n \n \t\t/* Check for any matches. */\n@@ -524,8 +488,7 @@ search_sse_4(const struct rte_acl_ctx *ctx, const uint8_t **data,\n \n static inline xmm_t\n transition2(xmm_t index_mask, xmm_t next_input, xmm_t shuffle_input,\n-\txmm_t ones_16, xmm_t bytes, xmm_t type_quad_range,\n-\tconst uint64_t *trans, xmm_t *indices1)\n+\txmm_t ones_16, const uint64_t *trans, xmm_t *indices1)\n {\n \tuint64_t t;\n \txmm_t addr, indices2;\n@@ -533,7 +496,7 @@ transition2(xmm_t index_mask, xmm_t next_input, xmm_t shuffle_input,\n \tindices2 = MM_XOR(ones_16, ones_16);\n \n \taddr = acl_calc_addr(index_mask, next_input, shuffle_input, ones_16,\n-\t\tbytes, type_quad_range, indices1, &indices2);\n+\t\t*indices1, indices2);\n \n \t/* Gather 64 bit transitions and pack 2 per register. */\n \n@@ -583,22 +546,18 @@ search_sse_2(const struct rte_acl_ctx *ctx, const uint8_t **data,\n \n \t\tinput = transition2(mm_index_mask64.m, input,\n \t\t\tmm_shuffle_input64.m, mm_ones_16.m,\n-\t\t\tmm_bytes64.m, mm_type_quad_range64.m,\n \t\t\tflows.trans, &indices);\n \n \t\tinput = transition2(mm_index_mask64.m, input,\n \t\t\tmm_shuffle_input64.m, mm_ones_16.m,\n-\t\t\tmm_bytes64.m, mm_type_quad_range64.m,\n \t\t\tflows.trans, &indices);\n \n \t\tinput = transition2(mm_index_mask64.m, input,\n \t\t\tmm_shuffle_input64.m, mm_ones_16.m,\n-\t\t\tmm_bytes64.m, mm_type_quad_range64.m,\n \t\t\tflows.trans, &indices);\n \n \t\tinput = transition2(mm_index_mask64.m, input,\n \t\t\tmm_shuffle_input64.m, mm_ones_16.m,\n-\t\t\tmm_bytes64.m, mm_type_quad_range64.m,\n \t\t\tflows.trans, &indices);\n \n \t\t/* Check for any matches. */\n",
    "prefixes": [
        "dpdk-dev",
        "v3",
        "06/18"
    ]
}