get:
Show a patch.

patch:
Update a patch.

put:
Update a patch.

GET /api/patches/2408/?format=api
HTTP 200 OK
Allow: GET, PUT, PATCH, HEAD, OPTIONS
Content-Type: application/json
Vary: Accept

{
    "id": 2408,
    "url": "https://patches.dpdk.org/api/patches/2408/?format=api",
    "web_url": "https://patches.dpdk.org/project/dpdk/patch/1421779267-18492-12-git-send-email-konstantin.ananyev@intel.com/",
    "project": {
        "id": 1,
        "url": "https://patches.dpdk.org/api/projects/1/?format=api",
        "name": "DPDK",
        "link_name": "dpdk",
        "list_id": "dev.dpdk.org",
        "list_email": "dev@dpdk.org",
        "web_url": "http://core.dpdk.org",
        "scm_url": "git://dpdk.org/dpdk",
        "webscm_url": "http://git.dpdk.org/dpdk",
        "list_archive_url": "https://inbox.dpdk.org/dev",
        "list_archive_url_format": "https://inbox.dpdk.org/dev/{}",
        "commit_url_format": ""
    },
    "msgid": "<1421779267-18492-12-git-send-email-konstantin.ananyev@intel.com>",
    "list_archive_url": "https://inbox.dpdk.org/dev/1421779267-18492-12-git-send-email-konstantin.ananyev@intel.com",
    "date": "2015-01-20T18:41:00",
    "name": "[dpdk-dev,v3,11/18] librte_acl: add AVX2 as new rte_acl_classify() method",
    "commit_ref": null,
    "pull_url": null,
    "state": "accepted",
    "archived": true,
    "hash": "97fca9f5f494ca5447a67a773e100b5e6e2651ad",
    "submitter": {
        "id": 33,
        "url": "https://patches.dpdk.org/api/people/33/?format=api",
        "name": "Ananyev, Konstantin",
        "email": "konstantin.ananyev@intel.com"
    },
    "delegate": null,
    "mbox": "https://patches.dpdk.org/project/dpdk/patch/1421779267-18492-12-git-send-email-konstantin.ananyev@intel.com/mbox/",
    "series": [],
    "comments": "https://patches.dpdk.org/api/patches/2408/comments/",
    "check": "pending",
    "checks": "https://patches.dpdk.org/api/patches/2408/checks/",
    "tags": {},
    "related": [],
    "headers": {
        "Return-Path": "<dev-bounces@dpdk.org>",
        "X-Original-To": "patchwork@dpdk.org",
        "Delivered-To": "patchwork@dpdk.org",
        "Received": [
            "from [92.243.14.124] (localhost [IPv6:::1])\n\tby dpdk.org (Postfix) with ESMTP id 8B5115ABE;\n\tTue, 20 Jan 2015 19:41:49 +0100 (CET)",
            "from mga02.intel.com (mga02.intel.com [134.134.136.20])\n\tby dpdk.org (Postfix) with ESMTP id 603075A9D\n\tfor <dev@dpdk.org>; Tue, 20 Jan 2015 19:41:24 +0100 (CET)",
            "from fmsmga001.fm.intel.com ([10.253.24.23])\n\tby orsmga101.jf.intel.com with ESMTP; 20 Jan 2015 10:41:23 -0800",
            "from irvmail001.ir.intel.com ([163.33.26.43])\n\tby fmsmga001.fm.intel.com with ESMTP; 20 Jan 2015 10:41:21 -0800",
            "from sivswdev02.ir.intel.com (sivswdev02.ir.intel.com\n\t[10.237.217.46])\n\tby irvmail001.ir.intel.com (8.14.3/8.13.6/MailSET/Hub) with ESMTP id\n\tt0KIfK6f029507; Tue, 20 Jan 2015 18:41:20 GMT",
            "from sivswdev02.ir.intel.com (localhost [127.0.0.1])\n\tby sivswdev02.ir.intel.com with ESMTP id t0KIfK2m018862;\n\tTue, 20 Jan 2015 18:41:20 GMT",
            "(from kananye1@localhost)\n\tby sivswdev02.ir.intel.com with  id t0KIfKgL018858;\n\tTue, 20 Jan 2015 18:41:20 GMT"
        ],
        "X-ExtLoop1": "1",
        "X-IronPort-AV": "E=Sophos;i=\"5.09,435,1418112000\"; d=\"scan'208\";a=\"653850319\"",
        "From": "Konstantin Ananyev <konstantin.ananyev@intel.com>",
        "To": "dev@dpdk.org",
        "Date": "Tue, 20 Jan 2015 18:41:00 +0000",
        "Message-Id": "<1421779267-18492-12-git-send-email-konstantin.ananyev@intel.com>",
        "X-Mailer": "git-send-email 1.7.4.1",
        "In-Reply-To": "<1421779267-18492-1-git-send-email-konstantin.ananyev@intel.com>",
        "References": "<1421779267-18492-1-git-send-email-konstantin.ananyev@intel.com>",
        "Subject": "[dpdk-dev] [PATCH v3 11/18] librte_acl: add AVX2 as new\n\trte_acl_classify() method",
        "X-BeenThere": "dev@dpdk.org",
        "X-Mailman-Version": "2.1.15",
        "Precedence": "list",
        "List-Id": "patches and discussions about DPDK <dev.dpdk.org>",
        "List-Unsubscribe": "<http://dpdk.org/ml/options/dev>,\n\t<mailto:dev-request@dpdk.org?subject=unsubscribe>",
        "List-Archive": "<http://dpdk.org/ml/archives/dev/>",
        "List-Post": "<mailto:dev@dpdk.org>",
        "List-Help": "<mailto:dev-request@dpdk.org?subject=help>",
        "List-Subscribe": "<http://dpdk.org/ml/listinfo/dev>,\n\t<mailto:dev-request@dpdk.org?subject=subscribe>",
        "Errors-To": "dev-bounces@dpdk.org",
        "Sender": "\"dev\" <dev-bounces@dpdk.org>"
    },
    "content": "v2 changes:\nWhen build with the compilers that don't support AVX2 instructions,\nmake rte_acl_classify_avx2() do nothing and return an error.\nRemove unneeded 'ifdef __AVX2__' in acl_run_avx2.*.\n\nIntroduce new classify() method that uses AVX2 instructions.\nFrom my measurements:\nOn HSW boards when processing >= 16 packets per call,\nAVX2 method outperforms it's SSE counterpart by 10-25%,\n(depending on the ruleset).\nAt runtime, if librte_acl was build with the compiler that supports AVX2,\nthis method is selected as default one on HW that supports AVX2.\n\nSigned-off-by: Konstantin Ananyev <konstantin.ananyev@intel.com>\n---\n lib/librte_acl/Makefile       |  18 ++\n lib/librte_acl/acl.h          |   4 +\n lib/librte_acl/acl_run.h      |   2 +-\n lib/librte_acl/acl_run_avx2.c |  54 +++++\n lib/librte_acl/acl_run_avx2.h | 301 +++++++++++++++++++++++\n lib/librte_acl/acl_run_sse.c  | 537 +-----------------------------------------\n lib/librte_acl/acl_run_sse.h  | 533 +++++++++++++++++++++++++++++++++++++++++\n lib/librte_acl/rte_acl.c      |  27 +++\n lib/librte_acl/rte_acl.h      |   2 +\n 9 files changed, 941 insertions(+), 537 deletions(-)\n create mode 100644 lib/librte_acl/acl_run_avx2.c\n create mode 100644 lib/librte_acl/acl_run_avx2.h\n create mode 100644 lib/librte_acl/acl_run_sse.h",
    "diff": "diff --git a/lib/librte_acl/Makefile b/lib/librte_acl/Makefile\nindex 65e566d..6b74dc9 100644\n--- a/lib/librte_acl/Makefile\n+++ b/lib/librte_acl/Makefile\n@@ -48,6 +48,24 @@ SRCS-$(CONFIG_RTE_LIBRTE_ACL) += acl_run_sse.c\n \n CFLAGS_acl_run_sse.o += -msse4.1\n \n+#\n+# If the compiler supports AVX2 instructions,\n+# then add support for AVX2 classify method.\n+#\n+\n+CC_AVX2_SUPPORT=$(shell $(CC) -march=core-avx2 -dM -E - </dev/null 2>&1 | \\\n+grep -q AVX2 && echo 1)\n+\n+ifeq ($(CC_AVX2_SUPPORT), 1)\n+\tSRCS-$(CONFIG_RTE_LIBRTE_ACL) += acl_run_avx2.c\n+\tCFLAGS_rte_acl.o += -DCC_AVX2_SUPPORT\n+\tifeq ($(CC), icc)\n+\tCFLAGS_acl_run_avx2.o += -march=core-avx2\n+\telse\n+\tCFLAGS_acl_run_avx2.o += -mavx2\n+\tendif\n+endif\n+\n # install this header file\n SYMLINK-$(CONFIG_RTE_LIBRTE_ACL)-include := rte_acl_osdep.h\n SYMLINK-$(CONFIG_RTE_LIBRTE_ACL)-include += rte_acl.h\ndiff --git a/lib/librte_acl/acl.h b/lib/librte_acl/acl.h\nindex 96bb318..d33d7ad 100644\n--- a/lib/librte_acl/acl.h\n+++ b/lib/librte_acl/acl.h\n@@ -196,6 +196,10 @@ int\n rte_acl_classify_sse(const struct rte_acl_ctx *ctx, const uint8_t **data,\n \tuint32_t *results, uint32_t num, uint32_t categories);\n \n+int\n+rte_acl_classify_avx2(const struct rte_acl_ctx *ctx, const uint8_t **data,\n+\tuint32_t *results, uint32_t num, uint32_t categories);\n+\n #ifdef __cplusplus\n }\n #endif /* __cplusplus */\ndiff --git a/lib/librte_acl/acl_run.h b/lib/librte_acl/acl_run.h\nindex 4c843c1..850bc81 100644\n--- a/lib/librte_acl/acl_run.h\n+++ b/lib/librte_acl/acl_run.h\n@@ -35,9 +35,9 @@\n #define\t_ACL_RUN_H_\n \n #include <rte_acl.h>\n-#include \"acl_vect.h\"\n #include \"acl.h\"\n \n+#define MAX_SEARCHES_AVX16\t16\n #define MAX_SEARCHES_SSE8\t8\n #define MAX_SEARCHES_SSE4\t4\n #define MAX_SEARCHES_SSE2\t2\ndiff --git a/lib/librte_acl/acl_run_avx2.c b/lib/librte_acl/acl_run_avx2.c\nnew file mode 100644\nindex 0000000..0a42f72\n--- /dev/null\n+++ b/lib/librte_acl/acl_run_avx2.c\n@@ -0,0 +1,54 @@\n+/*-\n+ *   BSD LICENSE\n+ *\n+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.\n+ *   All rights reserved.\n+ *\n+ *   Redistribution and use in source and binary forms, with or without\n+ *   modification, are permitted provided that the following conditions\n+ *   are met:\n+ *\n+ *     * Redistributions of source code must retain the above copyright\n+ *       notice, this list of conditions and the following disclaimer.\n+ *     * Redistributions in binary form must reproduce the above copyright\n+ *       notice, this list of conditions and the following disclaimer in\n+ *       the documentation and/or other materials provided with the\n+ *       distribution.\n+ *     * Neither the name of Intel Corporation nor the names of its\n+ *       contributors may be used to endorse or promote products derived\n+ *       from this software without specific prior written permission.\n+ *\n+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS\n+ *   \"AS IS\" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT\n+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR\n+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT\n+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\n+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT\n+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\n+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY\n+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n+ */\n+\n+\n+#include \"acl_run_avx2.h\"\n+\n+/*\n+ * Note, that to be able to use AVX2 classify method,\n+ * both compiler and target cpu have to support AVX2 instructions.\n+ */\n+int\n+rte_acl_classify_avx2(const struct rte_acl_ctx *ctx, const uint8_t **data,\n+\tuint32_t *results, uint32_t num, uint32_t categories)\n+{\n+\tif (likely(num >= MAX_SEARCHES_AVX16))\n+\t\treturn search_avx2x16(ctx, data, results, num, categories);\n+\telse if (num >= MAX_SEARCHES_SSE8)\n+\t\treturn search_sse_8(ctx, data, results, num, categories);\n+\telse if (num >= MAX_SEARCHES_SSE4)\n+\t\treturn search_sse_4(ctx, data, results, num, categories);\n+\telse\n+\t\treturn search_sse_2(ctx, data, results, num,\n+\t\t\tcategories);\n+}\ndiff --git a/lib/librte_acl/acl_run_avx2.h b/lib/librte_acl/acl_run_avx2.h\nnew file mode 100644\nindex 0000000..1688c50\n--- /dev/null\n+++ b/lib/librte_acl/acl_run_avx2.h\n@@ -0,0 +1,301 @@\n+/*-\n+ *   BSD LICENSE\n+ *\n+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.\n+ *   All rights reserved.\n+ *\n+ *   Redistribution and use in source and binary forms, with or without\n+ *   modification, are permitted provided that the following conditions\n+ *   are met:\n+ *\n+ *     * Redistributions of source code must retain the above copyright\n+ *       notice, this list of conditions and the following disclaimer.\n+ *     * Redistributions in binary form must reproduce the above copyright\n+ *       notice, this list of conditions and the following disclaimer in\n+ *       the documentation and/or other materials provided with the\n+ *       distribution.\n+ *     * Neither the name of Intel Corporation nor the names of its\n+ *       contributors may be used to endorse or promote products derived\n+ *       from this software without specific prior written permission.\n+ *\n+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS\n+ *   \"AS IS\" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT\n+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR\n+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT\n+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\n+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT\n+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\n+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY\n+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n+ */\n+\n+#include \"acl_run_sse.h\"\n+\n+static const rte_ymm_t ymm_match_mask = {\n+\t.u32 = {\n+\t\tRTE_ACL_NODE_MATCH,\n+\t\tRTE_ACL_NODE_MATCH,\n+\t\tRTE_ACL_NODE_MATCH,\n+\t\tRTE_ACL_NODE_MATCH,\n+\t\tRTE_ACL_NODE_MATCH,\n+\t\tRTE_ACL_NODE_MATCH,\n+\t\tRTE_ACL_NODE_MATCH,\n+\t\tRTE_ACL_NODE_MATCH,\n+\t},\n+};\n+\n+static const rte_ymm_t ymm_index_mask = {\n+\t.u32 = {\n+\t\tRTE_ACL_NODE_INDEX,\n+\t\tRTE_ACL_NODE_INDEX,\n+\t\tRTE_ACL_NODE_INDEX,\n+\t\tRTE_ACL_NODE_INDEX,\n+\t\tRTE_ACL_NODE_INDEX,\n+\t\tRTE_ACL_NODE_INDEX,\n+\t\tRTE_ACL_NODE_INDEX,\n+\t\tRTE_ACL_NODE_INDEX,\n+\t},\n+};\n+\n+static const rte_ymm_t ymm_shuffle_input = {\n+\t.u32 = {\n+\t\t0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c,\n+\t\t0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c,\n+\t},\n+};\n+\n+static const rte_ymm_t ymm_ones_16 = {\n+\t.u16 = {\n+\t\t1, 1, 1, 1, 1, 1, 1, 1,\n+\t\t1, 1, 1, 1, 1, 1, 1, 1,\n+\t},\n+};\n+\n+static inline __attribute__((always_inline)) ymm_t\n+calc_addr_avx2(ymm_t index_mask, ymm_t next_input, ymm_t shuffle_input,\n+\tymm_t ones_16, ymm_t tr_lo, ymm_t tr_hi)\n+{\n+\tymm_t in, node_type, r, t;\n+\tymm_t dfa_msk, dfa_ofs, quad_ofs;\n+\tymm_t addr;\n+\n+\tconst ymm_t range_base = _mm256_set_epi32(\n+\t\t0xffffff0c, 0xffffff08, 0xffffff04, 0xffffff00,\n+\t\t0xffffff0c, 0xffffff08, 0xffffff04, 0xffffff00);\n+\n+\tt = _mm256_xor_si256(index_mask, index_mask);\n+\tin = _mm256_shuffle_epi8(next_input, shuffle_input);\n+\n+\t/* Calc node type and node addr */\n+\tnode_type = _mm256_andnot_si256(index_mask, tr_lo);\n+\taddr = _mm256_and_si256(index_mask, tr_lo);\n+\n+\t/* DFA calculations. */\n+\n+\tdfa_msk = _mm256_cmpeq_epi32(node_type, t);\n+\n+\tr = _mm256_srli_epi32(in, 30);\n+\tr = _mm256_add_epi8(r, range_base);\n+\n+\tt = _mm256_srli_epi32(in, 24);\n+\tr = _mm256_shuffle_epi8(tr_hi, r);\n+\n+\tdfa_ofs = _mm256_sub_epi32(t, r);\n+\n+\t/* QUAD/SINGLE caluclations. */\n+\n+\tt = _mm256_cmpgt_epi8(in, tr_hi);\n+\tt = _mm256_sign_epi8(t, t);\n+\tt = _mm256_maddubs_epi16(t, t);\n+\tquad_ofs = _mm256_madd_epi16(t, ones_16);\n+\n+\t/* blend DFA and QUAD/SINGLE. */\n+\tt = _mm256_blendv_epi8(quad_ofs, dfa_ofs, dfa_msk);\n+\n+\taddr = _mm256_add_epi32(addr, t);\n+\treturn addr;\n+}\n+\n+static inline __attribute__((always_inline)) ymm_t\n+transition8(ymm_t next_input, const uint64_t *trans, ymm_t *tr_lo, ymm_t *tr_hi)\n+{\n+\tconst int32_t *tr;\n+\tymm_t addr;\n+\n+\ttr = (const int32_t *)(uintptr_t)trans;\n+\n+\taddr = calc_addr_avx2(ymm_index_mask.y, next_input, ymm_shuffle_input.y,\n+\t\tymm_ones_16.y, *tr_lo, *tr_hi);\n+\n+\t/* load lower 32 bits of 8 transactions at once. */\n+\t*tr_lo = _mm256_i32gather_epi32(tr, addr, sizeof(trans[0]));\n+\n+\tnext_input = _mm256_srli_epi32(next_input, CHAR_BIT);\n+\n+\t/* load high 32 bits of 8 transactions at once. */\n+\t*tr_hi = _mm256_i32gather_epi32(tr + 1, addr, sizeof(trans[0]));\n+\n+\treturn next_input;\n+}\n+\n+static inline void\n+acl_process_matches_avx2x8(const struct rte_acl_ctx *ctx,\n+\tstruct parms *parms, struct acl_flow_data *flows, uint32_t slot,\n+\tymm_t matches, ymm_t *tr_lo, ymm_t *tr_hi)\n+{\n+\tymm_t t0, t1;\n+\tymm_t lo, hi;\n+\txmm_t l0, l1;\n+\tuint32_t i;\n+\tuint64_t tr[MAX_SEARCHES_SSE8];\n+\n+\tl1 = _mm256_extracti128_si256(*tr_lo, 1);\n+\tl0 = _mm256_castsi256_si128(*tr_lo);\n+\n+\tfor (i = 0; i != RTE_DIM(tr) / 2; i++) {\n+\t\ttr[i] = (uint32_t)_mm_cvtsi128_si32(l0);\n+\t\ttr[i + 4] = (uint32_t)_mm_cvtsi128_si32(l1);\n+\n+\t\tl0 = _mm_srli_si128(l0, sizeof(uint32_t));\n+\t\tl1 = _mm_srli_si128(l1, sizeof(uint32_t));\n+\n+\t\ttr[i] = acl_match_check(tr[i], slot + i,\n+\t\t\tctx, parms, flows, resolve_priority_sse);\n+\t\ttr[i + 4] = acl_match_check(tr[i + 4], slot + i + 4,\n+\t\t\tctx, parms, flows, resolve_priority_sse);\n+\t}\n+\n+\tt0 = _mm256_set_epi64x(tr[5], tr[4], tr[1], tr[0]);\n+\tt1 = _mm256_set_epi64x(tr[7], tr[6], tr[3], tr[2]);\n+\n+\tlo = (ymm_t)_mm256_shuffle_ps((__m256)t0, (__m256)t1, 0x88);\n+\thi = (ymm_t)_mm256_shuffle_ps((__m256)t0, (__m256)t1, 0xdd);\n+\n+\t*tr_lo = _mm256_blendv_epi8(*tr_lo, lo, matches);\n+\t*tr_hi = _mm256_blendv_epi8(*tr_hi, hi, matches);\n+}\n+\n+static inline void\n+acl_match_check_avx2x8(const struct rte_acl_ctx *ctx, struct parms *parms,\n+\tstruct acl_flow_data *flows, uint32_t slot,\n+\tymm_t *tr_lo, ymm_t *tr_hi, ymm_t match_mask)\n+{\n+\tuint32_t msk;\n+\tymm_t matches, temp;\n+\n+\t/* test for match node */\n+\ttemp = _mm256_and_si256(match_mask, *tr_lo);\n+\tmatches = _mm256_cmpeq_epi32(temp, match_mask);\n+\tmsk = _mm256_movemask_epi8(matches);\n+\n+\twhile (msk != 0) {\n+\n+\t\tacl_process_matches_avx2x8(ctx, parms, flows, slot,\n+\t\t\tmatches, tr_lo, tr_hi);\n+\t\ttemp = _mm256_and_si256(match_mask, *tr_lo);\n+\t\tmatches = _mm256_cmpeq_epi32(temp, match_mask);\n+\t\tmsk = _mm256_movemask_epi8(matches);\n+\t}\n+}\n+\n+static inline int\n+search_avx2x16(const struct rte_acl_ctx *ctx, const uint8_t **data,\n+\tuint32_t *results, uint32_t total_packets, uint32_t categories)\n+{\n+\tuint32_t n;\n+\tstruct acl_flow_data flows;\n+\tuint64_t index_array[MAX_SEARCHES_AVX16];\n+\tstruct completion cmplt[MAX_SEARCHES_AVX16];\n+\tstruct parms parms[MAX_SEARCHES_AVX16];\n+\tymm_t input[2], tr_lo[2], tr_hi[2];\n+\tymm_t t0, t1;\n+\n+\tacl_set_flow(&flows, cmplt, RTE_DIM(cmplt), data, results,\n+\t\ttotal_packets, categories, ctx->trans_table);\n+\n+\tfor (n = 0; n < RTE_DIM(cmplt); n++) {\n+\t\tcmplt[n].count = 0;\n+\t\tindex_array[n] = acl_start_next_trie(&flows, parms, n, ctx);\n+\t}\n+\n+\tt0 = _mm256_set_epi64x(index_array[5], index_array[4],\n+\t\tindex_array[1], index_array[0]);\n+\tt1 = _mm256_set_epi64x(index_array[7], index_array[6],\n+\t\tindex_array[3], index_array[2]);\n+\n+\ttr_lo[0] = (ymm_t)_mm256_shuffle_ps((__m256)t0, (__m256)t1, 0x88);\n+\ttr_hi[0] = (ymm_t)_mm256_shuffle_ps((__m256)t0, (__m256)t1, 0xdd);\n+\n+\tt0 = _mm256_set_epi64x(index_array[13], index_array[12],\n+\t\tindex_array[9], index_array[8]);\n+\tt1 = _mm256_set_epi64x(index_array[15], index_array[14],\n+\t\tindex_array[11], index_array[10]);\n+\n+\ttr_lo[1] = (ymm_t)_mm256_shuffle_ps((__m256)t0, (__m256)t1, 0x88);\n+\ttr_hi[1] = (ymm_t)_mm256_shuffle_ps((__m256)t0, (__m256)t1, 0xdd);\n+\n+\t /* Check for any matches. */\n+\tacl_match_check_avx2x8(ctx, parms, &flows, 0, &tr_lo[0], &tr_hi[0],\n+\t\tymm_match_mask.y);\n+\tacl_match_check_avx2x8(ctx, parms, &flows, 8, &tr_lo[1], &tr_hi[1],\n+\t\tymm_match_mask.y);\n+\n+\twhile (flows.started > 0) {\n+\n+\t\tuint32_t in[MAX_SEARCHES_SSE8];\n+\n+\t\t/* Gather 4 bytes of input data for first 8 flows. */\n+\t\tin[0] = GET_NEXT_4BYTES(parms, 0);\n+\t\tin[4] = GET_NEXT_4BYTES(parms, 4);\n+\t\tin[1] = GET_NEXT_4BYTES(parms, 1);\n+\t\tin[5] = GET_NEXT_4BYTES(parms, 5);\n+\t\tin[2] = GET_NEXT_4BYTES(parms, 2);\n+\t\tin[6] = GET_NEXT_4BYTES(parms, 6);\n+\t\tin[3] = GET_NEXT_4BYTES(parms, 3);\n+\t\tin[7] = GET_NEXT_4BYTES(parms, 7);\n+\t\tinput[0] = _mm256_set_epi32(in[7], in[6], in[5], in[4],\n+\t\t\tin[3], in[2], in[1], in[0]);\n+\n+\t\t/* Gather 4 bytes of input data for last 8 flows. */\n+\t\tin[0] = GET_NEXT_4BYTES(parms, 8);\n+\t\tin[4] = GET_NEXT_4BYTES(parms, 12);\n+\t\tin[1] = GET_NEXT_4BYTES(parms, 9);\n+\t\tin[5] = GET_NEXT_4BYTES(parms, 13);\n+\t\tin[2] = GET_NEXT_4BYTES(parms, 10);\n+\t\tin[6] = GET_NEXT_4BYTES(parms, 14);\n+\t\tin[3] = GET_NEXT_4BYTES(parms, 11);\n+\t\tin[7] = GET_NEXT_4BYTES(parms, 15);\n+\t\tinput[1] = _mm256_set_epi32(in[7], in[6], in[5], in[4],\n+\t\t\tin[3], in[2], in[1], in[0]);\n+\n+\t\tinput[0] = transition8(input[0], flows.trans,\n+\t\t\t&tr_lo[0], &tr_hi[0]);\n+\t\tinput[1] = transition8(input[1], flows.trans,\n+\t\t\t&tr_lo[1], &tr_hi[1]);\n+\n+\t\tinput[0] = transition8(input[0], flows.trans,\n+\t\t\t&tr_lo[0], &tr_hi[0]);\n+\t\tinput[1] = transition8(input[1], flows.trans,\n+\t\t\t&tr_lo[1], &tr_hi[1]);\n+\n+\t\tinput[0] = transition8(input[0], flows.trans,\n+\t\t\t&tr_lo[0], &tr_hi[0]);\n+\t\tinput[1] = transition8(input[1], flows.trans,\n+\t\t\t&tr_lo[1], &tr_hi[1]);\n+\n+\t\tinput[0] = transition8(input[0], flows.trans,\n+\t\t\t&tr_lo[0], &tr_hi[0]);\n+\t\tinput[1] = transition8(input[1], flows.trans,\n+\t\t\t&tr_lo[1], &tr_hi[1]);\n+\n+\t\t /* Check for any matches. */\n+\t\tacl_match_check_avx2x8(ctx, parms, &flows, 0,\n+\t\t\t&tr_lo[0], &tr_hi[0], ymm_match_mask.y);\n+\t\tacl_match_check_avx2x8(ctx, parms, &flows, 8,\n+\t\t\t&tr_lo[1], &tr_hi[1], ymm_match_mask.y);\n+\t}\n+\n+\treturn 0;\n+}\ndiff --git a/lib/librte_acl/acl_run_sse.c b/lib/librte_acl/acl_run_sse.c\nindex 4605b58..77b32b3 100644\n--- a/lib/librte_acl/acl_run_sse.c\n+++ b/lib/librte_acl/acl_run_sse.c\n@@ -31,542 +31,7 @@\n  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n  */\n \n-#include \"acl_run.h\"\n-\n-enum {\n-\tSHUFFLE32_SLOT1 = 0xe5,\n-\tSHUFFLE32_SLOT2 = 0xe6,\n-\tSHUFFLE32_SLOT3 = 0xe7,\n-\tSHUFFLE32_SWAP64 = 0x4e,\n-};\n-\n-static const rte_xmm_t mm_shuffle_input = {\n-\t.u32 = {0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c},\n-};\n-\n-static const rte_xmm_t mm_shuffle_input64 = {\n-\t.u32 = {0x00000000, 0x04040404, 0x80808080, 0x80808080},\n-};\n-\n-static const rte_xmm_t mm_ones_16 = {\n-\t.u16 = {1, 1, 1, 1, 1, 1, 1, 1},\n-};\n-\n-static const rte_xmm_t mm_match_mask = {\n-\t.u32 = {\n-\t\tRTE_ACL_NODE_MATCH,\n-\t\tRTE_ACL_NODE_MATCH,\n-\t\tRTE_ACL_NODE_MATCH,\n-\t\tRTE_ACL_NODE_MATCH,\n-\t},\n-};\n-\n-static const rte_xmm_t mm_match_mask64 = {\n-\t.u32 = {\n-\t\tRTE_ACL_NODE_MATCH,\n-\t\t0,\n-\t\tRTE_ACL_NODE_MATCH,\n-\t\t0,\n-\t},\n-};\n-\n-static const rte_xmm_t mm_index_mask = {\n-\t.u32 = {\n-\t\tRTE_ACL_NODE_INDEX,\n-\t\tRTE_ACL_NODE_INDEX,\n-\t\tRTE_ACL_NODE_INDEX,\n-\t\tRTE_ACL_NODE_INDEX,\n-\t},\n-};\n-\n-static const rte_xmm_t mm_index_mask64 = {\n-\t.u32 = {\n-\t\tRTE_ACL_NODE_INDEX,\n-\t\tRTE_ACL_NODE_INDEX,\n-\t\t0,\n-\t\t0,\n-\t},\n-};\n-\n-\n-/*\n- * Resolve priority for multiple results (sse version).\n- * This consists comparing the priority of the current traversal with the\n- * running set of results for the packet.\n- * For each result, keep a running array of the result (rule number) and\n- * its priority for each category.\n- */\n-static inline void\n-resolve_priority_sse(uint64_t transition, int n, const struct rte_acl_ctx *ctx,\n-\tstruct parms *parms, const struct rte_acl_match_results *p,\n-\tuint32_t categories)\n-{\n-\tuint32_t x;\n-\txmm_t results, priority, results1, priority1, selector;\n-\txmm_t *saved_results, *saved_priority;\n-\n-\tfor (x = 0; x < categories; x += RTE_ACL_RESULTS_MULTIPLIER) {\n-\n-\t\tsaved_results = (xmm_t *)(&parms[n].cmplt->results[x]);\n-\t\tsaved_priority =\n-\t\t\t(xmm_t *)(&parms[n].cmplt->priority[x]);\n-\n-\t\t/* get results and priorities for completed trie */\n-\t\tresults = MM_LOADU((const xmm_t *)&p[transition].results[x]);\n-\t\tpriority = MM_LOADU((const xmm_t *)&p[transition].priority[x]);\n-\n-\t\t/* if this is not the first completed trie */\n-\t\tif (parms[n].cmplt->count != ctx->num_tries) {\n-\n-\t\t\t/* get running best results and their priorities */\n-\t\t\tresults1 = MM_LOADU(saved_results);\n-\t\t\tpriority1 = MM_LOADU(saved_priority);\n-\n-\t\t\t/* select results that are highest priority */\n-\t\t\tselector = MM_CMPGT32(priority1, priority);\n-\t\t\tresults = MM_BLENDV8(results, results1, selector);\n-\t\t\tpriority = MM_BLENDV8(priority, priority1, selector);\n-\t\t}\n-\n-\t\t/* save running best results and their priorities */\n-\t\tMM_STOREU(saved_results, results);\n-\t\tMM_STOREU(saved_priority, priority);\n-\t}\n-}\n-\n-/*\n- * Extract transitions from an XMM register and check for any matches\n- */\n-static void\n-acl_process_matches(xmm_t *indices, int slot, const struct rte_acl_ctx *ctx,\n-\tstruct parms *parms, struct acl_flow_data *flows)\n-{\n-\tuint64_t transition1, transition2;\n-\n-\t/* extract transition from low 64 bits. */\n-\ttransition1 = MM_CVT64(*indices);\n-\n-\t/* extract transition from high 64 bits. */\n-\t*indices = MM_SHUFFLE32(*indices, SHUFFLE32_SWAP64);\n-\ttransition2 = MM_CVT64(*indices);\n-\n-\ttransition1 = acl_match_check(transition1, slot, ctx,\n-\t\tparms, flows, resolve_priority_sse);\n-\ttransition2 = acl_match_check(transition2, slot + 1, ctx,\n-\t\tparms, flows, resolve_priority_sse);\n-\n-\t/* update indices with new transitions. */\n-\t*indices = MM_SET64(transition2, transition1);\n-}\n-\n-/*\n- * Check for a match in 2 transitions (contained in SSE register)\n- */\n-static inline void\n-acl_match_check_x2(int slot, const struct rte_acl_ctx *ctx, struct parms *parms,\n-\tstruct acl_flow_data *flows, xmm_t *indices, xmm_t match_mask)\n-{\n-\txmm_t temp;\n-\n-\ttemp = MM_AND(match_mask, *indices);\n-\twhile (!MM_TESTZ(temp, temp)) {\n-\t\tacl_process_matches(indices, slot, ctx, parms, flows);\n-\t\ttemp = MM_AND(match_mask, *indices);\n-\t}\n-}\n-\n-/*\n- * Check for any match in 4 transitions (contained in 2 SSE registers)\n- */\n-static inline void\n-acl_match_check_x4(int slot, const struct rte_acl_ctx *ctx, struct parms *parms,\n-\tstruct acl_flow_data *flows, xmm_t *indices1, xmm_t *indices2,\n-\txmm_t match_mask)\n-{\n-\txmm_t temp;\n-\n-\t/* put low 32 bits of each transition into one register */\n-\ttemp = (xmm_t)MM_SHUFFLEPS((__m128)*indices1, (__m128)*indices2,\n-\t\t0x88);\n-\t/* test for match node */\n-\ttemp = MM_AND(match_mask, temp);\n-\n-\twhile (!MM_TESTZ(temp, temp)) {\n-\t\tacl_process_matches(indices1, slot, ctx, parms, flows);\n-\t\tacl_process_matches(indices2, slot + 2, ctx, parms, flows);\n-\n-\t\ttemp = (xmm_t)MM_SHUFFLEPS((__m128)*indices1,\n-\t\t\t\t\t(__m128)*indices2,\n-\t\t\t\t\t0x88);\n-\t\ttemp = MM_AND(match_mask, temp);\n-\t}\n-}\n-\n-/*\n- * Calculate the address of the next transition for\n- * all types of nodes. Note that only DFA nodes and range\n- * nodes actually transition to another node. Match\n- * nodes don't move.\n- */\n-static inline xmm_t\n-acl_calc_addr(xmm_t index_mask, xmm_t next_input, xmm_t shuffle_input,\n-\txmm_t ones_16, xmm_t indices1, xmm_t indices2)\n-{\n-\txmm_t addr, node_types, range, temp;\n-\txmm_t dfa_msk, dfa_ofs, quad_ofs;\n-\txmm_t in, r, t;\n-\n-\tconst xmm_t range_base = _mm_set_epi32(0xffffff0c, 0xffffff08,\n-\t\t0xffffff04, 0xffffff00);\n-\n-\t/*\n-\t * Note that no transition is done for a match\n-\t * node and therefore a stream freezes when\n-\t * it reaches a match.\n-\t */\n-\n-\t/* Shuffle low 32 into temp and high 32 into indices2 */\n-\ttemp = (xmm_t)MM_SHUFFLEPS((__m128)indices1, (__m128)indices2, 0x88);\n-\trange = (xmm_t)MM_SHUFFLEPS((__m128)indices1, (__m128)indices2, 0xdd);\n-\n-\tt = MM_XOR(index_mask, index_mask);\n-\n-\t/* shuffle input byte to all 4 positions of 32 bit value */\n-\tin = MM_SHUFFLE8(next_input, shuffle_input);\n-\n-\t/* Calc node type and node addr */\n-\tnode_types = MM_ANDNOT(index_mask, temp);\n-\taddr = MM_AND(index_mask, temp);\n-\n-\t/*\n-\t * Calc addr for DFAs - addr = dfa_index + input_byte\n-\t */\n-\n-\t/* mask for DFA type (0) nodes */\n-\tdfa_msk = MM_CMPEQ32(node_types, t);\n-\n-\tr = _mm_srli_epi32(in, 30);\n-\tr = _mm_add_epi8(r, range_base);\n-\n-\tt = _mm_srli_epi32(in, 24);\n-\tr = _mm_shuffle_epi8(range, r);\n-\n-\tdfa_ofs = _mm_sub_epi32(t, r);\n-\n-\t/*\n-\t * Calculate number of range boundaries that are less than the\n-\t * input value. Range boundaries for each node are in signed 8 bit,\n-\t * ordered from -128 to 127 in the indices2 register.\n-\t * This is effectively a popcnt of bytes that are greater than the\n-\t * input byte.\n-\t */\n-\n-\t/* check ranges */\n-\ttemp = MM_CMPGT8(in, range);\n-\n-\t/* convert -1 to 1 (bytes greater than input byte */\n-\ttemp = MM_SIGN8(temp, temp);\n-\n-\t/* horizontal add pairs of bytes into words */\n-\ttemp = MM_MADD8(temp, temp);\n-\n-\t/* horizontal add pairs of words into dwords */\n-\tquad_ofs = MM_MADD16(temp, ones_16);\n-\n-\t/* mask to range type nodes */\n-\ttemp = _mm_blendv_epi8(quad_ofs, dfa_ofs, dfa_msk);\n-\n-\t/* add index into node position */\n-\treturn MM_ADD32(addr, temp);\n-}\n-\n-/*\n- * Process 4 transitions (in 2 SIMD registers) in parallel\n- */\n-static inline xmm_t\n-transition4(xmm_t index_mask, xmm_t next_input, xmm_t shuffle_input,\n-\txmm_t ones_16, const uint64_t *trans,\n-\txmm_t *indices1, xmm_t *indices2)\n-{\n-\txmm_t addr;\n-\tuint64_t trans0, trans2;\n-\n-\t /* Calculate the address (array index) for all 4 transitions. */\n-\n-\taddr = acl_calc_addr(index_mask, next_input, shuffle_input, ones_16,\n-\t\t*indices1, *indices2);\n-\n-\t /* Gather 64 bit transitions and pack back into 2 registers. */\n-\n-\ttrans0 = trans[MM_CVT32(addr)];\n-\n-\t/* get slot 2 */\n-\n-\t/* {x0, x1, x2, x3} -> {x2, x1, x2, x3} */\n-\taddr = MM_SHUFFLE32(addr, SHUFFLE32_SLOT2);\n-\ttrans2 = trans[MM_CVT32(addr)];\n-\n-\t/* get slot 1 */\n-\n-\t/* {x2, x1, x2, x3} -> {x1, x1, x2, x3} */\n-\taddr = MM_SHUFFLE32(addr, SHUFFLE32_SLOT1);\n-\t*indices1 = MM_SET64(trans[MM_CVT32(addr)], trans0);\n-\n-\t/* get slot 3 */\n-\n-\t/* {x1, x1, x2, x3} -> {x3, x1, x2, x3} */\n-\taddr = MM_SHUFFLE32(addr, SHUFFLE32_SLOT3);\n-\t*indices2 = MM_SET64(trans[MM_CVT32(addr)], trans2);\n-\n-\treturn MM_SRL32(next_input, 8);\n-}\n-\n-/*\n- * Execute trie traversal with 8 traversals in parallel\n- */\n-static inline int\n-search_sse_8(const struct rte_acl_ctx *ctx, const uint8_t **data,\n-\tuint32_t *results, uint32_t total_packets, uint32_t categories)\n-{\n-\tint n;\n-\tstruct acl_flow_data flows;\n-\tuint64_t index_array[MAX_SEARCHES_SSE8];\n-\tstruct completion cmplt[MAX_SEARCHES_SSE8];\n-\tstruct parms parms[MAX_SEARCHES_SSE8];\n-\txmm_t input0, input1;\n-\txmm_t indices1, indices2, indices3, indices4;\n-\n-\tacl_set_flow(&flows, cmplt, RTE_DIM(cmplt), data, results,\n-\t\ttotal_packets, categories, ctx->trans_table);\n-\n-\tfor (n = 0; n < MAX_SEARCHES_SSE8; n++) {\n-\t\tcmplt[n].count = 0;\n-\t\tindex_array[n] = acl_start_next_trie(&flows, parms, n, ctx);\n-\t}\n-\n-\t/*\n-\t * indices1 contains index_array[0,1]\n-\t * indices2 contains index_array[2,3]\n-\t * indices3 contains index_array[4,5]\n-\t * indices4 contains index_array[6,7]\n-\t */\n-\n-\tindices1 = MM_LOADU((xmm_t *) &index_array[0]);\n-\tindices2 = MM_LOADU((xmm_t *) &index_array[2]);\n-\n-\tindices3 = MM_LOADU((xmm_t *) &index_array[4]);\n-\tindices4 = MM_LOADU((xmm_t *) &index_array[6]);\n-\n-\t /* Check for any matches. */\n-\tacl_match_check_x4(0, ctx, parms, &flows,\n-\t\t&indices1, &indices2, mm_match_mask.x);\n-\tacl_match_check_x4(4, ctx, parms, &flows,\n-\t\t&indices3, &indices4, mm_match_mask.x);\n-\n-\twhile (flows.started > 0) {\n-\n-\t\t/* Gather 4 bytes of input data for each stream. */\n-\t\tinput0 = MM_INSERT32(mm_ones_16.x, GET_NEXT_4BYTES(parms, 0),\n-\t\t\t0);\n-\t\tinput1 = MM_INSERT32(mm_ones_16.x, GET_NEXT_4BYTES(parms, 4),\n-\t\t\t0);\n-\n-\t\tinput0 = MM_INSERT32(input0, GET_NEXT_4BYTES(parms, 1), 1);\n-\t\tinput1 = MM_INSERT32(input1, GET_NEXT_4BYTES(parms, 5), 1);\n-\n-\t\tinput0 = MM_INSERT32(input0, GET_NEXT_4BYTES(parms, 2), 2);\n-\t\tinput1 = MM_INSERT32(input1, GET_NEXT_4BYTES(parms, 6), 2);\n-\n-\t\tinput0 = MM_INSERT32(input0, GET_NEXT_4BYTES(parms, 3), 3);\n-\t\tinput1 = MM_INSERT32(input1, GET_NEXT_4BYTES(parms, 7), 3);\n-\n-\t\t /* Process the 4 bytes of input on each stream. */\n-\n-\t\tinput0 = transition4(mm_index_mask.x, input0,\n-\t\t\tmm_shuffle_input.x, mm_ones_16.x,\n-\t\t\tflows.trans, &indices1, &indices2);\n-\n-\t\tinput1 = transition4(mm_index_mask.x, input1,\n-\t\t\tmm_shuffle_input.x, mm_ones_16.x,\n-\t\t\tflows.trans, &indices3, &indices4);\n-\n-\t\tinput0 = transition4(mm_index_mask.x, input0,\n-\t\t\tmm_shuffle_input.x, mm_ones_16.x,\n-\t\t\tflows.trans, &indices1, &indices2);\n-\n-\t\tinput1 = transition4(mm_index_mask.x, input1,\n-\t\t\tmm_shuffle_input.x, mm_ones_16.x,\n-\t\t\tflows.trans, &indices3, &indices4);\n-\n-\t\tinput0 = transition4(mm_index_mask.x, input0,\n-\t\t\tmm_shuffle_input.x, mm_ones_16.x,\n-\t\t\tflows.trans, &indices1, &indices2);\n-\n-\t\tinput1 = transition4(mm_index_mask.x, input1,\n-\t\t\tmm_shuffle_input.x, mm_ones_16.x,\n-\t\t\tflows.trans, &indices3, &indices4);\n-\n-\t\tinput0 = transition4(mm_index_mask.x, input0,\n-\t\t\tmm_shuffle_input.x, mm_ones_16.x,\n-\t\t\tflows.trans, &indices1, &indices2);\n-\n-\t\tinput1 = transition4(mm_index_mask.x, input1,\n-\t\t\tmm_shuffle_input.x, mm_ones_16.x,\n-\t\t\tflows.trans, &indices3, &indices4);\n-\n-\t\t /* Check for any matches. */\n-\t\tacl_match_check_x4(0, ctx, parms, &flows,\n-\t\t\t&indices1, &indices2, mm_match_mask.x);\n-\t\tacl_match_check_x4(4, ctx, parms, &flows,\n-\t\t\t&indices3, &indices4, mm_match_mask.x);\n-\t}\n-\n-\treturn 0;\n-}\n-\n-/*\n- * Execute trie traversal with 4 traversals in parallel\n- */\n-static inline int\n-search_sse_4(const struct rte_acl_ctx *ctx, const uint8_t **data,\n-\t uint32_t *results, int total_packets, uint32_t categories)\n-{\n-\tint n;\n-\tstruct acl_flow_data flows;\n-\tuint64_t index_array[MAX_SEARCHES_SSE4];\n-\tstruct completion cmplt[MAX_SEARCHES_SSE4];\n-\tstruct parms parms[MAX_SEARCHES_SSE4];\n-\txmm_t input, indices1, indices2;\n-\n-\tacl_set_flow(&flows, cmplt, RTE_DIM(cmplt), data, results,\n-\t\ttotal_packets, categories, ctx->trans_table);\n-\n-\tfor (n = 0; n < MAX_SEARCHES_SSE4; n++) {\n-\t\tcmplt[n].count = 0;\n-\t\tindex_array[n] = acl_start_next_trie(&flows, parms, n, ctx);\n-\t}\n-\n-\tindices1 = MM_LOADU((xmm_t *) &index_array[0]);\n-\tindices2 = MM_LOADU((xmm_t *) &index_array[2]);\n-\n-\t/* Check for any matches. */\n-\tacl_match_check_x4(0, ctx, parms, &flows,\n-\t\t&indices1, &indices2, mm_match_mask.x);\n-\n-\twhile (flows.started > 0) {\n-\n-\t\t/* Gather 4 bytes of input data for each stream. */\n-\t\tinput = MM_INSERT32(mm_ones_16.x, GET_NEXT_4BYTES(parms, 0), 0);\n-\t\tinput = MM_INSERT32(input, GET_NEXT_4BYTES(parms, 1), 1);\n-\t\tinput = MM_INSERT32(input, GET_NEXT_4BYTES(parms, 2), 2);\n-\t\tinput = MM_INSERT32(input, GET_NEXT_4BYTES(parms, 3), 3);\n-\n-\t\t/* Process the 4 bytes of input on each stream. */\n-\t\tinput = transition4(mm_index_mask.x, input,\n-\t\t\tmm_shuffle_input.x, mm_ones_16.x,\n-\t\t\tflows.trans, &indices1, &indices2);\n-\n-\t\t input = transition4(mm_index_mask.x, input,\n-\t\t\tmm_shuffle_input.x, mm_ones_16.x,\n-\t\t\tflows.trans, &indices1, &indices2);\n-\n-\t\t input = transition4(mm_index_mask.x, input,\n-\t\t\tmm_shuffle_input.x, mm_ones_16.x,\n-\t\t\tflows.trans, &indices1, &indices2);\n-\n-\t\t input = transition4(mm_index_mask.x, input,\n-\t\t\tmm_shuffle_input.x, mm_ones_16.x,\n-\t\t\tflows.trans, &indices1, &indices2);\n-\n-\t\t/* Check for any matches. */\n-\t\tacl_match_check_x4(0, ctx, parms, &flows,\n-\t\t\t&indices1, &indices2, mm_match_mask.x);\n-\t}\n-\n-\treturn 0;\n-}\n-\n-static inline xmm_t\n-transition2(xmm_t index_mask, xmm_t next_input, xmm_t shuffle_input,\n-\txmm_t ones_16, const uint64_t *trans, xmm_t *indices1)\n-{\n-\tuint64_t t;\n-\txmm_t addr, indices2;\n-\n-\tindices2 = MM_XOR(ones_16, ones_16);\n-\n-\taddr = acl_calc_addr(index_mask, next_input, shuffle_input, ones_16,\n-\t\t*indices1, indices2);\n-\n-\t/* Gather 64 bit transitions and pack 2 per register. */\n-\n-\tt = trans[MM_CVT32(addr)];\n-\n-\t/* get slot 1 */\n-\taddr = MM_SHUFFLE32(addr, SHUFFLE32_SLOT1);\n-\t*indices1 = MM_SET64(trans[MM_CVT32(addr)], t);\n-\n-\treturn MM_SRL32(next_input, 8);\n-}\n-\n-/*\n- * Execute trie traversal with 2 traversals in parallel.\n- */\n-static inline int\n-search_sse_2(const struct rte_acl_ctx *ctx, const uint8_t **data,\n-\tuint32_t *results, uint32_t total_packets, uint32_t categories)\n-{\n-\tint n;\n-\tstruct acl_flow_data flows;\n-\tuint64_t index_array[MAX_SEARCHES_SSE2];\n-\tstruct completion cmplt[MAX_SEARCHES_SSE2];\n-\tstruct parms parms[MAX_SEARCHES_SSE2];\n-\txmm_t input, indices;\n-\n-\tacl_set_flow(&flows, cmplt, RTE_DIM(cmplt), data, results,\n-\t\ttotal_packets, categories, ctx->trans_table);\n-\n-\tfor (n = 0; n < MAX_SEARCHES_SSE2; n++) {\n-\t\tcmplt[n].count = 0;\n-\t\tindex_array[n] = acl_start_next_trie(&flows, parms, n, ctx);\n-\t}\n-\n-\tindices = MM_LOADU((xmm_t *) &index_array[0]);\n-\n-\t/* Check for any matches. */\n-\tacl_match_check_x2(0, ctx, parms, &flows, &indices, mm_match_mask64.x);\n-\n-\twhile (flows.started > 0) {\n-\n-\t\t/* Gather 4 bytes of input data for each stream. */\n-\t\tinput = MM_INSERT32(mm_ones_16.x, GET_NEXT_4BYTES(parms, 0), 0);\n-\t\tinput = MM_INSERT32(input, GET_NEXT_4BYTES(parms, 1), 1);\n-\n-\t\t/* Process the 4 bytes of input on each stream. */\n-\n-\t\tinput = transition2(mm_index_mask64.x, input,\n-\t\t\tmm_shuffle_input64.x, mm_ones_16.x,\n-\t\t\tflows.trans, &indices);\n-\n-\t\tinput = transition2(mm_index_mask64.x, input,\n-\t\t\tmm_shuffle_input64.x, mm_ones_16.x,\n-\t\t\tflows.trans, &indices);\n-\n-\t\tinput = transition2(mm_index_mask64.x, input,\n-\t\t\tmm_shuffle_input64.x, mm_ones_16.x,\n-\t\t\tflows.trans, &indices);\n-\n-\t\tinput = transition2(mm_index_mask64.x, input,\n-\t\t\tmm_shuffle_input64.x, mm_ones_16.x,\n-\t\t\tflows.trans, &indices);\n-\n-\t\t/* Check for any matches. */\n-\t\tacl_match_check_x2(0, ctx, parms, &flows, &indices,\n-\t\t\tmm_match_mask64.x);\n-\t}\n-\n-\treturn 0;\n-}\n+#include \"acl_run_sse.h\"\n \n int\n rte_acl_classify_sse(const struct rte_acl_ctx *ctx, const uint8_t **data,\ndiff --git a/lib/librte_acl/acl_run_sse.h b/lib/librte_acl/acl_run_sse.h\nnew file mode 100644\nindex 0000000..e33e16b\n--- /dev/null\n+++ b/lib/librte_acl/acl_run_sse.h\n@@ -0,0 +1,533 @@\n+/*-\n+ *   BSD LICENSE\n+ *\n+ *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.\n+ *   All rights reserved.\n+ *\n+ *   Redistribution and use in source and binary forms, with or without\n+ *   modification, are permitted provided that the following conditions\n+ *   are met:\n+ *\n+ *     * Redistributions of source code must retain the above copyright\n+ *       notice, this list of conditions and the following disclaimer.\n+ *     * Redistributions in binary form must reproduce the above copyright\n+ *       notice, this list of conditions and the following disclaimer in\n+ *       the documentation and/or other materials provided with the\n+ *       distribution.\n+ *     * Neither the name of Intel Corporation nor the names of its\n+ *       contributors may be used to endorse or promote products derived\n+ *       from this software without specific prior written permission.\n+ *\n+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS\n+ *   \"AS IS\" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT\n+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR\n+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT\n+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\n+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT\n+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\n+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY\n+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n+ */\n+\n+#include \"acl_run.h\"\n+#include \"acl_vect.h\"\n+\n+enum {\n+\tSHUFFLE32_SLOT1 = 0xe5,\n+\tSHUFFLE32_SLOT2 = 0xe6,\n+\tSHUFFLE32_SLOT3 = 0xe7,\n+\tSHUFFLE32_SWAP64 = 0x4e,\n+};\n+\n+static const rte_xmm_t xmm_shuffle_input = {\n+\t.u32 = {0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c},\n+};\n+\n+static const rte_xmm_t xmm_shuffle_input64 = {\n+\t.u32 = {0x00000000, 0x04040404, 0x80808080, 0x80808080},\n+};\n+\n+static const rte_xmm_t xmm_ones_16 = {\n+\t.u16 = {1, 1, 1, 1, 1, 1, 1, 1},\n+};\n+\n+static const rte_xmm_t xmm_match_mask = {\n+\t.u32 = {\n+\t\tRTE_ACL_NODE_MATCH,\n+\t\tRTE_ACL_NODE_MATCH,\n+\t\tRTE_ACL_NODE_MATCH,\n+\t\tRTE_ACL_NODE_MATCH,\n+\t},\n+};\n+\n+static const rte_xmm_t xmm_match_mask64 = {\n+\t.u32 = {\n+\t\tRTE_ACL_NODE_MATCH,\n+\t\t0,\n+\t\tRTE_ACL_NODE_MATCH,\n+\t\t0,\n+\t},\n+};\n+\n+static const rte_xmm_t xmm_index_mask = {\n+\t.u32 = {\n+\t\tRTE_ACL_NODE_INDEX,\n+\t\tRTE_ACL_NODE_INDEX,\n+\t\tRTE_ACL_NODE_INDEX,\n+\t\tRTE_ACL_NODE_INDEX,\n+\t},\n+};\n+\n+static const rte_xmm_t xmm_index_mask64 = {\n+\t.u32 = {\n+\t\tRTE_ACL_NODE_INDEX,\n+\t\tRTE_ACL_NODE_INDEX,\n+\t\t0,\n+\t\t0,\n+\t},\n+};\n+\n+\n+/*\n+ * Resolve priority for multiple results (sse version).\n+ * This consists comparing the priority of the current traversal with the\n+ * running set of results for the packet.\n+ * For each result, keep a running array of the result (rule number) and\n+ * its priority for each category.\n+ */\n+static inline void\n+resolve_priority_sse(uint64_t transition, int n, const struct rte_acl_ctx *ctx,\n+\tstruct parms *parms, const struct rte_acl_match_results *p,\n+\tuint32_t categories)\n+{\n+\tuint32_t x;\n+\txmm_t results, priority, results1, priority1, selector;\n+\txmm_t *saved_results, *saved_priority;\n+\n+\tfor (x = 0; x < categories; x += RTE_ACL_RESULTS_MULTIPLIER) {\n+\n+\t\tsaved_results = (xmm_t *)(&parms[n].cmplt->results[x]);\n+\t\tsaved_priority =\n+\t\t\t(xmm_t *)(&parms[n].cmplt->priority[x]);\n+\n+\t\t/* get results and priorities for completed trie */\n+\t\tresults = MM_LOADU((const xmm_t *)&p[transition].results[x]);\n+\t\tpriority = MM_LOADU((const xmm_t *)&p[transition].priority[x]);\n+\n+\t\t/* if this is not the first completed trie */\n+\t\tif (parms[n].cmplt->count != ctx->num_tries) {\n+\n+\t\t\t/* get running best results and their priorities */\n+\t\t\tresults1 = MM_LOADU(saved_results);\n+\t\t\tpriority1 = MM_LOADU(saved_priority);\n+\n+\t\t\t/* select results that are highest priority */\n+\t\t\tselector = MM_CMPGT32(priority1, priority);\n+\t\t\tresults = MM_BLENDV8(results, results1, selector);\n+\t\t\tpriority = MM_BLENDV8(priority, priority1, selector);\n+\t\t}\n+\n+\t\t/* save running best results and their priorities */\n+\t\tMM_STOREU(saved_results, results);\n+\t\tMM_STOREU(saved_priority, priority);\n+\t}\n+}\n+\n+/*\n+ * Extract transitions from an XMM register and check for any matches\n+ */\n+static void\n+acl_process_matches(xmm_t *indices, int slot, const struct rte_acl_ctx *ctx,\n+\tstruct parms *parms, struct acl_flow_data *flows)\n+{\n+\tuint64_t transition1, transition2;\n+\n+\t/* extract transition from low 64 bits. */\n+\ttransition1 = MM_CVT64(*indices);\n+\n+\t/* extract transition from high 64 bits. */\n+\t*indices = MM_SHUFFLE32(*indices, SHUFFLE32_SWAP64);\n+\ttransition2 = MM_CVT64(*indices);\n+\n+\ttransition1 = acl_match_check(transition1, slot, ctx,\n+\t\tparms, flows, resolve_priority_sse);\n+\ttransition2 = acl_match_check(transition2, slot + 1, ctx,\n+\t\tparms, flows, resolve_priority_sse);\n+\n+\t/* update indices with new transitions. */\n+\t*indices = MM_SET64(transition2, transition1);\n+}\n+\n+/*\n+ * Check for a match in 2 transitions (contained in SSE register)\n+ */\n+static inline __attribute__((always_inline)) void\n+acl_match_check_x2(int slot, const struct rte_acl_ctx *ctx, struct parms *parms,\n+\tstruct acl_flow_data *flows, xmm_t *indices, xmm_t match_mask)\n+{\n+\txmm_t temp;\n+\n+\ttemp = MM_AND(match_mask, *indices);\n+\twhile (!MM_TESTZ(temp, temp)) {\n+\t\tacl_process_matches(indices, slot, ctx, parms, flows);\n+\t\ttemp = MM_AND(match_mask, *indices);\n+\t}\n+}\n+\n+/*\n+ * Check for any match in 4 transitions (contained in 2 SSE registers)\n+ */\n+static inline __attribute__((always_inline)) void\n+acl_match_check_x4(int slot, const struct rte_acl_ctx *ctx, struct parms *parms,\n+\tstruct acl_flow_data *flows, xmm_t *indices1, xmm_t *indices2,\n+\txmm_t match_mask)\n+{\n+\txmm_t temp;\n+\n+\t/* put low 32 bits of each transition into one register */\n+\ttemp = (xmm_t)MM_SHUFFLEPS((__m128)*indices1, (__m128)*indices2,\n+\t\t0x88);\n+\t/* test for match node */\n+\ttemp = MM_AND(match_mask, temp);\n+\n+\twhile (!MM_TESTZ(temp, temp)) {\n+\t\tacl_process_matches(indices1, slot, ctx, parms, flows);\n+\t\tacl_process_matches(indices2, slot + 2, ctx, parms, flows);\n+\n+\t\ttemp = (xmm_t)MM_SHUFFLEPS((__m128)*indices1,\n+\t\t\t\t\t(__m128)*indices2,\n+\t\t\t\t\t0x88);\n+\t\ttemp = MM_AND(match_mask, temp);\n+\t}\n+}\n+\n+/*\n+ * Calculate the address of the next transition for\n+ * all types of nodes. Note that only DFA nodes and range\n+ * nodes actually transition to another node. Match\n+ * nodes don't move.\n+ */\n+static inline __attribute__((always_inline)) xmm_t\n+calc_addr_sse(xmm_t index_mask, xmm_t next_input, xmm_t shuffle_input,\n+\txmm_t ones_16, xmm_t indices1, xmm_t indices2)\n+{\n+\txmm_t addr, node_types, range, temp;\n+\txmm_t dfa_msk, dfa_ofs, quad_ofs;\n+\txmm_t in, r, t;\n+\n+\tconst xmm_t range_base = _mm_set_epi32(0xffffff0c, 0xffffff08,\n+\t\t0xffffff04, 0xffffff00);\n+\n+\t/*\n+\t * Note that no transition is done for a match\n+\t * node and therefore a stream freezes when\n+\t * it reaches a match.\n+\t */\n+\n+\t/* Shuffle low 32 into temp and high 32 into indices2 */\n+\ttemp = (xmm_t)MM_SHUFFLEPS((__m128)indices1, (__m128)indices2, 0x88);\n+\trange = (xmm_t)MM_SHUFFLEPS((__m128)indices1, (__m128)indices2, 0xdd);\n+\n+\tt = MM_XOR(index_mask, index_mask);\n+\n+\t/* shuffle input byte to all 4 positions of 32 bit value */\n+\tin = MM_SHUFFLE8(next_input, shuffle_input);\n+\n+\t/* Calc node type and node addr */\n+\tnode_types = MM_ANDNOT(index_mask, temp);\n+\taddr = MM_AND(index_mask, temp);\n+\n+\t/*\n+\t * Calc addr for DFAs - addr = dfa_index + input_byte\n+\t */\n+\n+\t/* mask for DFA type (0) nodes */\n+\tdfa_msk = MM_CMPEQ32(node_types, t);\n+\n+\tr = _mm_srli_epi32(in, 30);\n+\tr = _mm_add_epi8(r, range_base);\n+\n+\tt = _mm_srli_epi32(in, 24);\n+\tr = _mm_shuffle_epi8(range, r);\n+\n+\tdfa_ofs = _mm_sub_epi32(t, r);\n+\n+\t/*\n+\t * Calculate number of range boundaries that are less than the\n+\t * input value. Range boundaries for each node are in signed 8 bit,\n+\t * ordered from -128 to 127 in the indices2 register.\n+\t * This is effectively a popcnt of bytes that are greater than the\n+\t * input byte.\n+\t */\n+\n+\t/* check ranges */\n+\ttemp = MM_CMPGT8(in, range);\n+\n+\t/* convert -1 to 1 (bytes greater than input byte */\n+\ttemp = MM_SIGN8(temp, temp);\n+\n+\t/* horizontal add pairs of bytes into words */\n+\ttemp = MM_MADD8(temp, temp);\n+\n+\t/* horizontal add pairs of words into dwords */\n+\tquad_ofs = MM_MADD16(temp, ones_16);\n+\n+\t/* mask to range type nodes */\n+\ttemp = _mm_blendv_epi8(quad_ofs, dfa_ofs, dfa_msk);\n+\n+\t/* add index into node position */\n+\treturn MM_ADD32(addr, temp);\n+}\n+\n+/*\n+ * Process 4 transitions (in 2 SIMD registers) in parallel\n+ */\n+static inline __attribute__((always_inline)) xmm_t\n+transition4(xmm_t next_input, const uint64_t *trans,\n+\txmm_t *indices1, xmm_t *indices2)\n+{\n+\txmm_t addr;\n+\tuint64_t trans0, trans2;\n+\n+\t /* Calculate the address (array index) for all 4 transitions. */\n+\n+\taddr = calc_addr_sse(xmm_index_mask.x, next_input, xmm_shuffle_input.x,\n+\t\txmm_ones_16.x, *indices1, *indices2);\n+\n+\t /* Gather 64 bit transitions and pack back into 2 registers. */\n+\n+\ttrans0 = trans[MM_CVT32(addr)];\n+\n+\t/* get slot 2 */\n+\n+\t/* {x0, x1, x2, x3} -> {x2, x1, x2, x3} */\n+\taddr = MM_SHUFFLE32(addr, SHUFFLE32_SLOT2);\n+\ttrans2 = trans[MM_CVT32(addr)];\n+\n+\t/* get slot 1 */\n+\n+\t/* {x2, x1, x2, x3} -> {x1, x1, x2, x3} */\n+\taddr = MM_SHUFFLE32(addr, SHUFFLE32_SLOT1);\n+\t*indices1 = MM_SET64(trans[MM_CVT32(addr)], trans0);\n+\n+\t/* get slot 3 */\n+\n+\t/* {x1, x1, x2, x3} -> {x3, x1, x2, x3} */\n+\taddr = MM_SHUFFLE32(addr, SHUFFLE32_SLOT3);\n+\t*indices2 = MM_SET64(trans[MM_CVT32(addr)], trans2);\n+\n+\treturn MM_SRL32(next_input, CHAR_BIT);\n+}\n+\n+/*\n+ * Execute trie traversal with 8 traversals in parallel\n+ */\n+static inline int\n+search_sse_8(const struct rte_acl_ctx *ctx, const uint8_t **data,\n+\tuint32_t *results, uint32_t total_packets, uint32_t categories)\n+{\n+\tint n;\n+\tstruct acl_flow_data flows;\n+\tuint64_t index_array[MAX_SEARCHES_SSE8];\n+\tstruct completion cmplt[MAX_SEARCHES_SSE8];\n+\tstruct parms parms[MAX_SEARCHES_SSE8];\n+\txmm_t input0, input1;\n+\txmm_t indices1, indices2, indices3, indices4;\n+\n+\tacl_set_flow(&flows, cmplt, RTE_DIM(cmplt), data, results,\n+\t\ttotal_packets, categories, ctx->trans_table);\n+\n+\tfor (n = 0; n < MAX_SEARCHES_SSE8; n++) {\n+\t\tcmplt[n].count = 0;\n+\t\tindex_array[n] = acl_start_next_trie(&flows, parms, n, ctx);\n+\t}\n+\n+\t/*\n+\t * indices1 contains index_array[0,1]\n+\t * indices2 contains index_array[2,3]\n+\t * indices3 contains index_array[4,5]\n+\t * indices4 contains index_array[6,7]\n+\t */\n+\n+\tindices1 = MM_LOADU((xmm_t *) &index_array[0]);\n+\tindices2 = MM_LOADU((xmm_t *) &index_array[2]);\n+\n+\tindices3 = MM_LOADU((xmm_t *) &index_array[4]);\n+\tindices4 = MM_LOADU((xmm_t *) &index_array[6]);\n+\n+\t /* Check for any matches. */\n+\tacl_match_check_x4(0, ctx, parms, &flows,\n+\t\t&indices1, &indices2, xmm_match_mask.x);\n+\tacl_match_check_x4(4, ctx, parms, &flows,\n+\t\t&indices3, &indices4, xmm_match_mask.x);\n+\n+\twhile (flows.started > 0) {\n+\n+\t\t/* Gather 4 bytes of input data for each stream. */\n+\t\tinput0 = _mm_cvtsi32_si128(GET_NEXT_4BYTES(parms, 0));\n+\t\tinput1 = _mm_cvtsi32_si128(GET_NEXT_4BYTES(parms, 4));\n+\n+\t\tinput0 = MM_INSERT32(input0, GET_NEXT_4BYTES(parms, 1), 1);\n+\t\tinput1 = MM_INSERT32(input1, GET_NEXT_4BYTES(parms, 5), 1);\n+\n+\t\tinput0 = MM_INSERT32(input0, GET_NEXT_4BYTES(parms, 2), 2);\n+\t\tinput1 = MM_INSERT32(input1, GET_NEXT_4BYTES(parms, 6), 2);\n+\n+\t\tinput0 = MM_INSERT32(input0, GET_NEXT_4BYTES(parms, 3), 3);\n+\t\tinput1 = MM_INSERT32(input1, GET_NEXT_4BYTES(parms, 7), 3);\n+\n+\t\t /* Process the 4 bytes of input on each stream. */\n+\n+\t\tinput0 = transition4(input0, flows.trans,\n+\t\t\t&indices1, &indices2);\n+\t\tinput1 = transition4(input1, flows.trans,\n+\t\t\t&indices3, &indices4);\n+\n+\t\tinput0 = transition4(input0, flows.trans,\n+\t\t\t&indices1, &indices2);\n+\t\tinput1 = transition4(input1, flows.trans,\n+\t\t\t&indices3, &indices4);\n+\n+\t\tinput0 = transition4(input0, flows.trans,\n+\t\t\t&indices1, &indices2);\n+\t\tinput1 = transition4(input1, flows.trans,\n+\t\t\t&indices3, &indices4);\n+\n+\t\tinput0 = transition4(input0, flows.trans,\n+\t\t\t&indices1, &indices2);\n+\t\tinput1 = transition4(input1, flows.trans,\n+\t\t\t&indices3, &indices4);\n+\n+\t\t /* Check for any matches. */\n+\t\tacl_match_check_x4(0, ctx, parms, &flows,\n+\t\t\t&indices1, &indices2, xmm_match_mask.x);\n+\t\tacl_match_check_x4(4, ctx, parms, &flows,\n+\t\t\t&indices3, &indices4, xmm_match_mask.x);\n+\t}\n+\n+\treturn 0;\n+}\n+\n+/*\n+ * Execute trie traversal with 4 traversals in parallel\n+ */\n+static inline int\n+search_sse_4(const struct rte_acl_ctx *ctx, const uint8_t **data,\n+\t uint32_t *results, int total_packets, uint32_t categories)\n+{\n+\tint n;\n+\tstruct acl_flow_data flows;\n+\tuint64_t index_array[MAX_SEARCHES_SSE4];\n+\tstruct completion cmplt[MAX_SEARCHES_SSE4];\n+\tstruct parms parms[MAX_SEARCHES_SSE4];\n+\txmm_t input, indices1, indices2;\n+\n+\tacl_set_flow(&flows, cmplt, RTE_DIM(cmplt), data, results,\n+\t\ttotal_packets, categories, ctx->trans_table);\n+\n+\tfor (n = 0; n < MAX_SEARCHES_SSE4; n++) {\n+\t\tcmplt[n].count = 0;\n+\t\tindex_array[n] = acl_start_next_trie(&flows, parms, n, ctx);\n+\t}\n+\n+\tindices1 = MM_LOADU((xmm_t *) &index_array[0]);\n+\tindices2 = MM_LOADU((xmm_t *) &index_array[2]);\n+\n+\t/* Check for any matches. */\n+\tacl_match_check_x4(0, ctx, parms, &flows,\n+\t\t&indices1, &indices2, xmm_match_mask.x);\n+\n+\twhile (flows.started > 0) {\n+\n+\t\t/* Gather 4 bytes of input data for each stream. */\n+\t\tinput = _mm_cvtsi32_si128(GET_NEXT_4BYTES(parms, 0));\n+\t\tinput = MM_INSERT32(input, GET_NEXT_4BYTES(parms, 1), 1);\n+\t\tinput = MM_INSERT32(input, GET_NEXT_4BYTES(parms, 2), 2);\n+\t\tinput = MM_INSERT32(input, GET_NEXT_4BYTES(parms, 3), 3);\n+\n+\t\t/* Process the 4 bytes of input on each stream. */\n+\t\tinput = transition4(input, flows.trans, &indices1, &indices2);\n+\t\tinput = transition4(input, flows.trans, &indices1, &indices2);\n+\t\tinput = transition4(input, flows.trans, &indices1, &indices2);\n+\t\tinput = transition4(input, flows.trans, &indices1, &indices2);\n+\n+\t\t/* Check for any matches. */\n+\t\tacl_match_check_x4(0, ctx, parms, &flows,\n+\t\t\t&indices1, &indices2, xmm_match_mask.x);\n+\t}\n+\n+\treturn 0;\n+}\n+\n+static inline __attribute__((always_inline)) xmm_t\n+transition2(xmm_t next_input, const uint64_t *trans, xmm_t *indices1)\n+{\n+\tuint64_t t;\n+\txmm_t addr, indices2;\n+\n+\tindices2 = _mm_setzero_si128();\n+\n+\taddr = calc_addr_sse(xmm_index_mask.x, next_input, xmm_shuffle_input.x,\n+\t\txmm_ones_16.x, *indices1, indices2);\n+\n+\t/* Gather 64 bit transitions and pack 2 per register. */\n+\n+\tt = trans[MM_CVT32(addr)];\n+\n+\t/* get slot 1 */\n+\taddr = MM_SHUFFLE32(addr, SHUFFLE32_SLOT1);\n+\t*indices1 = MM_SET64(trans[MM_CVT32(addr)], t);\n+\n+\treturn MM_SRL32(next_input, CHAR_BIT);\n+}\n+\n+/*\n+ * Execute trie traversal with 2 traversals in parallel.\n+ */\n+static inline int\n+search_sse_2(const struct rte_acl_ctx *ctx, const uint8_t **data,\n+\tuint32_t *results, uint32_t total_packets, uint32_t categories)\n+{\n+\tint n;\n+\tstruct acl_flow_data flows;\n+\tuint64_t index_array[MAX_SEARCHES_SSE2];\n+\tstruct completion cmplt[MAX_SEARCHES_SSE2];\n+\tstruct parms parms[MAX_SEARCHES_SSE2];\n+\txmm_t input, indices;\n+\n+\tacl_set_flow(&flows, cmplt, RTE_DIM(cmplt), data, results,\n+\t\ttotal_packets, categories, ctx->trans_table);\n+\n+\tfor (n = 0; n < MAX_SEARCHES_SSE2; n++) {\n+\t\tcmplt[n].count = 0;\n+\t\tindex_array[n] = acl_start_next_trie(&flows, parms, n, ctx);\n+\t}\n+\n+\tindices = MM_LOADU((xmm_t *) &index_array[0]);\n+\n+\t/* Check for any matches. */\n+\tacl_match_check_x2(0, ctx, parms, &flows, &indices,\n+\t\txmm_match_mask64.x);\n+\n+\twhile (flows.started > 0) {\n+\n+\t\t/* Gather 4 bytes of input data for each stream. */\n+\t\tinput = _mm_cvtsi32_si128(GET_NEXT_4BYTES(parms, 0));\n+\t\tinput = MM_INSERT32(input, GET_NEXT_4BYTES(parms, 1), 1);\n+\n+\t\t/* Process the 4 bytes of input on each stream. */\n+\n+\t\tinput = transition2(input, flows.trans, &indices);\n+\t\tinput = transition2(input, flows.trans, &indices);\n+\t\tinput = transition2(input, flows.trans, &indices);\n+\t\tinput = transition2(input, flows.trans, &indices);\n+\n+\t\t/* Check for any matches. */\n+\t\tacl_match_check_x2(0, ctx, parms, &flows, &indices,\n+\t\t\txmm_match_mask64.x);\n+\t}\n+\n+\treturn 0;\n+}\ndiff --git a/lib/librte_acl/rte_acl.c b/lib/librte_acl/rte_acl.c\nindex a16c4a4..a9cd349 100644\n--- a/lib/librte_acl/rte_acl.c\n+++ b/lib/librte_acl/rte_acl.c\n@@ -38,10 +38,25 @@\n \n TAILQ_HEAD(rte_acl_list, rte_tailq_entry);\n \n+/*\n+ * If the compiler doesn't support AVX2 instructions,\n+ * then the dummy one would be used instead for AVX2 classify method.\n+ */\n+int __attribute__ ((weak))\n+rte_acl_classify_avx2(__rte_unused const struct rte_acl_ctx *ctx,\n+\t__rte_unused const uint8_t **data,\n+\t__rte_unused uint32_t *results,\n+\t__rte_unused uint32_t num,\n+\t__rte_unused uint32_t categories)\n+{\n+\treturn -ENOTSUP;\n+}\n+\n static const rte_acl_classify_t classify_fns[] = {\n \t[RTE_ACL_CLASSIFY_DEFAULT] = rte_acl_classify_scalar,\n \t[RTE_ACL_CLASSIFY_SCALAR] = rte_acl_classify_scalar,\n \t[RTE_ACL_CLASSIFY_SSE] = rte_acl_classify_sse,\n+\t[RTE_ACL_CLASSIFY_AVX2] = rte_acl_classify_avx2,\n };\n \n /* by default, use always available scalar code path. */\n@@ -64,12 +79,24 @@ rte_acl_set_ctx_classify(struct rte_acl_ctx *ctx, enum rte_acl_classify_alg alg)\n \treturn 0;\n }\n \n+/*\n+ * Select highest available classify method as default one.\n+ * Note that CLASSIFY_AVX2 should be set as a default only\n+ * if both conditions are met:\n+ * at build time compiler supports AVX2 and target cpu supports AVX2.\n+ */\n static void __attribute__((constructor))\n rte_acl_init(void)\n {\n \tenum rte_acl_classify_alg alg = RTE_ACL_CLASSIFY_DEFAULT;\n \n+#ifdef CC_AVX2_SUPPORT\n+\tif (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2))\n+\t\talg = RTE_ACL_CLASSIFY_AVX2;\n+\telse if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_SSE4_1))\n+#else\n \tif (rte_cpu_get_flag_enabled(RTE_CPUFLAG_SSE4_1))\n+#endif\n \t\talg = RTE_ACL_CLASSIFY_SSE;\n \n \trte_acl_set_default_classify(alg);\ndiff --git a/lib/librte_acl/rte_acl.h b/lib/librte_acl/rte_acl.h\nindex 0d913ee..652a234 100644\n--- a/lib/librte_acl/rte_acl.h\n+++ b/lib/librte_acl/rte_acl.h\n@@ -265,6 +265,8 @@ enum rte_acl_classify_alg {\n \tRTE_ACL_CLASSIFY_DEFAULT = 0,\n \tRTE_ACL_CLASSIFY_SCALAR = 1,  /**< generic implementation. */\n \tRTE_ACL_CLASSIFY_SSE = 2,     /**< requires SSE4.1 support. */\n+\tRTE_ACL_CLASSIFY_AVX2 = 3,    /**< requires AVX2 support. */\n+\tRTE_ACL_CLASSIFY_NUM          /* should always be the last one. */\n };\n \n /**\n",
    "prefixes": [
        "dpdk-dev",
        "v3",
        "11/18"
    ]
}