get:
Show a patch.

patch:
Update a patch.

put:
Update a patch.

GET /api/patches/79711/?format=api
HTTP 200 OK
Allow: GET, PUT, PATCH, HEAD, OPTIONS
Content-Type: application/json
Vary: Accept

{
    "id": 79711,
    "url": "http://patches.dpdk.org/api/patches/79711/?format=api",
    "web_url": "http://patches.dpdk.org/project/dpdk/patch/20201005184526.7465-9-konstantin.ananyev@intel.com/",
    "project": {
        "id": 1,
        "url": "http://patches.dpdk.org/api/projects/1/?format=api",
        "name": "DPDK",
        "link_name": "dpdk",
        "list_id": "dev.dpdk.org",
        "list_email": "dev@dpdk.org",
        "web_url": "http://core.dpdk.org",
        "scm_url": "git://dpdk.org/dpdk",
        "webscm_url": "http://git.dpdk.org/dpdk",
        "list_archive_url": "https://inbox.dpdk.org/dev",
        "list_archive_url_format": "https://inbox.dpdk.org/dev/{}",
        "commit_url_format": ""
    },
    "msgid": "<20201005184526.7465-9-konstantin.ananyev@intel.com>",
    "list_archive_url": "https://inbox.dpdk.org/dev/20201005184526.7465-9-konstantin.ananyev@intel.com",
    "date": "2020-10-05T18:45:20",
    "name": "[v3,08/14] acl: introduce 256-bit width AVX512 classify implementation",
    "commit_ref": null,
    "pull_url": null,
    "state": "superseded",
    "archived": true,
    "hash": "5b2dadf830b5a2e520fd53169dbdad196440c191",
    "submitter": {
        "id": 33,
        "url": "http://patches.dpdk.org/api/people/33/?format=api",
        "name": "Ananyev, Konstantin",
        "email": "konstantin.ananyev@intel.com"
    },
    "delegate": {
        "id": 24651,
        "url": "http://patches.dpdk.org/api/users/24651/?format=api",
        "username": "dmarchand",
        "first_name": "David",
        "last_name": "Marchand",
        "email": "david.marchand@redhat.com"
    },
    "mbox": "http://patches.dpdk.org/project/dpdk/patch/20201005184526.7465-9-konstantin.ananyev@intel.com/mbox/",
    "series": [
        {
            "id": 12702,
            "url": "http://patches.dpdk.org/api/series/12702/?format=api",
            "web_url": "http://patches.dpdk.org/project/dpdk/list/?series=12702",
            "date": "2020-10-05T18:45:21",
            "name": "acl: introduce AVX512 classify methods",
            "version": 3,
            "mbox": "http://patches.dpdk.org/series/12702/mbox/"
        }
    ],
    "comments": "http://patches.dpdk.org/api/patches/79711/comments/",
    "check": "success",
    "checks": "http://patches.dpdk.org/api/patches/79711/checks/",
    "tags": {},
    "related": [],
    "headers": {
        "Return-Path": "<dev-bounces@dpdk.org>",
        "X-Original-To": "patchwork@inbox.dpdk.org",
        "Delivered-To": "patchwork@inbox.dpdk.org",
        "Received": [
            "from dpdk.org (dpdk.org [92.243.14.124])\n\tby inbox.dpdk.org (Postfix) with ESMTP id 54899A04B1;\n\tMon,  5 Oct 2020 21:46:17 +0200 (CEST)",
            "from [92.243.14.124] (localhost [127.0.0.1])\n\tby dpdk.org (Postfix) with ESMTP id 807161BB6F;\n\tMon,  5 Oct 2020 21:43:27 +0200 (CEST)",
            "from mga07.intel.com (mga07.intel.com [134.134.136.100])\n by dpdk.org (Postfix) with ESMTP id 634E61BACD\n for <dev@dpdk.org>; Mon,  5 Oct 2020 21:43:23 +0200 (CEST)",
            "from orsmga005.jf.intel.com ([10.7.209.41])\n by orsmga105.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384;\n 05 Oct 2020 12:27:59 -0700",
            "from sivswdev08.ir.intel.com ([10.237.217.47])\n by orsmga005.jf.intel.com with ESMTP; 05 Oct 2020 11:46:21 -0700"
        ],
        "IronPort-SDR": [
            "\n FL6h+hBnUmtT9YOqozGXW+J3S5il53AG4vYPIhrFN47QSBKTm0gQakPjGr5P7oIt6Zpr4oxc07\n Gp4sHvI/nu8g==",
            "\n 79bzgUVx8eQjifAoD5SDrhlQSFwxXWebzcQQpWHEV8UNqCOTstOE2OIUiVmqCmhw0ShhhShlvW\n ioQhs20tc2WQ=="
        ],
        "X-IronPort-AV": [
            "E=McAfee;i=\"6000,8403,9765\"; a=\"228046127\"",
            "E=Sophos;i=\"5.77,340,1596524400\"; d=\"scan'208\";a=\"228046127\"",
            "E=Sophos;i=\"5.77,340,1596524400\"; d=\"scan'208\";a=\"526625112\""
        ],
        "X-Amp-Result": "SKIPPED(no attachment in message)",
        "X-Amp-File-Uploaded": "False",
        "X-ExtLoop1": "1",
        "From": "Konstantin Ananyev <konstantin.ananyev@intel.com>",
        "To": "dev@dpdk.org",
        "Cc": "jerinj@marvell.com, ruifeng.wang@arm.com, vladimir.medvedkin@intel.com,\n Konstantin Ananyev <konstantin.ananyev@intel.com>",
        "Date": "Mon,  5 Oct 2020 19:45:20 +0100",
        "Message-Id": "<20201005184526.7465-9-konstantin.ananyev@intel.com>",
        "X-Mailer": "git-send-email 2.18.0",
        "In-Reply-To": "<20201005184526.7465-1-konstantin.ananyev@intel.com>",
        "References": "<20200915165025.543-1-konstantin.ananyev@intel.com>\n <20201005184526.7465-1-konstantin.ananyev@intel.com>",
        "Subject": "[dpdk-dev] [PATCH v3 08/14] acl: introduce 256-bit width AVX512\n\tclassify implementation",
        "X-BeenThere": "dev@dpdk.org",
        "X-Mailman-Version": "2.1.15",
        "Precedence": "list",
        "List-Id": "DPDK patches and discussions <dev.dpdk.org>",
        "List-Unsubscribe": "<https://mails.dpdk.org/options/dev>,\n <mailto:dev-request@dpdk.org?subject=unsubscribe>",
        "List-Archive": "<http://mails.dpdk.org/archives/dev/>",
        "List-Post": "<mailto:dev@dpdk.org>",
        "List-Help": "<mailto:dev-request@dpdk.org?subject=help>",
        "List-Subscribe": "<https://mails.dpdk.org/listinfo/dev>,\n <mailto:dev-request@dpdk.org?subject=subscribe>",
        "Errors-To": "dev-bounces@dpdk.org",
        "Sender": "\"dev\" <dev-bounces@dpdk.org>"
    },
    "content": "Introduce classify implementation that uses AVX512 specific ISA.\nrte_acl_classify_avx512x16() is able to process up to 16 flows in parallel.\nIt uses 256-bit width registers/instructions only\n(to avoid frequency level change).\nNote that for now only 64-bit version is supported.\n\nSigned-off-by: Konstantin Ananyev <konstantin.ananyev@intel.com>\n---\n .../prog_guide/packet_classif_access_ctrl.rst |   4 +\n doc/guides/rel_notes/release_20_11.rst        |   5 +\n lib/librte_acl/acl.h                          |   7 +\n lib/librte_acl/acl_gen.c                      |   2 +-\n lib/librte_acl/acl_run_avx512.c               | 129 ++++\n lib/librte_acl/acl_run_avx512x8.h             | 642 ++++++++++++++++++\n 6 files changed, 788 insertions(+), 1 deletion(-)\n create mode 100644 lib/librte_acl/acl_run_avx512x8.h",
    "diff": "diff --git a/doc/guides/prog_guide/packet_classif_access_ctrl.rst b/doc/guides/prog_guide/packet_classif_access_ctrl.rst\nindex daf03e6d7a..11f4bc841b 100644\n--- a/doc/guides/prog_guide/packet_classif_access_ctrl.rst\n+++ b/doc/guides/prog_guide/packet_classif_access_ctrl.rst\n@@ -379,6 +379,10 @@ There are several implementations of classify algorithm:\n *   **RTE_ACL_CLASSIFY_ALTIVEC**: vector implementation, can process up to 8\n     flows in parallel. Requires ALTIVEC support.\n \n+*   **RTE_ACL_CLASSIFY_AVX512X16**: vector implementation, can process up to 16\n+    flows in parallel. Uses 256-bit width SIMD registers.\n+    Requires AVX512 support.\n+\n It is purely a runtime decision which method to choose, there is no build-time difference.\n All implementations operates over the same internal RT structures and use similar principles. The main difference is that vector implementations can manually exploit IA SIMD instructions and process several input data flows in parallel.\n At startup ACL library determines the highest available classify method for the given platform and sets it as default one. Though the user has an ability to override the default classifier function for a given ACL context or perform particular search using non-default classify method. In that case it is user responsibility to make sure that given platform supports selected classify implementation.\ndiff --git a/doc/guides/rel_notes/release_20_11.rst b/doc/guides/rel_notes/release_20_11.rst\nindex e0de60c0c2..95d7bfd777 100644\n--- a/doc/guides/rel_notes/release_20_11.rst\n+++ b/doc/guides/rel_notes/release_20_11.rst\n@@ -107,6 +107,11 @@ New Features\n   * Extern objects and functions can be plugged into the pipeline.\n   * Transaction-oriented table updates.\n \n+* **Add new AVX512 specific classify algorithms for ACL library.**\n+\n+  * Added new ``RTE_ACL_CLASSIFY_AVX512X16`` vector implementation,\n+    which can process up to 16 flows in parallel. Requires AVX512 support.\n+\n \n Removed Items\n -------------\ndiff --git a/lib/librte_acl/acl.h b/lib/librte_acl/acl.h\nindex 543ce55659..7ac0d12f08 100644\n--- a/lib/librte_acl/acl.h\n+++ b/lib/librte_acl/acl.h\n@@ -76,6 +76,13 @@ struct rte_acl_bitset {\n  * input_byte - ((uint8_t *)&transition)[4 + input_byte / 64].\n  */\n \n+/*\n+ * Each ACL RT contains an idle nomatch node:\n+ * a SINGLE node at predefined position (RTE_ACL_DFA_SIZE)\n+ * that points to itself.\n+ */\n+#define RTE_ACL_IDLE_NODE\t(RTE_ACL_DFA_SIZE | RTE_ACL_NODE_SINGLE)\n+\n /*\n  * Structure of a node is a set of ptrs and each ptr has a bit map\n  * of values associated with this transition.\ndiff --git a/lib/librte_acl/acl_gen.c b/lib/librte_acl/acl_gen.c\nindex f1b9d12f1e..e759a2ca15 100644\n--- a/lib/librte_acl/acl_gen.c\n+++ b/lib/librte_acl/acl_gen.c\n@@ -496,7 +496,7 @@ rte_acl_gen(struct rte_acl_ctx *ctx, struct rte_acl_trie *trie,\n \t * highest index, that points to itself)\n \t */\n \n-\tnode_array[RTE_ACL_DFA_SIZE] = RTE_ACL_DFA_SIZE | RTE_ACL_NODE_SINGLE;\n+\tnode_array[RTE_ACL_DFA_SIZE] = RTE_ACL_IDLE_NODE;\n \n \tfor (n = 0; n < RTE_ACL_DFA_SIZE; n++)\n \t\tnode_array[n] = no_match;\ndiff --git a/lib/librte_acl/acl_run_avx512.c b/lib/librte_acl/acl_run_avx512.c\nindex 1817f88b29..f5bc628b7c 100644\n--- a/lib/librte_acl/acl_run_avx512.c\n+++ b/lib/librte_acl/acl_run_avx512.c\n@@ -4,10 +4,126 @@\n \n #include \"acl_run_sse.h\"\n \n+/*sizeof(uint32_t) << match_log == sizeof(struct rte_acl_match_results)*/\n+static const uint32_t match_log = 5;\n+\n+struct acl_flow_avx512 {\n+\tuint32_t num_packets;       /* number of packets processed */\n+\tuint32_t total_packets;     /* max number of packets to process */\n+\tuint32_t root_index;        /* current root index */\n+\tconst uint64_t *trans;      /* transition table */\n+\tconst uint32_t *data_index; /* input data indexes */\n+\tconst uint8_t **idata;      /* input data */\n+\tuint32_t *matches;          /* match indexes */\n+};\n+\n+static inline void\n+acl_set_flow_avx512(struct acl_flow_avx512 *flow, const struct rte_acl_ctx *ctx,\n+\tuint32_t trie, const uint8_t *data[], uint32_t *matches,\n+\tuint32_t total_packets)\n+{\n+\tflow->num_packets = 0;\n+\tflow->total_packets = total_packets;\n+\tflow->root_index = ctx->trie[trie].root_index;\n+\tflow->trans = ctx->trans_table;\n+\tflow->data_index = ctx->trie[trie].data_index;\n+\tflow->idata = data;\n+\tflow->matches = matches;\n+}\n+\n+/*\n+ * Update flow and result masks based on the number of unprocessed flows.\n+ */\n+static inline uint32_t\n+update_flow_mask(const struct acl_flow_avx512 *flow, uint32_t *fmsk,\n+\tuint32_t *rmsk)\n+{\n+\tuint32_t i, j, k, m, n;\n+\n+\tfmsk[0] ^= rmsk[0];\n+\tm = rmsk[0];\n+\n+\tk = __builtin_popcount(m);\n+\tn = flow->total_packets - flow->num_packets;\n+\n+\tif (n < k) {\n+\t\t/* reduce mask */\n+\t\tfor (i = k - n; i != 0; i--) {\n+\t\t\tj = sizeof(m) * CHAR_BIT - 1 - __builtin_clz(m);\n+\t\t\tm ^= 1 << j;\n+\t\t}\n+\t} else\n+\t\tn = k;\n+\n+\trmsk[0] = m;\n+\tfmsk[0] |= rmsk[0];\n+\n+\treturn n;\n+}\n+\n+/*\n+ * Resolve matches for multiple categories (LE 8, use 128b instuctions/regs)\n+ */\n+static inline void\n+resolve_mcle8_avx512x1(uint32_t result[],\n+\tconst struct rte_acl_match_results pr[], const uint32_t match[],\n+\tuint32_t nb_pkt, uint32_t nb_cat, uint32_t nb_trie)\n+{\n+\tconst int32_t *pri;\n+\tconst uint32_t *pm, *res;\n+\tuint32_t i, j, k, mi, mn;\n+\t__mmask8 msk;\n+\txmm_t cp, cr, np, nr;\n+\n+\tres = pr->results;\n+\tpri = pr->priority;\n+\n+\tfor (k = 0; k != nb_pkt; k++, result += nb_cat) {\n+\n+\t\tmi = match[k] << match_log;\n+\n+\t\tfor (j = 0; j != nb_cat; j += RTE_ACL_RESULTS_MULTIPLIER) {\n+\n+\t\t\tcr = _mm_loadu_si128((const xmm_t *)(res + mi + j));\n+\t\t\tcp = _mm_loadu_si128((const xmm_t *)(pri + mi + j));\n+\n+\t\t\tfor (i = 1, pm = match + nb_pkt; i != nb_trie;\n+\t\t\t\ti++, pm += nb_pkt) {\n+\n+\t\t\t\tmn = j + (pm[k] << match_log);\n+\n+\t\t\t\tnr = _mm_loadu_si128((const xmm_t *)(res + mn));\n+\t\t\t\tnp = _mm_loadu_si128((const xmm_t *)(pri + mn));\n+\n+\t\t\t\tmsk = _mm_cmpgt_epi32_mask(cp, np);\n+\t\t\t\tcr = _mm_mask_mov_epi32(nr, msk, cr);\n+\t\t\t\tcp = _mm_mask_mov_epi32(np, msk, cp);\n+\t\t\t}\n+\n+\t\t\t_mm_storeu_si128((xmm_t *)(result + j), cr);\n+\t\t}\n+\t}\n+}\n+\n+#include \"acl_run_avx512x8.h\"\n+\n int\n rte_acl_classify_avx512x16(const struct rte_acl_ctx *ctx, const uint8_t **data,\n \tuint32_t *results, uint32_t num, uint32_t categories)\n {\n+\tconst uint32_t max_iter = MAX_SEARCHES_AVX16 * MAX_SEARCHES_AVX16;\n+\n+\t/* split huge lookup (gt 256) into series of fixed size ones */\n+\twhile (num > max_iter) {\n+\t\tsearch_avx512x8x2(ctx, data, results, max_iter, categories);\n+\t\tdata += max_iter;\n+\t\tresults += max_iter * categories;\n+\t\tnum -= max_iter;\n+\t}\n+\n+\t/* select classify method based on number of remaining requests */\n+\tif (num >= MAX_SEARCHES_AVX16)\n+\t\treturn search_avx512x8x2(ctx, data, results, num, categories);\n \tif (num >= MAX_SEARCHES_SSE8)\n \t\treturn search_sse_8(ctx, data, results, num, categories);\n \tif (num >= MAX_SEARCHES_SSE4)\n@@ -20,6 +136,19 @@ int\n rte_acl_classify_avx512x32(const struct rte_acl_ctx *ctx, const uint8_t **data,\n \tuint32_t *results, uint32_t num, uint32_t categories)\n {\n+\tconst uint32_t max_iter = MAX_SEARCHES_AVX16 * MAX_SEARCHES_AVX16;\n+\n+\t/* split huge lookup (gt 256) into series of fixed size ones */\n+\twhile (num > max_iter) {\n+\t\tsearch_avx512x8x2(ctx, data, results, max_iter, categories);\n+\t\tdata += max_iter;\n+\t\tresults += max_iter * categories;\n+\t\tnum -= max_iter;\n+\t}\n+\n+\t/* select classify method based on number of remaining requests */\n+\tif (num >= MAX_SEARCHES_AVX16)\n+\t\treturn search_avx512x8x2(ctx, data, results, num, categories);\n \tif (num >= MAX_SEARCHES_SSE8)\n \t\treturn search_sse_8(ctx, data, results, num, categories);\n \tif (num >= MAX_SEARCHES_SSE4)\ndiff --git a/lib/librte_acl/acl_run_avx512x8.h b/lib/librte_acl/acl_run_avx512x8.h\nnew file mode 100644\nindex 0000000000..cfba0299ed\n--- /dev/null\n+++ b/lib/librte_acl/acl_run_avx512x8.h\n@@ -0,0 +1,642 @@\n+/* SPDX-License-Identifier: BSD-3-Clause\n+ * Copyright(c) 2020 Intel Corporation\n+ */\n+\n+#define MASK8_BIT\t(sizeof(__mmask8) * CHAR_BIT)\n+\n+#define NUM_AVX512X8X2\t(2 * MASK8_BIT)\n+#define MSK_AVX512X8X2\t(NUM_AVX512X8X2 - 1)\n+\n+/* num/mask of pointers per SIMD regs */\n+#define YMM_PTR_NUM\t(sizeof(__m256i) / sizeof(uintptr_t))\n+#define YMM_PTR_MSK\tRTE_LEN2MASK(YMM_PTR_NUM, uint32_t)\n+\n+static const rte_ymm_t ymm_match_mask = {\n+\t.u32 = {\n+\t\tRTE_ACL_NODE_MATCH,\n+\t\tRTE_ACL_NODE_MATCH,\n+\t\tRTE_ACL_NODE_MATCH,\n+\t\tRTE_ACL_NODE_MATCH,\n+\t\tRTE_ACL_NODE_MATCH,\n+\t\tRTE_ACL_NODE_MATCH,\n+\t\tRTE_ACL_NODE_MATCH,\n+\t\tRTE_ACL_NODE_MATCH,\n+\t},\n+};\n+\n+static const rte_ymm_t ymm_index_mask = {\n+\t.u32 = {\n+\t\tRTE_ACL_NODE_INDEX,\n+\t\tRTE_ACL_NODE_INDEX,\n+\t\tRTE_ACL_NODE_INDEX,\n+\t\tRTE_ACL_NODE_INDEX,\n+\t\tRTE_ACL_NODE_INDEX,\n+\t\tRTE_ACL_NODE_INDEX,\n+\t\tRTE_ACL_NODE_INDEX,\n+\t\tRTE_ACL_NODE_INDEX,\n+\t},\n+};\n+\n+static const rte_ymm_t ymm_trlo_idle = {\n+\t.u32 = {\n+\t\tRTE_ACL_IDLE_NODE,\n+\t\tRTE_ACL_IDLE_NODE,\n+\t\tRTE_ACL_IDLE_NODE,\n+\t\tRTE_ACL_IDLE_NODE,\n+\t\tRTE_ACL_IDLE_NODE,\n+\t\tRTE_ACL_IDLE_NODE,\n+\t\tRTE_ACL_IDLE_NODE,\n+\t\tRTE_ACL_IDLE_NODE,\n+\t},\n+};\n+\n+static const rte_ymm_t ymm_trhi_idle = {\n+\t.u32 = {\n+\t\t0, 0, 0, 0,\n+\t\t0, 0, 0, 0,\n+\t},\n+};\n+\n+static const rte_ymm_t ymm_shuffle_input = {\n+\t.u32 = {\n+\t\t0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c,\n+\t\t0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c,\n+\t},\n+};\n+\n+static const rte_ymm_t ymm_four_32 = {\n+\t.u32 = {\n+\t\t4, 4, 4, 4,\n+\t\t4, 4, 4, 4,\n+\t},\n+};\n+\n+static const rte_ymm_t ymm_idx_add = {\n+\t.u32 = {\n+\t\t0, 1, 2, 3,\n+\t\t4, 5, 6, 7,\n+\t},\n+};\n+\n+static const rte_ymm_t ymm_range_base = {\n+\t.u32 = {\n+\t\t0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c,\n+\t\t0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c,\n+\t},\n+};\n+\n+static const rte_ymm_t ymm_pminp = {\n+\t.u32 = {\n+\t\t0x00, 0x01, 0x02, 0x03,\n+\t\t0x08, 0x09, 0x0a, 0x0b,\n+\t},\n+};\n+\n+static const __mmask16 ymm_pmidx_msk = 0x55;\n+\n+static const rte_ymm_t ymm_pmidx[2] = {\n+\t[0] = {\n+\t\t.u32 = {\n+\t\t\t0, 0, 1, 0, 2, 0, 3, 0,\n+\t\t},\n+\t},\n+\t[1] = {\n+\t\t.u32 = {\n+\t\t\t4, 0, 5, 0, 6, 0, 7, 0,\n+\t\t},\n+\t},\n+};\n+\n+/*\n+ * unfortunately current AVX512 ISA doesn't provide ability for\n+ * gather load on a byte quantity. So we have to mimic it in SW,\n+ * by doing 4x1B scalar loads.\n+ */\n+static inline __m128i\n+_m256_mask_gather_epi8x4(__m256i pdata, __mmask8 mask)\n+{\n+\trte_xmm_t v;\n+\trte_ymm_t p;\n+\n+\tstatic const uint32_t zero;\n+\n+\tp.y = _mm256_mask_set1_epi64(pdata, mask ^ YMM_PTR_MSK,\n+\t\t(uintptr_t)&zero);\n+\n+\tv.u32[0] = *(uint8_t *)p.u64[0];\n+\tv.u32[1] = *(uint8_t *)p.u64[1];\n+\tv.u32[2] = *(uint8_t *)p.u64[2];\n+\tv.u32[3] = *(uint8_t *)p.u64[3];\n+\n+\treturn v.x;\n+}\n+\n+/*\n+ * Calculate the address of the next transition for\n+ * all types of nodes. Note that only DFA nodes and range\n+ * nodes actually transition to another node. Match\n+ * nodes not supposed to be encountered here.\n+ * For quad range nodes:\n+ * Calculate number of range boundaries that are less than the\n+ * input value. Range boundaries for each node are in signed 8 bit,\n+ * ordered from -128 to 127.\n+ * This is effectively a popcnt of bytes that are greater than the\n+ * input byte.\n+ * Single nodes are processed in the same ways as quad range nodes.\n+ */\n+static __rte_always_inline __m256i\n+calc_addr8(__m256i index_mask, __m256i next_input, __m256i shuffle_input,\n+\t__m256i four_32, __m256i range_base, __m256i tr_lo, __m256i tr_hi)\n+{\n+\t__mmask32 qm;\n+\t__mmask8 dfa_msk;\n+\t__m256i addr, in, node_type, r, t;\n+\t__m256i dfa_ofs, quad_ofs;\n+\n+\tt = _mm256_xor_si256(index_mask, index_mask);\n+\tin = _mm256_shuffle_epi8(next_input, shuffle_input);\n+\n+\t/* Calc node type and node addr */\n+\tnode_type = _mm256_andnot_si256(index_mask, tr_lo);\n+\taddr = _mm256_and_si256(index_mask, tr_lo);\n+\n+\t/* mask for DFA type(0) nodes */\n+\tdfa_msk = _mm256_cmpeq_epi32_mask(node_type, t);\n+\n+\t/* DFA calculations. */\n+\tr = _mm256_srli_epi32(in, 30);\n+\tr = _mm256_add_epi8(r, range_base);\n+\tt = _mm256_srli_epi32(in, 24);\n+\tr = _mm256_shuffle_epi8(tr_hi, r);\n+\n+\tdfa_ofs = _mm256_sub_epi32(t, r);\n+\n+\t/* QUAD/SINGLE calculations. */\n+\tqm = _mm256_cmpgt_epi8_mask(in, tr_hi);\n+\tt = _mm256_maskz_set1_epi8(qm, (uint8_t)UINT8_MAX);\n+\tt = _mm256_lzcnt_epi32(t);\n+\tt = _mm256_srli_epi32(t, 3);\n+\tquad_ofs = _mm256_sub_epi32(four_32, t);\n+\n+\t/* blend DFA and QUAD/SINGLE. */\n+\tt = _mm256_mask_mov_epi32(quad_ofs, dfa_msk, dfa_ofs);\n+\n+\t/* calculate address for next transitions. */\n+\taddr = _mm256_add_epi32(addr, t);\n+\treturn addr;\n+}\n+\n+/*\n+ * Process 16 transitions in parallel.\n+ * tr_lo contains low 32 bits for 16 transition.\n+ * tr_hi contains high 32 bits for 16 transition.\n+ * next_input contains up to 4 input bytes for 16 flows.\n+ */\n+static __rte_always_inline __m256i\n+transition8(__m256i next_input, const uint64_t *trans, __m256i *tr_lo,\n+\t__m256i *tr_hi)\n+{\n+\tconst int32_t *tr;\n+\t__m256i addr;\n+\n+\ttr = (const int32_t *)(uintptr_t)trans;\n+\n+\t/* Calculate the address (array index) for all 8 transitions. */\n+\taddr = calc_addr8(ymm_index_mask.y, next_input, ymm_shuffle_input.y,\n+\t\tymm_four_32.y, ymm_range_base.y, *tr_lo, *tr_hi);\n+\n+\t/* load lower 32 bits of 16 transactions at once. */\n+\t*tr_lo = _mm256_i32gather_epi32(tr, addr, sizeof(trans[0]));\n+\n+\tnext_input = _mm256_srli_epi32(next_input, CHAR_BIT);\n+\n+\t/* load high 32 bits of 16 transactions at once. */\n+\t*tr_hi = _mm256_i32gather_epi32(tr + 1, addr, sizeof(trans[0]));\n+\n+\treturn next_input;\n+}\n+\n+/*\n+ * Execute first transition for up to 16 flows in parallel.\n+ * next_input should contain one input byte for up to 16 flows.\n+ * msk - mask of active flows.\n+ * tr_lo contains low 32 bits for up to 16 transitions.\n+ * tr_hi contains high 32 bits for up to 16 transitions.\n+ */\n+static __rte_always_inline void\n+first_trans8(const struct acl_flow_avx512 *flow, __m256i next_input,\n+\t__mmask8 msk, __m256i *tr_lo, __m256i *tr_hi)\n+{\n+\tconst int32_t *tr;\n+\t__m256i addr, root;\n+\n+\ttr = (const int32_t *)(uintptr_t)flow->trans;\n+\n+\taddr = _mm256_set1_epi32(UINT8_MAX);\n+\troot = _mm256_set1_epi32(flow->root_index);\n+\n+\taddr = _mm256_and_si256(next_input, addr);\n+\taddr = _mm256_add_epi32(root, addr);\n+\n+\t/* load lower 32 bits of 16 transactions at once. */\n+\t*tr_lo = _mm256_mmask_i32gather_epi32(*tr_lo, msk, addr, tr,\n+\t\tsizeof(flow->trans[0]));\n+\n+\t/* load high 32 bits of 16 transactions at once. */\n+\t*tr_hi = _mm256_mmask_i32gather_epi32(*tr_hi, msk, addr, (tr + 1),\n+\t\tsizeof(flow->trans[0]));\n+}\n+\n+/*\n+ * Load and return next 4 input bytes for up to 16 flows in parallel.\n+ * pdata - 8x2 pointers to flow input data\n+ * mask - mask of active flows.\n+ * di - data indexes for these 16 flows.\n+ */\n+static inline __m256i\n+get_next_bytes_avx512x8(const struct acl_flow_avx512 *flow, __m256i pdata[2],\n+\tuint32_t msk, __m256i *di, uint32_t bnum)\n+{\n+\tconst int32_t *div;\n+\tuint32_t m[2];\n+\t__m256i one, zero, t, p[2];\n+\t__m128i inp[2];\n+\n+\tdiv = (const int32_t *)flow->data_index;\n+\n+\tone = _mm256_set1_epi32(1);\n+\tzero = _mm256_xor_si256(one, one);\n+\n+\t/* load data offsets for given indexes */\n+\tt = _mm256_mmask_i32gather_epi32(zero, msk, *di, div, sizeof(div[0]));\n+\n+\t/* increment data indexes */\n+\t*di = _mm256_mask_add_epi32(*di, msk, *di, one);\n+\n+\t/*\n+\t * unsigned expand 32-bit indexes to 64-bit\n+\t * (for later pointer arithmetic), i.e:\n+\t * for (i = 0; i != 16; i++)\n+\t *   p[i/8].u64[i%8] = (uint64_t)t.u32[i];\n+\t */\n+\tp[0] = _mm256_maskz_permutexvar_epi32(ymm_pmidx_msk, ymm_pmidx[0].y, t);\n+\tp[1] = _mm256_maskz_permutexvar_epi32(ymm_pmidx_msk, ymm_pmidx[1].y, t);\n+\n+\tp[0] = _mm256_add_epi64(p[0], pdata[0]);\n+\tp[1] = _mm256_add_epi64(p[1], pdata[1]);\n+\n+\t/* load input byte(s), either one or four */\n+\n+\tm[0] = msk & YMM_PTR_MSK;\n+\tm[1] = msk >> YMM_PTR_NUM;\n+\n+\tif (bnum == sizeof(uint8_t)) {\n+\t\tinp[0] = _m256_mask_gather_epi8x4(p[0], m[0]);\n+\t\tinp[1] = _m256_mask_gather_epi8x4(p[1], m[1]);\n+\t} else {\n+\t\tinp[0] = _mm256_mmask_i64gather_epi32(\n+\t\t\t\t_mm256_castsi256_si128(zero), m[0], p[0],\n+\t\t\t\tNULL, sizeof(uint8_t));\n+\t\tinp[1] = _mm256_mmask_i64gather_epi32(\n+\t\t\t\t_mm256_castsi256_si128(zero), m[1], p[1],\n+\t\t\t\tNULL, sizeof(uint8_t));\n+\t}\n+\n+\t/* squeeze input into one 512-bit register */\n+\treturn _mm256_permutex2var_epi32(_mm256_castsi128_si256(inp[0]),\n+\t\t\tymm_pminp.y,  _mm256_castsi128_si256(inp[1]));\n+}\n+\n+/*\n+ * Start up to 16 new flows.\n+ * num - number of flows to start\n+ * msk - mask of new flows.\n+ * pdata - pointers to flow input data\n+ * idx - match indexed for given flows\n+ * di - data indexes for these flows.\n+ */\n+static inline void\n+start_flow8(struct acl_flow_avx512 *flow, uint32_t num, uint32_t msk,\n+\t__m256i pdata[2], __m256i *idx, __m256i *di)\n+{\n+\tuint32_t n, m[2], nm[2];\n+\t__m256i ni, nd[2];\n+\n+\tm[0] = msk & YMM_PTR_MSK;\n+\tm[1] = msk >> YMM_PTR_NUM;\n+\n+\tn = __builtin_popcount(m[0]);\n+\tnm[0] = (1 << n) - 1;\n+\tnm[1] = (1 << (num - n)) - 1;\n+\n+\t/* load input data pointers for new flows */\n+\tnd[0] = _mm256_maskz_loadu_epi64(nm[0],\n+\t\tflow->idata + flow->num_packets);\n+\tnd[1] = _mm256_maskz_loadu_epi64(nm[1],\n+\t\tflow->idata + flow->num_packets + n);\n+\n+\t/* calculate match indexes of new flows */\n+\tni = _mm256_set1_epi32(flow->num_packets);\n+\tni = _mm256_add_epi32(ni, ymm_idx_add.y);\n+\n+\t/* merge new and existing flows data */\n+\tpdata[0] = _mm256_mask_expand_epi64(pdata[0], m[0], nd[0]);\n+\tpdata[1] = _mm256_mask_expand_epi64(pdata[1], m[1], nd[1]);\n+\n+\t/* update match and data indexes */\n+\t*idx = _mm256_mask_expand_epi32(*idx, msk, ni);\n+\t*di = _mm256_maskz_mov_epi32(msk ^ UINT8_MAX, *di);\n+\n+\tflow->num_packets += num;\n+}\n+\n+/*\n+ * Process found matches for up to 16 flows.\n+ * fmsk - mask of active flows\n+ * rmsk - mask of found matches\n+ * pdata - pointers to flow input data\n+ * di - data indexes for these flows\n+ * idx - match indexed for given flows\n+ * tr_lo contains low 32 bits for up to 8 transitions.\n+ * tr_hi contains high 32 bits for up to 8 transitions.\n+ */\n+static inline uint32_t\n+match_process_avx512x8(struct acl_flow_avx512 *flow, uint32_t *fmsk,\n+\tuint32_t *rmsk, __m256i pdata[2], __m256i *di, __m256i *idx,\n+\t__m256i *tr_lo, __m256i *tr_hi)\n+{\n+\tuint32_t n;\n+\t__m256i res;\n+\n+\tif (rmsk[0] == 0)\n+\t\treturn 0;\n+\n+\t/* extract match indexes */\n+\tres = _mm256_and_si256(tr_lo[0], ymm_index_mask.y);\n+\n+\t/* mask  matched transitions to nop */\n+\ttr_lo[0] = _mm256_mask_mov_epi32(tr_lo[0], rmsk[0], ymm_trlo_idle.y);\n+\ttr_hi[0] = _mm256_mask_mov_epi32(tr_hi[0], rmsk[0], ymm_trhi_idle.y);\n+\n+\t/* save found match indexes */\n+\t_mm256_mask_i32scatter_epi32(flow->matches, rmsk[0],\n+\t\tidx[0], res, sizeof(flow->matches[0]));\n+\n+\t/* update masks and start new flows for matches */\n+\tn = update_flow_mask(flow, fmsk, rmsk);\n+\tstart_flow8(flow, n, rmsk[0], pdata, idx, di);\n+\n+\treturn n;\n+}\n+\n+/*\n+ * Test for matches ut to 32 (2x16) flows at once,\n+ * if matches exist - process them and start new flows.\n+ */\n+static inline void\n+match_check_process_avx512x8x2(struct acl_flow_avx512 *flow, uint32_t fm[2],\n+\t__m256i pdata[4], __m256i di[2], __m256i idx[2], __m256i inp[2],\n+\t__m256i tr_lo[2], __m256i tr_hi[2])\n+{\n+\tuint32_t n[2];\n+\tuint32_t rm[2];\n+\n+\t/* check for matches */\n+\trm[0] = _mm256_test_epi32_mask(tr_lo[0], ymm_match_mask.y);\n+\trm[1] = _mm256_test_epi32_mask(tr_lo[1], ymm_match_mask.y);\n+\n+\t/* till unprocessed matches exist */\n+\twhile ((rm[0] | rm[1]) != 0) {\n+\n+\t\t/* process matches and start new flows */\n+\t\tn[0] = match_process_avx512x8(flow, &fm[0], &rm[0], &pdata[0],\n+\t\t\t&di[0], &idx[0], &tr_lo[0], &tr_hi[0]);\n+\t\tn[1] = match_process_avx512x8(flow, &fm[1], &rm[1], &pdata[2],\n+\t\t\t&di[1], &idx[1], &tr_lo[1], &tr_hi[1]);\n+\n+\t\t/* execute first transition for new flows, if any */\n+\n+\t\tif (n[0] != 0) {\n+\t\t\tinp[0] = get_next_bytes_avx512x8(flow, &pdata[0],\n+\t\t\t\trm[0], &di[0], sizeof(uint8_t));\n+\t\t\tfirst_trans8(flow, inp[0], rm[0], &tr_lo[0],\n+\t\t\t\t&tr_hi[0]);\n+\t\t\trm[0] = _mm256_test_epi32_mask(tr_lo[0],\n+\t\t\t\tymm_match_mask.y);\n+\t\t}\n+\n+\t\tif (n[1] != 0) {\n+\t\t\tinp[1] = get_next_bytes_avx512x8(flow, &pdata[2],\n+\t\t\t\trm[1], &di[1], sizeof(uint8_t));\n+\t\t\tfirst_trans8(flow, inp[1], rm[1], &tr_lo[1],\n+\t\t\t\t&tr_hi[1]);\n+\t\t\trm[1] = _mm256_test_epi32_mask(tr_lo[1],\n+\t\t\t\tymm_match_mask.y);\n+\t\t}\n+\t}\n+}\n+\n+/*\n+ * Perform search for up to 32 flows in parallel.\n+ * Use two sets of metadata, each serves 16 flows max.\n+ * So in fact we perform search for 2x16 flows.\n+ */\n+static inline void\n+search_trie_avx512x8x2(struct acl_flow_avx512 *flow)\n+{\n+\tuint32_t fm[2];\n+\t__m256i di[2], idx[2], in[2], pdata[4], tr_lo[2], tr_hi[2];\n+\n+\t/* first 1B load */\n+\tstart_flow8(flow, MASK8_BIT, UINT8_MAX, &pdata[0], &idx[0], &di[0]);\n+\tstart_flow8(flow, MASK8_BIT, UINT8_MAX, &pdata[2], &idx[1], &di[1]);\n+\n+\tin[0] = get_next_bytes_avx512x8(flow, &pdata[0], UINT8_MAX, &di[0],\n+\t\t\tsizeof(uint8_t));\n+\tin[1] = get_next_bytes_avx512x8(flow, &pdata[2], UINT8_MAX, &di[1],\n+\t\t\tsizeof(uint8_t));\n+\n+\tfirst_trans8(flow, in[0], UINT8_MAX, &tr_lo[0], &tr_hi[0]);\n+\tfirst_trans8(flow, in[1], UINT8_MAX, &tr_lo[1], &tr_hi[1]);\n+\n+\tfm[0] = UINT8_MAX;\n+\tfm[1] = UINT8_MAX;\n+\n+\t/* match check */\n+\tmatch_check_process_avx512x8x2(flow, fm, pdata, di, idx, in,\n+\t\ttr_lo, tr_hi);\n+\n+\twhile ((fm[0] | fm[1]) != 0) {\n+\n+\t\t/* load next 4B */\n+\n+\t\tin[0] = get_next_bytes_avx512x8(flow, &pdata[0], fm[0],\n+\t\t\t&di[0], sizeof(uint32_t));\n+\t\tin[1] = get_next_bytes_avx512x8(flow, &pdata[2], fm[1],\n+\t\t\t&di[1], sizeof(uint32_t));\n+\n+\t\t/* main 4B loop */\n+\n+\t\tin[0] = transition8(in[0], flow->trans, &tr_lo[0], &tr_hi[0]);\n+\t\tin[1] = transition8(in[1], flow->trans, &tr_lo[1], &tr_hi[1]);\n+\n+\t\tin[0] = transition8(in[0], flow->trans, &tr_lo[0], &tr_hi[0]);\n+\t\tin[1] = transition8(in[1], flow->trans, &tr_lo[1], &tr_hi[1]);\n+\n+\t\tin[0] = transition8(in[0], flow->trans, &tr_lo[0], &tr_hi[0]);\n+\t\tin[1] = transition8(in[1], flow->trans, &tr_lo[1], &tr_hi[1]);\n+\n+\t\tin[0] = transition8(in[0], flow->trans, &tr_lo[0], &tr_hi[0]);\n+\t\tin[1] = transition8(in[1], flow->trans, &tr_lo[1], &tr_hi[1]);\n+\n+\t\t/* check for matches */\n+\t\tmatch_check_process_avx512x8x2(flow, fm, pdata, di, idx, in,\n+\t\t\ttr_lo, tr_hi);\n+\t}\n+}\n+\n+/*\n+ * resolve match index to actual result/priority offset.\n+ */\n+static inline __m256i\n+resolve_match_idx_avx512x8(__m256i mi)\n+{\n+\tRTE_BUILD_BUG_ON(sizeof(struct rte_acl_match_results) !=\n+\t\t1 << (match_log + 2));\n+\treturn _mm256_slli_epi32(mi, match_log);\n+}\n+\n+/*\n+ * Resolve multiple matches for the same flow based on priority.\n+ */\n+static inline __m256i\n+resolve_pri_avx512x8(const int32_t res[], const int32_t pri[],\n+\tconst uint32_t match[], __mmask8 msk, uint32_t nb_trie,\n+\tuint32_t nb_skip)\n+{\n+\tuint32_t i;\n+\tconst uint32_t *pm;\n+\t__mmask16 m;\n+\t__m256i cp, cr, np, nr, mch;\n+\n+\tconst __m256i zero = _mm256_set1_epi32(0);\n+\n+\t/* get match indexes */\n+\tmch = _mm256_maskz_loadu_epi32(msk, match);\n+\tmch = resolve_match_idx_avx512x8(mch);\n+\n+\t/* read result and priority values for first trie */\n+\tcr = _mm256_mmask_i32gather_epi32(zero, msk, mch, res, sizeof(res[0]));\n+\tcp = _mm256_mmask_i32gather_epi32(zero, msk, mch, pri, sizeof(pri[0]));\n+\n+\t/*\n+\t * read result and priority values for next tries and select one\n+\t * with highest priority.\n+\t */\n+\tfor (i = 1, pm = match + nb_skip; i != nb_trie;\n+\t\t\ti++, pm += nb_skip) {\n+\n+\t\tmch = _mm256_maskz_loadu_epi32(msk, pm);\n+\t\tmch = resolve_match_idx_avx512x8(mch);\n+\n+\t\tnr = _mm256_mmask_i32gather_epi32(zero, msk, mch, res,\n+\t\t\tsizeof(res[0]));\n+\t\tnp = _mm256_mmask_i32gather_epi32(zero, msk, mch, pri,\n+\t\t\tsizeof(pri[0]));\n+\n+\t\tm = _mm256_cmpgt_epi32_mask(cp, np);\n+\t\tcr = _mm256_mask_mov_epi32(nr, m, cr);\n+\t\tcp = _mm256_mask_mov_epi32(np, m, cp);\n+\t}\n+\n+\treturn cr;\n+}\n+\n+/*\n+ * Resolve num (<= 8) matches for single category\n+ */\n+static inline void\n+resolve_sc_avx512x8(uint32_t result[], const int32_t res[],\n+\tconst int32_t pri[], const uint32_t match[], uint32_t nb_pkt,\n+\tuint32_t nb_trie, uint32_t nb_skip)\n+{\n+\t__mmask8 msk;\n+\t__m256i cr;\n+\n+\tmsk = (1 << nb_pkt) - 1;\n+\tcr = resolve_pri_avx512x8(res, pri, match, msk, nb_trie, nb_skip);\n+\t_mm256_mask_storeu_epi32(result, msk, cr);\n+}\n+\n+/*\n+ * Resolve matches for single category\n+ */\n+static inline void\n+resolve_sc_avx512x8x2(uint32_t result[],\n+\tconst struct rte_acl_match_results pr[], const uint32_t match[],\n+\tuint32_t nb_pkt, uint32_t nb_trie)\n+{\n+\tuint32_t j, k, n;\n+\tconst int32_t *res, *pri;\n+\t__m256i cr[2];\n+\n+\tres = (const int32_t *)pr->results;\n+\tpri = pr->priority;\n+\n+\tfor (k = 0; k != (nb_pkt & ~MSK_AVX512X8X2); k += NUM_AVX512X8X2) {\n+\n+\t\tj = k + MASK8_BIT;\n+\n+\t\tcr[0] = resolve_pri_avx512x8(res, pri, match + k, UINT8_MAX,\n+\t\t\t\tnb_trie, nb_pkt);\n+\t\tcr[1] = resolve_pri_avx512x8(res, pri, match + j, UINT8_MAX,\n+\t\t\t\tnb_trie, nb_pkt);\n+\n+\t\t_mm256_storeu_si256((void *)(result + k), cr[0]);\n+\t\t_mm256_storeu_si256((void *)(result + j), cr[1]);\n+\t}\n+\n+\tn = nb_pkt - k;\n+\tif (n != 0) {\n+\t\tif (n > MASK8_BIT) {\n+\t\t\tresolve_sc_avx512x8(result + k, res, pri, match + k,\n+\t\t\t\tMASK8_BIT, nb_trie, nb_pkt);\n+\t\t\tk += MASK8_BIT;\n+\t\t\tn -= MASK8_BIT;\n+\t\t}\n+\t\tresolve_sc_avx512x8(result + k, res, pri, match + k, n,\n+\t\t\t\tnb_trie, nb_pkt);\n+\t}\n+}\n+\n+static inline int\n+search_avx512x8x2(const struct rte_acl_ctx *ctx, const uint8_t **data,\n+\tuint32_t *results, uint32_t total_packets, uint32_t categories)\n+{\n+\tuint32_t i, *pm;\n+\tconst struct rte_acl_match_results *pr;\n+\tstruct acl_flow_avx512 flow;\n+\tuint32_t match[ctx->num_tries * total_packets];\n+\n+\tfor (i = 0, pm = match; i != ctx->num_tries; i++, pm += total_packets) {\n+\n+\t\t/* setup for next trie */\n+\t\tacl_set_flow_avx512(&flow, ctx, i, data, pm, total_packets);\n+\n+\t\t/* process the trie */\n+\t\tsearch_trie_avx512x8x2(&flow);\n+\t}\n+\n+\t/* resolve matches */\n+\tpr = (const struct rte_acl_match_results *)\n+\t\t(ctx->trans_table + ctx->match_index);\n+\n+\tif (categories == 1)\n+\t\tresolve_sc_avx512x8x2(results, pr, match, total_packets,\n+\t\t\tctx->num_tries);\n+\telse\n+\t\tresolve_mcle8_avx512x1(results, pr, match, total_packets,\n+\t\t\tcategories, ctx->num_tries);\n+\n+\treturn 0;\n+}\n",
    "prefixes": [
        "v3",
        "08/14"
    ]
}