get:
Show a patch.

patch:
Update a patch.

put:
Update a patch.

GET /api/patches/75316/?format=api
HTTP 200 OK
Allow: GET, PUT, PATCH, HEAD, OPTIONS
Content-Type: application/json
Vary: Accept

{
    "id": 75316,
    "url": "http://patches.dpdk.org/api/patches/75316/?format=api",
    "web_url": "http://patches.dpdk.org/project/dpdk/patch/20200807162829.11690-7-konstantin.ananyev@intel.com/",
    "project": {
        "id": 1,
        "url": "http://patches.dpdk.org/api/projects/1/?format=api",
        "name": "DPDK",
        "link_name": "dpdk",
        "list_id": "dev.dpdk.org",
        "list_email": "dev@dpdk.org",
        "web_url": "http://core.dpdk.org",
        "scm_url": "git://dpdk.org/dpdk",
        "webscm_url": "http://git.dpdk.org/dpdk",
        "list_archive_url": "https://inbox.dpdk.org/dev",
        "list_archive_url_format": "https://inbox.dpdk.org/dev/{}",
        "commit_url_format": ""
    },
    "msgid": "<20200807162829.11690-7-konstantin.ananyev@intel.com>",
    "list_archive_url": "https://inbox.dpdk.org/dev/20200807162829.11690-7-konstantin.ananyev@intel.com",
    "date": "2020-08-07T16:28:28",
    "name": "[20.11,6/7] acl: introduce AVX512 classify implementation",
    "commit_ref": null,
    "pull_url": null,
    "state": "superseded",
    "archived": true,
    "hash": "b6027b85fedf0309f0513c649ff0ad4eaa760de1",
    "submitter": {
        "id": 33,
        "url": "http://patches.dpdk.org/api/people/33/?format=api",
        "name": "Ananyev, Konstantin",
        "email": "konstantin.ananyev@intel.com"
    },
    "delegate": {
        "id": 1,
        "url": "http://patches.dpdk.org/api/users/1/?format=api",
        "username": "tmonjalo",
        "first_name": "Thomas",
        "last_name": "Monjalon",
        "email": "thomas@monjalon.net"
    },
    "mbox": "http://patches.dpdk.org/project/dpdk/patch/20200807162829.11690-7-konstantin.ananyev@intel.com/mbox/",
    "series": [
        {
            "id": 11551,
            "url": "http://patches.dpdk.org/api/series/11551/?format=api",
            "web_url": "http://patches.dpdk.org/project/dpdk/list/?series=11551",
            "date": "2020-08-07T16:28:22",
            "name": "acl: introduce AVX512 classify method",
            "version": 1,
            "mbox": "http://patches.dpdk.org/series/11551/mbox/"
        }
    ],
    "comments": "http://patches.dpdk.org/api/patches/75316/comments/",
    "check": "warning",
    "checks": "http://patches.dpdk.org/api/patches/75316/checks/",
    "tags": {},
    "related": [],
    "headers": {
        "Return-Path": "<dev-bounces@dpdk.org>",
        "X-Original-To": "patchwork@inbox.dpdk.org",
        "Delivered-To": "patchwork@inbox.dpdk.org",
        "Received": [
            "from dpdk.org (dpdk.org [92.243.14.124])\n\tby inbox.dpdk.org (Postfix) with ESMTP id 32B63A04B0;\n\tFri,  7 Aug 2020 18:29:51 +0200 (CEST)",
            "from [92.243.14.124] (localhost [127.0.0.1])\n\tby dpdk.org (Postfix) with ESMTP id 54F651C0D4;\n\tFri,  7 Aug 2020 18:29:05 +0200 (CEST)",
            "from mga02.intel.com (mga02.intel.com [134.134.136.20])\n by dpdk.org (Postfix) with ESMTP id 156EF1C043\n for <dev@dpdk.org>; Fri,  7 Aug 2020 18:29:02 +0200 (CEST)",
            "from orsmga008.jf.intel.com ([10.7.209.65])\n by orsmga101.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384;\n 07 Aug 2020 09:29:02 -0700",
            "from sivswdev08.ir.intel.com ([10.237.217.47])\n by orsmga008.jf.intel.com with ESMTP; 07 Aug 2020 09:29:01 -0700"
        ],
        "IronPort-SDR": [
            "\n WoMNPNtBEokqTHpzhA0RcWAf3FHcIH3ChZaA97sDxQr71p/8bj2lEqpznfPhGARgaB4bFwv42C\n pxsDXvAYhpLw==",
            "\n b/OupmSPajmgn65T2v9+fhpEPRCfioBQov2ZOdHHZ44q+lvpwFWoShs3QeTbcgYUqu5G9ISnsk\n Vo4RXH7Cflhw=="
        ],
        "X-IronPort-AV": [
            "E=McAfee;i=\"6000,8403,9706\"; a=\"141003486\"",
            "E=Sophos;i=\"5.75,446,1589266800\"; d=\"scan'208\";a=\"141003486\"",
            "E=Sophos;i=\"5.75,446,1589266800\"; d=\"scan'208\";a=\"323799750\""
        ],
        "X-Amp-Result": "SKIPPED(no attachment in message)",
        "X-Amp-File-Uploaded": "False",
        "X-ExtLoop1": "1",
        "From": "Konstantin Ananyev <konstantin.ananyev@intel.com>",
        "To": "dev@dpdk.org",
        "Cc": "jerinj@marvell.com, ruifeng.wang@arm.com, vladimir.medvedkin@intel.com,\n Konstantin Ananyev <konstantin.ananyev@intel.com>",
        "Date": "Fri,  7 Aug 2020 17:28:28 +0100",
        "Message-Id": "<20200807162829.11690-7-konstantin.ananyev@intel.com>",
        "X-Mailer": "git-send-email 2.18.0",
        "In-Reply-To": "<20200807162829.11690-1-konstantin.ananyev@intel.com>",
        "References": "<20200807162829.11690-1-konstantin.ananyev@intel.com>",
        "Subject": "[dpdk-dev] [PATCH 20.11 6/7] acl: introduce AVX512 classify\n\timplementation",
        "X-BeenThere": "dev@dpdk.org",
        "X-Mailman-Version": "2.1.15",
        "Precedence": "list",
        "List-Id": "DPDK patches and discussions <dev.dpdk.org>",
        "List-Unsubscribe": "<https://mails.dpdk.org/options/dev>,\n <mailto:dev-request@dpdk.org?subject=unsubscribe>",
        "List-Archive": "<http://mails.dpdk.org/archives/dev/>",
        "List-Post": "<mailto:dev@dpdk.org>",
        "List-Help": "<mailto:dev-request@dpdk.org?subject=help>",
        "List-Subscribe": "<https://mails.dpdk.org/listinfo/dev>,\n <mailto:dev-request@dpdk.org?subject=subscribe>",
        "Errors-To": "dev-bounces@dpdk.org",
        "Sender": "\"dev\" <dev-bounces@dpdk.org>"
    },
    "content": "Add search_avx512x8x2() which uses mostly 256-bit width\nregisters/instructions and is able to process up to 16 flows in\nparallel.\n\nSigned-off-by: Konstantin Ananyev <konstantin.ananyev@intel.com>\n---\n lib/librte_acl/acl_run_avx512.c   | 120 ++++++\n lib/librte_acl/acl_run_avx512x8.h | 614 ++++++++++++++++++++++++++++++\n 2 files changed, 734 insertions(+)\n create mode 100644 lib/librte_acl/acl_run_avx512x8.h",
    "diff": "diff --git a/lib/librte_acl/acl_run_avx512.c b/lib/librte_acl/acl_run_avx512.c\nindex 67274989d..8ee996679 100644\n--- a/lib/librte_acl/acl_run_avx512.c\n+++ b/lib/librte_acl/acl_run_avx512.c\n@@ -4,10 +4,130 @@\n \n #include \"acl_run_sse.h\"\n \n+/*sizeof(uint32_t) << match_log == sizeof(struct rte_acl_match_results)*/\n+static const uint32_t match_log = 5;\n+\n+struct acl_flow_avx512 {\n+\tuint32_t num_packets;       /* number of packets processed */\n+\tuint32_t total_packets;     /* max number of packets to process */\n+\tuint32_t root_index;        /* current root index */\n+\tconst uint64_t *trans;      /* transition table */\n+\tconst uint32_t *data_index; /* input data indexes */\n+\tconst uint8_t **idata;      /* input data */\n+\tuint32_t *matches;          /* match indexes */\n+};\n+\n+static inline void\n+acl_set_flow_avx512(struct acl_flow_avx512 *flow, const struct rte_acl_ctx *ctx,\n+\tuint32_t trie, const uint8_t *data[], uint32_t *matches,\n+\tuint32_t total_packets)\n+{\n+\tflow->num_packets = 0;\n+\tflow->total_packets = total_packets;\n+\tflow->root_index = ctx->trie[trie].root_index;\n+\tflow->trans = ctx->trans_table;\n+\tflow->data_index = ctx->trie[trie].data_index;\n+\tflow->idata = data;\n+\tflow->matches = matches;\n+}\n+\n+/*\n+ * Resolve matches for multiple categories (LE 8, use 128b instuctions/regs)\n+ */\n+static inline void\n+resolve_mcle8_avx512x1(uint32_t result[],\n+\tconst struct rte_acl_match_results pr[], const uint32_t match[],\n+\tuint32_t nb_pkt, uint32_t nb_cat, uint32_t nb_trie)\n+{\n+\tconst int32_t *pri;\n+\tconst uint32_t *pm, *res;\n+\tuint32_t i, j, k, mi, mn;\n+\t__mmask8 msk;\n+\txmm_t cp, cr, np, nr;\n+\n+\tres = pr->results;\n+\tpri = pr->priority;\n+\n+\tfor (k = 0; k != nb_pkt; k++, result += nb_cat) {\n+\n+\t\tmi = match[k] << match_log;\n+\n+\t\tfor (j = 0; j != nb_cat; j += RTE_ACL_RESULTS_MULTIPLIER) {\n+\n+\t\t\tcr = _mm_loadu_si128((const xmm_t *)(res + mi + j));\n+\t\t\tcp = _mm_loadu_si128((const xmm_t *)(pri + mi + j));\n+\n+\t\t\tfor (i = 1, pm = match + nb_pkt; i != nb_trie;\n+\t\t\t\ti++, pm += nb_pkt) {\n+\n+\t\t\t\tmn = j + (pm[k] << match_log);\n+\n+\t\t\t\tnr = _mm_loadu_si128((const xmm_t *)(res + mn));\n+\t\t\t\tnp = _mm_loadu_si128((const xmm_t *)(pri + mn));\n+\n+\t\t\t\tmsk = _mm_cmpgt_epi32_mask(cp, np);\n+\t\t\t\tcr = _mm_mask_mov_epi32(nr, msk, cr);\n+\t\t\t\tcp = _mm_mask_mov_epi32(np, msk, cp);\n+\t\t\t}\n+\n+\t\t\t_mm_storeu_si128((xmm_t *)(result + j), cr);\n+\t\t}\n+\t}\n+}\n+\n+/*\n+ * Resolve matches for multiple categories (GT 8, use 512b instuctions/regs)\n+ */\n+static inline void\n+resolve_mcgt8_avx512x1(uint32_t result[],\n+\tconst struct rte_acl_match_results pr[], const uint32_t match[],\n+\tuint32_t nb_pkt, uint32_t nb_cat, uint32_t nb_trie)\n+{\n+\tconst int32_t *pri;\n+\tconst uint32_t *pm, *res;\n+\tuint32_t i, k, mi;\n+\t__mmask16 cm, sm;\n+\t__m512i cp, cr, np, nr;\n+\n+\tconst uint32_t match_log = 5;\n+\n+\tres = pr->results;\n+\tpri = pr->priority;\n+\n+\tcm = (1 << nb_cat) - 1;\n+\n+\tfor (k = 0; k != nb_pkt; k++, result += nb_cat) {\n+\n+\t\tmi = match[k] << match_log;\n+\n+\t\tcr = _mm512_maskz_loadu_epi32(cm, res + mi);\n+\t\tcp = _mm512_maskz_loadu_epi32(cm, pri + mi);\n+\n+\t\tfor (i = 1, pm = match + nb_pkt; i != nb_trie;\n+\t\t\t\ti++, pm += nb_pkt) {\n+\n+\t\t\tmi = pm[k] << match_log;\n+\n+\t\t\tnr = _mm512_maskz_loadu_epi32(cm, res + mi);\n+\t\t\tnp = _mm512_maskz_loadu_epi32(cm, pri + mi);\n+\n+\t\t\tsm = _mm512_cmpgt_epi32_mask(cp, np);\n+\t\t\tcr = _mm512_mask_mov_epi32(nr, sm, cr);\n+\t\t\tcp = _mm512_mask_mov_epi32(np, sm, cp);\n+\t\t}\n+\n+\t\t_mm512_mask_storeu_epi32(result, cm, cr);\n+\t}\n+}\n+\n+#include \"acl_run_avx512x8.h\"\n+\n int\n rte_acl_classify_avx512(const struct rte_acl_ctx *ctx, const uint8_t **data,\n \tuint32_t *results, uint32_t num, uint32_t categories)\n {\n+\tif (num >= MAX_SEARCHES_AVX16)\n+\t\treturn search_avx512x8x2(ctx, data, results, num, categories);\n \tif (num >= MAX_SEARCHES_SSE8)\n \t\treturn search_sse_8(ctx, data, results, num, categories);\n \tif (num >= MAX_SEARCHES_SSE4)\ndiff --git a/lib/librte_acl/acl_run_avx512x8.h b/lib/librte_acl/acl_run_avx512x8.h\nnew file mode 100644\nindex 000000000..63b1d872f\n--- /dev/null\n+++ b/lib/librte_acl/acl_run_avx512x8.h\n@@ -0,0 +1,614 @@\n+/* SPDX-License-Identifier: BSD-3-Clause\n+ * Copyright(c) 2020 Intel Corporation\n+ */\n+\n+#define NUM_AVX512X8X2\t(2 * CHAR_BIT)\n+#define MSK_AVX512X8X2\t(NUM_AVX512X8X2 - 1)\n+\n+static const rte_ymm_t ymm_match_mask = {\n+\t.u32 = {\n+\t\tRTE_ACL_NODE_MATCH,\n+\t\tRTE_ACL_NODE_MATCH,\n+\t\tRTE_ACL_NODE_MATCH,\n+\t\tRTE_ACL_NODE_MATCH,\n+\t\tRTE_ACL_NODE_MATCH,\n+\t\tRTE_ACL_NODE_MATCH,\n+\t\tRTE_ACL_NODE_MATCH,\n+\t\tRTE_ACL_NODE_MATCH,\n+\t},\n+};\n+\n+static const rte_ymm_t ymm_index_mask = {\n+\t.u32 = {\n+\t\tRTE_ACL_NODE_INDEX,\n+\t\tRTE_ACL_NODE_INDEX,\n+\t\tRTE_ACL_NODE_INDEX,\n+\t\tRTE_ACL_NODE_INDEX,\n+\t\tRTE_ACL_NODE_INDEX,\n+\t\tRTE_ACL_NODE_INDEX,\n+\t\tRTE_ACL_NODE_INDEX,\n+\t\tRTE_ACL_NODE_INDEX,\n+\t},\n+};\n+\n+static const rte_ymm_t ymm_trlo_idle = {\n+\t.u32 = {\n+\t\tRTE_ACL_DFA_SIZE | RTE_ACL_NODE_SINGLE,\n+\t\tRTE_ACL_DFA_SIZE | RTE_ACL_NODE_SINGLE,\n+\t\tRTE_ACL_DFA_SIZE | RTE_ACL_NODE_SINGLE,\n+\t\tRTE_ACL_DFA_SIZE | RTE_ACL_NODE_SINGLE,\n+\t\tRTE_ACL_DFA_SIZE | RTE_ACL_NODE_SINGLE,\n+\t\tRTE_ACL_DFA_SIZE | RTE_ACL_NODE_SINGLE,\n+\t\tRTE_ACL_DFA_SIZE | RTE_ACL_NODE_SINGLE,\n+\t\tRTE_ACL_DFA_SIZE | RTE_ACL_NODE_SINGLE,\n+\t},\n+};\n+\n+static const rte_ymm_t ymm_trhi_idle = {\n+\t.u32 = {\n+\t\t0, 0, 0, 0,\n+\t\t0, 0, 0, 0,\n+\t},\n+};\n+\n+static const rte_ymm_t ymm_shuffle_input = {\n+\t.u32 = {\n+\t\t0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c,\n+\t\t0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c,\n+\t},\n+};\n+\n+static const rte_ymm_t ymm_four_32 = {\n+\t.u32 = {\n+\t\t4, 4, 4, 4,\n+\t\t4, 4, 4, 4,\n+\t},\n+};\n+\n+static const rte_ymm_t ymm_idx_add = {\n+\t.u32 = {\n+\t\t0, 1, 2, 3,\n+\t\t4, 5, 6, 7,\n+\t},\n+};\n+\n+static const rte_ymm_t ymm_range_base = {\n+\t.u32 = {\n+\t\t0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c,\n+\t\t0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c,\n+\t},\n+};\n+\n+/*\n+ * Calculate the address of the next transition for\n+ * all types of nodes. Note that only DFA nodes and range\n+ * nodes actually transition to another node. Match\n+ * nodes not supposed to be encountered here.\n+ * For quad range nodes:\n+ * Calculate number of range boundaries that are less than the\n+ * input value. Range boundaries for each node are in signed 8 bit,\n+ * ordered from -128 to 127.\n+ * This is effectively a popcnt of bytes that are greater than the\n+ * input byte.\n+ * Single nodes are processed in the same ways as quad range nodes.\n+ */\n+static __rte_always_inline ymm_t\n+calc_addr8(ymm_t index_mask, ymm_t next_input, ymm_t shuffle_input,\n+\tymm_t four_32, ymm_t range_base, ymm_t tr_lo, ymm_t tr_hi)\n+{\n+\tymm_t addr, in, node_type, r, t;\n+\tymm_t dfa_msk, dfa_ofs, quad_ofs;\n+\n+\tt = _mm256_xor_si256(index_mask, index_mask);\n+\tin = _mm256_shuffle_epi8(next_input, shuffle_input);\n+\n+\t/* Calc node type and node addr */\n+\tnode_type = _mm256_andnot_si256(index_mask, tr_lo);\n+\taddr = _mm256_and_si256(index_mask, tr_lo);\n+\n+\t/* mask for DFA type(0) nodes */\n+\tdfa_msk = _mm256_cmpeq_epi32(node_type, t);\n+\n+\t/* DFA calculations. */\n+\tr = _mm256_srli_epi32(in, 30);\n+\tr = _mm256_add_epi8(r, range_base);\n+\tt = _mm256_srli_epi32(in, 24);\n+\tr = _mm256_shuffle_epi8(tr_hi, r);\n+\n+\tdfa_ofs = _mm256_sub_epi32(t, r);\n+\n+\t/* QUAD/SINGLE calculations. */\n+\tt = _mm256_cmpgt_epi8(in, tr_hi);\n+\tt = _mm256_lzcnt_epi32(t);\n+\tt = _mm256_srli_epi32(t, 3);\n+\tquad_ofs = _mm256_sub_epi32(four_32, t);\n+\n+\t/* blend DFA and QUAD/SINGLE. */\n+\tt = _mm256_blendv_epi8(quad_ofs, dfa_ofs, dfa_msk);\n+\n+\t/* calculate address for next transitions. */\n+\taddr = _mm256_add_epi32(addr, t);\n+\treturn addr;\n+}\n+\n+/*\n+ * Process 8 transitions in parallel.\n+ * tr_lo contains low 32 bits for 8 transitions.\n+ * tr_hi contains high 32 bits for 8 transitions.\n+ * next_input contains up to 4 input bytes for 8 flows.\n+ */\n+static __rte_always_inline ymm_t\n+transition8(ymm_t next_input, const uint64_t *trans, ymm_t *tr_lo, ymm_t *tr_hi)\n+{\n+\tconst int32_t *tr;\n+\tymm_t addr;\n+\n+\ttr = (const int32_t *)(uintptr_t)trans;\n+\n+\t/* Calculate the address (array index) for all 8 transitions. */\n+\taddr = calc_addr8(ymm_index_mask.y, next_input, ymm_shuffle_input.y,\n+\t\tymm_four_32.y, ymm_range_base.y, *tr_lo, *tr_hi);\n+\n+\t/* load lower 32 bits of 8 transactions at once. */\n+\t*tr_lo = _mm256_i32gather_epi32(tr, addr, sizeof(trans[0]));\n+\n+\tnext_input = _mm256_srli_epi32(next_input, CHAR_BIT);\n+\n+\t/* load high 32 bits of 8 transactions at once. */\n+\t*tr_hi = _mm256_i32gather_epi32(tr + 1, addr, sizeof(trans[0]));\n+\n+\treturn next_input;\n+}\n+\n+/*\n+ * Execute first transition for up to 8 flows in parallel.\n+ * next_input should contain one input byte for up to 8 flows.\n+ * msk - mask of active flows.\n+ * tr_lo contains low 32 bits for up to 8 transitions.\n+ * tr_hi contains high 32 bits for up to 8 transitions.\n+ */\n+static __rte_always_inline void\n+first_trans8(const struct acl_flow_avx512 *flow, ymm_t next_input,\n+\t__mmask8 msk, ymm_t *tr_lo, ymm_t *tr_hi)\n+{\n+\tconst int32_t *tr;\n+\tymm_t addr, root;\n+\n+\ttr = (const int32_t *)(uintptr_t)flow->trans;\n+\n+\taddr = _mm256_set1_epi32(UINT8_MAX);\n+\troot = _mm256_set1_epi32(flow->root_index);\n+\n+\taddr = _mm256_and_si256(next_input, addr);\n+\taddr = _mm256_add_epi32(root, addr);\n+\n+\t/* load lower 32 bits of 8 transactions at once. */\n+\t*tr_lo = _mm256_mmask_i32gather_epi32(*tr_lo, msk, addr, tr,\n+\t\tsizeof(flow->trans[0]));\n+\n+\t/* load high 32 bits of 8 transactions at once. */\n+\t*tr_hi = _mm256_mmask_i32gather_epi32(*tr_hi, msk, addr, (tr + 1),\n+\t\tsizeof(flow->trans[0]));\n+}\n+\n+/*\n+ * Load and return next 4 input bytes for up to 8 flows in parallel.\n+ * pdata - 8 pointers to flow input data\n+ * mask - mask of active flows.\n+ * di - data indexes for these 8 flows.\n+ */\n+static inline ymm_t\n+get_next_4bytes_avx512x8(const struct acl_flow_avx512 *flow, __m512i pdata,\n+\t__mmask8 mask, ymm_t *di)\n+{\n+\tconst int32_t *div;\n+\tymm_t one, zero;\n+\tymm_t inp, t;\n+\t__m512i p;\n+\n+\tdiv = (const int32_t *)flow->data_index;\n+\n+\tone = _mm256_set1_epi32(1);\n+\tzero = _mm256_xor_si256(one, one);\n+\n+\t/* load data offsets for given indexes */\n+\tt = _mm256_mmask_i32gather_epi32(zero, mask, *di, div, sizeof(div[0]));\n+\n+\t/* increment data indexes */\n+\t*di = _mm256_mask_add_epi32(*di, mask, *di, one);\n+\n+\tp = _mm512_cvtepu32_epi64(t);\n+\tp = _mm512_add_epi64(p, pdata);\n+\n+\t/* load input bytes */\n+\tinp = _mm512_mask_i64gather_epi32(zero, mask, p, NULL, sizeof(uint8_t));\n+\treturn inp;\n+}\n+\n+/*\n+ * Start up to 8 new flows.\n+ * num - number of flows to start\n+ * msk - mask of new flows.\n+ * pdata - pointers to flow input data\n+ * di - data indexes for these flows.\n+ */\n+static inline void\n+start_flow8(struct acl_flow_avx512 *flow, uint32_t num, uint32_t msk,\n+\t__m512i *pdata, ymm_t *idx, ymm_t *di)\n+{\n+\tuint32_t nm;\n+\tymm_t ni;\n+\t__m512i nd;\n+\n+\t/* load input data pointers for new flows */\n+\tnm = (1 << num) - 1;\n+\tnd = _mm512_maskz_loadu_epi64(nm, flow->idata + flow->num_packets);\n+\n+\t/* calculate match indexes of new flows */\n+\tni = _mm256_set1_epi32(flow->num_packets);\n+\tni = _mm256_add_epi32(ni, ymm_idx_add.y);\n+\n+\t/* merge new and existing flows data */\n+\t*pdata = _mm512_mask_expand_epi64(*pdata, msk, nd);\n+\t*idx = _mm256_mask_expand_epi32(*idx, msk, ni);\n+\t*di = _mm256_maskz_mov_epi32(msk ^ UINT8_MAX, *di);\n+\n+\tflow->num_packets += num;\n+}\n+\n+/*\n+ * Update flow and result masks based on the number of unprocessed flows.\n+ */\n+static inline uint32_t\n+update_flow_mask8(const struct acl_flow_avx512 *flow, __mmask8 *fmsk,\n+\t__mmask8 *rmsk)\n+{\n+\tuint32_t i, j, k, m, n;\n+\n+\tfmsk[0] ^= rmsk[0];\n+\tm = rmsk[0];\n+\n+\tk = __builtin_popcount(m);\n+\tn = flow->total_packets - flow->num_packets;\n+\n+\tif (n < k) {\n+\t\t/* reduce mask */\n+\t\tfor (i = k - n; i != 0; i--) {\n+\t\t\tj = sizeof(m) * CHAR_BIT - 1 - __builtin_clz(m);\n+\t\t\tm ^= 1 << j;\n+\t\t}\n+\t} else\n+\t\tn = k;\n+\n+\trmsk[0] = m;\n+\tfmsk[0] |= rmsk[0];\n+\n+\treturn n;\n+}\n+\n+/*\n+ * Process found matches for up to 8 flows.\n+ * fmsk - mask of active flows\n+ * rmsk - maks of found matches\n+ * pdata - pointers to flow input data\n+ * di - data indexes for these flows\n+ * idx - match indexed for given flows\n+ * tr_lo contains low 32 bits for up to 8 transitions.\n+ * tr_hi contains high 32 bits for up to 8 transitions.\n+ */\n+static inline uint32_t\n+match_process_avx512x8(struct acl_flow_avx512 *flow, __mmask8 *fmsk,\n+\t__mmask8 *rmsk,\t__m512i *pdata, ymm_t *di, ymm_t *idx,\n+\tymm_t *tr_lo, ymm_t *tr_hi)\n+{\n+\tuint32_t n;\n+\tymm_t res;\n+\n+\tif (rmsk[0] == 0)\n+\t\treturn 0;\n+\n+\t/* extract match indexes */\n+\tres = _mm256_and_si256(tr_lo[0], ymm_index_mask.y);\n+\n+\t/* mask  matched transitions to nop */\n+\ttr_lo[0] = _mm256_mask_mov_epi32(tr_lo[0], rmsk[0], ymm_trlo_idle.y);\n+\ttr_hi[0] = _mm256_mask_mov_epi32(tr_hi[0], rmsk[0], ymm_trhi_idle.y);\n+\n+\t/* save found match indexes */\n+\t_mm256_mask_i32scatter_epi32(flow->matches, rmsk[0],\n+\t\tidx[0], res, sizeof(flow->matches[0]));\n+\n+\t/* update masks and start new flows for matches */\n+\tn = update_flow_mask8(flow, fmsk, rmsk);\n+\tstart_flow8(flow, n, rmsk[0], pdata, idx, di);\n+\n+\treturn n;\n+}\n+\n+\n+static inline void\n+match_check_process_avx512x8x2(struct acl_flow_avx512 *flow, __mmask8 fm[2],\n+\t__m512i pdata[2], ymm_t di[2], ymm_t idx[2], ymm_t inp[2],\n+\tymm_t tr_lo[2], ymm_t tr_hi[2])\n+{\n+\tuint32_t n[2];\n+\t__mmask8 rm[2];\n+\n+\t/* check for matches */\n+\trm[0] = _mm256_test_epi32_mask(tr_lo[0], ymm_match_mask.y);\n+\trm[1] = _mm256_test_epi32_mask(tr_lo[1], ymm_match_mask.y);\n+\n+\t/* till unprocessed matches exist */\n+\twhile ((rm[0] | rm[1]) != 0) {\n+\n+\t\t/* process matches and start new flows */\n+\t\tn[0] = match_process_avx512x8(flow, &fm[0], &rm[0], &pdata[0],\n+\t\t\t&di[0], &idx[0], &tr_lo[0], &tr_hi[0]);\n+\t\tn[1] = match_process_avx512x8(flow, &fm[1], &rm[1], &pdata[1],\n+\t\t\t&di[1], &idx[1], &tr_lo[1], &tr_hi[1]);\n+\n+\t\t/* execute first transition for new flows, if any */\n+\n+\t\tif (n[0] != 0) {\n+\t\t\tinp[0] = get_next_4bytes_avx512x8(flow, pdata[0], rm[0],\n+\t\t\t\t&di[0]);\n+\t\t\tfirst_trans8(flow, inp[0], rm[0], &tr_lo[0], &tr_hi[0]);\n+\n+\t\t\trm[0] = _mm256_test_epi32_mask(tr_lo[0],\n+\t\t\t\tymm_match_mask.y);\n+\t\t}\n+\n+\t\tif (n[1] != 0) {\n+\t\t\tinp[1] = get_next_4bytes_avx512x8(flow, pdata[1], rm[1],\n+\t\t\t\t&di[1]);\n+\t\t\tfirst_trans8(flow, inp[1], rm[1], &tr_lo[1], &tr_hi[1]);\n+\n+\t\t\trm[1] = _mm256_test_epi32_mask(tr_lo[1],\n+\t\t\t\tymm_match_mask.y);\n+\t\t}\n+\t}\n+}\n+\n+/*\n+ * Perform search for up to 16 flows in parallel.\n+ * Use two sets of metadata, each serves 8 flows max.\n+ * So in fact we perform search for 2x8 flows.\n+ */\n+static inline void\n+search_trie_avx512x8x2(struct acl_flow_avx512 *flow)\n+{\n+\t__mmask8 fm[2];\n+\t__m512i pdata[2];\n+\tymm_t di[2], idx[2], inp[2], tr_lo[2], tr_hi[2];\n+\n+\t/* first 1B load */\n+\tstart_flow8(flow, CHAR_BIT, UINT8_MAX, &pdata[0], &idx[0], &di[0]);\n+\tstart_flow8(flow, CHAR_BIT, UINT8_MAX, &pdata[1], &idx[1], &di[1]);\n+\n+\tinp[0] = get_next_4bytes_avx512x8(flow, pdata[0], UINT8_MAX, &di[0]);\n+\tinp[1] = get_next_4bytes_avx512x8(flow, pdata[1], UINT8_MAX, &di[1]);\n+\n+\tfirst_trans8(flow, inp[0], UINT8_MAX, &tr_lo[0], &tr_hi[0]);\n+\tfirst_trans8(flow, inp[1], UINT8_MAX, &tr_lo[1], &tr_hi[1]);\n+\n+\tfm[0] = UINT8_MAX;\n+\tfm[1] = UINT8_MAX;\n+\n+\t/* match check */\n+\tmatch_check_process_avx512x8x2(flow, fm, pdata, di, idx, inp,\n+\t\ttr_lo, tr_hi);\n+\n+\twhile ((fm[0] | fm[1]) != 0) {\n+\n+\t\t/* load next 4B */\n+\n+\t\tinp[0] = get_next_4bytes_avx512x8(flow, pdata[0], fm[0],\n+\t\t\t&di[0]);\n+\t\tinp[1] = get_next_4bytes_avx512x8(flow, pdata[1], fm[1],\n+\t\t\t&di[1]);\n+\n+\t\t/* main 4B loop */\n+\n+\t\tinp[0] = transition8(inp[0], flow->trans, &tr_lo[0], &tr_hi[0]);\n+\t\tinp[1] = transition8(inp[1], flow->trans, &tr_lo[1], &tr_hi[1]);\n+\n+\t\tinp[0] = transition8(inp[0], flow->trans, &tr_lo[0], &tr_hi[0]);\n+\t\tinp[1] = transition8(inp[1], flow->trans, &tr_lo[1], &tr_hi[1]);\n+\n+\t\tinp[0] = transition8(inp[0], flow->trans, &tr_lo[0], &tr_hi[0]);\n+\t\tinp[1] = transition8(inp[1], flow->trans, &tr_lo[1], &tr_hi[1]);\n+\n+\t\tinp[0] = transition8(inp[0], flow->trans, &tr_lo[0], &tr_hi[0]);\n+\t\tinp[1] = transition8(inp[1], flow->trans, &tr_lo[1], &tr_hi[1]);\n+\n+\t\t/* check for matches */\n+\t\tmatch_check_process_avx512x8x2(flow, fm, pdata, di, idx, inp,\n+\t\t\ttr_lo, tr_hi);\n+\t}\n+}\n+\n+/*\n+ * resolve match index to actual result/priority offset.\n+ */\n+static inline ymm_t\n+resolve_match_idx_avx512x8(ymm_t mi)\n+{\n+\tRTE_BUILD_BUG_ON(sizeof(struct rte_acl_match_results) !=\n+\t\t1 << (match_log + 2));\n+\treturn _mm256_slli_epi32(mi, match_log);\n+}\n+\n+\n+/*\n+ * Resolve multiple matches for the same flow based on priority.\n+ */\n+static inline ymm_t\n+resolve_pri_avx512x8(const int32_t res[], const int32_t pri[],\n+\tconst uint32_t match[], __mmask8 msk, uint32_t nb_trie,\n+\tuint32_t nb_skip)\n+{\n+\tuint32_t i;\n+\tconst uint32_t *pm;\n+\t__mmask8 m;\n+\tymm_t cp, cr, np, nr, mch;\n+\n+\tconst ymm_t zero = _mm256_set1_epi32(0);\n+\n+\tmch = _mm256_maskz_loadu_epi32(msk, match);\n+\tmch = resolve_match_idx_avx512x8(mch);\n+\n+\tcr = _mm256_mmask_i32gather_epi32(zero, msk, mch, res, sizeof(res[0]));\n+\tcp = _mm256_mmask_i32gather_epi32(zero, msk, mch, pri, sizeof(pri[0]));\n+\n+\tfor (i = 1, pm = match + nb_skip; i != nb_trie;\n+\t\t\ti++, pm += nb_skip) {\n+\n+\t\tmch = _mm256_maskz_loadu_epi32(msk, pm);\n+\t\tmch = resolve_match_idx_avx512x8(mch);\n+\n+\t\tnr = _mm256_mmask_i32gather_epi32(zero, msk, mch, res,\n+\t\t\tsizeof(res[0]));\n+\t\tnp = _mm256_mmask_i32gather_epi32(zero, msk, mch, pri,\n+\t\t\tsizeof(pri[0]));\n+\n+\t\tm = _mm256_cmpgt_epi32_mask(cp, np);\n+\t\tcr = _mm256_mask_mov_epi32(nr, m, cr);\n+\t\tcp = _mm256_mask_mov_epi32(np, m, cp);\n+\t}\n+\n+\treturn cr;\n+}\n+\n+/*\n+ * Resolve num (<= 8) matches for single category\n+ */\n+static inline void\n+resolve_sc_avx512x8(uint32_t result[], const int32_t res[], const int32_t pri[],\n+\tconst uint32_t match[], uint32_t nb_pkt, uint32_t nb_trie,\n+\tuint32_t nb_skip)\n+{\n+\t__mmask8 msk;\n+\tymm_t cr;\n+\n+\tmsk = (1 << nb_pkt) - 1;\n+\tcr = resolve_pri_avx512x8(res, pri, match, msk, nb_trie, nb_skip);\n+\t_mm256_mask_storeu_epi32(result, msk, cr);\n+}\n+\n+/*\n+ * Resolve matches for single category\n+ */\n+static inline void\n+resolve_sc_avx512x8x2(uint32_t result[],\n+\tconst struct rte_acl_match_results pr[], const uint32_t match[],\n+\tuint32_t nb_pkt, uint32_t nb_trie)\n+{\n+\tuint32_t i, j, k, n;\n+\tconst uint32_t *pm;\n+\tconst int32_t *res, *pri;\n+\t__mmask8 m[2];\n+\tymm_t cp[2], cr[2], np[2], nr[2], mch[2];\n+\n+\tres = (const int32_t *)pr->results;\n+\tpri = pr->priority;\n+\n+\tfor (k = 0; k != (nb_pkt & ~MSK_AVX512X8X2); k += NUM_AVX512X8X2) {\n+\n+\t\tj = k + CHAR_BIT;\n+\n+\t\t/* load match indexes for first trie */\n+\t\tmch[0] = _mm256_loadu_si256((const ymm_t *)(match + k));\n+\t\tmch[1] = _mm256_loadu_si256((const ymm_t *)(match + j));\n+\n+\t\tmch[0] = resolve_match_idx_avx512x8(mch[0]);\n+\t\tmch[1] = resolve_match_idx_avx512x8(mch[1]);\n+\n+\t\t/* load matches and their priorities for first trie */\n+\n+\t\tcr[0] = _mm256_i32gather_epi32(res, mch[0], sizeof(res[0]));\n+\t\tcr[1] = _mm256_i32gather_epi32(res, mch[1], sizeof(res[0]));\n+\n+\t\tcp[0] = _mm256_i32gather_epi32(pri, mch[0], sizeof(pri[0]));\n+\t\tcp[1] = _mm256_i32gather_epi32(pri, mch[1], sizeof(pri[0]));\n+\n+\t\t/* select match with highest priority */\n+\t\tfor (i = 1, pm = match + nb_pkt; i != nb_trie;\n+\t\t\t\ti++, pm += nb_pkt) {\n+\n+\t\t\tmch[0] = _mm256_loadu_si256((const ymm_t *)(pm + k));\n+\t\t\tmch[1] = _mm256_loadu_si256((const ymm_t *)(pm + j));\n+\n+\t\t\tmch[0] = resolve_match_idx_avx512x8(mch[0]);\n+\t\t\tmch[1] = resolve_match_idx_avx512x8(mch[1]);\n+\n+\t\t\tnr[0] = _mm256_i32gather_epi32(res, mch[0],\n+\t\t\t\tsizeof(res[0]));\n+\t\t\tnr[1] = _mm256_i32gather_epi32(res, mch[1],\n+\t\t\t\tsizeof(res[0]));\n+\n+\t\t\tnp[0] = _mm256_i32gather_epi32(pri, mch[0],\n+\t\t\t\tsizeof(pri[0]));\n+\t\t\tnp[1] = _mm256_i32gather_epi32(pri, mch[1],\n+\t\t\t\tsizeof(pri[0]));\n+\n+\t\t\tm[0] = _mm256_cmpgt_epi32_mask(cp[0], np[0]);\n+\t\t\tm[1] = _mm256_cmpgt_epi32_mask(cp[1], np[1]);\n+\n+\t\t\tcr[0] = _mm256_mask_mov_epi32(nr[0], m[0], cr[0]);\n+\t\t\tcr[1] = _mm256_mask_mov_epi32(nr[1], m[1], cr[1]);\n+\n+\t\t\tcp[0] = _mm256_mask_mov_epi32(np[0], m[0], cp[0]);\n+\t\t\tcp[1] = _mm256_mask_mov_epi32(np[1], m[1], cp[1]);\n+\t\t}\n+\n+\t\t_mm256_storeu_si256((ymm_t *)(result + k), cr[0]);\n+\t\t_mm256_storeu_si256((ymm_t *)(result + j), cr[1]);\n+\t}\n+\n+\tn = nb_pkt - k;\n+\tif (n != 0) {\n+\t\tif (n > CHAR_BIT) {\n+\t\t\tresolve_sc_avx512x8(result + k, res, pri, match + k,\n+\t\t\t\tCHAR_BIT, nb_trie, nb_pkt);\n+\t\t\tk += CHAR_BIT;\n+\t\t\tn -= CHAR_BIT;\n+\t\t}\n+\t\tresolve_sc_avx512x8(result + k, res, pri, match + k, n,\n+\t\t\t\tnb_trie, nb_pkt);\n+\t}\n+}\n+\n+static inline int\n+search_avx512x8x2(const struct rte_acl_ctx *ctx, const uint8_t **data,\n+\tuint32_t *results, uint32_t total_packets, uint32_t categories)\n+{\n+\tuint32_t i, *pm;\n+\tconst struct rte_acl_match_results *pr;\n+\tstruct acl_flow_avx512 flow;\n+\tuint32_t match[ctx->num_tries * total_packets];\n+\n+\tfor (i = 0, pm = match; i != ctx->num_tries; i++, pm += total_packets) {\n+\n+\t\t/* setup for next trie */\n+\t\tacl_set_flow_avx512(&flow, ctx, i, data, pm, total_packets);\n+\n+\t\t/* process the trie */\n+\t\tsearch_trie_avx512x8x2(&flow);\n+\t}\n+\n+\t/* resolve matches */\n+\tpr = (const struct rte_acl_match_results *)\n+\t\t(ctx->trans_table + ctx->match_index);\n+\n+\tif (categories == 1)\n+\t\tresolve_sc_avx512x8x2(results, pr, match, total_packets,\n+\t\t\tctx->num_tries);\n+\telse if (categories <= RTE_ACL_MAX_CATEGORIES / 2)\n+\t\tresolve_mcle8_avx512x1(results, pr, match, total_packets,\n+\t\t\tcategories, ctx->num_tries);\n+\telse\n+\t\tresolve_mcgt8_avx512x1(results, pr, match, total_packets,\n+\t\t\tcategories, ctx->num_tries);\n+\n+\treturn 0;\n+}\n",
    "prefixes": [
        "20.11",
        "6/7"
    ]
}