get:
Show a patch.

patch:
Update a patch.

put:
Update a patch.

GET /api/patches/75317/?format=api
HTTP 200 OK
Allow: GET, PUT, PATCH, HEAD, OPTIONS
Content-Type: application/json
Vary: Accept

{
    "id": 75317,
    "url": "http://patches.dpdk.org/api/patches/75317/?format=api",
    "web_url": "http://patches.dpdk.org/project/dpdk/patch/20200807162829.11690-8-konstantin.ananyev@intel.com/",
    "project": {
        "id": 1,
        "url": "http://patches.dpdk.org/api/projects/1/?format=api",
        "name": "DPDK",
        "link_name": "dpdk",
        "list_id": "dev.dpdk.org",
        "list_email": "dev@dpdk.org",
        "web_url": "http://core.dpdk.org",
        "scm_url": "git://dpdk.org/dpdk",
        "webscm_url": "http://git.dpdk.org/dpdk",
        "list_archive_url": "https://inbox.dpdk.org/dev",
        "list_archive_url_format": "https://inbox.dpdk.org/dev/{}",
        "commit_url_format": ""
    },
    "msgid": "<20200807162829.11690-8-konstantin.ananyev@intel.com>",
    "list_archive_url": "https://inbox.dpdk.org/dev/20200807162829.11690-8-konstantin.ananyev@intel.com",
    "date": "2020-08-07T16:28:29",
    "name": "[20.11,7/7] acl: enhance AVX512 classify implementation",
    "commit_ref": null,
    "pull_url": null,
    "state": "superseded",
    "archived": true,
    "hash": "bb8d336e212c8ae9826f6a03753764b46723a30c",
    "submitter": {
        "id": 33,
        "url": "http://patches.dpdk.org/api/people/33/?format=api",
        "name": "Ananyev, Konstantin",
        "email": "konstantin.ananyev@intel.com"
    },
    "delegate": {
        "id": 1,
        "url": "http://patches.dpdk.org/api/users/1/?format=api",
        "username": "tmonjalo",
        "first_name": "Thomas",
        "last_name": "Monjalon",
        "email": "thomas@monjalon.net"
    },
    "mbox": "http://patches.dpdk.org/project/dpdk/patch/20200807162829.11690-8-konstantin.ananyev@intel.com/mbox/",
    "series": [
        {
            "id": 11551,
            "url": "http://patches.dpdk.org/api/series/11551/?format=api",
            "web_url": "http://patches.dpdk.org/project/dpdk/list/?series=11551",
            "date": "2020-08-07T16:28:22",
            "name": "acl: introduce AVX512 classify method",
            "version": 1,
            "mbox": "http://patches.dpdk.org/series/11551/mbox/"
        }
    ],
    "comments": "http://patches.dpdk.org/api/patches/75317/comments/",
    "check": "fail",
    "checks": "http://patches.dpdk.org/api/patches/75317/checks/",
    "tags": {},
    "related": [],
    "headers": {
        "Return-Path": "<dev-bounces@dpdk.org>",
        "X-Original-To": "patchwork@inbox.dpdk.org",
        "Delivered-To": "patchwork@inbox.dpdk.org",
        "Received": [
            "from dpdk.org (dpdk.org [92.243.14.124])\n\tby inbox.dpdk.org (Postfix) with ESMTP id 6EF6DA04B0;\n\tFri,  7 Aug 2020 18:30:05 +0200 (CEST)",
            "from [92.243.14.124] (localhost [127.0.0.1])\n\tby dpdk.org (Postfix) with ESMTP id 8C2241C10A;\n\tFri,  7 Aug 2020 18:29:07 +0200 (CEST)",
            "from mga02.intel.com (mga02.intel.com [134.134.136.20])\n by dpdk.org (Postfix) with ESMTP id 658BD1C0D5\n for <dev@dpdk.org>; Fri,  7 Aug 2020 18:29:05 +0200 (CEST)",
            "from orsmga008.jf.intel.com ([10.7.209.65])\n by orsmga101.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384;\n 07 Aug 2020 09:29:05 -0700",
            "from sivswdev08.ir.intel.com ([10.237.217.47])\n by orsmga008.jf.intel.com with ESMTP; 07 Aug 2020 09:29:03 -0700"
        ],
        "IronPort-SDR": [
            "\n FOE7kj67EQepSVXRTXas2AfFGOT8yyGIVxmkUvCRlUlhdbmLwVF4oh/XxLhHe3DzLUhIAqoHGD\n 5nWUTRxLVXNg==",
            "\n Bo20pzAUgiI5M56Eww2HPJF9mJfTOprVVkKZ8N352Sa1zU0jq8bwO9uvMfTI4n4otDMik0l7dG\n f6cMs8U1bxfw=="
        ],
        "X-IronPort-AV": [
            "E=McAfee;i=\"6000,8403,9706\"; a=\"141003530\"",
            "E=Sophos;i=\"5.75,446,1589266800\"; d=\"scan'208\";a=\"141003530\"",
            "E=Sophos;i=\"5.75,446,1589266800\"; d=\"scan'208\";a=\"323799756\""
        ],
        "X-Amp-Result": "SKIPPED(no attachment in message)",
        "X-Amp-File-Uploaded": "False",
        "X-ExtLoop1": "1",
        "From": "Konstantin Ananyev <konstantin.ananyev@intel.com>",
        "To": "dev@dpdk.org",
        "Cc": "jerinj@marvell.com, ruifeng.wang@arm.com, vladimir.medvedkin@intel.com,\n Konstantin Ananyev <konstantin.ananyev@intel.com>",
        "Date": "Fri,  7 Aug 2020 17:28:29 +0100",
        "Message-Id": "<20200807162829.11690-8-konstantin.ananyev@intel.com>",
        "X-Mailer": "git-send-email 2.18.0",
        "In-Reply-To": "<20200807162829.11690-1-konstantin.ananyev@intel.com>",
        "References": "<20200807162829.11690-1-konstantin.ananyev@intel.com>",
        "Subject": "[dpdk-dev] [PATCH 20.11 7/7] acl: enhance AVX512 classify\n\timplementation",
        "X-BeenThere": "dev@dpdk.org",
        "X-Mailman-Version": "2.1.15",
        "Precedence": "list",
        "List-Id": "DPDK patches and discussions <dev.dpdk.org>",
        "List-Unsubscribe": "<https://mails.dpdk.org/options/dev>,\n <mailto:dev-request@dpdk.org?subject=unsubscribe>",
        "List-Archive": "<http://mails.dpdk.org/archives/dev/>",
        "List-Post": "<mailto:dev@dpdk.org>",
        "List-Help": "<mailto:dev-request@dpdk.org?subject=help>",
        "List-Subscribe": "<https://mails.dpdk.org/listinfo/dev>,\n <mailto:dev-request@dpdk.org?subject=subscribe>",
        "Errors-To": "dev-bounces@dpdk.org",
        "Sender": "\"dev\" <dev-bounces@dpdk.org>"
    },
    "content": "Add search_avx512x16x2() which uses mostly 512-bit width\nregisters/instructions and is able to process up to 32 flows in\nparallel.\n\nSigned-off-by: Konstantin Ananyev <konstantin.ananyev@intel.com>\n---\n\nThese patch depends on:\nhttps://patches.dpdk.org/patch/70429/\nto be applied first.\n\n lib/librte_acl/acl_run_avx512.c    |   3 +\n lib/librte_acl/acl_run_avx512x16.h | 635 +++++++++++++++++++++++++++++\n 2 files changed, 638 insertions(+)\n create mode 100644 lib/librte_acl/acl_run_avx512x16.h",
    "diff": "diff --git a/lib/librte_acl/acl_run_avx512.c b/lib/librte_acl/acl_run_avx512.c\nindex 8ee996679..332e359fb 100644\n--- a/lib/librte_acl/acl_run_avx512.c\n+++ b/lib/librte_acl/acl_run_avx512.c\n@@ -121,11 +121,14 @@ resolve_mcgt8_avx512x1(uint32_t result[],\n }\n \n #include \"acl_run_avx512x8.h\"\n+#include \"acl_run_avx512x16.h\"\n \n int\n rte_acl_classify_avx512(const struct rte_acl_ctx *ctx, const uint8_t **data,\n \tuint32_t *results, uint32_t num, uint32_t categories)\n {\n+\tif (num >= 2 * MAX_SEARCHES_AVX16)\n+\t\treturn search_avx512x16x2(ctx, data, results, num, categories);\n \tif (num >= MAX_SEARCHES_AVX16)\n \t\treturn search_avx512x8x2(ctx, data, results, num, categories);\n \tif (num >= MAX_SEARCHES_SSE8)\ndiff --git a/lib/librte_acl/acl_run_avx512x16.h b/lib/librte_acl/acl_run_avx512x16.h\nnew file mode 100644\nindex 000000000..53216bda3\n--- /dev/null\n+++ b/lib/librte_acl/acl_run_avx512x16.h\n@@ -0,0 +1,635 @@\n+/* SPDX-License-Identifier: BSD-3-Clause\n+ * Copyright(c) 2020 Intel Corporation\n+ */\n+\n+#define\tMASK16_BIT\t(sizeof(__mmask16) * CHAR_BIT)\n+\n+#define NUM_AVX512X16X2\t(2 * MASK16_BIT)\n+#define MSK_AVX512X16X2\t(NUM_AVX512X16X2 - 1)\n+\n+static const __rte_x86_zmm_t zmm_match_mask = {\n+\t.u32 = {\n+\t\tRTE_ACL_NODE_MATCH,\n+\t\tRTE_ACL_NODE_MATCH,\n+\t\tRTE_ACL_NODE_MATCH,\n+\t\tRTE_ACL_NODE_MATCH,\n+\t\tRTE_ACL_NODE_MATCH,\n+\t\tRTE_ACL_NODE_MATCH,\n+\t\tRTE_ACL_NODE_MATCH,\n+\t\tRTE_ACL_NODE_MATCH,\n+\t\tRTE_ACL_NODE_MATCH,\n+\t\tRTE_ACL_NODE_MATCH,\n+\t\tRTE_ACL_NODE_MATCH,\n+\t\tRTE_ACL_NODE_MATCH,\n+\t\tRTE_ACL_NODE_MATCH,\n+\t\tRTE_ACL_NODE_MATCH,\n+\t\tRTE_ACL_NODE_MATCH,\n+\t\tRTE_ACL_NODE_MATCH,\n+\t},\n+};\n+\n+static const __rte_x86_zmm_t zmm_index_mask = {\n+\t.u32 = {\n+\t\tRTE_ACL_NODE_INDEX,\n+\t\tRTE_ACL_NODE_INDEX,\n+\t\tRTE_ACL_NODE_INDEX,\n+\t\tRTE_ACL_NODE_INDEX,\n+\t\tRTE_ACL_NODE_INDEX,\n+\t\tRTE_ACL_NODE_INDEX,\n+\t\tRTE_ACL_NODE_INDEX,\n+\t\tRTE_ACL_NODE_INDEX,\n+\t\tRTE_ACL_NODE_INDEX,\n+\t\tRTE_ACL_NODE_INDEX,\n+\t\tRTE_ACL_NODE_INDEX,\n+\t\tRTE_ACL_NODE_INDEX,\n+\t\tRTE_ACL_NODE_INDEX,\n+\t\tRTE_ACL_NODE_INDEX,\n+\t\tRTE_ACL_NODE_INDEX,\n+\t\tRTE_ACL_NODE_INDEX,\n+\t},\n+};\n+\n+static const __rte_x86_zmm_t zmm_trlo_idle = {\n+\t.u32 = {\n+\t\tRTE_ACL_DFA_SIZE | RTE_ACL_NODE_SINGLE,\n+\t\tRTE_ACL_DFA_SIZE | RTE_ACL_NODE_SINGLE,\n+\t\tRTE_ACL_DFA_SIZE | RTE_ACL_NODE_SINGLE,\n+\t\tRTE_ACL_DFA_SIZE | RTE_ACL_NODE_SINGLE,\n+\t\tRTE_ACL_DFA_SIZE | RTE_ACL_NODE_SINGLE,\n+\t\tRTE_ACL_DFA_SIZE | RTE_ACL_NODE_SINGLE,\n+\t\tRTE_ACL_DFA_SIZE | RTE_ACL_NODE_SINGLE,\n+\t\tRTE_ACL_DFA_SIZE | RTE_ACL_NODE_SINGLE,\n+\t\tRTE_ACL_DFA_SIZE | RTE_ACL_NODE_SINGLE,\n+\t\tRTE_ACL_DFA_SIZE | RTE_ACL_NODE_SINGLE,\n+\t\tRTE_ACL_DFA_SIZE | RTE_ACL_NODE_SINGLE,\n+\t\tRTE_ACL_DFA_SIZE | RTE_ACL_NODE_SINGLE,\n+\t\tRTE_ACL_DFA_SIZE | RTE_ACL_NODE_SINGLE,\n+\t\tRTE_ACL_DFA_SIZE | RTE_ACL_NODE_SINGLE,\n+\t\tRTE_ACL_DFA_SIZE | RTE_ACL_NODE_SINGLE,\n+\t\tRTE_ACL_DFA_SIZE | RTE_ACL_NODE_SINGLE,\n+\t},\n+};\n+\n+static const __rte_x86_zmm_t zmm_trhi_idle = {\n+\t.u32 = {\n+\t\t0, 0, 0, 0,\n+\t\t0, 0, 0, 0,\n+\t\t0, 0, 0, 0,\n+\t\t0, 0, 0, 0,\n+\t},\n+};\n+\n+static const __rte_x86_zmm_t zmm_shuffle_input = {\n+\t.u32 = {\n+\t\t0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c,\n+\t\t0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c,\n+\t\t0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c,\n+\t\t0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c,\n+\t},\n+};\n+\n+static const __rte_x86_zmm_t zmm_four_32 = {\n+\t.u32 = {\n+\t\t4, 4, 4, 4,\n+\t\t4, 4, 4, 4,\n+\t\t4, 4, 4, 4,\n+\t\t4, 4, 4, 4,\n+\t},\n+};\n+\n+static const __rte_x86_zmm_t zmm_idx_add = {\n+\t.u32 = {\n+\t\t0, 1, 2, 3,\n+\t\t4, 5, 6, 7,\n+\t\t8, 9, 10, 11,\n+\t\t12, 13, 14, 15,\n+\t},\n+};\n+\n+static const __rte_x86_zmm_t zmm_range_base = {\n+\t.u32 = {\n+\t\t0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c,\n+\t\t0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c,\n+\t\t0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c,\n+\t\t0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c,\n+\t},\n+};\n+\n+/*\n+ * Calculate the address of the next transition for\n+ * all types of nodes. Note that only DFA nodes and range\n+ * nodes actually transition to another node. Match\n+ * nodes not supposed to be encountered here.\n+ * For quad range nodes:\n+ * Calculate number of range boundaries that are less than the\n+ * input value. Range boundaries for each node are in signed 8 bit,\n+ * ordered from -128 to 127.\n+ * This is effectively a popcnt of bytes that are greater than the\n+ * input byte.\n+ * Single nodes are processed in the same ways as quad range nodes.\n+ */\n+static __rte_always_inline __m512i\n+calc_addr16(__m512i index_mask, __m512i next_input, __m512i shuffle_input,\n+\t__m512i four_32, __m512i range_base, __m512i tr_lo, __m512i tr_hi)\n+{\n+\t__mmask64 qm;\n+\t__mmask16 dfa_msk;\n+\t__m512i addr, in, node_type, r, t;\n+\t__m512i dfa_ofs, quad_ofs;\n+\n+\tt = _mm512_xor_si512(index_mask, index_mask);\n+\tin = _mm512_shuffle_epi8(next_input, shuffle_input);\n+\n+\t/* Calc node type and node addr */\n+\tnode_type = _mm512_andnot_si512(index_mask, tr_lo);\n+\taddr = _mm512_and_si512(index_mask, tr_lo);\n+\n+\t/* mask for DFA type(0) nodes */\n+\tdfa_msk = _mm512_cmpeq_epi32_mask(node_type, t);\n+\n+\t/* DFA calculations. */\n+\tr = _mm512_srli_epi32(in, 30);\n+\tr = _mm512_add_epi8(r, range_base);\n+\tt = _mm512_srli_epi32(in, 24);\n+\tr = _mm512_shuffle_epi8(tr_hi, r);\n+\n+\tdfa_ofs = _mm512_sub_epi32(t, r);\n+\n+\t/* QUAD/SINGLE calculations. */\n+\tqm = _mm512_cmpgt_epi8_mask(in, tr_hi);\n+\tt = _mm512_maskz_set1_epi8(qm, (uint8_t)UINT8_MAX);\n+\tt = _mm512_lzcnt_epi32(t);\n+\tt = _mm512_srli_epi32(t, 3);\n+\tquad_ofs = _mm512_sub_epi32(four_32, t);\n+\n+\t/* blend DFA and QUAD/SINGLE. */\n+\tt = _mm512_mask_mov_epi32(quad_ofs, dfa_msk, dfa_ofs);\n+\n+\t/* calculate address for next transitions. */\n+\taddr = _mm512_add_epi32(addr, t);\n+\treturn addr;\n+}\n+\n+/*\n+ * Process 8 transitions in parallel.\n+ * tr_lo contains low 32 bits for 8 transition.\n+ * tr_hi contains high 32 bits for 8 transition.\n+ * next_input contains up to 4 input bytes for 8 flows.\n+ */\n+static __rte_always_inline __m512i\n+transition16(__m512i next_input, const uint64_t *trans, __m512i *tr_lo,\n+\t__m512i *tr_hi)\n+{\n+\tconst int32_t *tr;\n+\t__m512i addr;\n+\n+\ttr = (const int32_t *)(uintptr_t)trans;\n+\n+\t/* Calculate the address (array index) for all 8 transitions. */\n+\taddr = calc_addr16(zmm_index_mask.z, next_input, zmm_shuffle_input.z,\n+\t\tzmm_four_32.z, zmm_range_base.z, *tr_lo, *tr_hi);\n+\n+\t/* load lower 32 bits of 8 transactions at once. */\n+\t*tr_lo = _mm512_i32gather_epi32(addr, tr, sizeof(trans[0]));\n+\n+\tnext_input = _mm512_srli_epi32(next_input, CHAR_BIT);\n+\n+\t/* load high 32 bits of 8 transactions at once. */\n+\t*tr_hi = _mm512_i32gather_epi32(addr, (tr + 1), sizeof(trans[0]));\n+\n+\treturn next_input;\n+}\n+\n+static __rte_always_inline void\n+first_trans16(const struct acl_flow_avx512 *flow, __m512i next_input,\n+\t__mmask16 msk, __m512i *tr_lo, __m512i *tr_hi)\n+{\n+\tconst int32_t *tr;\n+\t__m512i addr, root;\n+\n+\ttr = (const int32_t *)(uintptr_t)flow->trans;\n+\n+\taddr = _mm512_set1_epi32(UINT8_MAX);\n+\troot = _mm512_set1_epi32(flow->root_index);\n+\n+\taddr = _mm512_and_si512(next_input, addr);\n+\taddr = _mm512_add_epi32(root, addr);\n+\n+\t/* load lower 32 bits of 8 transactions at once. */\n+\t*tr_lo = _mm512_mask_i32gather_epi32(*tr_lo, msk, addr, tr,\n+\t\tsizeof(flow->trans[0]));\n+\n+\t/* load high 32 bits of 8 transactions at once. */\n+\t*tr_hi = _mm512_mask_i32gather_epi32(*tr_hi, msk, addr, (tr + 1),\n+\t\tsizeof(flow->trans[0]));\n+}\n+\n+static inline __m512i\n+get_next_4bytes_avx512x16(const struct acl_flow_avx512 *flow, __m512i pdata[2],\n+\tuint32_t msk, __m512i *di)\n+{\n+\tconst int32_t *div;\n+\t__m512i one, zero, t, p[2];\n+\tymm_t inp[2];\n+\n+\tstatic const __rte_x86_zmm_t zmm_pminp = {\n+\t\t.u32 = {\n+\t\t\t0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,\n+\t\t\t0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,\n+\t\t},\n+\t};\n+\n+\tconst __mmask16 pmidx_msk = 0x5555;\n+\n+\tstatic const __rte_x86_zmm_t zmm_pmidx[2] = {\n+\t\t[0] = {\n+\t\t\t.u32 = {\n+\t\t\t\t0, 0, 1, 0, 2, 0, 3, 0,\n+\t\t\t\t4, 0, 5, 0, 6, 0, 7, 0,\n+\t\t\t},\n+\t\t},\n+\t\t[1] = {\n+\t\t\t.u32 = {\n+\t\t\t\t8, 0, 9, 0, 10, 0, 11, 0,\n+\t\t\t\t12, 0, 13, 0, 14, 0, 15, 0,\n+\t\t\t},\n+\t\t},\n+\t};\n+\n+\tdiv = (const int32_t *)flow->data_index;\n+\n+\tone = _mm512_set1_epi32(1);\n+\tzero = _mm512_xor_si512(one, one);\n+\n+\tt = _mm512_mask_i32gather_epi32(zero, msk, *di, div, sizeof(div[0]));\n+\n+\t*di = _mm512_mask_add_epi32(*di, msk, *di, one);\n+\n+\tp[0] = _mm512_maskz_permutexvar_epi32(pmidx_msk, zmm_pmidx[0].z, t);\n+\tp[1] = _mm512_maskz_permutexvar_epi32(pmidx_msk, zmm_pmidx[1].z, t);\n+\n+\tp[0] = _mm512_add_epi64(p[0], pdata[0]);\n+\tp[1] = _mm512_add_epi64(p[1], pdata[1]);\n+\n+\tinp[0] = _mm512_mask_i64gather_epi32(_mm512_castsi512_si256(zero),\n+\t\t(msk & UINT8_MAX), p[0], NULL, sizeof(uint8_t));\n+\tinp[1] = _mm512_mask_i64gather_epi32(_mm512_castsi512_si256(zero),\n+\t\t(msk >> CHAR_BIT), p[1], NULL, sizeof(uint8_t));\n+\n+\treturn _mm512_permutex2var_epi32(_mm512_castsi256_si512(inp[0]),\n+\t\t\tzmm_pminp.z, _mm512_castsi256_si512(inp[1]));\n+}\n+\n+static inline void\n+start_flow16(struct acl_flow_avx512 *flow, uint32_t num, uint32_t msk,\n+\t__m512i pdata[2], __m512i *idx, __m512i *di)\n+{\n+\tuint32_t n, nm[2];\n+\t__m512i ni, nd[2];\n+\n+\tn = __builtin_popcount(msk & UINT8_MAX);\n+\tnm[0] = (1 << n) - 1;\n+\tnm[1] = (1 << (num - n)) - 1;\n+\n+\tnd[0] = _mm512_maskz_loadu_epi64(nm[0],\n+\t\tflow->idata + flow->num_packets);\n+\tnd[1] = _mm512_maskz_loadu_epi64(nm[1],\n+\t\tflow->idata + flow->num_packets + n);\n+\n+\tni = _mm512_set1_epi32(flow->num_packets);\n+\tni = _mm512_add_epi32(ni, zmm_idx_add.z);\n+\n+\tpdata[0] = _mm512_mask_expand_epi64(pdata[0], (msk & UINT8_MAX), nd[0]);\n+\tpdata[1] = _mm512_mask_expand_epi64(pdata[1], (msk >> CHAR_BIT), nd[1]);\n+\n+\t*idx = _mm512_mask_expand_epi32(*idx, msk, ni);\n+\t*di = _mm512_maskz_mov_epi32(msk ^ UINT16_MAX, *di);\n+\n+\tflow->num_packets += num;\n+}\n+\n+static inline uint32_t\n+update_flow_mask16(const struct acl_flow_avx512 *flow, __mmask16 *fmsk,\n+\t__mmask16 *rmsk)\n+{\n+\tuint32_t i, j, k, m, n;\n+\n+\tfmsk[0] ^= rmsk[0];\n+\tm = rmsk[0];\n+\n+\tk = __builtin_popcount(m);\n+\tn = flow->total_packets - flow->num_packets;\n+\n+\tif (n < k) {\n+\t\t/* reduce mask */\n+\t\tfor (i = k - n; i != 0; i--) {\n+\t\t\tj = sizeof(m) * CHAR_BIT - 1 - __builtin_clz(m);\n+\t\t\tm ^= 1 << j;\n+\t\t}\n+\t} else\n+\t\tn = k;\n+\n+\trmsk[0] = m;\n+\tfmsk[0] |= rmsk[0];\n+\n+\treturn n;\n+}\n+\n+static inline uint32_t\n+match_process_avx512x16(struct acl_flow_avx512 *flow, __mmask16 *fmsk,\n+\t__mmask16 *rmsk, __m512i pdata[2], __m512i *di, __m512i *idx,\n+\t__m512i *tr_lo, __m512i *tr_hi)\n+{\n+\tuint32_t n;\n+\t__m512i res;\n+\n+\tif (rmsk[0] == 0)\n+\t\treturn 0;\n+\n+\t/* extract match indexes */\n+\tres = _mm512_and_si512(tr_lo[0], zmm_index_mask.z);\n+\n+\t/* mask  matched transitions to nop */\n+\ttr_lo[0] = _mm512_mask_mov_epi32(tr_lo[0], rmsk[0], zmm_trlo_idle.z);\n+\ttr_hi[0] = _mm512_mask_mov_epi32(tr_hi[0], rmsk[0], zmm_trhi_idle.z);\n+\n+\t/* save found match indexes */\n+\t_mm512_mask_i32scatter_epi32(flow->matches, rmsk[0],\n+\t\tidx[0], res, sizeof(flow->matches[0]));\n+\n+\t/* update masks and start new flows for matches */\n+\tn = update_flow_mask16(flow, fmsk, rmsk);\n+\tstart_flow16(flow, n, rmsk[0], pdata, idx, di);\n+\n+\treturn n;\n+}\n+\n+static inline void\n+match_check_process_avx512x16x2(struct acl_flow_avx512 *flow, __mmask16 fm[2],\n+\t__m512i pdata[4], __m512i di[2], __m512i idx[2], __m512i inp[2],\n+\t__m512i tr_lo[2], __m512i tr_hi[2])\n+{\n+\tuint32_t n[2];\n+\t__mmask16 rm[2];\n+\n+\t/* check for matches */\n+\trm[0] = _mm512_test_epi32_mask(tr_lo[0], zmm_match_mask.z);\n+\trm[1] = _mm512_test_epi32_mask(tr_lo[1], zmm_match_mask.z);\n+\n+\twhile ((rm[0] | rm[1]) != 0) {\n+\n+\t\tn[0] = match_process_avx512x16(flow, &fm[0], &rm[0], &pdata[0],\n+\t\t\t&di[0], &idx[0], &tr_lo[0], &tr_hi[0]);\n+\t\tn[1] = match_process_avx512x16(flow, &fm[1], &rm[1], &pdata[2],\n+\t\t\t&di[1], &idx[1], &tr_lo[1], &tr_hi[1]);\n+\n+\t\tif (n[0] != 0) {\n+\t\t\tinp[0] = get_next_4bytes_avx512x16(flow, &pdata[0],\n+\t\t\t\trm[0], &di[0]);\n+\t\t\tfirst_trans16(flow, inp[0], rm[0], &tr_lo[0],\n+\t\t\t\t&tr_hi[0]);\n+\t\t\trm[0] = _mm512_test_epi32_mask(tr_lo[0],\n+\t\t\t\tzmm_match_mask.z);\n+\t\t}\n+\n+\t\tif (n[1] != 0) {\n+\t\t\tinp[1] = get_next_4bytes_avx512x16(flow, &pdata[2],\n+\t\t\t\trm[1], &di[1]);\n+\t\t\tfirst_trans16(flow, inp[1], rm[1], &tr_lo[1],\n+\t\t\t\t&tr_hi[1]);\n+\t\t\trm[1] = _mm512_test_epi32_mask(tr_lo[1],\n+\t\t\t\tzmm_match_mask.z);\n+\t\t}\n+\t}\n+}\n+\n+static inline void\n+search_trie_avx512x16x2(struct acl_flow_avx512 *flow)\n+{\n+\t__mmask16 fm[2];\n+\t__m512i di[2], idx[2], in[2], pdata[4], tr_lo[2], tr_hi[2];\n+\n+\t/* first 1B load */\n+\tstart_flow16(flow, MASK16_BIT, UINT16_MAX, &pdata[0], &idx[0], &di[0]);\n+\tstart_flow16(flow, MASK16_BIT, UINT16_MAX, &pdata[2], &idx[1], &di[1]);\n+\n+\tin[0] = get_next_4bytes_avx512x16(flow, &pdata[0], UINT16_MAX, &di[0]);\n+\tin[1] = get_next_4bytes_avx512x16(flow, &pdata[2], UINT16_MAX, &di[1]);\n+\n+\tfirst_trans16(flow, in[0], UINT16_MAX, &tr_lo[0], &tr_hi[0]);\n+\tfirst_trans16(flow, in[1], UINT16_MAX, &tr_lo[1], &tr_hi[1]);\n+\n+\tfm[0] = UINT16_MAX;\n+\tfm[1] = UINT16_MAX;\n+\n+\t/* match check */\n+\tmatch_check_process_avx512x16x2(flow, fm, pdata, di, idx, in,\n+\t\ttr_lo, tr_hi);\n+\n+\twhile ((fm[0] | fm[1]) != 0) {\n+\n+\t\t/* load next 4B */\n+\n+\t\tin[0] = get_next_4bytes_avx512x16(flow, &pdata[0], fm[0],\n+\t\t\t&di[0]);\n+\t\tin[1] = get_next_4bytes_avx512x16(flow, &pdata[2], fm[1],\n+\t\t\t&di[1]);\n+\n+\t\t/* main 4B loop */\n+\n+\t\tin[0] = transition16(in[0], flow->trans, &tr_lo[0], &tr_hi[0]);\n+\t\tin[1] = transition16(in[1], flow->trans, &tr_lo[1], &tr_hi[1]);\n+\n+\t\tin[0] = transition16(in[0], flow->trans, &tr_lo[0], &tr_hi[0]);\n+\t\tin[1] = transition16(in[1], flow->trans, &tr_lo[1], &tr_hi[1]);\n+\n+\t\tin[0] = transition16(in[0], flow->trans, &tr_lo[0], &tr_hi[0]);\n+\t\tin[1] = transition16(in[1], flow->trans, &tr_lo[1], &tr_hi[1]);\n+\n+\t\tin[0] = transition16(in[0], flow->trans, &tr_lo[0], &tr_hi[0]);\n+\t\tin[1] = transition16(in[1], flow->trans, &tr_lo[1], &tr_hi[1]);\n+\n+\t\t/* check for matches */\n+\t\tmatch_check_process_avx512x16x2(flow, fm, pdata, di, idx, in,\n+\t\t\ttr_lo, tr_hi);\n+\t}\n+}\n+\n+static inline __m512i\n+resolve_match_idx_avx512x16(__m512i mi)\n+{\n+\tRTE_BUILD_BUG_ON(sizeof(struct rte_acl_match_results) !=\n+\t\t1 << (match_log + 2));\n+\treturn _mm512_slli_epi32(mi, match_log);\n+}\n+\n+static inline __m512i\n+resolve_pri_avx512x16(const int32_t res[], const int32_t pri[],\n+\tconst uint32_t match[], __mmask16 msk, uint32_t nb_trie,\n+\tuint32_t nb_skip)\n+{\n+\tuint32_t i;\n+\tconst uint32_t *pm;\n+\t__mmask16 m;\n+\t__m512i cp, cr, np, nr, mch;\n+\n+\tconst __m512i zero = _mm512_set1_epi32(0);\n+\n+\tmch = _mm512_maskz_loadu_epi32(msk, match);\n+\tmch = resolve_match_idx_avx512x16(mch);\n+\n+\tcr = _mm512_mask_i32gather_epi32(zero, msk, mch, res, sizeof(res[0]));\n+\tcp = _mm512_mask_i32gather_epi32(zero, msk, mch, pri, sizeof(pri[0]));\n+\n+\tfor (i = 1, pm = match + nb_skip; i != nb_trie;\n+\t\t\ti++, pm += nb_skip) {\n+\n+\t\tmch = _mm512_maskz_loadu_epi32(msk, pm);\n+\t\tmch = resolve_match_idx_avx512x16(mch);\n+\n+\t\tnr = _mm512_mask_i32gather_epi32(zero, msk, mch, res,\n+\t\t\tsizeof(res[0]));\n+\t\tnp = _mm512_mask_i32gather_epi32(zero, msk, mch, pri,\n+\t\t\tsizeof(pri[0]));\n+\n+\t\tm = _mm512_cmpgt_epi32_mask(cp, np);\n+\t\tcr = _mm512_mask_mov_epi32(nr, m, cr);\n+\t\tcp = _mm512_mask_mov_epi32(np, m, cp);\n+\t}\n+\n+\treturn cr;\n+}\n+\n+/*\n+ * Resolve num (<= 16) matches for single category\n+ */\n+static inline void\n+resolve_sc_avx512x16(uint32_t result[], const int32_t res[],\n+\tconst int32_t pri[], const uint32_t match[], uint32_t nb_pkt,\n+\tuint32_t nb_trie, uint32_t nb_skip)\n+{\n+\t__mmask16 msk;\n+\t__m512i cr;\n+\n+\tmsk = (1 << nb_pkt) - 1;\n+\tcr = resolve_pri_avx512x16(res, pri, match, msk, nb_trie, nb_skip);\n+\t_mm512_mask_storeu_epi32(result, msk, cr);\n+}\n+\n+/*\n+ * Resolve matches for single category\n+ */\n+static inline void\n+resolve_sc_avx512x16x2(uint32_t result[],\n+\tconst struct rte_acl_match_results pr[], const uint32_t match[],\n+\tuint32_t nb_pkt, uint32_t nb_trie)\n+{\n+\tuint32_t i, j, k, n;\n+\tconst uint32_t *pm;\n+\tconst int32_t *res, *pri;\n+\t__mmask16 m[2];\n+\t__m512i cp[2], cr[2], np[2], nr[2], mch[2];\n+\n+\tres = (const int32_t *)pr->results;\n+\tpri = pr->priority;\n+\n+\tfor (k = 0; k != (nb_pkt & ~MSK_AVX512X16X2); k += NUM_AVX512X16X2) {\n+\n+\t\tj = k + MASK16_BIT;\n+\n+\t\t/* load match indexes for first trie */\n+\t\tmch[0] = _mm512_loadu_si512(match + k);\n+\t\tmch[1] = _mm512_loadu_si512(match + j);\n+\n+\t\tmch[0] = resolve_match_idx_avx512x16(mch[0]);\n+\t\tmch[1] = resolve_match_idx_avx512x16(mch[1]);\n+\n+\t\t/* load matches and their priorities for first trie */\n+\n+\t\tcr[0] = _mm512_i32gather_epi32(mch[0], res, sizeof(res[0]));\n+\t\tcr[1] = _mm512_i32gather_epi32(mch[1], res, sizeof(res[0]));\n+\n+\t\tcp[0] = _mm512_i32gather_epi32(mch[0], pri, sizeof(pri[0]));\n+\t\tcp[1] = _mm512_i32gather_epi32(mch[1], pri, sizeof(pri[0]));\n+\n+\t\t/* select match with highest priority */\n+\t\tfor (i = 1, pm = match + nb_pkt; i != nb_trie;\n+\t\t\t\ti++, pm += nb_pkt) {\n+\n+\t\t\tmch[0] = _mm512_loadu_si512(pm + k);\n+\t\t\tmch[1] = _mm512_loadu_si512(pm + j);\n+\n+\t\t\tmch[0] = resolve_match_idx_avx512x16(mch[0]);\n+\t\t\tmch[1] = resolve_match_idx_avx512x16(mch[1]);\n+\n+\t\t\tnr[0] = _mm512_i32gather_epi32(mch[0], res,\n+\t\t\t\tsizeof(res[0]));\n+\t\t\tnr[1] = _mm512_i32gather_epi32(mch[1], res,\n+\t\t\t\tsizeof(res[0]));\n+\n+\t\t\tnp[0] = _mm512_i32gather_epi32(mch[0], pri,\n+\t\t\t\tsizeof(pri[0]));\n+\t\t\tnp[1] = _mm512_i32gather_epi32(mch[1], pri,\n+\t\t\t\tsizeof(pri[0]));\n+\n+\t\t\tm[0] = _mm512_cmpgt_epi32_mask(cp[0], np[0]);\n+\t\t\tm[1] = _mm512_cmpgt_epi32_mask(cp[1], np[1]);\n+\n+\t\t\tcr[0] = _mm512_mask_mov_epi32(nr[0], m[0], cr[0]);\n+\t\t\tcr[1] = _mm512_mask_mov_epi32(nr[1], m[1], cr[1]);\n+\n+\t\t\tcp[0] = _mm512_mask_mov_epi32(np[0], m[0], cp[0]);\n+\t\t\tcp[1] = _mm512_mask_mov_epi32(np[1], m[1], cp[1]);\n+\t\t}\n+\n+\t\t_mm512_storeu_si512(result + k, cr[0]);\n+\t\t_mm512_storeu_si512(result + j, cr[1]);\n+\t}\n+\n+\tn = nb_pkt - k;\n+\tif (n != 0) {\n+\t\tif (n > MASK16_BIT) {\n+\t\t\tresolve_sc_avx512x16(result + k, res, pri, match + k,\n+\t\t\t\tMASK16_BIT, nb_trie, nb_pkt);\n+\t\t\tk += MASK16_BIT;\n+\t\t\tn -= MASK16_BIT;\n+\t\t}\n+\t\tresolve_sc_avx512x16(result + k, res, pri, match + k, n,\n+\t\t\t\tnb_trie, nb_pkt);\n+\t}\n+}\n+\n+static inline int\n+search_avx512x16x2(const struct rte_acl_ctx *ctx, const uint8_t **data,\n+\tuint32_t *results, uint32_t total_packets, uint32_t categories)\n+{\n+\tuint32_t i, *pm;\n+\tconst struct rte_acl_match_results *pr;\n+\tstruct acl_flow_avx512 flow;\n+\tuint32_t match[ctx->num_tries * total_packets];\n+\n+\tfor (i = 0, pm = match; i != ctx->num_tries; i++, pm += total_packets) {\n+\n+\t\t/* setup for next trie */\n+\t\tacl_set_flow_avx512(&flow, ctx, i, data, pm, total_packets);\n+\n+\t\t/* process the trie */\n+\t\tsearch_trie_avx512x16x2(&flow);\n+\t}\n+\n+\t/* resolve matches */\n+\tpr = (const struct rte_acl_match_results *)\n+\t\t(ctx->trans_table + ctx->match_index);\n+\n+\tif (categories == 1)\n+\t\tresolve_sc_avx512x16x2(results, pr, match, total_packets,\n+\t\t\tctx->num_tries);\n+\telse if (categories <= RTE_ACL_MAX_CATEGORIES / 2)\n+\t\tresolve_mcle8_avx512x1(results, pr, match, total_packets,\n+\t\t\tcategories, ctx->num_tries);\n+\telse\n+\t\tresolve_mcgt8_avx512x1(results, pr, match, total_packets,\n+\t\t\tcategories, ctx->num_tries);\n+\n+\treturn 0;\n+}\n",
    "prefixes": [
        "20.11",
        "7/7"
    ]
}