get:
Show a patch.

patch:
Update a patch.

put:
Update a patch.

GET /api/patches/2265/?format=api
HTTP 200 OK
Allow: GET, PUT, PATCH, HEAD, OPTIONS
Content-Type: application/json
Vary: Accept

{
    "id": 2265,
    "url": "https://patches.dpdk.org/api/patches/2265/?format=api",
    "web_url": "https://patches.dpdk.org/project/dpdk/patch/1421090181-17150-16-git-send-email-konstantin.ananyev@intel.com/",
    "project": {
        "id": 1,
        "url": "https://patches.dpdk.org/api/projects/1/?format=api",
        "name": "DPDK",
        "link_name": "dpdk",
        "list_id": "dev.dpdk.org",
        "list_email": "dev@dpdk.org",
        "web_url": "http://core.dpdk.org",
        "scm_url": "git://dpdk.org/dpdk",
        "webscm_url": "http://git.dpdk.org/dpdk",
        "list_archive_url": "https://inbox.dpdk.org/dev",
        "list_archive_url_format": "https://inbox.dpdk.org/dev/{}",
        "commit_url_format": ""
    },
    "msgid": "<1421090181-17150-16-git-send-email-konstantin.ananyev@intel.com>",
    "list_archive_url": "https://inbox.dpdk.org/dev/1421090181-17150-16-git-send-email-konstantin.ananyev@intel.com",
    "date": "2015-01-12T19:16:19",
    "name": "[dpdk-dev,v2,15/17] libte_acl: make calc_addr a define to deduplicate the code.",
    "commit_ref": null,
    "pull_url": null,
    "state": "superseded",
    "archived": true,
    "hash": "777ab3b0b8602677b05b5af132b84d8699812cf4",
    "submitter": {
        "id": 33,
        "url": "https://patches.dpdk.org/api/people/33/?format=api",
        "name": "Ananyev, Konstantin",
        "email": "konstantin.ananyev@intel.com"
    },
    "delegate": null,
    "mbox": "https://patches.dpdk.org/project/dpdk/patch/1421090181-17150-16-git-send-email-konstantin.ananyev@intel.com/mbox/",
    "series": [],
    "comments": "https://patches.dpdk.org/api/patches/2265/comments/",
    "check": "pending",
    "checks": "https://patches.dpdk.org/api/patches/2265/checks/",
    "tags": {},
    "related": [],
    "headers": {
        "Return-Path": "<dev-bounces@dpdk.org>",
        "X-Original-To": "patchwork@dpdk.org",
        "Delivered-To": "patchwork@dpdk.org",
        "Received": [
            "from [92.243.14.124] (localhost [IPv6:::1])\n\tby dpdk.org (Postfix) with ESMTP id 2A8AE5B0F;\n\tMon, 12 Jan 2015 20:17:12 +0100 (CET)",
            "from mga11.intel.com (mga11.intel.com [192.55.52.93])\n\tby dpdk.org (Postfix) with ESMTP id D003D5A7C\n\tfor <dev@dpdk.org>; Mon, 12 Jan 2015 20:16:40 +0100 (CET)",
            "from fmsmga001.fm.intel.com ([10.253.24.23])\n\tby fmsmga102.fm.intel.com with ESMTP; 12 Jan 2015 11:16:38 -0800",
            "from irvmail001.ir.intel.com ([163.33.26.43])\n\tby fmsmga001.fm.intel.com with ESMTP; 12 Jan 2015 11:16:37 -0800",
            "from sivswdev02.ir.intel.com (sivswdev02.ir.intel.com\n\t[10.237.217.46])\n\tby irvmail001.ir.intel.com (8.14.3/8.13.6/MailSET/Hub) with ESMTP id\n\tt0CJGarb008653; Mon, 12 Jan 2015 19:16:36 GMT",
            "from sivswdev02.ir.intel.com (localhost [127.0.0.1])\n\tby sivswdev02.ir.intel.com with ESMTP id t0CJGaLF017307;\n\tMon, 12 Jan 2015 19:16:36 GMT",
            "(from kananye1@localhost)\n\tby sivswdev02.ir.intel.com with  id t0CJGa2S017303;\n\tMon, 12 Jan 2015 19:16:36 GMT"
        ],
        "X-ExtLoop1": "1",
        "X-IronPort-AV": "E=Sophos;i=\"5.07,745,1413270000\"; d=\"scan'208\";a=\"649962547\"",
        "From": "Konstantin Ananyev <konstantin.ananyev@intel.com>",
        "To": "dev@dpdk.org",
        "Date": "Mon, 12 Jan 2015 19:16:19 +0000",
        "Message-Id": "<1421090181-17150-16-git-send-email-konstantin.ananyev@intel.com>",
        "X-Mailer": "git-send-email 1.7.4.1",
        "In-Reply-To": "<1421090181-17150-1-git-send-email-konstantin.ananyev@intel.com>",
        "References": "<1421090181-17150-1-git-send-email-konstantin.ananyev@intel.com>",
        "Subject": "[dpdk-dev] [PATCH v2 15/17] libte_acl: make calc_addr a define to\n\tdeduplicate the code.",
        "X-BeenThere": "dev@dpdk.org",
        "X-Mailman-Version": "2.1.15",
        "Precedence": "list",
        "List-Id": "patches and discussions about DPDK <dev.dpdk.org>",
        "List-Unsubscribe": "<http://dpdk.org/ml/options/dev>,\n\t<mailto:dev-request@dpdk.org?subject=unsubscribe>",
        "List-Archive": "<http://dpdk.org/ml/archives/dev/>",
        "List-Post": "<mailto:dev@dpdk.org>",
        "List-Help": "<mailto:dev-request@dpdk.org?subject=help>",
        "List-Subscribe": "<http://dpdk.org/ml/listinfo/dev>,\n\t<mailto:dev-request@dpdk.org?subject=subscribe>",
        "Errors-To": "dev-bounces@dpdk.org",
        "Sender": "\"dev\" <dev-bounces@dpdk.org>"
    },
    "content": "Vector code reorganisation/deduplication:\nTo avoid maintaining two nearly identical implementations of calc_addr()\n(one for SSE, another for AVX2), replace it with  a new macro that suits\nboth SSE and AVX2 code-paths.\nAlso remove no needed any more MM_* macros.\n\nSigned-off-by: Konstantin Ananyev <konstantin.ananyev@intel.com>\n---\n lib/librte_acl/acl_run_avx2.h                   |  87 +++++-------\n lib/librte_acl/acl_run_sse.h                    | 178 ++++++++----------------\n lib/librte_acl/acl_vect.h                       | 132 ++++++++----------\n lib/librte_eal/common/include/rte_common_vect.h |  12 ++\n 4 files changed, 160 insertions(+), 249 deletions(-)",
    "diff": "diff --git a/lib/librte_acl/acl_run_avx2.h b/lib/librte_acl/acl_run_avx2.h\nindex 1688c50..b01a46a 100644\n--- a/lib/librte_acl/acl_run_avx2.h\n+++ b/lib/librte_acl/acl_run_avx2.h\n@@ -73,51 +73,19 @@ static const rte_ymm_t ymm_ones_16 = {\n \t},\n };\n \n-static inline __attribute__((always_inline)) ymm_t\n-calc_addr_avx2(ymm_t index_mask, ymm_t next_input, ymm_t shuffle_input,\n-\tymm_t ones_16, ymm_t tr_lo, ymm_t tr_hi)\n-{\n-\tymm_t in, node_type, r, t;\n-\tymm_t dfa_msk, dfa_ofs, quad_ofs;\n-\tymm_t addr;\n-\n-\tconst ymm_t range_base = _mm256_set_epi32(\n-\t\t0xffffff0c, 0xffffff08, 0xffffff04, 0xffffff00,\n-\t\t0xffffff0c, 0xffffff08, 0xffffff04, 0xffffff00);\n-\n-\tt = _mm256_xor_si256(index_mask, index_mask);\n-\tin = _mm256_shuffle_epi8(next_input, shuffle_input);\n-\n-\t/* Calc node type and node addr */\n-\tnode_type = _mm256_andnot_si256(index_mask, tr_lo);\n-\taddr = _mm256_and_si256(index_mask, tr_lo);\n-\n-\t/* DFA calculations. */\n-\n-\tdfa_msk = _mm256_cmpeq_epi32(node_type, t);\n-\n-\tr = _mm256_srli_epi32(in, 30);\n-\tr = _mm256_add_epi8(r, range_base);\n-\n-\tt = _mm256_srli_epi32(in, 24);\n-\tr = _mm256_shuffle_epi8(tr_hi, r);\n-\n-\tdfa_ofs = _mm256_sub_epi32(t, r);\n-\n-\t/* QUAD/SINGLE caluclations. */\n-\n-\tt = _mm256_cmpgt_epi8(in, tr_hi);\n-\tt = _mm256_sign_epi8(t, t);\n-\tt = _mm256_maddubs_epi16(t, t);\n-\tquad_ofs = _mm256_madd_epi16(t, ones_16);\n-\n-\t/* blend DFA and QUAD/SINGLE. */\n-\tt = _mm256_blendv_epi8(quad_ofs, dfa_ofs, dfa_msk);\n-\n-\taddr = _mm256_add_epi32(addr, t);\n-\treturn addr;\n-}\n+static const rte_ymm_t ymm_range_base = {\n+\t.u32 = {\n+\t\t0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c,\n+\t\t0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c,\n+\t},\n+};\n \n+/*\n+ * Process 8 transitions in parallel.\n+ * tr_lo contains low 32 bits for 8 transition.\n+ * tr_hi contains high 32 bits for 8 transition.\n+ * next_input contains up to 4 input bytes for 8 flows.\n+ */\n static inline __attribute__((always_inline)) ymm_t\n transition8(ymm_t next_input, const uint64_t *trans, ymm_t *tr_lo, ymm_t *tr_hi)\n {\n@@ -126,8 +94,10 @@ transition8(ymm_t next_input, const uint64_t *trans, ymm_t *tr_lo, ymm_t *tr_hi)\n \n \ttr = (const int32_t *)(uintptr_t)trans;\n \n-\taddr = calc_addr_avx2(ymm_index_mask.y, next_input, ymm_shuffle_input.y,\n-\t\tymm_ones_16.y, *tr_lo, *tr_hi);\n+\t/* Calculate the address (array index) for all 8 transitions. */\n+\tACL_TR_CALC_ADDR(mm256, 256, addr, ymm_index_mask.y, next_input,\n+\t\tymm_shuffle_input.y, ymm_ones_16.y, ymm_range_base.y,\n+\t\t*tr_lo, *tr_hi);\n \n \t/* load lower 32 bits of 8 transactions at once. */\n \t*tr_lo = _mm256_i32gather_epi32(tr, addr, sizeof(trans[0]));\n@@ -140,6 +110,11 @@ transition8(ymm_t next_input, const uint64_t *trans, ymm_t *tr_lo, ymm_t *tr_hi)\n \treturn next_input;\n }\n \n+/*\n+ * Process matches for  8 flows.\n+ * tr_lo contains low 32 bits for 8 transition.\n+ * tr_hi contains high 32 bits for 8 transition.\n+ */\n static inline void\n acl_process_matches_avx2x8(const struct rte_acl_ctx *ctx,\n \tstruct parms *parms, struct acl_flow_data *flows, uint32_t slot,\n@@ -155,6 +130,11 @@ acl_process_matches_avx2x8(const struct rte_acl_ctx *ctx,\n \tl0 = _mm256_castsi256_si128(*tr_lo);\n \n \tfor (i = 0; i != RTE_DIM(tr) / 2; i++) {\n+\n+\t\t/*\n+\t\t * Extract low 32bits of each transition.\n+\t\t * That's enough to process the match.\n+\t\t */\n \t\ttr[i] = (uint32_t)_mm_cvtsi128_si32(l0);\n \t\ttr[i + 4] = (uint32_t)_mm_cvtsi128_si32(l1);\n \n@@ -167,12 +147,14 @@ acl_process_matches_avx2x8(const struct rte_acl_ctx *ctx,\n \t\t\tctx, parms, flows, resolve_priority_sse);\n \t}\n \n+\t/* Collect new transitions into 2 YMM registers. */\n \tt0 = _mm256_set_epi64x(tr[5], tr[4], tr[1], tr[0]);\n \tt1 = _mm256_set_epi64x(tr[7], tr[6], tr[3], tr[2]);\n \n-\tlo = (ymm_t)_mm256_shuffle_ps((__m256)t0, (__m256)t1, 0x88);\n-\thi = (ymm_t)_mm256_shuffle_ps((__m256)t0, (__m256)t1, 0xdd);\n+\t/* For each transition: put low 32 into tr_lo and high 32 into tr_hi */\n+\tACL_TR_HILO(mm256, __m256, t0, t1, lo, hi);\n \n+\t/* Keep transitions wth NOMATCH intact. */\n \t*tr_lo = _mm256_blendv_epi8(*tr_lo, lo, matches);\n \t*tr_hi = _mm256_blendv_epi8(*tr_hi, hi, matches);\n }\n@@ -200,6 +182,9 @@ acl_match_check_avx2x8(const struct rte_acl_ctx *ctx, struct parms *parms,\n \t}\n }\n \n+/*\n+ * Execute trie traversal for up to 16 flows in parallel.\n+ */\n static inline int\n search_avx2x16(const struct rte_acl_ctx *ctx, const uint8_t **data,\n \tuint32_t *results, uint32_t total_packets, uint32_t categories)\n@@ -225,16 +210,14 @@ search_avx2x16(const struct rte_acl_ctx *ctx, const uint8_t **data,\n \tt1 = _mm256_set_epi64x(index_array[7], index_array[6],\n \t\tindex_array[3], index_array[2]);\n \n-\ttr_lo[0] = (ymm_t)_mm256_shuffle_ps((__m256)t0, (__m256)t1, 0x88);\n-\ttr_hi[0] = (ymm_t)_mm256_shuffle_ps((__m256)t0, (__m256)t1, 0xdd);\n+\tACL_TR_HILO(mm256, __m256, t0, t1, tr_lo[0], tr_hi[0]);\n \n \tt0 = _mm256_set_epi64x(index_array[13], index_array[12],\n \t\tindex_array[9], index_array[8]);\n \tt1 = _mm256_set_epi64x(index_array[15], index_array[14],\n \t\tindex_array[11], index_array[10]);\n \n-\ttr_lo[1] = (ymm_t)_mm256_shuffle_ps((__m256)t0, (__m256)t1, 0x88);\n-\ttr_hi[1] = (ymm_t)_mm256_shuffle_ps((__m256)t0, (__m256)t1, 0xdd);\n+\tACL_TR_HILO(mm256, __m256, t0, t1, tr_lo[1], tr_hi[1]);\n \n \t /* Check for any matches. */\n \tacl_match_check_avx2x8(ctx, parms, &flows, 0, &tr_lo[0], &tr_hi[0],\ndiff --git a/lib/librte_acl/acl_run_sse.h b/lib/librte_acl/acl_run_sse.h\nindex 4a174e9..ad40a67 100644\n--- a/lib/librte_acl/acl_run_sse.h\n+++ b/lib/librte_acl/acl_run_sse.h\n@@ -67,6 +67,12 @@ static const rte_xmm_t xmm_index_mask = {\n \t},\n };\n \n+static const rte_xmm_t xmm_range_base = {\n+\t.u32 = {\n+\t\t0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c,\n+\t},\n+};\n+\n /*\n  * Resolve priority for multiple results (sse version).\n  * This consists comparing the priority of the current traversal with the\n@@ -90,25 +96,28 @@ resolve_priority_sse(uint64_t transition, int n, const struct rte_acl_ctx *ctx,\n \t\t\t(xmm_t *)(&parms[n].cmplt->priority[x]);\n \n \t\t/* get results and priorities for completed trie */\n-\t\tresults = MM_LOADU((const xmm_t *)&p[transition].results[x]);\n-\t\tpriority = MM_LOADU((const xmm_t *)&p[transition].priority[x]);\n+\t\tresults = _mm_loadu_si128(\n+\t\t\t(const xmm_t *)&p[transition].results[x]);\n+\t\tpriority = _mm_loadu_si128(\n+\t\t\t(const xmm_t *)&p[transition].priority[x]);\n \n \t\t/* if this is not the first completed trie */\n \t\tif (parms[n].cmplt->count != ctx->num_tries) {\n \n \t\t\t/* get running best results and their priorities */\n-\t\t\tresults1 = MM_LOADU(saved_results);\n-\t\t\tpriority1 = MM_LOADU(saved_priority);\n+\t\t\tresults1 = _mm_loadu_si128(saved_results);\n+\t\t\tpriority1 = _mm_loadu_si128(saved_priority);\n \n \t\t\t/* select results that are highest priority */\n-\t\t\tselector = MM_CMPGT32(priority1, priority);\n-\t\t\tresults = MM_BLENDV8(results, results1, selector);\n-\t\t\tpriority = MM_BLENDV8(priority, priority1, selector);\n+\t\t\tselector = _mm_cmpgt_epi32(priority1, priority);\n+\t\t\tresults = _mm_blendv_epi8(results, results1, selector);\n+\t\t\tpriority = _mm_blendv_epi8(priority, priority1,\n+\t\t\t\tselector);\n \t\t}\n \n \t\t/* save running best results and their priorities */\n-\t\tMM_STOREU(saved_results, results);\n-\t\tMM_STOREU(saved_priority, priority);\n+\t\t_mm_storeu_si128(saved_results, results);\n+\t\t_mm_storeu_si128(saved_priority, priority);\n \t}\n }\n \n@@ -122,11 +131,11 @@ acl_process_matches(xmm_t *indices, int slot, const struct rte_acl_ctx *ctx,\n \tuint64_t transition1, transition2;\n \n \t/* extract transition from low 64 bits. */\n-\ttransition1 = MM_CVT64(*indices);\n+\ttransition1 = _mm_cvtsi128_si64(*indices);\n \n \t/* extract transition from high 64 bits. */\n-\t*indices = MM_SHUFFLE32(*indices, SHUFFLE32_SWAP64);\n-\ttransition2 = MM_CVT64(*indices);\n+\t*indices = _mm_shuffle_epi32(*indices, SHUFFLE32_SWAP64);\n+\ttransition2 = _mm_cvtsi128_si64(*indices);\n \n \ttransition1 = acl_match_check(transition1, slot, ctx,\n \t\tparms, flows, resolve_priority_sse);\n@@ -134,7 +143,7 @@ acl_process_matches(xmm_t *indices, int slot, const struct rte_acl_ctx *ctx,\n \t\tparms, flows, resolve_priority_sse);\n \n \t/* update indices with new transitions. */\n-\t*indices = MM_SET64(transition2, transition1);\n+\t*indices = _mm_set_epi64x(transition2, transition1);\n }\n \n /*\n@@ -148,98 +157,24 @@ acl_match_check_x4(int slot, const struct rte_acl_ctx *ctx, struct parms *parms,\n \txmm_t temp;\n \n \t/* put low 32 bits of each transition into one register */\n-\ttemp = (xmm_t)MM_SHUFFLEPS((__m128)*indices1, (__m128)*indices2,\n+\ttemp = (xmm_t)_mm_shuffle_ps((__m128)*indices1, (__m128)*indices2,\n \t\t0x88);\n \t/* test for match node */\n-\ttemp = MM_AND(match_mask, temp);\n+\ttemp = _mm_and_si128(match_mask, temp);\n \n-\twhile (!MM_TESTZ(temp, temp)) {\n+\twhile (!_mm_testz_si128(temp, temp)) {\n \t\tacl_process_matches(indices1, slot, ctx, parms, flows);\n \t\tacl_process_matches(indices2, slot + 2, ctx, parms, flows);\n \n-\t\ttemp = (xmm_t)MM_SHUFFLEPS((__m128)*indices1,\n+\t\ttemp = (xmm_t)_mm_shuffle_ps((__m128)*indices1,\n \t\t\t\t\t(__m128)*indices2,\n \t\t\t\t\t0x88);\n-\t\ttemp = MM_AND(match_mask, temp);\n+\t\ttemp = _mm_and_si128(match_mask, temp);\n \t}\n }\n \n /*\n- * Calculate the address of the next transition for\n- * all types of nodes. Note that only DFA nodes and range\n- * nodes actually transition to another node. Match\n- * nodes don't move.\n- */\n-static inline __attribute__((always_inline)) xmm_t\n-calc_addr_sse(xmm_t index_mask, xmm_t next_input, xmm_t shuffle_input,\n-\txmm_t ones_16, xmm_t tr_lo, xmm_t tr_hi)\n-{\n-\txmm_t addr, node_types;\n-\txmm_t dfa_msk, dfa_ofs, quad_ofs;\n-\txmm_t in, r, t;\n-\n-\tconst xmm_t range_base = _mm_set_epi32(0xffffff0c, 0xffffff08,\n-\t\t0xffffff04, 0xffffff00);\n-\n-\t/*\n-\t * Note that no transition is done for a match\n-\t * node and therefore a stream freezes when\n-\t * it reaches a match.\n-\t */\n-\n-\tt = MM_XOR(index_mask, index_mask);\n-\n-\t/* shuffle input byte to all 4 positions of 32 bit value */\n-\tin = MM_SHUFFLE8(next_input, shuffle_input);\n-\n-\t/* Calc node type and node addr */\n-\tnode_types = MM_ANDNOT(index_mask, tr_lo);\n-\taddr = MM_AND(index_mask, tr_lo);\n-\n-\t/*\n-\t * Calc addr for DFAs - addr = dfa_index + input_byte\n-\t */\n-\n-\t/* mask for DFA type (0) nodes */\n-\tdfa_msk = MM_CMPEQ32(node_types, t);\n-\n-\tr = _mm_srli_epi32(in, 30);\n-\tr = _mm_add_epi8(r, range_base);\n-\n-\tt = _mm_srli_epi32(in, 24);\n-\tr = _mm_shuffle_epi8(tr_hi, r);\n-\n-\tdfa_ofs = _mm_sub_epi32(t, r);\n-\n-\t/*\n-\t * Calculate number of range boundaries that are less than the\n-\t * input value. Range boundaries for each node are in signed 8 bit,\n-\t * ordered from -128 to 127 in the indices2 register.\n-\t * This is effectively a popcnt of bytes that are greater than the\n-\t * input byte.\n-\t */\n-\n-\t/* check ranges */\n-\tt = MM_CMPGT8(in, tr_hi);\n-\n-\t/* convert -1 to 1 (bytes greater than input byte */\n-\tt = MM_SIGN8(t, t);\n-\n-\t/* horizontal add pairs of bytes into words */\n-\tt = MM_MADD8(t, t);\n-\n-\t/* horizontal add pairs of words into dwords */\n-\tquad_ofs = MM_MADD16(t, ones_16);\n-\n-\t/* blend DFA and QUAD/SINGLE. */\n-\tt = _mm_blendv_epi8(quad_ofs, dfa_ofs, dfa_msk);\n-\n-\t/* add index into node position */\n-\treturn MM_ADD32(addr, t);\n-}\n-\n-/*\n- * Process 4 transitions (in 2 SIMD registers) in parallel\n+ * Process 4 transitions (in 2 XMM registers) in parallel\n  */\n static inline __attribute__((always_inline)) xmm_t\n transition4(xmm_t next_input, const uint64_t *trans,\n@@ -249,39 +184,36 @@ transition4(xmm_t next_input, const uint64_t *trans,\n \tuint64_t trans0, trans2;\n \n \t/* Shuffle low 32 into tr_lo and high 32 into tr_hi */\n-\ttr_lo = (xmm_t)_mm_shuffle_ps((__m128)*indices1, (__m128)*indices2,\n-\t\t0x88);\n-\ttr_hi = (xmm_t)_mm_shuffle_ps((__m128)*indices1, (__m128)*indices2,\n-\t\t0xdd);\n+\tACL_TR_HILO(mm, __m128, *indices1, *indices2, tr_lo, tr_hi);\n \n \t /* Calculate the address (array index) for all 4 transitions. */\n-\n-\taddr = calc_addr_sse(xmm_index_mask.x, next_input, xmm_shuffle_input.x,\n-\t\txmm_ones_16.x, tr_lo, tr_hi);\n+\tACL_TR_CALC_ADDR(mm, 128, addr, xmm_index_mask.x, next_input,\n+\t\txmm_shuffle_input.x, xmm_ones_16.x, xmm_range_base.x,\n+\t\ttr_lo, tr_hi);\n \n \t /* Gather 64 bit transitions and pack back into 2 registers. */\n \n-\ttrans0 = trans[MM_CVT32(addr)];\n+\ttrans0 = trans[_mm_cvtsi128_si32(addr)];\n \n \t/* get slot 2 */\n \n \t/* {x0, x1, x2, x3} -> {x2, x1, x2, x3} */\n-\taddr = MM_SHUFFLE32(addr, SHUFFLE32_SLOT2);\n-\ttrans2 = trans[MM_CVT32(addr)];\n+\taddr = _mm_shuffle_epi32(addr, SHUFFLE32_SLOT2);\n+\ttrans2 = trans[_mm_cvtsi128_si32(addr)];\n \n \t/* get slot 1 */\n \n \t/* {x2, x1, x2, x3} -> {x1, x1, x2, x3} */\n-\taddr = MM_SHUFFLE32(addr, SHUFFLE32_SLOT1);\n-\t*indices1 = MM_SET64(trans[MM_CVT32(addr)], trans0);\n+\taddr = _mm_shuffle_epi32(addr, SHUFFLE32_SLOT1);\n+\t*indices1 = _mm_set_epi64x(trans[_mm_cvtsi128_si32(addr)], trans0);\n \n \t/* get slot 3 */\n \n \t/* {x1, x1, x2, x3} -> {x3, x1, x2, x3} */\n-\taddr = MM_SHUFFLE32(addr, SHUFFLE32_SLOT3);\n-\t*indices2 = MM_SET64(trans[MM_CVT32(addr)], trans2);\n+\taddr = _mm_shuffle_epi32(addr, SHUFFLE32_SLOT3);\n+\t*indices2 = _mm_set_epi64x(trans[_mm_cvtsi128_si32(addr)], trans2);\n \n-\treturn MM_SRL32(next_input, CHAR_BIT);\n+\treturn _mm_srli_epi32(next_input, CHAR_BIT);\n }\n \n /*\n@@ -314,11 +246,11 @@ search_sse_8(const struct rte_acl_ctx *ctx, const uint8_t **data,\n \t * indices4 contains index_array[6,7]\n \t */\n \n-\tindices1 = MM_LOADU((xmm_t *) &index_array[0]);\n-\tindices2 = MM_LOADU((xmm_t *) &index_array[2]);\n+\tindices1 = _mm_loadu_si128((xmm_t *) &index_array[0]);\n+\tindices2 = _mm_loadu_si128((xmm_t *) &index_array[2]);\n \n-\tindices3 = MM_LOADU((xmm_t *) &index_array[4]);\n-\tindices4 = MM_LOADU((xmm_t *) &index_array[6]);\n+\tindices3 = _mm_loadu_si128((xmm_t *) &index_array[4]);\n+\tindices4 = _mm_loadu_si128((xmm_t *) &index_array[6]);\n \n \t /* Check for any matches. */\n \tacl_match_check_x4(0, ctx, parms, &flows,\n@@ -332,14 +264,14 @@ search_sse_8(const struct rte_acl_ctx *ctx, const uint8_t **data,\n \t\tinput0 = _mm_cvtsi32_si128(GET_NEXT_4BYTES(parms, 0));\n \t\tinput1 = _mm_cvtsi32_si128(GET_NEXT_4BYTES(parms, 4));\n \n-\t\tinput0 = MM_INSERT32(input0, GET_NEXT_4BYTES(parms, 1), 1);\n-\t\tinput1 = MM_INSERT32(input1, GET_NEXT_4BYTES(parms, 5), 1);\n+\t\tinput0 = _mm_insert_epi32(input0, GET_NEXT_4BYTES(parms, 1), 1);\n+\t\tinput1 = _mm_insert_epi32(input1, GET_NEXT_4BYTES(parms, 5), 1);\n \n-\t\tinput0 = MM_INSERT32(input0, GET_NEXT_4BYTES(parms, 2), 2);\n-\t\tinput1 = MM_INSERT32(input1, GET_NEXT_4BYTES(parms, 6), 2);\n+\t\tinput0 = _mm_insert_epi32(input0, GET_NEXT_4BYTES(parms, 2), 2);\n+\t\tinput1 = _mm_insert_epi32(input1, GET_NEXT_4BYTES(parms, 6), 2);\n \n-\t\tinput0 = MM_INSERT32(input0, GET_NEXT_4BYTES(parms, 3), 3);\n-\t\tinput1 = MM_INSERT32(input1, GET_NEXT_4BYTES(parms, 7), 3);\n+\t\tinput0 = _mm_insert_epi32(input0, GET_NEXT_4BYTES(parms, 3), 3);\n+\t\tinput1 = _mm_insert_epi32(input1, GET_NEXT_4BYTES(parms, 7), 3);\n \n \t\t /* Process the 4 bytes of input on each stream. */\n \n@@ -395,8 +327,8 @@ search_sse_4(const struct rte_acl_ctx *ctx, const uint8_t **data,\n \t\tindex_array[n] = acl_start_next_trie(&flows, parms, n, ctx);\n \t}\n \n-\tindices1 = MM_LOADU((xmm_t *) &index_array[0]);\n-\tindices2 = MM_LOADU((xmm_t *) &index_array[2]);\n+\tindices1 = _mm_loadu_si128((xmm_t *) &index_array[0]);\n+\tindices2 = _mm_loadu_si128((xmm_t *) &index_array[2]);\n \n \t/* Check for any matches. */\n \tacl_match_check_x4(0, ctx, parms, &flows,\n@@ -406,9 +338,9 @@ search_sse_4(const struct rte_acl_ctx *ctx, const uint8_t **data,\n \n \t\t/* Gather 4 bytes of input data for each stream. */\n \t\tinput = _mm_cvtsi32_si128(GET_NEXT_4BYTES(parms, 0));\n-\t\tinput = MM_INSERT32(input, GET_NEXT_4BYTES(parms, 1), 1);\n-\t\tinput = MM_INSERT32(input, GET_NEXT_4BYTES(parms, 2), 2);\n-\t\tinput = MM_INSERT32(input, GET_NEXT_4BYTES(parms, 3), 3);\n+\t\tinput = _mm_insert_epi32(input, GET_NEXT_4BYTES(parms, 1), 1);\n+\t\tinput = _mm_insert_epi32(input, GET_NEXT_4BYTES(parms, 2), 2);\n+\t\tinput = _mm_insert_epi32(input, GET_NEXT_4BYTES(parms, 3), 3);\n \n \t\t/* Process the 4 bytes of input on each stream. */\n \t\tinput = transition4(input, flows.trans, &indices1, &indices2);\ndiff --git a/lib/librte_acl/acl_vect.h b/lib/librte_acl/acl_vect.h\nindex d813600..6cc1999 100644\n--- a/lib/librte_acl/acl_vect.h\n+++ b/lib/librte_acl/acl_vect.h\n@@ -44,86 +44,70 @@\n extern \"C\" {\n #endif\n \n-#define\tMM_ADD16(a, b)\t\t_mm_add_epi16(a, b)\n-#define\tMM_ADD32(a, b)\t\t_mm_add_epi32(a, b)\n-#define\tMM_ALIGNR8(a, b, c)\t_mm_alignr_epi8(a, b, c)\n-#define\tMM_AND(a, b)\t\t_mm_and_si128(a, b)\n-#define MM_ANDNOT(a, b)\t\t_mm_andnot_si128(a, b)\n-#define MM_BLENDV8(a, b, c)\t_mm_blendv_epi8(a, b, c)\n-#define MM_CMPEQ16(a, b)\t_mm_cmpeq_epi16(a, b)\n-#define MM_CMPEQ32(a, b)\t_mm_cmpeq_epi32(a, b)\n-#define\tMM_CMPEQ8(a, b)\t\t_mm_cmpeq_epi8(a, b)\n-#define MM_CMPGT32(a, b)\t_mm_cmpgt_epi32(a, b)\n-#define MM_CMPGT8(a, b)\t\t_mm_cmpgt_epi8(a, b)\n-#define MM_CVT(a)\t\t_mm_cvtsi32_si128(a)\n-#define\tMM_CVT32(a)\t\t_mm_cvtsi128_si32(a)\n-#define MM_CVTU32(a)\t\t_mm_cvtsi32_si128(a)\n-#define\tMM_INSERT16(a, c, b)\t_mm_insert_epi16(a, c, b)\n-#define\tMM_INSERT32(a, c, b)\t_mm_insert_epi32(a, c, b)\n-#define\tMM_LOAD(a)\t\t_mm_load_si128(a)\n-#define\tMM_LOADH_PI(a, b)\t_mm_loadh_pi(a, b)\n-#define\tMM_LOADU(a)\t\t_mm_loadu_si128(a)\n-#define\tMM_MADD16(a, b)\t\t_mm_madd_epi16(a, b)\n-#define\tMM_MADD8(a, b)\t\t_mm_maddubs_epi16(a, b)\n-#define\tMM_MOVEMASK8(a)\t\t_mm_movemask_epi8(a)\n-#define MM_OR(a, b)\t\t_mm_or_si128(a, b)\n-#define\tMM_SET1_16(a)\t\t_mm_set1_epi16(a)\n-#define\tMM_SET1_32(a)\t\t_mm_set1_epi32(a)\n-#define\tMM_SET1_64(a)\t\t_mm_set1_epi64(a)\n-#define\tMM_SET1_8(a)\t\t_mm_set1_epi8(a)\n-#define\tMM_SET32(a, b, c, d)\t_mm_set_epi32(a, b, c, d)\n-#define\tMM_SHUFFLE32(a, b)\t_mm_shuffle_epi32(a, b)\n-#define\tMM_SHUFFLE8(a, b)\t_mm_shuffle_epi8(a, b)\n-#define\tMM_SHUFFLEPS(a, b, c)\t_mm_shuffle_ps(a, b, c)\n-#define\tMM_SIGN8(a, b)\t\t_mm_sign_epi8(a, b)\n-#define\tMM_SLL64(a, b)\t\t_mm_sll_epi64(a, b)\n-#define\tMM_SRL128(a, b)\t\t_mm_srli_si128(a, b)\n-#define MM_SRL16(a, b)\t\t_mm_srli_epi16(a, b)\n-#define\tMM_SRL32(a, b)\t\t_mm_srli_epi32(a, b)\n-#define\tMM_STORE(a, b)\t\t_mm_store_si128(a, b)\n-#define\tMM_STOREU(a, b)\t\t_mm_storeu_si128(a, b)\n-#define\tMM_TESTZ(a, b)\t\t_mm_testz_si128(a, b)\n-#define\tMM_XOR(a, b)\t\t_mm_xor_si128(a, b)\n-\n-#define\tMM_SET16(a, b, c, d, e, f, g, h)\t\\\n-\t_mm_set_epi16(a, b, c, d, e, f, g, h)\n-\n-#define\tMM_SET8(c0, c1, c2, c3, c4, c5, c6, c7,\t\\\n-\t\tc8, c9, cA, cB, cC, cD, cE, cF)\t\\\n-\t_mm_set_epi8(c0, c1, c2, c3, c4, c5, c6, c7,\t\\\n-\t\tc8, c9, cA, cB, cC, cD, cE, cF)\n-\n-#ifdef RTE_ARCH_X86_64\n-\n-#define\tMM_CVT64(a)\t\t_mm_cvtsi128_si64(a)\n-\n-#else\n-\n-#define\tMM_CVT64(a)\t({ \\\n-\trte_xmm_t m;       \\\n-\tm.m = (a);         \\\n-\t(m.u64[0]);        \\\n-})\n-\n-#endif /*RTE_ARCH_X86_64 */\n \n /*\n- * Prior to version 12.1 icc doesn't support _mm_set_epi64x.\n+ * Takes 2 SIMD registers containing N transitions eachi (tr0, tr1).\n+ * Shuffles it into different representation:\n+ * lo - contains low 32 bits of given N transitions.\n+ * hi - contains high 32 bits of given N transitions.\n  */\n-#if (defined(__ICC) && __ICC < 1210)\n+#define\tACL_TR_HILO(P, TC, tr0, tr1, lo, hi)                        do { \\\n+\tlo = (typeof(lo))_##P##_shuffle_ps((TC)(tr0), (TC)(tr1), 0x88);  \\\n+\thi = (typeof(hi))_##P##_shuffle_ps((TC)(tr0), (TC)(tr1), 0xdd);  \\\n+} while (0)\n \n-#define\tMM_SET64(a, b)\t({ \\\n-\trte_xmm_t m;       \\\n-\tm.u64[0] = b;      \\\n-\tm.u64[1] = a;      \\\n-\t(m.m);             \\\n-})\n \n-#else\n-\n-#define\tMM_SET64(a, b)\t\t_mm_set_epi64x(a, b)\n+/*\n+ * Calculate the address of the next transition for\n+ * all types of nodes. Note that only DFA nodes and range\n+ * nodes actually transition to another node. Match\n+ * nodes not supposed to be encountered here.\n+ * For quad range nodes:\n+ * Calculate number of range boundaries that are less than the\n+ * input value. Range boundaries for each node are in signed 8 bit,\n+ * ordered from -128 to 127.\n+ * This is effectively a popcnt of bytes that are greater than the\n+ * input byte.\n+ * Single nodes are processed in the same ways as quad range nodes.\n+*/\n+#define ACL_TR_CALC_ADDR(P, S,\t\t\t\t\t\\\n+\taddr, index_mask, next_input, shuffle_input,\t\t\\\n+\tones_16, range_base, tr_lo, tr_hi)               do {\t\\\n+\t\t\t\t\t\t\t\t\\\n+\ttypeof(addr) in, node_type, r, t;\t\t\t\\\n+\ttypeof(addr) dfa_msk, dfa_ofs, quad_ofs;\t\t\\\n+\t\t\t\t\t\t\t\t\\\n+\tt = _##P##_xor_si##S(index_mask, index_mask);\t\t\\\n+\tin = _##P##_shuffle_epi8(next_input, shuffle_input);\t\\\n+\t\t\t\t\t\t\t\t\\\n+\t/* Calc node type and node addr */\t\t\t\\\n+\tnode_type = _##P##_andnot_si##S(index_mask, tr_lo);\t\\\n+\taddr = _##P##_and_si##S(index_mask, tr_lo);\t\t\\\n+\t\t\t\t\t\t\t\t\\\n+\t/* mask for DFA type(0) nodes */\t\t\t\\\n+\tdfa_msk = _##P##_cmpeq_epi32(node_type, t);\t\t\\\n+\t\t\t\t\t\t\t\t\\\n+\t/* DFA calculations. */\t\t\t\t\t\\\n+\tr = _##P##_srli_epi32(in, 30);\t\t\t\t\\\n+\tr = _##P##_add_epi8(r, range_base);\t\t\t\\\n+\tt = _##P##_srli_epi32(in, 24);\t\t\t\t\\\n+\tr = _##P##_shuffle_epi8(tr_hi, r);\t\t\t\\\n+\t\t\t\t\t\t\t\t\\\n+\tdfa_ofs = _##P##_sub_epi32(t, r);\t\t\t\\\n+\t\t\t\t\t\t\t\t\\\n+\t/* QUAD/SINGLE caluclations. */\t\t\t\t\\\n+\tt = _##P##_cmpgt_epi8(in, tr_hi);\t\t\t\\\n+\tt = _##P##_sign_epi8(t, t);\t\t\t\t\\\n+\tt = _##P##_maddubs_epi16(t, t);\t\t\t\t\\\n+\tquad_ofs = _##P##_madd_epi16(t, ones_16);\t\t\\\n+\t\t\t\t\t\t\t\t\\\n+\t/* blend DFA and QUAD/SINGLE. */\t\t\t\\\n+\tt = _##P##_blendv_epi8(quad_ofs, dfa_ofs, dfa_msk);\t\\\n+\t\t\t\t\t\t\t\t\\\n+\t/* calculate address for next transitions. */\t\t\\\n+\taddr = _##P##_add_epi32(addr, t);\t\t\t\\\n+} while (0)\n \n-#endif /* (defined(__ICC) && __ICC < 1210) */\n \n #ifdef __cplusplus\n }\ndiff --git a/lib/librte_eal/common/include/rte_common_vect.h b/lib/librte_eal/common/include/rte_common_vect.h\nindex 617470b..54ec70f 100644\n--- a/lib/librte_eal/common/include/rte_common_vect.h\n+++ b/lib/librte_eal/common/include/rte_common_vect.h\n@@ -109,6 +109,18 @@ typedef union rte_ymm {\n })\n #endif\n \n+/*\n+ * Prior to version 12.1 icc doesn't support _mm_set_epi64x.\n+ */\n+#if (defined(__ICC) && __ICC < 1210)\n+#define _mm_set_epi64x(a, b)  ({ \\\n+\trte_xmm_t m;             \\\n+\tm.u64[0] = b;            \\\n+\tm.u64[1] = a;            \\\n+\t(m.x);                   \\\n+})\n+#endif /* (defined(__ICC) && __ICC < 1210) */\n+\n #ifdef __cplusplus\n }\n #endif\n",
    "prefixes": [
        "dpdk-dev",
        "v2",
        "15/17"
    ]
}