get:
Show a patch.

patch:
Update a patch.

put:
Update a patch.

GET /api/patches/94885/?format=api
HTTP 200 OK
Allow: GET, PUT, PATCH, HEAD, OPTIONS
Content-Type: application/json
Vary: Accept

{
    "id": 94885,
    "url": "https://patches.dpdk.org/api/patches/94885/?format=api",
    "web_url": "https://patches.dpdk.org/project/dpdk/patch/1624866784-2458-3-git-send-email-wenzhuo.lu@intel.com/",
    "project": {
        "id": 1,
        "url": "https://patches.dpdk.org/api/projects/1/?format=api",
        "name": "DPDK",
        "link_name": "dpdk",
        "list_id": "dev.dpdk.org",
        "list_email": "dev@dpdk.org",
        "web_url": "http://core.dpdk.org",
        "scm_url": "git://dpdk.org/dpdk",
        "webscm_url": "http://git.dpdk.org/dpdk",
        "list_archive_url": "https://inbox.dpdk.org/dev",
        "list_archive_url_format": "https://inbox.dpdk.org/dev/{}",
        "commit_url_format": ""
    },
    "msgid": "<1624866784-2458-3-git-send-email-wenzhuo.lu@intel.com>",
    "list_archive_url": "https://inbox.dpdk.org/dev/1624866784-2458-3-git-send-email-wenzhuo.lu@intel.com",
    "date": "2021-06-28T07:53:04",
    "name": "[v2,2/2] net/ice: add Rx AVX2 offload path",
    "commit_ref": null,
    "pull_url": null,
    "state": "superseded",
    "archived": true,
    "hash": "cc7a2135afb0ad101a41280d7d2acc4e13ef156b",
    "submitter": {
        "id": 258,
        "url": "https://patches.dpdk.org/api/people/258/?format=api",
        "name": "Wenzhuo Lu",
        "email": "wenzhuo.lu@intel.com"
    },
    "delegate": {
        "id": 1540,
        "url": "https://patches.dpdk.org/api/users/1540/?format=api",
        "username": "qzhan15",
        "first_name": "Qi",
        "last_name": "Zhang",
        "email": "qi.z.zhang@intel.com"
    },
    "mbox": "https://patches.dpdk.org/project/dpdk/patch/1624866784-2458-3-git-send-email-wenzhuo.lu@intel.com/mbox/",
    "series": [
        {
            "id": 17497,
            "url": "https://patches.dpdk.org/api/series/17497/?format=api",
            "web_url": "https://patches.dpdk.org/project/dpdk/list/?series=17497",
            "date": "2021-06-28T07:53:02",
            "name": "add Rx/Tx offload paths for ICE AVX2",
            "version": 2,
            "mbox": "https://patches.dpdk.org/series/17497/mbox/"
        }
    ],
    "comments": "https://patches.dpdk.org/api/patches/94885/comments/",
    "check": "warning",
    "checks": "https://patches.dpdk.org/api/patches/94885/checks/",
    "tags": {},
    "related": [],
    "headers": {
        "Return-Path": "<dev-bounces@dpdk.org>",
        "X-Original-To": "patchwork@inbox.dpdk.org",
        "Delivered-To": "patchwork@inbox.dpdk.org",
        "Received": [
            "from mails.dpdk.org (mails.dpdk.org [217.70.189.124])\n\tby inbox.dpdk.org (Postfix) with ESMTP id C30B2A0C3F;\n\tMon, 28 Jun 2021 09:53:27 +0200 (CEST)",
            "from [217.70.189.124] (localhost [127.0.0.1])\n\tby mails.dpdk.org (Postfix) with ESMTP id 2F478410FB;\n\tMon, 28 Jun 2021 09:53:21 +0200 (CEST)",
            "from mga14.intel.com (mga14.intel.com [192.55.52.115])\n by mails.dpdk.org (Postfix) with ESMTP id 1E9E3410F3\n for <dev@dpdk.org>; Mon, 28 Jun 2021 09:53:16 +0200 (CEST)",
            "from fmsmga008.fm.intel.com ([10.253.24.58])\n by fmsmga103.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384;\n 28 Jun 2021 00:53:16 -0700",
            "from dpdk-wenzhuo-haswell.sh.intel.com ([10.67.110.186])\n by fmsmga008.fm.intel.com with ESMTP; 28 Jun 2021 00:53:15 -0700"
        ],
        "X-IronPort-AV": [
            "E=McAfee;i=\"6200,9189,10028\"; a=\"207723877\"",
            "E=Sophos;i=\"5.83,305,1616482800\"; d=\"scan'208\";a=\"207723877\"",
            "E=Sophos;i=\"5.83,305,1616482800\"; d=\"scan'208\";a=\"456223308\""
        ],
        "X-ExtLoop1": "1",
        "From": "Wenzhuo Lu <wenzhuo.lu@intel.com>",
        "To": "dev@dpdk.org",
        "Cc": "Wenzhuo Lu <wenzhuo.lu@intel.com>",
        "Date": "Mon, 28 Jun 2021 15:53:04 +0800",
        "Message-Id": "<1624866784-2458-3-git-send-email-wenzhuo.lu@intel.com>",
        "X-Mailer": "git-send-email 1.9.3",
        "In-Reply-To": "<1624866784-2458-1-git-send-email-wenzhuo.lu@intel.com>",
        "References": "<1622600462-39088-1-git-send-email-wenzhuo.lu@intel.com>\n <1624866784-2458-1-git-send-email-wenzhuo.lu@intel.com>",
        "Subject": "[dpdk-dev] [PATCH v2 2/2] net/ice: add Rx AVX2 offload path",
        "X-BeenThere": "dev@dpdk.org",
        "X-Mailman-Version": "2.1.29",
        "Precedence": "list",
        "List-Id": "DPDK patches and discussions <dev.dpdk.org>",
        "List-Unsubscribe": "<https://mails.dpdk.org/options/dev>,\n <mailto:dev-request@dpdk.org?subject=unsubscribe>",
        "List-Archive": "<http://mails.dpdk.org/archives/dev/>",
        "List-Post": "<mailto:dev@dpdk.org>",
        "List-Help": "<mailto:dev-request@dpdk.org?subject=help>",
        "List-Subscribe": "<https://mails.dpdk.org/listinfo/dev>,\n <mailto:dev-request@dpdk.org?subject=subscribe>",
        "Errors-To": "dev-bounces@dpdk.org",
        "Sender": "\"dev\" <dev-bounces@dpdk.org>"
    },
    "content": "Add a specific path for RX AVX2.\nIn this path, support the HW offload features, like,\nchecksum, VLAN stripping, RSS hash.\nThis path is chosen automatically according to the\nconfiguration.\n\n'inline' is used, then the duplicate code is generated\nby the compiler.\n\nSigned-off-by: Wenzhuo Lu <wenzhuo.lu@intel.com>\n---\n doc/guides/rel_notes/release_21_08.rst |   6 +\n drivers/net/ice/ice_rxtx.c             |  50 ++++--\n drivers/net/ice/ice_rxtx.h             |   5 +\n drivers/net/ice/ice_rxtx_vec_avx2.c    | 296 +++++++++++++++++++--------------\n 4 files changed, 217 insertions(+), 140 deletions(-)",
    "diff": "diff --git a/doc/guides/rel_notes/release_21_08.rst b/doc/guides/rel_notes/release_21_08.rst\nindex a6ecfdf..203b772 100644\n--- a/doc/guides/rel_notes/release_21_08.rst\n+++ b/doc/guides/rel_notes/release_21_08.rst\n@@ -55,6 +55,12 @@ New Features\n      Also, make sure to start the actual text at the margin.\n      =======================================================\n \n+* **Updated Intel ice driver.**\n+\n+  * In AVX2 code, added the new RX and TX paths to use the HW offload\n+    features. When the HW offload features are configured to be used, the\n+    offload paths are chosen automatically. In parallel the support for HW\n+    offload features was removed from the legacy AVX2 paths.\n \n Removed Items\n -------------\ndiff --git a/drivers/net/ice/ice_rxtx.c b/drivers/net/ice/ice_rxtx.c\nindex 5d7ca60..27fd248 100644\n--- a/drivers/net/ice/ice_rxtx.c\n+++ b/drivers/net/ice/ice_rxtx.c\n@@ -1999,7 +1999,9 @@\n \t    dev->rx_pkt_burst == ice_recv_scattered_pkts_vec_avx512_offload ||\n #endif\n \t    dev->rx_pkt_burst == ice_recv_pkts_vec_avx2 ||\n-\t    dev->rx_pkt_burst == ice_recv_scattered_pkts_vec_avx2)\n+\t    dev->rx_pkt_burst == ice_recv_pkts_vec_avx2_offload ||\n+\t    dev->rx_pkt_burst == ice_recv_scattered_pkts_vec_avx2 ||\n+\t    dev->rx_pkt_burst == ice_recv_scattered_pkts_vec_avx2_offload)\n \t\treturn ptypes;\n #endif\n \n@@ -3058,7 +3060,7 @@\n #ifdef RTE_ARCH_X86\n \tstruct ice_rx_queue *rxq;\n \tint i;\n-\tint rx_check_ret;\n+\tint rx_check_ret = -1;\n \tbool use_avx512 = false;\n \tbool use_avx2 = false;\n \n@@ -3113,14 +3115,25 @@\n \t\t\t\t\t\tice_recv_scattered_pkts_vec_avx512;\n \t\t\t\t}\n #endif\n+\t\t\t} else if (use_avx2) {\n+\t\t\t\tif (rx_check_ret == ICE_VECTOR_OFFLOAD_PATH) {\n+\t\t\t\t\tPMD_DRV_LOG(NOTICE,\n+\t\t\t\t\t\t    \"Using AVX2 OFFLOAD Vector Scattered Rx (port %d).\",\n+\t\t\t\t\t\t    dev->data->port_id);\n+\t\t\t\t\tdev->rx_pkt_burst =\n+\t\t\t\t\t\tice_recv_scattered_pkts_vec_avx2_offload;\n+\t\t\t\t} else {\n+\t\t\t\t\tPMD_DRV_LOG(NOTICE,\n+\t\t\t\t\t\t    \"Using AVX2 Vector Scattered Rx (port %d).\",\n+\t\t\t\t\t\t    dev->data->port_id);\n+\t\t\t\t\tdev->rx_pkt_burst =\n+\t\t\t\t\t\tice_recv_scattered_pkts_vec_avx2;\n+\t\t\t\t}\n \t\t\t} else {\n \t\t\t\tPMD_DRV_LOG(DEBUG,\n-\t\t\t\t\t\"Using %sVector Scattered Rx (port %d).\",\n-\t\t\t\t\tuse_avx2 ? \"avx2 \" : \"\",\n+\t\t\t\t\t\"Using Vector Scattered Rx (port %d).\",\n \t\t\t\t\tdev->data->port_id);\n-\t\t\t\tdev->rx_pkt_burst = use_avx2 ?\n-\t\t\t\t\tice_recv_scattered_pkts_vec_avx2 :\n-\t\t\t\t\tice_recv_scattered_pkts_vec;\n+\t\t\t\tdev->rx_pkt_burst = ice_recv_scattered_pkts_vec;\n \t\t\t}\n \t\t} else {\n \t\t\tif (use_avx512) {\n@@ -3139,14 +3152,25 @@\n \t\t\t\t\t\tice_recv_pkts_vec_avx512;\n \t\t\t\t}\n #endif\n+\t\t\t} else if (use_avx2) {\n+\t\t\t\tif (rx_check_ret == ICE_VECTOR_OFFLOAD_PATH) {\n+\t\t\t\t\tPMD_DRV_LOG(NOTICE,\n+\t\t\t\t\t\t    \"Using AVX2 OFFLOAD Vector Rx (port %d).\",\n+\t\t\t\t\t\t    dev->data->port_id);\n+\t\t\t\t\tdev->rx_pkt_burst =\n+\t\t\t\t\t\tice_recv_pkts_vec_avx2_offload;\n+\t\t\t\t} else {\n+\t\t\t\t\tPMD_DRV_LOG(NOTICE,\n+\t\t\t\t\t\t    \"Using AVX2 Vector Rx (port %d).\",\n+\t\t\t\t\t\t    dev->data->port_id);\n+\t\t\t\t\tdev->rx_pkt_burst =\n+\t\t\t\t\t\tice_recv_pkts_vec_avx2;\n+\t\t\t\t}\n \t\t\t} else {\n \t\t\t\tPMD_DRV_LOG(DEBUG,\n-\t\t\t\t\t\"Using %sVector Rx (port %d).\",\n-\t\t\t\t\tuse_avx2 ? \"avx2 \" : \"\",\n+\t\t\t\t\t\"Using Vector Rx (port %d).\",\n \t\t\t\t\tdev->data->port_id);\n-\t\t\t\tdev->rx_pkt_burst = use_avx2 ?\n-\t\t\t\t\tice_recv_pkts_vec_avx2 :\n-\t\t\t\t\tice_recv_pkts_vec;\n+\t\t\t\tdev->rx_pkt_burst = ice_recv_pkts_vec;\n \t\t\t}\n \t\t}\n \t\treturn;\n@@ -3191,7 +3215,9 @@\n \t{ ice_recv_pkts_vec_avx512_offload,   \"Offload Vector AVX512\" },\n #endif\n \t{ ice_recv_scattered_pkts_vec_avx2, \"Vector AVX2 Scattered\" },\n+\t{ ice_recv_scattered_pkts_vec_avx2_offload, \"Offload Vector AVX2 Scattered\" },\n \t{ ice_recv_pkts_vec_avx2,           \"Vector AVX2\" },\n+\t{ ice_recv_pkts_vec_avx2_offload,   \"Offload Vector AVX2\" },\n \t{ ice_recv_scattered_pkts_vec,      \"Vector SSE Scattered\" },\n \t{ ice_recv_pkts_vec,                \"Vector SSE\" },\n #endif\ndiff --git a/drivers/net/ice/ice_rxtx.h b/drivers/net/ice/ice_rxtx.h\nindex 595dc66..bd28a68 100644\n--- a/drivers/net/ice/ice_rxtx.h\n+++ b/drivers/net/ice/ice_rxtx.h\n@@ -250,9 +250,14 @@ uint16_t ice_xmit_pkts_vec(void *tx_queue, struct rte_mbuf **tx_pkts,\n \t\t\t   uint16_t nb_pkts);\n uint16_t ice_recv_pkts_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,\n \t\t\t\tuint16_t nb_pkts);\n+uint16_t ice_recv_pkts_vec_avx2_offload(void *rx_queue, struct rte_mbuf **rx_pkts,\n+\t\t\t\t\tuint16_t nb_pkts);\n uint16_t ice_recv_scattered_pkts_vec_avx2(void *rx_queue,\n \t\t\t\t\t  struct rte_mbuf **rx_pkts,\n \t\t\t\t\t  uint16_t nb_pkts);\n+uint16_t ice_recv_scattered_pkts_vec_avx2_offload(void *rx_queue,\n+\t\t\t\t\t\t  struct rte_mbuf **rx_pkts,\n+\t\t\t\t\t\t  uint16_t nb_pkts);\n uint16_t ice_xmit_pkts_vec_avx2(void *tx_queue, struct rte_mbuf **tx_pkts,\n \t\t\t\tuint16_t nb_pkts);\n uint16_t ice_xmit_pkts_vec_avx2_offload(void *tx_queue, struct rte_mbuf **tx_pkts,\ndiff --git a/drivers/net/ice/ice_rxtx_vec_avx2.c b/drivers/net/ice/ice_rxtx_vec_avx2.c\nindex b83c1ac..2841597 100644\n--- a/drivers/net/ice/ice_rxtx_vec_avx2.c\n+++ b/drivers/net/ice/ice_rxtx_vec_avx2.c\n@@ -16,7 +16,7 @@\n \treturn ice_rxq_rearm_common(rxq, false);\n }\n \n-static inline __m256i\n+static __rte_always_inline __m256i\n ice_flex_rxd_to_fdir_flags_vec_avx2(const __m256i fdir_id0_7)\n {\n #define FDID_MIS_MAGIC 0xFFFFFFFF\n@@ -35,9 +35,10 @@\n \treturn fdir_flags;\n }\n \n-static inline uint16_t\n+static __rte_always_inline uint16_t\n _ice_recv_raw_pkts_vec_avx2(struct ice_rx_queue *rxq, struct rte_mbuf **rx_pkts,\n-\t\t\t    uint16_t nb_pkts, uint8_t *split_packet)\n+\t\t\t    uint16_t nb_pkts, uint8_t *split_packet,\n+\t\t\t    bool offload)\n {\n #define ICE_DESCS_PER_LOOP_AVX 8\n \n@@ -385,39 +386,43 @@\n \t\t */\n \t\t__m256i status0_7 = _mm256_unpacklo_epi64(status4_7,\n \t\t\t\t\t\t\t  status0_3);\n+\t\t__m256i mbuf_flags = _mm256_set1_epi32(0);\n \n-\t\t/* now do flag manipulation */\n+\t\tif (offload) {\n+\t\t\t/* now do flag manipulation */\n \n-\t\t/* get only flag/error bits we want */\n-\t\tconst __m256i flag_bits =\n-\t\t\t_mm256_and_si256(status0_7, flags_mask);\n-\t\t/**\n-\t\t * l3_l4_error flags, shuffle, then shift to correct adjustment\n-\t\t * of flags in flags_shuf, and finally mask out extra bits\n-\t\t */\n-\t\t__m256i l3_l4_flags = _mm256_shuffle_epi8(l3_l4_flags_shuf,\n-\t\t\t\t_mm256_srli_epi32(flag_bits, 4));\n-\t\tl3_l4_flags = _mm256_slli_epi32(l3_l4_flags, 1);\n-\n-\t\t__m256i l4_outer_mask = _mm256_set1_epi32(0x6);\n-\t\t__m256i l4_outer_flags =\n-\t\t\t\t_mm256_and_si256(l3_l4_flags, l4_outer_mask);\n-\t\tl4_outer_flags = _mm256_slli_epi32(l4_outer_flags, 20);\n-\n-\t\t__m256i l3_l4_mask = _mm256_set1_epi32(~0x6);\n-\t\tl3_l4_flags = _mm256_and_si256(l3_l4_flags, l3_l4_mask);\n-\t\tl3_l4_flags = _mm256_or_si256(l3_l4_flags, l4_outer_flags);\n-\t\tl3_l4_flags = _mm256_and_si256(l3_l4_flags, cksum_mask);\n-\t\t/* set rss and vlan flags */\n-\t\tconst __m256i rss_vlan_flag_bits =\n-\t\t\t_mm256_srli_epi32(flag_bits, 12);\n-\t\tconst __m256i rss_vlan_flags =\n-\t\t\t_mm256_shuffle_epi8(rss_vlan_flags_shuf,\n-\t\t\t\t\t    rss_vlan_flag_bits);\n-\n-\t\t/* merge flags */\n-\t\t__m256i mbuf_flags = _mm256_or_si256(l3_l4_flags,\n-\t\t\t\trss_vlan_flags);\n+\t\t\t/* get only flag/error bits we want */\n+\t\t\tconst __m256i flag_bits =\n+\t\t\t\t_mm256_and_si256(status0_7, flags_mask);\n+\t\t\t/**\n+\t\t\t * l3_l4_error flags, shuffle, then shift to correct adjustment\n+\t\t\t * of flags in flags_shuf, and finally mask out extra bits\n+\t\t\t */\n+\t\t\t__m256i l3_l4_flags = _mm256_shuffle_epi8(l3_l4_flags_shuf,\n+\t\t\t\t\t_mm256_srli_epi32(flag_bits, 4));\n+\t\t\tl3_l4_flags = _mm256_slli_epi32(l3_l4_flags, 1);\n+\n+\t\t\t__m256i l4_outer_mask = _mm256_set1_epi32(0x6);\n+\t\t\t__m256i l4_outer_flags =\n+\t\t\t\t\t_mm256_and_si256(l3_l4_flags, l4_outer_mask);\n+\t\t\tl4_outer_flags = _mm256_slli_epi32(l4_outer_flags, 20);\n+\n+\t\t\t__m256i l3_l4_mask = _mm256_set1_epi32(~0x6);\n+\n+\t\t\tl3_l4_flags = _mm256_and_si256(l3_l4_flags, l3_l4_mask);\n+\t\t\tl3_l4_flags = _mm256_or_si256(l3_l4_flags, l4_outer_flags);\n+\t\t\tl3_l4_flags = _mm256_and_si256(l3_l4_flags, cksum_mask);\n+\t\t\t/* set rss and vlan flags */\n+\t\t\tconst __m256i rss_vlan_flag_bits =\n+\t\t\t\t_mm256_srli_epi32(flag_bits, 12);\n+\t\t\tconst __m256i rss_vlan_flags =\n+\t\t\t\t_mm256_shuffle_epi8(rss_vlan_flags_shuf,\n+\t\t\t\t\t\t    rss_vlan_flag_bits);\n+\n+\t\t\t/* merge flags */\n+\t\t\tmbuf_flags = _mm256_or_si256(l3_l4_flags,\n+\t\t\t\t\t\t     rss_vlan_flags);\n+\t\t}\n \n \t\tif (rxq->fdir_enabled) {\n \t\t\tconst __m256i fdir_id4_7 =\n@@ -461,95 +466,97 @@\n \t\t\t\t_mm256_extract_epi32(fdir_id0_7, 4);\n \t\t} /* if() on fdir_enabled */\n \n+\t\tif (offload) {\n #ifndef RTE_LIBRTE_ICE_16BYTE_RX_DESC\n-\t\t/**\n-\t\t * needs to load 2nd 16B of each desc for RSS hash parsing,\n-\t\t * will cause performance drop to get into this context.\n-\t\t */\n-\t\tif (rxq->vsi->adapter->eth_dev->data->dev_conf.rxmode.offloads &\n-\t\t\t\tDEV_RX_OFFLOAD_RSS_HASH) {\n-\t\t\t/* load bottom half of every 32B desc */\n-\t\t\tconst __m128i raw_desc_bh7 =\n-\t\t\t\t_mm_load_si128\n-\t\t\t\t\t((void *)(&rxdp[7].wb.status_error1));\n-\t\t\trte_compiler_barrier();\n-\t\t\tconst __m128i raw_desc_bh6 =\n-\t\t\t\t_mm_load_si128\n-\t\t\t\t\t((void *)(&rxdp[6].wb.status_error1));\n-\t\t\trte_compiler_barrier();\n-\t\t\tconst __m128i raw_desc_bh5 =\n-\t\t\t\t_mm_load_si128\n-\t\t\t\t\t((void *)(&rxdp[5].wb.status_error1));\n-\t\t\trte_compiler_barrier();\n-\t\t\tconst __m128i raw_desc_bh4 =\n-\t\t\t\t_mm_load_si128\n-\t\t\t\t\t((void *)(&rxdp[4].wb.status_error1));\n-\t\t\trte_compiler_barrier();\n-\t\t\tconst __m128i raw_desc_bh3 =\n-\t\t\t\t_mm_load_si128\n-\t\t\t\t\t((void *)(&rxdp[3].wb.status_error1));\n-\t\t\trte_compiler_barrier();\n-\t\t\tconst __m128i raw_desc_bh2 =\n-\t\t\t\t_mm_load_si128\n-\t\t\t\t\t((void *)(&rxdp[2].wb.status_error1));\n-\t\t\trte_compiler_barrier();\n-\t\t\tconst __m128i raw_desc_bh1 =\n-\t\t\t\t_mm_load_si128\n-\t\t\t\t\t((void *)(&rxdp[1].wb.status_error1));\n-\t\t\trte_compiler_barrier();\n-\t\t\tconst __m128i raw_desc_bh0 =\n-\t\t\t\t_mm_load_si128\n-\t\t\t\t\t((void *)(&rxdp[0].wb.status_error1));\n-\n-\t\t\t__m256i raw_desc_bh6_7 =\n-\t\t\t\t_mm256_inserti128_si256\n-\t\t\t\t\t(_mm256_castsi128_si256(raw_desc_bh6),\n-\t\t\t\t\traw_desc_bh7, 1);\n-\t\t\t__m256i raw_desc_bh4_5 =\n-\t\t\t\t_mm256_inserti128_si256\n-\t\t\t\t\t(_mm256_castsi128_si256(raw_desc_bh4),\n-\t\t\t\t\traw_desc_bh5, 1);\n-\t\t\t__m256i raw_desc_bh2_3 =\n-\t\t\t\t_mm256_inserti128_si256\n-\t\t\t\t\t(_mm256_castsi128_si256(raw_desc_bh2),\n-\t\t\t\t\traw_desc_bh3, 1);\n-\t\t\t__m256i raw_desc_bh0_1 =\n-\t\t\t\t_mm256_inserti128_si256\n-\t\t\t\t\t(_mm256_castsi128_si256(raw_desc_bh0),\n-\t\t\t\t\traw_desc_bh1, 1);\n-\n \t\t\t/**\n-\t\t\t * to shift the 32b RSS hash value to the\n-\t\t\t * highest 32b of each 128b before mask\n+\t\t\t * needs to load 2nd 16B of each desc for RSS hash parsing,\n+\t\t\t * will cause performance drop to get into this context.\n \t\t\t */\n-\t\t\t__m256i rss_hash6_7 =\n-\t\t\t\t_mm256_slli_epi64(raw_desc_bh6_7, 32);\n-\t\t\t__m256i rss_hash4_5 =\n-\t\t\t\t_mm256_slli_epi64(raw_desc_bh4_5, 32);\n-\t\t\t__m256i rss_hash2_3 =\n-\t\t\t\t_mm256_slli_epi64(raw_desc_bh2_3, 32);\n-\t\t\t__m256i rss_hash0_1 =\n-\t\t\t\t_mm256_slli_epi64(raw_desc_bh0_1, 32);\n-\n-\t\t\t__m256i rss_hash_msk =\n-\t\t\t\t_mm256_set_epi32(0xFFFFFFFF, 0, 0, 0,\n-\t\t\t\t\t\t 0xFFFFFFFF, 0, 0, 0);\n-\n-\t\t\trss_hash6_7 = _mm256_and_si256\n-\t\t\t\t\t(rss_hash6_7, rss_hash_msk);\n-\t\t\trss_hash4_5 = _mm256_and_si256\n-\t\t\t\t\t(rss_hash4_5, rss_hash_msk);\n-\t\t\trss_hash2_3 = _mm256_and_si256\n-\t\t\t\t\t(rss_hash2_3, rss_hash_msk);\n-\t\t\trss_hash0_1 = _mm256_and_si256\n-\t\t\t\t\t(rss_hash0_1, rss_hash_msk);\n-\n-\t\t\tmb6_7 = _mm256_or_si256(mb6_7, rss_hash6_7);\n-\t\t\tmb4_5 = _mm256_or_si256(mb4_5, rss_hash4_5);\n-\t\t\tmb2_3 = _mm256_or_si256(mb2_3, rss_hash2_3);\n-\t\t\tmb0_1 = _mm256_or_si256(mb0_1, rss_hash0_1);\n-\t\t} /* if() on RSS hash parsing */\n+\t\t\tif (rxq->vsi->adapter->eth_dev->data->dev_conf.rxmode.offloads &\n+\t\t\t\t\tDEV_RX_OFFLOAD_RSS_HASH) {\n+\t\t\t\t/* load bottom half of every 32B desc */\n+\t\t\t\tconst __m128i raw_desc_bh7 =\n+\t\t\t\t\t_mm_load_si128\n+\t\t\t\t\t\t((void *)(&rxdp[7].wb.status_error1));\n+\t\t\t\trte_compiler_barrier();\n+\t\t\t\tconst __m128i raw_desc_bh6 =\n+\t\t\t\t\t_mm_load_si128\n+\t\t\t\t\t\t((void *)(&rxdp[6].wb.status_error1));\n+\t\t\t\trte_compiler_barrier();\n+\t\t\t\tconst __m128i raw_desc_bh5 =\n+\t\t\t\t\t_mm_load_si128\n+\t\t\t\t\t\t((void *)(&rxdp[5].wb.status_error1));\n+\t\t\t\trte_compiler_barrier();\n+\t\t\t\tconst __m128i raw_desc_bh4 =\n+\t\t\t\t\t_mm_load_si128\n+\t\t\t\t\t\t((void *)(&rxdp[4].wb.status_error1));\n+\t\t\t\trte_compiler_barrier();\n+\t\t\t\tconst __m128i raw_desc_bh3 =\n+\t\t\t\t\t_mm_load_si128\n+\t\t\t\t\t\t((void *)(&rxdp[3].wb.status_error1));\n+\t\t\t\trte_compiler_barrier();\n+\t\t\t\tconst __m128i raw_desc_bh2 =\n+\t\t\t\t\t_mm_load_si128\n+\t\t\t\t\t\t((void *)(&rxdp[2].wb.status_error1));\n+\t\t\t\trte_compiler_barrier();\n+\t\t\t\tconst __m128i raw_desc_bh1 =\n+\t\t\t\t\t_mm_load_si128\n+\t\t\t\t\t\t((void *)(&rxdp[1].wb.status_error1));\n+\t\t\t\trte_compiler_barrier();\n+\t\t\t\tconst __m128i raw_desc_bh0 =\n+\t\t\t\t\t_mm_load_si128\n+\t\t\t\t\t\t((void *)(&rxdp[0].wb.status_error1));\n+\n+\t\t\t\t__m256i raw_desc_bh6_7 =\n+\t\t\t\t\t_mm256_inserti128_si256\n+\t\t\t\t\t\t(_mm256_castsi128_si256(raw_desc_bh6),\n+\t\t\t\t\t\traw_desc_bh7, 1);\n+\t\t\t\t__m256i raw_desc_bh4_5 =\n+\t\t\t\t\t_mm256_inserti128_si256\n+\t\t\t\t\t\t(_mm256_castsi128_si256(raw_desc_bh4),\n+\t\t\t\t\t\traw_desc_bh5, 1);\n+\t\t\t\t__m256i raw_desc_bh2_3 =\n+\t\t\t\t\t_mm256_inserti128_si256\n+\t\t\t\t\t\t(_mm256_castsi128_si256(raw_desc_bh2),\n+\t\t\t\t\t\traw_desc_bh3, 1);\n+\t\t\t\t__m256i raw_desc_bh0_1 =\n+\t\t\t\t\t_mm256_inserti128_si256\n+\t\t\t\t\t\t(_mm256_castsi128_si256(raw_desc_bh0),\n+\t\t\t\t\t\traw_desc_bh1, 1);\n+\n+\t\t\t\t/**\n+\t\t\t\t * to shift the 32b RSS hash value to the\n+\t\t\t\t * highest 32b of each 128b before mask\n+\t\t\t\t */\n+\t\t\t\t__m256i rss_hash6_7 =\n+\t\t\t\t\t_mm256_slli_epi64(raw_desc_bh6_7, 32);\n+\t\t\t\t__m256i rss_hash4_5 =\n+\t\t\t\t\t_mm256_slli_epi64(raw_desc_bh4_5, 32);\n+\t\t\t\t__m256i rss_hash2_3 =\n+\t\t\t\t\t_mm256_slli_epi64(raw_desc_bh2_3, 32);\n+\t\t\t\t__m256i rss_hash0_1 =\n+\t\t\t\t\t_mm256_slli_epi64(raw_desc_bh0_1, 32);\n+\n+\t\t\t\t__m256i rss_hash_msk =\n+\t\t\t\t\t_mm256_set_epi32(0xFFFFFFFF, 0, 0, 0,\n+\t\t\t\t\t\t\t 0xFFFFFFFF, 0, 0, 0);\n+\n+\t\t\t\trss_hash6_7 = _mm256_and_si256\n+\t\t\t\t\t\t(rss_hash6_7, rss_hash_msk);\n+\t\t\t\trss_hash4_5 = _mm256_and_si256\n+\t\t\t\t\t\t(rss_hash4_5, rss_hash_msk);\n+\t\t\t\trss_hash2_3 = _mm256_and_si256\n+\t\t\t\t\t\t(rss_hash2_3, rss_hash_msk);\n+\t\t\t\trss_hash0_1 = _mm256_and_si256\n+\t\t\t\t\t\t(rss_hash0_1, rss_hash_msk);\n+\n+\t\t\t\tmb6_7 = _mm256_or_si256(mb6_7, rss_hash6_7);\n+\t\t\t\tmb4_5 = _mm256_or_si256(mb4_5, rss_hash4_5);\n+\t\t\t\tmb2_3 = _mm256_or_si256(mb2_3, rss_hash2_3);\n+\t\t\t\tmb0_1 = _mm256_or_si256(mb0_1, rss_hash0_1);\n+\t\t\t} /* if() on RSS hash parsing */\n #endif\n+\t\t}\n \n \t\t/**\n \t\t * At this point, we have the 8 sets of flags in the low 16-bits\n@@ -701,7 +708,16 @@\n ice_recv_pkts_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,\n \t\t       uint16_t nb_pkts)\n {\n-\treturn _ice_recv_raw_pkts_vec_avx2(rx_queue, rx_pkts, nb_pkts, NULL);\n+\treturn _ice_recv_raw_pkts_vec_avx2(rx_queue, rx_pkts,\n+\t\t\t\t\t   nb_pkts, NULL, false);\n+}\n+\n+uint16_t\n+ice_recv_pkts_vec_avx2_offload(void *rx_queue, struct rte_mbuf **rx_pkts,\n+\t\t\t       uint16_t nb_pkts)\n+{\n+\treturn _ice_recv_raw_pkts_vec_avx2(rx_queue, rx_pkts,\n+\t\t\t\t\t   nb_pkts, NULL, true);\n }\n \n /**\n@@ -709,16 +725,16 @@\n  * Notice:\n  * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet\n  */\n-static uint16_t\n+static __rte_always_inline uint16_t\n ice_recv_scattered_burst_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,\n-\t\t\t\t  uint16_t nb_pkts)\n+\t\t\t\t  uint16_t nb_pkts, bool offload)\n {\n \tstruct ice_rx_queue *rxq = rx_queue;\n \tuint8_t split_flags[ICE_VPMD_RX_BURST] = {0};\n \n \t/* get some new buffers */\n \tuint16_t nb_bufs = _ice_recv_raw_pkts_vec_avx2(rxq, rx_pkts, nb_pkts,\n-\t\t\t\t\t\t       split_flags);\n+\t\t\t\t\t\t       split_flags, offload);\n \tif (nb_bufs == 0)\n \t\treturn 0;\n \n@@ -751,22 +767,46 @@\n  * Notice:\n  * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet\n  */\n-uint16_t\n-ice_recv_scattered_pkts_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,\n-\t\t\t\t uint16_t nb_pkts)\n+static __rte_always_inline uint16_t\n+ice_recv_scattered_pkts_vec_avx2_common(void *rx_queue,\n+\t\t\t\t\tstruct rte_mbuf **rx_pkts,\n+\t\t\t\t\tuint16_t nb_pkts,\n+\t\t\t\t\tbool offload)\n {\n \tuint16_t retval = 0;\n \n \twhile (nb_pkts > ICE_VPMD_RX_BURST) {\n \t\tuint16_t burst = ice_recv_scattered_burst_vec_avx2(rx_queue,\n-\t\t\t\trx_pkts + retval, ICE_VPMD_RX_BURST);\n+\t\t\t\trx_pkts + retval, ICE_VPMD_RX_BURST, offload);\n \t\tretval += burst;\n \t\tnb_pkts -= burst;\n \t\tif (burst < ICE_VPMD_RX_BURST)\n \t\t\treturn retval;\n \t}\n \treturn retval + ice_recv_scattered_burst_vec_avx2(rx_queue,\n-\t\t\t\trx_pkts + retval, nb_pkts);\n+\t\t\t\trx_pkts + retval, nb_pkts, offload);\n+}\n+\n+uint16_t\n+ice_recv_scattered_pkts_vec_avx2(void *rx_queue,\n+\t\t\t\t struct rte_mbuf **rx_pkts,\n+\t\t\t\t uint16_t nb_pkts)\n+{\n+\treturn ice_recv_scattered_pkts_vec_avx2_common(rx_queue,\n+\t\t\t\t\t\t       rx_pkts,\n+\t\t\t\t\t\t       nb_pkts,\n+\t\t\t\t\t\t       false);\n+}\n+\n+uint16_t\n+ice_recv_scattered_pkts_vec_avx2_offload(void *rx_queue,\n+\t\t\t\t\t struct rte_mbuf **rx_pkts,\n+\t\t\t\t\t uint16_t nb_pkts)\n+{\n+\treturn ice_recv_scattered_pkts_vec_avx2_common(rx_queue,\n+\t\t\t\t\t\t       rx_pkts,\n+\t\t\t\t\t\t       nb_pkts,\n+\t\t\t\t\t\t       true);\n }\n \n static __rte_always_inline void\n",
    "prefixes": [
        "v2",
        "2/2"
    ]
}