get:
Show a patch.

patch:
Update a patch.

put:
Update a patch.

GET /api/patches/77118/?format=api
HTTP 200 OK
Allow: GET, PUT, PATCH, HEAD, OPTIONS
Content-Type: application/json
Vary: Accept

{
    "id": 77118,
    "url": "http://patches.dpdk.org/api/patches/77118/?format=api",
    "web_url": "http://patches.dpdk.org/project/dpdk/patch/20200910065504.104217-3-leyi.rong@intel.com/",
    "project": {
        "id": 1,
        "url": "http://patches.dpdk.org/api/projects/1/?format=api",
        "name": "DPDK",
        "link_name": "dpdk",
        "list_id": "dev.dpdk.org",
        "list_email": "dev@dpdk.org",
        "web_url": "http://core.dpdk.org",
        "scm_url": "git://dpdk.org/dpdk",
        "webscm_url": "http://git.dpdk.org/dpdk",
        "list_archive_url": "https://inbox.dpdk.org/dev",
        "list_archive_url_format": "https://inbox.dpdk.org/dev/{}",
        "commit_url_format": ""
    },
    "msgid": "<20200910065504.104217-3-leyi.rong@intel.com>",
    "list_archive_url": "https://inbox.dpdk.org/dev/20200910065504.104217-3-leyi.rong@intel.com",
    "date": "2020-09-10T06:55:04",
    "name": "[v1,2/2] net/ice: optimize Tx path on AVX512 vPMD",
    "commit_ref": null,
    "pull_url": null,
    "state": "superseded",
    "archived": true,
    "hash": "7d79967c2228b866fbfb45acc3e5b101a00ab23d",
    "submitter": {
        "id": 1204,
        "url": "http://patches.dpdk.org/api/people/1204/?format=api",
        "name": "Leyi Rong",
        "email": "leyi.rong@intel.com"
    },
    "delegate": {
        "id": 1540,
        "url": "http://patches.dpdk.org/api/users/1540/?format=api",
        "username": "qzhan15",
        "first_name": "Qi",
        "last_name": "Zhang",
        "email": "qi.z.zhang@intel.com"
    },
    "mbox": "http://patches.dpdk.org/project/dpdk/patch/20200910065504.104217-3-leyi.rong@intel.com/mbox/",
    "series": [
        {
            "id": 12088,
            "url": "http://patches.dpdk.org/api/series/12088/?format=api",
            "web_url": "http://patches.dpdk.org/project/dpdk/list/?series=12088",
            "date": "2020-09-10T06:55:02",
            "name": "AVX512 vPMD on ice",
            "version": 1,
            "mbox": "http://patches.dpdk.org/series/12088/mbox/"
        }
    ],
    "comments": "http://patches.dpdk.org/api/patches/77118/comments/",
    "check": "fail",
    "checks": "http://patches.dpdk.org/api/patches/77118/checks/",
    "tags": {},
    "related": [],
    "headers": {
        "Return-Path": "<dev-bounces@dpdk.org>",
        "X-Original-To": "patchwork@inbox.dpdk.org",
        "Delivered-To": "patchwork@inbox.dpdk.org",
        "Received": [
            "from dpdk.org (dpdk.org [92.243.14.124])\n\tby inbox.dpdk.org (Postfix) with ESMTP id 4E282A04B5;\n\tThu, 10 Sep 2020 09:12:35 +0200 (CEST)",
            "from [92.243.14.124] (localhost [127.0.0.1])\n\tby dpdk.org (Postfix) with ESMTP id 4A0581C0D4;\n\tThu, 10 Sep 2020 09:12:23 +0200 (CEST)",
            "from mga11.intel.com (mga11.intel.com [192.55.52.93])\n by dpdk.org (Postfix) with ESMTP id E2D311C0CC\n for <dev@dpdk.org>; Thu, 10 Sep 2020 09:12:19 +0200 (CEST)",
            "from fmsmga006.fm.intel.com ([10.253.24.20])\n by fmsmga102.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384;\n 10 Sep 2020 00:12:19 -0700",
            "from dpdk-lrong-srv-04.sh.intel.com ([10.67.119.221])\n by fmsmga006.fm.intel.com with ESMTP; 10 Sep 2020 00:12:17 -0700"
        ],
        "IronPort-SDR": [
            "\n DTnBzjngQrDIKCGVch2tRZvMPcJEbc8ZGYZszmYbzIegAx/KccKvlWW1qUuBcpRIVLgj5mQwK/\n Ey+qp2ayUDAQ==",
            "\n 9AD+/MGbDk5jPSDbi6dU/odLjU/aXWRpcFcdN8MSK+drUN4eGfL3E4h6MT980i0FtvssqObH3Q\n y6t9gP+x0GlA=="
        ],
        "X-IronPort-AV": [
            "E=McAfee;i=\"6000,8403,9739\"; a=\"155942598\"",
            "E=Sophos;i=\"5.76,412,1592895600\"; d=\"scan'208\";a=\"155942598\"",
            "E=Sophos;i=\"5.76,412,1592895600\"; d=\"scan'208\";a=\"505023568\""
        ],
        "X-Amp-Result": "SKIPPED(no attachment in message)",
        "X-Amp-File-Uploaded": "False",
        "X-ExtLoop1": "1",
        "From": "Leyi Rong <leyi.rong@intel.com>",
        "To": "bruce.richardson@intel.com,\n\twenzhuo.lu@intel.com,\n\tqi.z.zhang@intel.com",
        "Cc": "dev@dpdk.org,\n\tLeyi Rong <leyi.rong@intel.com>",
        "Date": "Thu, 10 Sep 2020 14:55:04 +0800",
        "Message-Id": "<20200910065504.104217-3-leyi.rong@intel.com>",
        "X-Mailer": "git-send-email 2.17.1",
        "In-Reply-To": "<20200910065504.104217-1-leyi.rong@intel.com>",
        "References": "<20200910065504.104217-1-leyi.rong@intel.com>",
        "Subject": "[dpdk-dev] [PATCH v1 2/2] net/ice: optimize Tx path on AVX512 vPMD",
        "X-BeenThere": "dev@dpdk.org",
        "X-Mailman-Version": "2.1.15",
        "Precedence": "list",
        "List-Id": "DPDK patches and discussions <dev.dpdk.org>",
        "List-Unsubscribe": "<https://mails.dpdk.org/options/dev>,\n <mailto:dev-request@dpdk.org?subject=unsubscribe>",
        "List-Archive": "<http://mails.dpdk.org/archives/dev/>",
        "List-Post": "<mailto:dev@dpdk.org>",
        "List-Help": "<mailto:dev-request@dpdk.org?subject=help>",
        "List-Subscribe": "<https://mails.dpdk.org/listinfo/dev>,\n <mailto:dev-request@dpdk.org?subject=subscribe>",
        "Errors-To": "dev-bounces@dpdk.org",
        "Sender": "\"dev\" <dev-bounces@dpdk.org>"
    },
    "content": "Optimize Tx path by using AVX512 instructions and vectorize the\ntx free bufs process.\n\nSigned-off-by: Leyi Rong <leyi.rong@intel.com>\nSigned-off-by: Bruce Richardson <bruce.richardson@intel.com>\n---\n drivers/net/ice/ice_rxtx.h            |   4 +\n drivers/net/ice/ice_rxtx_vec_avx512.c | 147 ++++++++++++++++++++++----\n drivers/net/ice/ice_rxtx_vec_common.h |  36 +++++--\n 3 files changed, 158 insertions(+), 29 deletions(-)",
    "diff": "diff --git a/drivers/net/ice/ice_rxtx.h b/drivers/net/ice/ice_rxtx.h\nindex a39b41c05..08084f5c5 100644\n--- a/drivers/net/ice/ice_rxtx.h\n+++ b/drivers/net/ice/ice_rxtx.h\n@@ -88,6 +88,10 @@ struct ice_tx_entry {\n \tuint16_t last_id;\n };\n \n+struct ice_vec_tx_entry {\n+\tstruct rte_mbuf *mbuf;\n+};\n+\n struct ice_tx_queue {\n \tuint16_t nb_tx_desc; /* number of TX descriptors */\n \trte_iova_t tx_ring_dma; /* TX ring DMA address */\ndiff --git a/drivers/net/ice/ice_rxtx_vec_avx512.c b/drivers/net/ice/ice_rxtx_vec_avx512.c\nindex 6a9d0a8ea..1bc1191d0 100644\n--- a/drivers/net/ice/ice_rxtx_vec_avx512.c\n+++ b/drivers/net/ice/ice_rxtx_vec_avx512.c\n@@ -665,6 +665,108 @@ ice_recv_scattered_pkts_vec_avx512(void *rx_queue, struct rte_mbuf **rx_pkts,\n \t\t\t\trx_pkts + retval, nb_pkts);\n }\n \n+static __rte_always_inline int\n+ice_tx_free_bufs_avx512(struct ice_tx_queue *txq)\n+{\n+\tstruct ice_vec_tx_entry *txep;\n+\tuint32_t n;\n+\tuint32_t i;\n+\tint nb_free = 0;\n+\tstruct rte_mbuf *m, *free[ICE_TX_MAX_FREE_BUF_SZ];\n+\n+\t/* check DD bits on threshold descriptor */\n+\tif ((txq->tx_ring[txq->tx_next_dd].cmd_type_offset_bsz &\n+\t\t\trte_cpu_to_le_64(ICE_TXD_QW1_DTYPE_M)) !=\n+\t\t\trte_cpu_to_le_64(ICE_TX_DESC_DTYPE_DESC_DONE))\n+\t\treturn 0;\n+\n+\tn = txq->tx_rs_thresh;\n+\n+\t/* first buffer to free from S/W ring is at index\n+\t * tx_next_dd - (tx_rs_thresh - 1)\n+\t */\n+\ttxep = (void *)txq->sw_ring;\n+\ttxep += txq->tx_next_dd - (n - 1);\n+\n+\tif (txq->offloads & DEV_TX_OFFLOAD_MBUF_FAST_FREE && (n & 31) == 0) {\n+\t\tstruct rte_mempool *mp = txep[0].mbuf->pool;\n+\t\tstruct rte_mempool_cache *cache = rte_mempool_default_cache(mp,\n+\t\t\t\trte_lcore_id());\n+\t\tvoid **cache_objs = &cache->objs[cache->len];\n+\n+\t\tif (n > RTE_MEMPOOL_CACHE_MAX_SIZE) {\n+\t\t\trte_mempool_ops_enqueue_bulk(mp, (void *)txep, n);\n+\t\t\tgoto done;\n+\t\t}\n+\n+\t\t/* The cache follows the following algorithm\n+\t\t *   1. Add the objects to the cache\n+\t\t *   2. Anything greater than the cache min value (if it\n+\t\t *   crosses the cache flush threshold) is flushed to the ring.\n+\t\t */\n+\t\t/* Add elements back into the cache */\n+\t\tuint32_t copied = 0;\n+\t\t/* n is multiple of 32 */\n+\t\twhile (copied < n) {\n+\t\t\tconst __m512i a = _mm512_loadu_si512(&txep[copied]);\n+\t\t\tconst __m512i b = _mm512_loadu_si512(&txep[copied + 8]);\n+\t\t\tconst __m512i c = _mm512_loadu_si512(&txep[copied + 16]);\n+\t\t\tconst __m512i d = _mm512_loadu_si512(&txep[copied + 24]);\n+\n+\t\t\t_mm512_storeu_si512(&cache_objs[copied], a);\n+\t\t\t_mm512_storeu_si512(&cache_objs[copied + 8], b);\n+\t\t\t_mm512_storeu_si512(&cache_objs[copied + 16], c);\n+\t\t\t_mm512_storeu_si512(&cache_objs[copied + 24], d);\n+\t\t\tcopied += 32;\n+\t\t}\n+\t\tcache->len += n;\n+\n+\t\tif (cache->len >= cache->flushthresh) {\n+\t\t\trte_mempool_ops_enqueue_bulk\n+\t\t\t\t(mp, &cache->objs[cache->size],\n+\t\t\t\t cache->len - cache->size);\n+\t\t\tcache->len = cache->size;\n+\t\t}\n+\t\tgoto done;\n+\t}\n+\n+\tm = rte_pktmbuf_prefree_seg(txep[0].mbuf);\n+\tif (likely(m)) {\n+\t\tfree[0] = m;\n+\t\tnb_free = 1;\n+\t\tfor (i = 1; i < n; i++) {\n+\t\t\tm = rte_pktmbuf_prefree_seg(txep[i].mbuf);\n+\t\t\tif (likely(m)) {\n+\t\t\t\tif (likely(m->pool == free[0]->pool)) {\n+\t\t\t\t\tfree[nb_free++] = m;\n+\t\t\t\t} else {\n+\t\t\t\t\trte_mempool_put_bulk(free[0]->pool,\n+\t\t\t\t\t\t\t     (void *)free,\n+\t\t\t\t\t\t\t     nb_free);\n+\t\t\t\t\tfree[0] = m;\n+\t\t\t\t\tnb_free = 1;\n+\t\t\t\t}\n+\t\t\t}\n+\t\t}\n+\t\trte_mempool_put_bulk(free[0]->pool, (void **)free, nb_free);\n+\t} else {\n+\t\tfor (i = 1; i < n; i++) {\n+\t\t\tm = rte_pktmbuf_prefree_seg(txep[i].mbuf);\n+\t\t\tif (m)\n+\t\t\t\trte_mempool_put(m->pool, m);\n+\t\t}\n+\t}\n+\n+done:\n+\t/* buffers were freed, update counters */\n+\ttxq->nb_tx_free = (uint16_t)(txq->nb_tx_free + txq->tx_rs_thresh);\n+\ttxq->tx_next_dd = (uint16_t)(txq->tx_next_dd + txq->tx_rs_thresh);\n+\tif (txq->tx_next_dd >= txq->nb_tx_desc)\n+\t\ttxq->tx_next_dd = (uint16_t)(txq->tx_rs_thresh - 1);\n+\n+\treturn txq->tx_rs_thresh;\n+}\n+\n static inline void\n ice_vtx1(volatile struct ice_tx_desc *txdp,\n \t struct rte_mbuf *pkt, uint64_t flags)\n@@ -686,13 +788,6 @@ ice_vtx(volatile struct ice_tx_desc *txdp,\n \tconst uint64_t hi_qw_tmpl = (ICE_TX_DESC_DTYPE_DATA |\n \t\t\t((uint64_t)flags  << ICE_TXD_QW1_CMD_S));\n \n-\t/* if unaligned on 32-bit boundary, do one to align */\n-\tif (((uintptr_t)txdp & 0x1F) != 0 && nb_pkts != 0) {\n-\t\tice_vtx1(txdp, *pkt, flags);\n-\t\tnb_pkts--, txdp++, pkt++;\n-\t}\n-\n-\t/* do two at a time while possible, in bursts */\n \tfor (; nb_pkts > 3; txdp += 4, pkt += 4, nb_pkts -= 4) {\n \t\tuint64_t hi_qw3 =\n \t\t\thi_qw_tmpl |\n@@ -711,20 +806,17 @@ ice_vtx(volatile struct ice_tx_desc *txdp,\n \t\t\t((uint64_t)pkt[0]->data_len <<\n \t\t\t ICE_TXD_QW1_TX_BUF_SZ_S);\n \n-\t\t__m256i desc2_3 =\n-\t\t\t_mm256_set_epi64x\n+\t\t__m512i desc0_3 =\n+\t\t\t_mm512_set_epi64\n \t\t\t\t(hi_qw3,\n \t\t\t\t pkt[3]->buf_physaddr + pkt[3]->data_off,\n \t\t\t\t hi_qw2,\n-\t\t\t\t pkt[2]->buf_physaddr + pkt[2]->data_off);\n-\t\t__m256i desc0_1 =\n-\t\t\t_mm256_set_epi64x\n-\t\t\t\t(hi_qw1,\n+\t\t\t\t pkt[2]->buf_physaddr + pkt[2]->data_off,\n+\t\t\t\t hi_qw1,\n \t\t\t\t pkt[1]->buf_physaddr + pkt[1]->data_off,\n \t\t\t\t hi_qw0,\n \t\t\t\t pkt[0]->buf_physaddr + pkt[0]->data_off);\n-\t\t_mm256_store_si256((void *)(txdp + 2), desc2_3);\n-\t\t_mm256_store_si256((void *)txdp, desc0_1);\n+\t\t_mm512_storeu_si512((void *)txdp, desc0_3);\n \t}\n \n \t/* do any last ones */\n@@ -734,13 +826,23 @@ ice_vtx(volatile struct ice_tx_desc *txdp,\n \t}\n }\n \n+static __rte_always_inline void\n+ice_tx_backlog_entry_avx512(struct ice_vec_tx_entry *txep,\n+\t\t\t    struct rte_mbuf **tx_pkts, uint16_t nb_pkts)\n+{\n+\tint i;\n+\n+\tfor (i = 0; i < (int)nb_pkts; ++i)\n+\t\ttxep[i].mbuf = tx_pkts[i];\n+}\n+\n static inline uint16_t\n ice_xmit_fixed_burst_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,\n \t\t\t\tuint16_t nb_pkts)\n {\n \tstruct ice_tx_queue *txq = (struct ice_tx_queue *)tx_queue;\n \tvolatile struct ice_tx_desc *txdp;\n-\tstruct ice_tx_entry *txep;\n+\tstruct ice_vec_tx_entry *txep;\n \tuint16_t n, nb_commit, tx_id;\n \tuint64_t flags = ICE_TD_CMD;\n \tuint64_t rs = ICE_TX_DESC_CMD_RS | ICE_TD_CMD;\n@@ -749,7 +851,7 @@ ice_xmit_fixed_burst_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,\n \tnb_pkts = RTE_MIN(nb_pkts, txq->tx_rs_thresh);\n \n \tif (txq->nb_tx_free < txq->tx_free_thresh)\n-\t\tice_tx_free_bufs(txq);\n+\t\tice_tx_free_bufs_avx512(txq);\n \n \tnb_commit = nb_pkts = (uint16_t)RTE_MIN(txq->nb_tx_free, nb_pkts);\n \tif (unlikely(nb_pkts == 0))\n@@ -757,13 +859,14 @@ ice_xmit_fixed_burst_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,\n \n \ttx_id = txq->tx_tail;\n \ttxdp = &txq->tx_ring[tx_id];\n-\ttxep = &txq->sw_ring[tx_id];\n+\ttxep = (void *)txq->sw_ring;\n+\ttxep += tx_id;\n \n \ttxq->nb_tx_free = (uint16_t)(txq->nb_tx_free - nb_pkts);\n \n \tn = (uint16_t)(txq->nb_tx_desc - tx_id);\n \tif (nb_commit >= n) {\n-\t\tice_tx_backlog_entry(txep, tx_pkts, n);\n+\t\tice_tx_backlog_entry_avx512(txep, tx_pkts, n);\n \n \t\tice_vtx(txdp, tx_pkts, n - 1, flags);\n \t\ttx_pkts += (n - 1);\n@@ -777,11 +880,11 @@ ice_xmit_fixed_burst_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,\n \t\ttxq->tx_next_rs = (uint16_t)(txq->tx_rs_thresh - 1);\n \n \t\t/* avoid reach the end of ring */\n-\t\ttxdp = &txq->tx_ring[tx_id];\n-\t\ttxep = &txq->sw_ring[tx_id];\n+\t\ttxdp = txq->tx_ring;\n+\t\ttxep = (void *)txq->sw_ring;\n \t}\n \n-\tice_tx_backlog_entry(txep, tx_pkts, nb_commit);\n+\tice_tx_backlog_entry_avx512(txep, tx_pkts, nb_commit);\n \n \tice_vtx(txdp, tx_pkts, nb_commit, flags);\n \ndiff --git a/drivers/net/ice/ice_rxtx_vec_common.h b/drivers/net/ice/ice_rxtx_vec_common.h\nindex 46e3be98a..ee0bb1798 100644\n--- a/drivers/net/ice/ice_rxtx_vec_common.h\n+++ b/drivers/net/ice/ice_rxtx_vec_common.h\n@@ -189,16 +189,38 @@ _ice_tx_queue_release_mbufs_vec(struct ice_tx_queue *txq)\n \t *  so need to free remains more carefully.\n \t */\n \ti = txq->tx_next_dd - txq->tx_rs_thresh + 1;\n-\tif (txq->tx_tail < i) {\n-\t\tfor (; i < txq->nb_tx_desc; i++) {\n+\n+#ifdef CC_AVX512_SUPPORT\n+\tstruct rte_eth_dev *dev = txq->vsi->adapter->eth_dev;\n+\n+\tif (dev->tx_pkt_burst == ice_xmit_pkts_vec_avx512) {\n+\t\tstruct ice_vec_tx_entry *swr = (void *)txq->sw_ring;\n+\n+\t\tif (txq->tx_tail < i) {\n+\t\t\tfor (; i < txq->nb_tx_desc; i++) {\n+\t\t\t\trte_pktmbuf_free_seg(swr[i].mbuf);\n+\t\t\t\tswr[i].mbuf = NULL;\n+\t\t\t}\n+\t\t\ti = 0;\n+\t\t}\n+\t\tfor (; i < txq->tx_tail; i++) {\n+\t\t\trte_pktmbuf_free_seg(swr[i].mbuf);\n+\t\t\tswr[i].mbuf = NULL;\n+\t\t}\n+\t} else\n+#endif\n+\t{\n+\t\tif (txq->tx_tail < i) {\n+\t\t\tfor (; i < txq->nb_tx_desc; i++) {\n+\t\t\t\trte_pktmbuf_free_seg(txq->sw_ring[i].mbuf);\n+\t\t\t\ttxq->sw_ring[i].mbuf = NULL;\n+\t\t\t}\n+\t\t\ti = 0;\n+\t\t}\n+\t\tfor (; i < txq->tx_tail; i++) {\n \t\t\trte_pktmbuf_free_seg(txq->sw_ring[i].mbuf);\n \t\t\ttxq->sw_ring[i].mbuf = NULL;\n \t\t}\n-\t\ti = 0;\n-\t}\n-\tfor (; i < txq->tx_tail; i++) {\n-\t\trte_pktmbuf_free_seg(txq->sw_ring[i].mbuf);\n-\t\ttxq->sw_ring[i].mbuf = NULL;\n \t}\n }\n \n",
    "prefixes": [
        "v1",
        "2/2"
    ]
}