From patchwork Wed Apr 14 07:34:07 2021 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Wenzhuo Lu X-Patchwork-Id: 91423 X-Patchwork-Delegate: qi.z.zhang@intel.com Return-Path: X-Original-To: patchwork@inbox.dpdk.org Delivered-To: patchwork@inbox.dpdk.org Received: from mails.dpdk.org (mails.dpdk.org [217.70.189.124]) by inbox.dpdk.org (Postfix) with ESMTP id E9EBFA0524; Wed, 14 Apr 2021 09:34:34 +0200 (CEST) Received: from [217.70.189.124] (localhost [127.0.0.1]) by mails.dpdk.org (Postfix) with ESMTP id AB91E16179B; Wed, 14 Apr 2021 09:34:28 +0200 (CEST) Received: from mga12.intel.com (mga12.intel.com [192.55.52.136]) by mails.dpdk.org (Postfix) with ESMTP id 04B2A160FE5 for ; Wed, 14 Apr 2021 09:34:24 +0200 (CEST) IronPort-SDR: /ShMj4PR1jf5bt4FAt54HMG1HHBkMghVnGl+8m8EHA3Qt5Pl2JjXokHytohnMF32jSMUkYCxMu F3ebMS0qz6ig== X-IronPort-AV: E=McAfee;i="6200,9189,9953"; a="174084748" X-IronPort-AV: E=Sophos;i="5.82,221,1613462400"; d="scan'208";a="174084748" Received: from fmsmga003.fm.intel.com ([10.253.24.29]) by fmsmga106.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 14 Apr 2021 00:34:23 -0700 IronPort-SDR: HiQA33wgUNIKDsry3WhUjql+lBg0dQLHkRvPzazTahF37JKwvcqU8MeC+QaLMJhoY/QsidD0xn QDAQ7VFqSRoQ== X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="5.82,221,1613462400"; d="scan'208";a="450705154" Received: from dpdk-wenzhuo-haswell.sh.intel.com ([10.67.111.137]) by FMSMGA003.fm.intel.com with ESMTP; 14 Apr 2021 00:34:21 -0700 From: Wenzhuo Lu To: dev@dpdk.org Cc: Wenzhuo Lu Date: Wed, 14 Apr 2021 15:34:07 +0800 Message-Id: <1618385649-44717-3-git-send-email-wenzhuo.lu@intel.com> X-Mailer: git-send-email 1.9.3 In-Reply-To: <1618385649-44717-1-git-send-email-wenzhuo.lu@intel.com> References: <1617947944-130983-1-git-send-email-wenzhuo.lu@intel.com> <1618385649-44717-1-git-send-email-wenzhuo.lu@intel.com> Subject: [dpdk-dev] [PATCH v5 2/4] net/iavf: add offload path for Tx AVX512 X-BeenThere: dev@dpdk.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: DPDK patches and discussions List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: dev-bounces@dpdk.org Sender: "dev" Add a specific path for TX AVX512. In this path, support the HW offload features, like, checksum insertion, VLAN insertion. This path is chosen automatically according to the configuration. 'inline' is used, then the duplicate code is generated by the compiler. Signed-off-by: Wenzhuo Lu --- drivers/net/iavf/iavf_rxtx.c | 57 +++++++++++------ drivers/net/iavf/iavf_rxtx.h | 14 +++- drivers/net/iavf/iavf_rxtx_vec_avx512.c | 110 +++++++++++++++++++------------- drivers/net/iavf/iavf_rxtx_vec_common.h | 98 ++++++++++++++++++++++++++-- 4 files changed, 210 insertions(+), 69 deletions(-) diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c index bd0b7ee..099ede7 100644 --- a/drivers/net/iavf/iavf_rxtx.c +++ b/drivers/net/iavf/iavf_rxtx.c @@ -160,7 +160,7 @@ static inline bool check_tx_vec_allow(struct iavf_tx_queue *txq) { - if (!(txq->offloads & IAVF_NO_VECTOR_FLAGS) && + if (!(txq->offloads & IAVF_TX_NO_VECTOR_FLAGS) && txq->rs_thresh >= IAVF_VPMD_TX_MAX_BURST && txq->rs_thresh <= IAVF_VPMD_TX_MAX_FREE_BUF) { PMD_INIT_LOG(DEBUG, "Vector tx can be enabled on this txq."); @@ -2498,17 +2498,23 @@ #ifdef RTE_ARCH_X86 struct iavf_tx_queue *txq; int i; + int check_ret; + bool use_sse = false; bool use_avx2 = false; -#ifdef CC_AVX512_SUPPORT bool use_avx512 = false; -#endif - if (!iavf_tx_vec_dev_check(dev) && - rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) { - if ((rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1 || - rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1) && - rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_256) - use_avx2 = true; + check_ret = iavf_tx_vec_dev_check(dev); + + if (check_ret >= 0 && + rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) { + /* SSE and AVX2 not support offload path yet. */ + if (check_ret == IAVF_VECTOR_PATH) { + use_sse = true; + if ((rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1 || + rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1) && + rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_256) + use_avx2 = true; + } #ifdef CC_AVX512_SUPPORT if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1 && rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512BW) == 1 && @@ -2516,15 +2522,29 @@ use_avx512 = true; #endif - PMD_DRV_LOG(DEBUG, "Using %sVector Tx (port %d).", - use_avx2 ? "avx2 " : "", - dev->data->port_id); - dev->tx_pkt_burst = use_avx2 ? - iavf_xmit_pkts_vec_avx2 : - iavf_xmit_pkts_vec; + if (!use_sse && !use_avx2 && !use_avx512) + goto normal; + + if (!use_avx512) { + PMD_DRV_LOG(DEBUG, "Using %sVector Tx (port %d).", + use_avx2 ? "avx2 " : "", + dev->data->port_id); + dev->tx_pkt_burst = use_avx2 ? + iavf_xmit_pkts_vec_avx2 : + iavf_xmit_pkts_vec; + } #ifdef CC_AVX512_SUPPORT - if (use_avx512) - dev->tx_pkt_burst = iavf_xmit_pkts_vec_avx512; + if (use_avx512) { + if (check_ret == IAVF_VECTOR_PATH) { + dev->tx_pkt_burst = iavf_xmit_pkts_vec_avx512; + PMD_DRV_LOG(DEBUG, "Using AVX512 Vector Tx (port %d).", + dev->data->port_id); + } else { + dev->tx_pkt_burst = iavf_xmit_pkts_vec_avx512_offload; + PMD_DRV_LOG(DEBUG, "Using AVX512 OFFLOAD Vector Tx (port %d).", + dev->data->port_id); + } + } #endif dev->tx_pkt_prepare = NULL; @@ -2544,8 +2564,9 @@ return; } -#endif +normal: +#endif PMD_DRV_LOG(DEBUG, "Using Basic Tx callback (port=%d).", dev->data->port_id); dev->tx_pkt_burst = iavf_xmit_pkts; diff --git a/drivers/net/iavf/iavf_rxtx.h b/drivers/net/iavf/iavf_rxtx.h index f56dd74..bead119 100644 --- a/drivers/net/iavf/iavf_rxtx.h +++ b/drivers/net/iavf/iavf_rxtx.h @@ -23,14 +23,21 @@ #define IAVF_VPMD_DESCS_PER_LOOP 4 #define IAVF_VPMD_TX_MAX_FREE_BUF 64 -#define IAVF_NO_VECTOR_FLAGS ( \ +#define IAVF_TX_NO_VECTOR_FLAGS ( \ DEV_TX_OFFLOAD_MULTI_SEGS | \ + DEV_TX_OFFLOAD_TCP_TSO) + +#define IAVF_TX_VECTOR_OFFLOAD ( \ DEV_TX_OFFLOAD_VLAN_INSERT | \ + DEV_TX_OFFLOAD_QINQ_INSERT | \ + DEV_TX_OFFLOAD_IPV4_CKSUM | \ DEV_TX_OFFLOAD_SCTP_CKSUM | \ DEV_TX_OFFLOAD_UDP_CKSUM | \ - DEV_TX_OFFLOAD_TCP_TSO | \ DEV_TX_OFFLOAD_TCP_CKSUM) +#define IAVF_VECTOR_PATH 0 +#define IAVF_VECTOR_OFFLOAD_PATH 1 + #define DEFAULT_TX_RS_THRESH 32 #define DEFAULT_TX_FREE_THRESH 32 @@ -488,6 +495,9 @@ uint16_t iavf_recv_scattered_pkts_vec_avx512_flex_rxd(void *rx_queue, uint16_t nb_pkts); uint16_t iavf_xmit_pkts_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts); +uint16_t iavf_xmit_pkts_vec_avx512_offload(void *tx_queue, + struct rte_mbuf **tx_pkts, + uint16_t nb_pkts); int iavf_txq_vec_setup_avx512(struct iavf_tx_queue *txq); uint8_t iavf_proto_xtr_type_to_rxdid(uint8_t xtr_type); diff --git a/drivers/net/iavf/iavf_rxtx_vec_avx512.c b/drivers/net/iavf/iavf_rxtx_vec_avx512.c index 385f44e..f4dd222 100644 --- a/drivers/net/iavf/iavf_rxtx_vec_avx512.c +++ b/drivers/net/iavf/iavf_rxtx_vec_avx512.c @@ -1518,14 +1518,16 @@ txep[i].mbuf = tx_pkts[i]; } -static inline void +static __rte_always_inline void iavf_vtx1(volatile struct iavf_tx_desc *txdp, - struct rte_mbuf *pkt, uint64_t flags) + struct rte_mbuf *pkt, uint64_t flags, bool offload) { uint64_t high_qw = (IAVF_TX_DESC_DTYPE_DATA | ((uint64_t)flags << IAVF_TXD_QW1_CMD_SHIFT) | ((uint64_t)pkt->data_len << IAVF_TXD_QW1_TX_BUF_SZ_SHIFT)); + if (offload) + iavf_txd_enable_offload(pkt, &high_qw); __m128i descriptor = _mm_set_epi64x(high_qw, pkt->buf_iova + pkt->data_off); @@ -1534,62 +1536,70 @@ #define IAVF_TX_LEN_MASK 0xAA #define IAVF_TX_OFF_MASK 0x55 -static inline void +static __rte_always_inline void iavf_vtx(volatile struct iavf_tx_desc *txdp, - struct rte_mbuf **pkt, uint16_t nb_pkts, uint64_t flags) + struct rte_mbuf **pkt, uint16_t nb_pkts, uint64_t flags, + bool offload) { const uint64_t hi_qw_tmpl = (IAVF_TX_DESC_DTYPE_DATA | ((uint64_t)flags << IAVF_TXD_QW1_CMD_SHIFT)); /* if unaligned on 32-bit boundary, do one to align */ if (((uintptr_t)txdp & 0x1F) != 0 && nb_pkts != 0) { - iavf_vtx1(txdp, *pkt, flags); + iavf_vtx1(txdp, *pkt, flags, offload); nb_pkts--, txdp++, pkt++; } /* do 4 at a time while possible, in bursts */ for (; nb_pkts > 3; txdp += 4, pkt += 4, nb_pkts -= 4) { - __m512i desc4 = - _mm512_set_epi64 - ((uint64_t)pkt[3]->data_len, - pkt[3]->buf_iova, - (uint64_t)pkt[2]->data_len, - pkt[2]->buf_iova, - (uint64_t)pkt[1]->data_len, - pkt[1]->buf_iova, - (uint64_t)pkt[0]->data_len, - pkt[0]->buf_iova); - __m512i hi_qw_tmpl_4 = _mm512_set1_epi64(hi_qw_tmpl); - __m512i data_off_4 = + uint64_t hi_qw3 = + hi_qw_tmpl | + ((uint64_t)pkt[3]->data_len << + IAVF_TXD_QW1_TX_BUF_SZ_SHIFT); + if (offload) + iavf_txd_enable_offload(pkt[3], &hi_qw3); + uint64_t hi_qw2 = + hi_qw_tmpl | + ((uint64_t)pkt[2]->data_len << + IAVF_TXD_QW1_TX_BUF_SZ_SHIFT); + if (offload) + iavf_txd_enable_offload(pkt[2], &hi_qw2); + uint64_t hi_qw1 = + hi_qw_tmpl | + ((uint64_t)pkt[1]->data_len << + IAVF_TXD_QW1_TX_BUF_SZ_SHIFT); + if (offload) + iavf_txd_enable_offload(pkt[1], &hi_qw1); + uint64_t hi_qw0 = + hi_qw_tmpl | + ((uint64_t)pkt[0]->data_len << + IAVF_TXD_QW1_TX_BUF_SZ_SHIFT); + if (offload) + iavf_txd_enable_offload(pkt[0], &hi_qw0); + + __m512i desc0_3 = _mm512_set_epi64 - (0, - pkt[3]->data_off, - 0, - pkt[2]->data_off, - 0, - pkt[1]->data_off, - 0, - pkt[0]->data_off); - - desc4 = _mm512_mask_slli_epi64(desc4, IAVF_TX_LEN_MASK, desc4, - IAVF_TXD_QW1_TX_BUF_SZ_SHIFT); - desc4 = _mm512_mask_or_epi64(desc4, IAVF_TX_LEN_MASK, desc4, - hi_qw_tmpl_4); - desc4 = _mm512_mask_add_epi64(desc4, IAVF_TX_OFF_MASK, desc4, - data_off_4); - _mm512_storeu_si512((void *)txdp, desc4); + (hi_qw3, + pkt[3]->buf_iova + pkt[3]->data_off, + hi_qw2, + pkt[2]->buf_iova + pkt[2]->data_off, + hi_qw1, + pkt[1]->buf_iova + pkt[1]->data_off, + hi_qw0, + pkt[0]->buf_iova + pkt[0]->data_off); + _mm512_storeu_si512((void *)txdp, desc0_3); } /* do any last ones */ while (nb_pkts) { - iavf_vtx1(txdp, *pkt, flags); + iavf_vtx1(txdp, *pkt, flags, offload); txdp++, pkt++, nb_pkts--; } } -static inline uint16_t +static __rte_always_inline uint16_t iavf_xmit_fixed_burst_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts, - uint16_t nb_pkts) + uint16_t nb_pkts, bool offload) { struct iavf_tx_queue *txq = (struct iavf_tx_queue *)tx_queue; volatile struct iavf_tx_desc *txdp; @@ -1620,11 +1630,11 @@ if (nb_commit >= n) { tx_backlog_entry_avx512(txep, tx_pkts, n); - iavf_vtx(txdp, tx_pkts, n - 1, flags); + iavf_vtx(txdp, tx_pkts, n - 1, flags, offload); tx_pkts += (n - 1); txdp += (n - 1); - iavf_vtx1(txdp, *tx_pkts++, rs); + iavf_vtx1(txdp, *tx_pkts++, rs, offload); nb_commit = (uint16_t)(nb_commit - n); @@ -1639,7 +1649,7 @@ tx_backlog_entry_avx512(txep, tx_pkts, nb_commit); - iavf_vtx(txdp, tx_pkts, nb_commit, flags); + iavf_vtx(txdp, tx_pkts, nb_commit, flags, offload); tx_id = (uint16_t)(tx_id + nb_commit); if (tx_id > txq->next_rs) { @@ -1657,9 +1667,9 @@ return nb_pkts; } -uint16_t -iavf_xmit_pkts_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts, - uint16_t nb_pkts) +static __rte_always_inline uint16_t +iavf_xmit_pkts_vec_avx512_cmn(void *tx_queue, struct rte_mbuf **tx_pkts, + uint16_t nb_pkts, bool offload) { uint16_t nb_tx = 0; struct iavf_tx_queue *txq = (struct iavf_tx_queue *)tx_queue; @@ -1669,7 +1679,7 @@ num = (uint16_t)RTE_MIN(nb_pkts, txq->rs_thresh); ret = iavf_xmit_fixed_burst_vec_avx512(tx_queue, &tx_pkts[nb_tx], - num); + num, offload); nb_tx += ret; nb_pkts -= ret; if (ret < num) @@ -1679,6 +1689,13 @@ return nb_tx; } +uint16_t +iavf_xmit_pkts_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts, + uint16_t nb_pkts) +{ + return iavf_xmit_pkts_vec_avx512_cmn(tx_queue, tx_pkts, nb_pkts, false); +} + static inline void iavf_tx_queue_release_mbufs_avx512(struct iavf_tx_queue *txq) { @@ -1709,3 +1726,10 @@ txq->ops = &avx512_vec_txq_ops; return 0; } + +uint16_t +iavf_xmit_pkts_vec_avx512_offload(void *tx_queue, struct rte_mbuf **tx_pkts, + uint16_t nb_pkts) +{ + return iavf_xmit_pkts_vec_avx512_cmn(tx_queue, tx_pkts, nb_pkts, true); +} diff --git a/drivers/net/iavf/iavf_rxtx_vec_common.h b/drivers/net/iavf/iavf_rxtx_vec_common.h index 816e16a..62a333f 100644 --- a/drivers/net/iavf/iavf_rxtx_vec_common.h +++ b/drivers/net/iavf/iavf_rxtx_vec_common.h @@ -240,14 +240,17 @@ if (!txq) return -1; - if (txq->offloads & IAVF_NO_VECTOR_FLAGS) - return -1; - if (txq->rs_thresh < IAVF_VPMD_TX_MAX_BURST || txq->rs_thresh > IAVF_VPMD_TX_MAX_FREE_BUF) return -1; - return 0; + if (txq->offloads & IAVF_TX_NO_VECTOR_FLAGS) + return -1; + + if (txq->offloads & IAVF_TX_VECTOR_OFFLOAD) + return IAVF_VECTOR_OFFLOAD_PATH; + + return IAVF_VECTOR_PATH; } static inline int @@ -270,14 +273,97 @@ { int i; struct iavf_tx_queue *txq; + int ret; + int result = 0; for (i = 0; i < dev->data->nb_tx_queues; i++) { txq = dev->data->tx_queues[i]; - if (iavf_tx_vec_queue_default(txq)) + ret = iavf_tx_vec_queue_default(txq); + + if (ret < 0) return -1; + if (ret > result) + result = ret; } - return 0; + return result; +} + +/****************************************************************************** + * If user knows a specific offload is not enabled by APP, + * the macro can be commented to save the effort of fast path. + * Currently below 2 features are supported in TX path, + * 1, checksum offload + * 2, VLAN/QINQ insertion + ******************************************************************************/ +#define IAVF_TX_CSUM_OFFLOAD +#define IAVF_TX_VLAN_QINQ_OFFLOAD + +static __rte_always_inline void +iavf_txd_enable_offload(__rte_unused struct rte_mbuf *tx_pkt, + uint64_t *txd_hi) +{ +#if defined(IAVF_TX_CSUM_OFFLOAD) || defined(IAVF_TX_VLAN_QINQ_OFFLOAD) + uint64_t ol_flags = tx_pkt->ol_flags; +#endif + uint32_t td_cmd = 0; +#ifdef IAVF_TX_CSUM_OFFLOAD + uint32_t td_offset = 0; +#endif + +#ifdef IAVF_TX_CSUM_OFFLOAD + /* Set MACLEN */ + td_offset |= (tx_pkt->l2_len >> 1) << + IAVF_TX_DESC_LENGTH_MACLEN_SHIFT; + + /* Enable L3 checksum offloads */ + if (ol_flags & PKT_TX_IP_CKSUM) { + td_cmd |= IAVF_TX_DESC_CMD_IIPT_IPV4_CSUM; + td_offset |= (tx_pkt->l3_len >> 2) << + IAVF_TX_DESC_LENGTH_IPLEN_SHIFT; + } else if (ol_flags & PKT_TX_IPV4) { + td_cmd |= IAVF_TX_DESC_CMD_IIPT_IPV4; + td_offset |= (tx_pkt->l3_len >> 2) << + IAVF_TX_DESC_LENGTH_IPLEN_SHIFT; + } else if (ol_flags & PKT_TX_IPV6) { + td_cmd |= IAVF_TX_DESC_CMD_IIPT_IPV6; + td_offset |= (tx_pkt->l3_len >> 2) << + IAVF_TX_DESC_LENGTH_IPLEN_SHIFT; + } + + /* Enable L4 checksum offloads */ + switch (ol_flags & PKT_TX_L4_MASK) { + case PKT_TX_TCP_CKSUM: + td_cmd |= IAVF_TX_DESC_CMD_L4T_EOFT_TCP; + td_offset |= (sizeof(struct rte_tcp_hdr) >> 2) << + IAVF_TX_DESC_LENGTH_L4_FC_LEN_SHIFT; + break; + case PKT_TX_SCTP_CKSUM: + td_cmd |= IAVF_TX_DESC_CMD_L4T_EOFT_SCTP; + td_offset |= (sizeof(struct rte_sctp_hdr) >> 2) << + IAVF_TX_DESC_LENGTH_L4_FC_LEN_SHIFT; + break; + case PKT_TX_UDP_CKSUM: + td_cmd |= IAVF_TX_DESC_CMD_L4T_EOFT_UDP; + td_offset |= (sizeof(struct rte_udp_hdr) >> 2) << + IAVF_TX_DESC_LENGTH_L4_FC_LEN_SHIFT; + break; + default: + break; + } + + *txd_hi |= ((uint64_t)td_offset) << IAVF_TXD_QW1_OFFSET_SHIFT; +#endif + +#ifdef IAVF_TX_VLAN_QINQ_OFFLOAD + if (ol_flags & (PKT_TX_VLAN | PKT_TX_QINQ)) { + td_cmd |= IAVF_TX_DESC_CMD_IL2TAG1; + *txd_hi |= ((uint64_t)tx_pkt->vlan_tci << + IAVF_TXD_QW1_L2TAG1_SHIFT); + } +#endif + + *txd_hi |= ((uint64_t)td_cmd) << IAVF_TXD_QW1_CMD_SHIFT; } #ifdef CC_AVX2_SUPPORT