[v3,8/8] net/ice: support vector AVX2 in TX
Checks
Commit Message
Signed-off-by: Wenzhuo Lu <wenzhuo.lu@intel.com>
---
doc/guides/rel_notes/release_19_05.rst | 4 +
drivers/net/ice/ice_rxtx.c | 13 ++-
drivers/net/ice/ice_rxtx.h | 2 +
drivers/net/ice/ice_rxtx_vec_avx2.c | 158 +++++++++++++++++++++++++++++++++
4 files changed, 175 insertions(+), 2 deletions(-)
Comments
On 3/15/2019 6:22 AM, Wenzhuo Lu wrote:
> Signed-off-by: Wenzhuo Lu <wenzhuo.lu@intel.com>
> ---
> doc/guides/rel_notes/release_19_05.rst | 4 +
> drivers/net/ice/ice_rxtx.c | 13 ++-
> drivers/net/ice/ice_rxtx.h | 2 +
> drivers/net/ice/ice_rxtx_vec_avx2.c | 158 +++++++++++++++++++++++++++++++++
> 4 files changed, 175 insertions(+), 2 deletions(-)
>
> diff --git a/doc/guides/rel_notes/release_19_05.rst b/doc/guides/rel_notes/release_19_05.rst
> index 61a2c73..610c4cd 100644
> --- a/doc/guides/rel_notes/release_19_05.rst
> +++ b/doc/guides/rel_notes/release_19_05.rst
> @@ -91,6 +91,10 @@ New Features
>
> * Added promiscuous mode support.
>
> +* **Added support of vector instructions on ICE.**
> +
> + Added support of SSE and AVX2 instructions in ICE RX and TX path.
> +
ice documentation doesn't have any information about vector path, can you please
update it?
I think it can be good to document when vector path is used? How to decide
scalar, sse or avx to use? What will prevent using vector path, like any offload
or any specific config?
Thanks,
ferruh
Hi Ferruh,
> -----Original Message-----
> From: Yigit, Ferruh
> Sent: Saturday, March 16, 2019 1:55 AM
> To: Lu, Wenzhuo <wenzhuo.lu@intel.com>; dev@dpdk.org; Zhang, Qi Z
> <qi.z.zhang@intel.com>
> Subject: Re: [dpdk-dev] [PATCH v3 8/8] net/ice: support vector AVX2 in TX
>
> On 3/15/2019 6:22 AM, Wenzhuo Lu wrote:
> > Signed-off-by: Wenzhuo Lu <wenzhuo.lu@intel.com>
> > ---
> > doc/guides/rel_notes/release_19_05.rst | 4 +
> > drivers/net/ice/ice_rxtx.c | 13 ++-
> > drivers/net/ice/ice_rxtx.h | 2 +
> > drivers/net/ice/ice_rxtx_vec_avx2.c | 158
> +++++++++++++++++++++++++++++++++
> > 4 files changed, 175 insertions(+), 2 deletions(-)
> >
> > diff --git a/doc/guides/rel_notes/release_19_05.rst
> > b/doc/guides/rel_notes/release_19_05.rst
> > index 61a2c73..610c4cd 100644
> > --- a/doc/guides/rel_notes/release_19_05.rst
> > +++ b/doc/guides/rel_notes/release_19_05.rst
> > @@ -91,6 +91,10 @@ New Features
> >
> > * Added promiscuous mode support.
> >
> > +* **Added support of vector instructions on ICE.**
> > +
> > + Added support of SSE and AVX2 instructions in ICE RX and TX path.
> > +
>
> ice documentation doesn't have any information about vector path, can you
> please update it?
>
> I think it can be good to document when vector path is used? How to decide
> scalar, sse or avx to use? What will prevent using vector path, like any
> offload or any specific config?
Thanks for the comments. Will add more info here.
>
> Thanks,
> ferruh
@@ -91,6 +91,10 @@ New Features
* Added promiscuous mode support.
+* **Added support of vector instructions on ICE.**
+
+ Added support of SSE and AVX2 instructions in ICE RX and TX path.
+
Removed Items
-------------
@@ -2354,15 +2354,24 @@ void __attribute__((cold))
#ifdef RTE_ARCH_X86
struct ice_tx_queue *txq;
int i;
+ bool use_avx2 = false;
if (!ice_tx_vec_dev_check(dev)) {
for (i = 0; i < dev->data->nb_tx_queues; i++) {
txq = dev->data->tx_queues[i];
(void)ice_txq_vec_setup(txq);
}
- PMD_DRV_LOG(DEBUG, "Using Vector Tx (port %d).",
+
+ if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1 ||
+ rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1)
+ use_avx2 = true;
+
+ PMD_DRV_LOG(DEBUG, "Using %sVector Tx (port %d).",
+ use_avx2 ? "avx2 " : "",
dev->data->port_id);
- dev->tx_pkt_burst = ice_xmit_pkts_vec;
+ dev->tx_pkt_burst = use_avx2 ?
+ ice_xmit_pkts_vec_avx2 :
+ ice_xmit_pkts_vec;
dev->tx_pkt_prepare = NULL;
return;
@@ -187,5 +187,7 @@ uint16_t ice_recv_pkts_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
uint16_t ice_recv_scattered_pkts_vec_avx2(void *rx_queue,
struct rte_mbuf **rx_pkts,
uint16_t nb_pkts);
+uint16_t ice_xmit_pkts_vec_avx2(void *tx_queue, struct rte_mbuf **tx_pkts,
+ uint16_t nb_pkts);
#endif
#endif /* _ICE_RXTX_H_ */
@@ -675,3 +675,161 @@
return retval + ice_recv_scattered_burst_vec_avx2(rx_queue,
rx_pkts + retval, nb_pkts);
}
+
+static inline void
+ice_vtx1(volatile struct ice_tx_desc *txdp,
+ struct rte_mbuf *pkt, uint64_t flags)
+{
+ uint64_t high_qw =
+ (ICE_TX_DESC_DTYPE_DATA |
+ ((uint64_t)flags << ICE_TXD_QW1_CMD_S) |
+ ((uint64_t)pkt->data_len << ICE_TXD_QW1_TX_BUF_SZ_S));
+
+ __m128i descriptor = _mm_set_epi64x(high_qw,
+ pkt->buf_physaddr + pkt->data_off);
+ _mm_store_si128((__m128i *)txdp, descriptor);
+}
+
+static inline void
+ice_vtx(volatile struct ice_tx_desc *txdp,
+ struct rte_mbuf **pkt, uint16_t nb_pkts, uint64_t flags)
+{
+ const uint64_t hi_qw_tmpl = (ICE_TX_DESC_DTYPE_DATA |
+ ((uint64_t)flags << ICE_TXD_QW1_CMD_S));
+
+ /* if unaligned on 32-bit boundary, do one to align */
+ if (((uintptr_t)txdp & 0x1F) != 0 && nb_pkts != 0) {
+ ice_vtx1(txdp, *pkt, flags);
+ nb_pkts--, txdp++, pkt++;
+ }
+
+ /* do two at a time while possible, in bursts */
+ for (; nb_pkts > 3; txdp += 4, pkt += 4, nb_pkts -= 4) {
+ uint64_t hi_qw3 =
+ hi_qw_tmpl |
+ ((uint64_t)pkt[3]->data_len <<
+ ICE_TXD_QW1_TX_BUF_SZ_S);
+ uint64_t hi_qw2 =
+ hi_qw_tmpl |
+ ((uint64_t)pkt[2]->data_len <<
+ ICE_TXD_QW1_TX_BUF_SZ_S);
+ uint64_t hi_qw1 =
+ hi_qw_tmpl |
+ ((uint64_t)pkt[1]->data_len <<
+ ICE_TXD_QW1_TX_BUF_SZ_S);
+ uint64_t hi_qw0 =
+ hi_qw_tmpl |
+ ((uint64_t)pkt[0]->data_len <<
+ ICE_TXD_QW1_TX_BUF_SZ_S);
+
+ __m256i desc2_3 =
+ _mm256_set_epi64x
+ (hi_qw3,
+ pkt[3]->buf_physaddr + pkt[3]->data_off,
+ hi_qw2,
+ pkt[2]->buf_physaddr + pkt[2]->data_off);
+ __m256i desc0_1 =
+ _mm256_set_epi64x
+ (hi_qw1,
+ pkt[1]->buf_physaddr + pkt[1]->data_off,
+ hi_qw0,
+ pkt[0]->buf_physaddr + pkt[0]->data_off);
+ _mm256_store_si256((void *)(txdp + 2), desc2_3);
+ _mm256_store_si256((void *)txdp, desc0_1);
+ }
+
+ /* do any last ones */
+ while (nb_pkts) {
+ ice_vtx1(txdp, *pkt, flags);
+ txdp++, pkt++, nb_pkts--;
+ }
+}
+
+static inline uint16_t
+ice_xmit_fixed_burst_vec_avx2(void *tx_queue, struct rte_mbuf **tx_pkts,
+ uint16_t nb_pkts)
+{
+ struct ice_tx_queue *txq = (struct ice_tx_queue *)tx_queue;
+ volatile struct ice_tx_desc *txdp;
+ struct ice_tx_entry *txep;
+ uint16_t n, nb_commit, tx_id;
+ uint64_t flags = ICE_TD_CMD;
+ uint64_t rs = ICE_TX_DESC_CMD_RS | ICE_TD_CMD;
+
+ /* cross rx_thresh boundary is not allowed */
+ nb_pkts = RTE_MIN(nb_pkts, txq->tx_rs_thresh);
+
+ if (txq->nb_tx_free < txq->tx_free_thresh)
+ ice_tx_free_bufs(txq);
+
+ nb_commit = nb_pkts = (uint16_t)RTE_MIN(txq->nb_tx_free, nb_pkts);
+ if (unlikely(nb_pkts == 0))
+ return 0;
+
+ tx_id = txq->tx_tail;
+ txdp = &txq->tx_ring[tx_id];
+ txep = &txq->sw_ring[tx_id];
+
+ txq->nb_tx_free = (uint16_t)(txq->nb_tx_free - nb_pkts);
+
+ n = (uint16_t)(txq->nb_tx_desc - tx_id);
+ if (nb_commit >= n) {
+ tx_backlog_entry(txep, tx_pkts, n);
+
+ ice_vtx(txdp, tx_pkts, n - 1, flags);
+ tx_pkts += (n - 1);
+ txdp += (n - 1);
+
+ ice_vtx1(txdp, *tx_pkts++, rs);
+
+ nb_commit = (uint16_t)(nb_commit - n);
+
+ tx_id = 0;
+ txq->tx_next_rs = (uint16_t)(txq->tx_rs_thresh - 1);
+
+ /* avoid reach the end of ring */
+ txdp = &txq->tx_ring[tx_id];
+ txep = &txq->sw_ring[tx_id];
+ }
+
+ tx_backlog_entry(txep, tx_pkts, nb_commit);
+
+ ice_vtx(txdp, tx_pkts, nb_commit, flags);
+
+ tx_id = (uint16_t)(tx_id + nb_commit);
+ if (tx_id > txq->tx_next_rs) {
+ txq->tx_ring[txq->tx_next_rs].cmd_type_offset_bsz |=
+ rte_cpu_to_le_64(((uint64_t)ICE_TX_DESC_CMD_RS) <<
+ ICE_TXD_QW1_CMD_S);
+ txq->tx_next_rs =
+ (uint16_t)(txq->tx_next_rs + txq->tx_rs_thresh);
+ }
+
+ txq->tx_tail = tx_id;
+
+ ICE_PCI_REG_WRITE(txq->qtx_tail, txq->tx_tail);
+
+ return nb_pkts;
+}
+
+uint16_t
+ice_xmit_pkts_vec_avx2(void *tx_queue, struct rte_mbuf **tx_pkts,
+ uint16_t nb_pkts)
+{
+ uint16_t nb_tx = 0;
+ struct ice_tx_queue *txq = (struct ice_tx_queue *)tx_queue;
+
+ while (nb_pkts) {
+ uint16_t ret, num;
+
+ num = (uint16_t)RTE_MIN(nb_pkts, txq->tx_rs_thresh);
+ ret = ice_xmit_fixed_burst_vec_avx2(tx_queue, &tx_pkts[nb_tx],
+ num);
+ nb_tx += ret;
+ nb_pkts -= ret;
+ if (ret < num)
+ break;
+ }
+
+ return nb_tx;
+}