[v9,7/7] event/cnxk: add Tx event vector fastpath
Checks
Commit Message
From: Pavan Nikhilesh <pbhagavatula@marvell.com>
Add Tx event vector fastpath, integrate event vector Tx routine
into Tx burst.
Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
drivers/common/cnxk/roc_sso.h | 23 ++++++
drivers/event/cnxk/cn10k_eventdev.c | 3 +-
drivers/event/cnxk/cn10k_worker.h | 104 +++++++++++++++++++++++++--
drivers/event/cnxk/cn9k_worker.h | 4 +-
drivers/event/cnxk/cnxk_worker.h | 22 ------
drivers/net/cnxk/cn10k_tx.c | 2 +-
drivers/net/cnxk/cn10k_tx.h | 52 +++++++++-----
drivers/net/cnxk/cn10k_tx_mseg.c | 3 +-
drivers/net/cnxk/cn10k_tx_vec.c | 2 +-
drivers/net/cnxk/cn10k_tx_vec_mseg.c | 2 +-
10 files changed, 165 insertions(+), 52 deletions(-)
Comments
On Wed, Jul 14, 2021 at 2:33 PM <pbhagavatula@marvell.com> wrote:
>
> From: Pavan Nikhilesh <pbhagavatula@marvell.com>
>
> Add Tx event vector fastpath, integrate event vector Tx routine
> into Tx burst.
>
> Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
Series Acked-by: Jerin Jacob <jerinj@marvell.com>
Series v9 Applied to dpdk-next-net-eventdev/for-main. Thanks
> ---
> drivers/common/cnxk/roc_sso.h | 23 ++++++
> drivers/event/cnxk/cn10k_eventdev.c | 3 +-
> drivers/event/cnxk/cn10k_worker.h | 104 +++++++++++++++++++++++++--
> drivers/event/cnxk/cn9k_worker.h | 4 +-
> drivers/event/cnxk/cnxk_worker.h | 22 ------
> drivers/net/cnxk/cn10k_tx.c | 2 +-
> drivers/net/cnxk/cn10k_tx.h | 52 +++++++++-----
> drivers/net/cnxk/cn10k_tx_mseg.c | 3 +-
> drivers/net/cnxk/cn10k_tx_vec.c | 2 +-
> drivers/net/cnxk/cn10k_tx_vec_mseg.c | 2 +-
> 10 files changed, 165 insertions(+), 52 deletions(-)
>
> diff --git a/drivers/common/cnxk/roc_sso.h b/drivers/common/cnxk/roc_sso.h
> index a6030e7d8a..b28f6089cc 100644
> --- a/drivers/common/cnxk/roc_sso.h
> +++ b/drivers/common/cnxk/roc_sso.h
> @@ -44,6 +44,29 @@ struct roc_sso {
> uint8_t reserved[ROC_SSO_MEM_SZ] __plt_cache_aligned;
> } __plt_cache_aligned;
>
> +static __plt_always_inline void
> +roc_sso_hws_head_wait(uintptr_t tag_op)
> +{
> +#ifdef RTE_ARCH_ARM64
> + uint64_t tag;
> +
> + asm volatile(PLT_CPU_FEATURE_PREAMBLE
> + " ldr %[tag], [%[tag_op]] \n"
> + " tbnz %[tag], 35, done%= \n"
> + " sevl \n"
> + "rty%=: wfe \n"
> + " ldr %[tag], [%[tag_op]] \n"
> + " tbz %[tag], 35, rty%= \n"
> + "done%=: \n"
> + : [tag] "=&r"(tag)
> + : [tag_op] "r"(tag_op));
> +#else
> + /* Wait for the SWTAG/SWTAG_FULL operation */
> + while (!(plt_read64(tag_op) & BIT_ULL(35)))
> + ;
> +#endif
> +}
> +
> /* SSO device initialization */
> int __roc_api roc_sso_dev_init(struct roc_sso *roc_sso);
> int __roc_api roc_sso_dev_fini(struct roc_sso *roc_sso);
> diff --git a/drivers/event/cnxk/cn10k_eventdev.c b/drivers/event/cnxk/cn10k_eventdev.c
> index e85fa4785d..6f37c5bd23 100644
> --- a/drivers/event/cnxk/cn10k_eventdev.c
> +++ b/drivers/event/cnxk/cn10k_eventdev.c
> @@ -782,7 +782,8 @@ cn10k_sso_tx_adapter_caps_get(const struct rte_eventdev *dev,
> if (ret)
> *caps = 0;
> else
> - *caps = RTE_EVENT_ETH_TX_ADAPTER_CAP_INTERNAL_PORT;
> + *caps = RTE_EVENT_ETH_TX_ADAPTER_CAP_INTERNAL_PORT |
> + RTE_EVENT_ETH_TX_ADAPTER_CAP_EVENT_VECTOR;
>
> return 0;
> }
> diff --git a/drivers/event/cnxk/cn10k_worker.h b/drivers/event/cnxk/cn10k_worker.h
> index 7a48a6b17d..9cc0992063 100644
> --- a/drivers/event/cnxk/cn10k_worker.h
> +++ b/drivers/event/cnxk/cn10k_worker.h
> @@ -308,29 +308,120 @@ uint16_t __rte_hot cn10k_sso_hws_enq_fwd_burst(void *port,
> NIX_RX_FASTPATH_MODES
> #undef R
>
> -static __rte_always_inline const struct cn10k_eth_txq *
> +static __rte_always_inline struct cn10k_eth_txq *
> cn10k_sso_hws_xtract_meta(struct rte_mbuf *m,
> const uint64_t txq_data[][RTE_MAX_QUEUES_PER_PORT])
> {
> - return (const struct cn10k_eth_txq *)
> + return (struct cn10k_eth_txq *)
> txq_data[m->port][rte_event_eth_tx_adapter_txq_get(m)];
> }
>
> +static __rte_always_inline void
> +cn10k_sso_vwqe_split_tx(struct rte_mbuf **mbufs, uint16_t nb_mbufs,
> + uint64_t *cmd, uint16_t lmt_id, uintptr_t lmt_addr,
> + uint8_t sched_type, uintptr_t base,
> + const uint64_t txq_data[][RTE_MAX_QUEUES_PER_PORT],
> + const uint32_t flags)
> +{
> + uint16_t port[4], queue[4];
> + struct cn10k_eth_txq *txq;
> + uint16_t i, j;
> + uintptr_t pa;
> +
> + for (i = 0; i < nb_mbufs; i += 4) {
> + port[0] = mbufs[i]->port;
> + port[1] = mbufs[i + 1]->port;
> + port[2] = mbufs[i + 2]->port;
> + port[3] = mbufs[i + 3]->port;
> +
> + queue[0] = rte_event_eth_tx_adapter_txq_get(mbufs[i]);
> + queue[1] = rte_event_eth_tx_adapter_txq_get(mbufs[i + 1]);
> + queue[2] = rte_event_eth_tx_adapter_txq_get(mbufs[i + 2]);
> + queue[3] = rte_event_eth_tx_adapter_txq_get(mbufs[i + 3]);
> +
> + if (((port[0] ^ port[1]) & (port[2] ^ port[3])) ||
> + ((queue[0] ^ queue[1]) & (queue[2] ^ queue[3]))) {
> +
> + for (j = 0; j < 4; j++) {
> + struct rte_mbuf *m = mbufs[i + j];
> +
> + txq = (struct cn10k_eth_txq *)
> + txq_data[port[j]][queue[j]];
> + cn10k_nix_tx_skeleton(txq, cmd, flags);
> + /* Perform header writes before barrier
> + * for TSO
> + */
> + if (flags & NIX_TX_OFFLOAD_TSO_F)
> + cn10k_nix_xmit_prepare_tso(m, flags);
> +
> + cn10k_nix_xmit_prepare(m, cmd, lmt_addr, flags,
> + txq->lso_tun_fmt);
> + if (flags & NIX_TX_MULTI_SEG_F) {
> + const uint16_t segdw =
> + cn10k_nix_prepare_mseg(
> + m, (uint64_t *)lmt_addr,
> + flags);
> + pa = txq->io_addr | ((segdw - 1) << 4);
> + } else {
> + pa = txq->io_addr |
> + (cn10k_nix_tx_ext_subs(flags) + 1)
> + << 4;
> + }
> + if (!sched_type)
> + roc_sso_hws_head_wait(base +
> + SSOW_LF_GWS_TAG);
> +
> + roc_lmt_submit_steorl(lmt_id, pa);
> + }
> + } else {
> + txq = (struct cn10k_eth_txq *)
> + txq_data[port[0]][queue[0]];
> + cn10k_nix_xmit_pkts_vector(txq, &mbufs[i], 4, cmd, base
> + + SSOW_LF_GWS_TAG,
> + flags | NIX_TX_VWQE_F);
> + }
> + }
> +}
> +
> static __rte_always_inline uint16_t
> cn10k_sso_hws_event_tx(struct cn10k_sso_hws *ws, struct rte_event *ev,
> uint64_t *cmd,
> const uint64_t txq_data[][RTE_MAX_QUEUES_PER_PORT],
> const uint32_t flags)
> {
> - const struct cn10k_eth_txq *txq;
> - struct rte_mbuf *m = ev->mbuf;
> - uint16_t ref_cnt = m->refcnt;
> + struct cn10k_eth_txq *txq;
> + struct rte_mbuf *m;
> uintptr_t lmt_addr;
> + uint16_t ref_cnt;
> uint16_t lmt_id;
> uintptr_t pa;
>
> lmt_addr = ws->lmt_base;
> ROC_LMT_BASE_ID_GET(lmt_addr, lmt_id);
> +
> + if (ev->event_type & RTE_EVENT_TYPE_VECTOR) {
> + struct rte_mbuf **mbufs = ev->vec->mbufs;
> + uint64_t meta = *(uint64_t *)ev->vec;
> +
> + if (meta & BIT(31)) {
> + txq = (struct cn10k_eth_txq *)
> + txq_data[meta >> 32][meta >> 48];
> +
> + cn10k_nix_xmit_pkts_vector(
> + txq, mbufs, meta & 0xFFFF, cmd,
> + ws->tx_base + SSOW_LF_GWS_TAG,
> + flags | NIX_TX_VWQE_F);
> + } else {
> + cn10k_sso_vwqe_split_tx(
> + mbufs, meta & 0xFFFF, cmd, lmt_id, lmt_addr,
> + ev->sched_type, ws->tx_base, txq_data, flags);
> + }
> + rte_mempool_put(rte_mempool_from_obj(ev->vec), ev->vec);
> + return (meta & 0xFFFF);
> + }
> +
> + m = ev->mbuf;
> + ref_cnt = m->refcnt;
> txq = cn10k_sso_hws_xtract_meta(m, txq_data);
> cn10k_nix_tx_skeleton(txq, cmd, flags);
> /* Perform header writes before barrier for TSO */
> @@ -346,7 +437,7 @@ cn10k_sso_hws_event_tx(struct cn10k_sso_hws *ws, struct rte_event *ev,
> pa = txq->io_addr | (cn10k_nix_tx_ext_subs(flags) + 1) << 4;
> }
> if (!ev->sched_type)
> - cnxk_sso_hws_head_wait(ws->tx_base + SSOW_LF_GWS_TAG);
> + roc_sso_hws_head_wait(ws->tx_base + SSOW_LF_GWS_TAG);
>
> roc_lmt_submit_steorl(lmt_id, pa);
>
> @@ -357,7 +448,6 @@ cn10k_sso_hws_event_tx(struct cn10k_sso_hws *ws, struct rte_event *ev,
>
> cnxk_sso_hws_swtag_flush(ws->tx_base + SSOW_LF_GWS_TAG,
> ws->tx_base + SSOW_LF_GWS_OP_SWTAG_FLUSH);
> -
> return 1;
> }
>
> diff --git a/drivers/event/cnxk/cn9k_worker.h b/drivers/event/cnxk/cn9k_worker.h
> index 3f9751211a..cc1e141957 100644
> --- a/drivers/event/cnxk/cn9k_worker.h
> +++ b/drivers/event/cnxk/cn9k_worker.h
> @@ -466,7 +466,7 @@ cn9k_sso_hws_event_tx(uint64_t base, struct rte_event *ev, uint64_t *cmd,
> const uint16_t segdw = cn9k_nix_prepare_mseg(m, cmd, flags);
> if (!CNXK_TT_FROM_EVENT(ev->event)) {
> cn9k_nix_xmit_mseg_prep_lmt(cmd, txq->lmt_addr, segdw);
> - cnxk_sso_hws_head_wait(base + SSOW_LF_GWS_TAG);
> + roc_sso_hws_head_wait(base + SSOW_LF_GWS_TAG);
> cn9k_sso_txq_fc_wait(txq);
> if (cn9k_nix_xmit_submit_lmt(txq->io_addr) == 0)
> cn9k_nix_xmit_mseg_one(cmd, txq->lmt_addr,
> @@ -478,7 +478,7 @@ cn9k_sso_hws_event_tx(uint64_t base, struct rte_event *ev, uint64_t *cmd,
> } else {
> if (!CNXK_TT_FROM_EVENT(ev->event)) {
> cn9k_nix_xmit_prep_lmt(cmd, txq->lmt_addr, flags);
> - cnxk_sso_hws_head_wait(base + SSOW_LF_GWS_TAG);
> + roc_sso_hws_head_wait(base + SSOW_LF_GWS_TAG);
> cn9k_sso_txq_fc_wait(txq);
> if (cn9k_nix_xmit_submit_lmt(txq->io_addr) == 0)
> cn9k_nix_xmit_one(cmd, txq->lmt_addr,
> diff --git a/drivers/event/cnxk/cnxk_worker.h b/drivers/event/cnxk/cnxk_worker.h
> index 7891b749df..9f9ceab8a1 100644
> --- a/drivers/event/cnxk/cnxk_worker.h
> +++ b/drivers/event/cnxk/cnxk_worker.h
> @@ -75,26 +75,4 @@ cnxk_sso_hws_swtag_wait(uintptr_t tag_op)
> #endif
> }
>
> -static __rte_always_inline void
> -cnxk_sso_hws_head_wait(uintptr_t tag_op)
> -{
> -#ifdef RTE_ARCH_ARM64
> - uint64_t tag;
> -
> - asm volatile(" ldr %[tag], [%[tag_op]] \n"
> - " tbnz %[tag], 35, done%= \n"
> - " sevl \n"
> - "rty%=: wfe \n"
> - " ldr %[tag], [%[tag_op]] \n"
> - " tbz %[tag], 35, rty%= \n"
> - "done%=: \n"
> - : [tag] "=&r"(tag)
> - : [tag_op] "r"(tag_op));
> -#else
> - /* Wait for the HEAD to be set */
> - while (!(plt_read64(tag_op) & BIT_ULL(35)))
> - ;
> -#endif
> -}
> -
> #endif
> diff --git a/drivers/net/cnxk/cn10k_tx.c b/drivers/net/cnxk/cn10k_tx.c
> index 1f30bab59a..0e1276c60b 100644
> --- a/drivers/net/cnxk/cn10k_tx.c
> +++ b/drivers/net/cnxk/cn10k_tx.c
> @@ -16,7 +16,7 @@
> !((flags) & NIX_TX_OFFLOAD_L3_L4_CSUM_F)) \
> return 0; \
> return cn10k_nix_xmit_pkts(tx_queue, tx_pkts, pkts, cmd, \
> - flags); \
> + 0, flags); \
> }
>
> NIX_TX_FASTPATH_MODES
> diff --git a/drivers/net/cnxk/cn10k_tx.h b/drivers/net/cnxk/cn10k_tx.h
> index eb148b8e77..f75cae07ae 100644
> --- a/drivers/net/cnxk/cn10k_tx.h
> +++ b/drivers/net/cnxk/cn10k_tx.h
> @@ -18,6 +18,7 @@
> * Defining it from backwards to denote its been
> * not used as offload flags to pick function
> */
> +#define NIX_TX_VWQE_F BIT(14)
> #define NIX_TX_MULTI_SEG_F BIT(15)
>
> #define NIX_TX_NEED_SEND_HDR_W1 \
> @@ -519,7 +520,7 @@ cn10k_nix_prepare_mseg(struct rte_mbuf *m, uint64_t *cmd, const uint16_t flags)
>
> static __rte_always_inline uint16_t
> cn10k_nix_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts,
> - uint64_t *cmd, const uint16_t flags)
> + uint64_t *cmd, uintptr_t base, const uint16_t flags)
> {
> struct cn10k_eth_txq *txq = tx_queue;
> const rte_iova_t io_addr = txq->io_addr;
> @@ -528,14 +529,15 @@ cn10k_nix_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts,
> uint64_t lso_tun_fmt;
> uint64_t data;
>
> - NIX_XMIT_FC_OR_RETURN(txq, pkts);
> + if (!(flags & NIX_TX_VWQE_F)) {
> + NIX_XMIT_FC_OR_RETURN(txq, pkts);
> + /* Reduce the cached count */
> + txq->fc_cache_pkts -= pkts;
> + }
>
> /* Get cmd skeleton */
> cn10k_nix_tx_skeleton(txq, cmd, flags);
>
> - /* Reduce the cached count */
> - txq->fc_cache_pkts -= pkts;
> -
> if (flags & NIX_TX_OFFLOAD_TSO_F)
> lso_tun_fmt = txq->lso_tun_fmt;
>
> @@ -558,6 +560,9 @@ cn10k_nix_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts,
> lmt_addr += (1ULL << ROC_LMT_LINE_SIZE_LOG2);
> }
>
> + if (flags & NIX_TX_VWQE_F)
> + roc_sso_hws_head_wait(base);
> +
> /* Trigger LMTST */
> if (burst > 16) {
> data = cn10k_nix_tx_steor_data(flags);
> @@ -604,7 +609,8 @@ cn10k_nix_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts,
>
> static __rte_always_inline uint16_t
> cn10k_nix_xmit_pkts_mseg(void *tx_queue, struct rte_mbuf **tx_pkts,
> - uint16_t pkts, uint64_t *cmd, const uint16_t flags)
> + uint16_t pkts, uint64_t *cmd, uintptr_t base,
> + const uint16_t flags)
> {
> struct cn10k_eth_txq *txq = tx_queue;
> uintptr_t pa0, pa1, lmt_addr = txq->lmt_base;
> @@ -652,6 +658,9 @@ cn10k_nix_xmit_pkts_mseg(void *tx_queue, struct rte_mbuf **tx_pkts,
> shft += 3;
> }
>
> + if (flags & NIX_TX_VWQE_F)
> + roc_sso_hws_head_wait(base);
> +
> data0 = (uint64_t)data128;
> data1 = (uint64_t)(data128 >> 64);
> /* Make data0 similar to data1 */
> @@ -984,7 +993,8 @@ cn10k_nix_prep_lmt_mseg_vector(struct rte_mbuf **mbufs, uint64x2_t *cmd0,
>
> static __rte_always_inline uint16_t
> cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
> - uint16_t pkts, uint64_t *cmd, const uint16_t flags)
> + uint16_t pkts, uint64_t *cmd, uintptr_t base,
> + const uint16_t flags)
> {
> uint64x2_t dataoff_iova0, dataoff_iova1, dataoff_iova2, dataoff_iova3;
> uint64x2_t len_olflags0, len_olflags1, len_olflags2, len_olflags3;
> @@ -1013,13 +1023,17 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
> uint64_t data[2];
> } wd;
>
> - NIX_XMIT_FC_OR_RETURN(txq, pkts);
> -
> - scalar = pkts & (NIX_DESCS_PER_LOOP - 1);
> - pkts = RTE_ALIGN_FLOOR(pkts, NIX_DESCS_PER_LOOP);
> + if (!(flags & NIX_TX_VWQE_F)) {
> + NIX_XMIT_FC_OR_RETURN(txq, pkts);
> + scalar = pkts & (NIX_DESCS_PER_LOOP - 1);
> + pkts = RTE_ALIGN_FLOOR(pkts, NIX_DESCS_PER_LOOP);
> + /* Reduce the cached count */
> + txq->fc_cache_pkts -= pkts;
> + } else {
> + scalar = pkts & (NIX_DESCS_PER_LOOP - 1);
> + pkts = RTE_ALIGN_FLOOR(pkts, NIX_DESCS_PER_LOOP);
> + }
>
> - /* Reduce the cached count */
> - txq->fc_cache_pkts -= pkts;
> /* Perform header writes before barrier for TSO */
> if (flags & NIX_TX_OFFLOAD_TSO_F) {
> for (i = 0; i < pkts; i++)
> @@ -1973,6 +1987,9 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
> if (flags & NIX_TX_MULTI_SEG_F)
> wd.data[0] >>= 16;
>
> + if (flags & NIX_TX_VWQE_F)
> + roc_sso_hws_head_wait(base);
> +
> /* Trigger LMTST */
> if (lnum > 16) {
> if (!(flags & NIX_TX_MULTI_SEG_F))
> @@ -2029,10 +2046,11 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
> if (unlikely(scalar)) {
> if (flags & NIX_TX_MULTI_SEG_F)
> pkts += cn10k_nix_xmit_pkts_mseg(tx_queue, tx_pkts,
> - scalar, cmd, flags);
> + scalar, cmd, base,
> + flags);
> else
> pkts += cn10k_nix_xmit_pkts(tx_queue, tx_pkts, scalar,
> - cmd, flags);
> + cmd, base, flags);
> }
>
> return pkts;
> @@ -2041,13 +2059,15 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
> #else
> static __rte_always_inline uint16_t
> cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
> - uint16_t pkts, uint64_t *cmd, const uint16_t flags)
> + uint16_t pkts, uint64_t *cmd, uintptr_t base,
> + const uint16_t flags)
> {
> RTE_SET_USED(tx_queue);
> RTE_SET_USED(tx_pkts);
> RTE_SET_USED(pkts);
> RTE_SET_USED(cmd);
> RTE_SET_USED(flags);
> + RTE_SET_USED(base);
> return 0;
> }
> #endif
> diff --git a/drivers/net/cnxk/cn10k_tx_mseg.c b/drivers/net/cnxk/cn10k_tx_mseg.c
> index 33f6754722..4ea4c8a4e5 100644
> --- a/drivers/net/cnxk/cn10k_tx_mseg.c
> +++ b/drivers/net/cnxk/cn10k_tx_mseg.c
> @@ -18,7 +18,8 @@
> !((flags) & NIX_TX_OFFLOAD_L3_L4_CSUM_F)) \
> return 0; \
> return cn10k_nix_xmit_pkts_mseg(tx_queue, tx_pkts, pkts, cmd, \
> - (flags) | NIX_TX_MULTI_SEG_F); \
> + 0, (flags) \
> + | NIX_TX_MULTI_SEG_F); \
> }
>
> NIX_TX_FASTPATH_MODES
> diff --git a/drivers/net/cnxk/cn10k_tx_vec.c b/drivers/net/cnxk/cn10k_tx_vec.c
> index 34e3737501..a0350496ab 100644
> --- a/drivers/net/cnxk/cn10k_tx_vec.c
> +++ b/drivers/net/cnxk/cn10k_tx_vec.c
> @@ -18,7 +18,7 @@
> !((flags) & NIX_TX_OFFLOAD_L3_L4_CSUM_F)) \
> return 0; \
> return cn10k_nix_xmit_pkts_vector(tx_queue, tx_pkts, pkts, cmd,\
> - (flags)); \
> + 0, (flags)); \
> }
>
> NIX_TX_FASTPATH_MODES
> diff --git a/drivers/net/cnxk/cn10k_tx_vec_mseg.c b/drivers/net/cnxk/cn10k_tx_vec_mseg.c
> index 1fad81dbad..7f98f79b97 100644
> --- a/drivers/net/cnxk/cn10k_tx_vec_mseg.c
> +++ b/drivers/net/cnxk/cn10k_tx_vec_mseg.c
> @@ -16,7 +16,7 @@
> !((flags) & NIX_TX_OFFLOAD_L3_L4_CSUM_F)) \
> return 0; \
> return cn10k_nix_xmit_pkts_vector( \
> - tx_queue, tx_pkts, pkts, cmd, \
> + tx_queue, tx_pkts, pkts, cmd, 0, \
> (flags) | NIX_TX_MULTI_SEG_F); \
> }
>
> --
> 2.17.1
>
@@ -44,6 +44,29 @@ struct roc_sso {
uint8_t reserved[ROC_SSO_MEM_SZ] __plt_cache_aligned;
} __plt_cache_aligned;
+static __plt_always_inline void
+roc_sso_hws_head_wait(uintptr_t tag_op)
+{
+#ifdef RTE_ARCH_ARM64
+ uint64_t tag;
+
+ asm volatile(PLT_CPU_FEATURE_PREAMBLE
+ " ldr %[tag], [%[tag_op]] \n"
+ " tbnz %[tag], 35, done%= \n"
+ " sevl \n"
+ "rty%=: wfe \n"
+ " ldr %[tag], [%[tag_op]] \n"
+ " tbz %[tag], 35, rty%= \n"
+ "done%=: \n"
+ : [tag] "=&r"(tag)
+ : [tag_op] "r"(tag_op));
+#else
+ /* Wait for the SWTAG/SWTAG_FULL operation */
+ while (!(plt_read64(tag_op) & BIT_ULL(35)))
+ ;
+#endif
+}
+
/* SSO device initialization */
int __roc_api roc_sso_dev_init(struct roc_sso *roc_sso);
int __roc_api roc_sso_dev_fini(struct roc_sso *roc_sso);
@@ -782,7 +782,8 @@ cn10k_sso_tx_adapter_caps_get(const struct rte_eventdev *dev,
if (ret)
*caps = 0;
else
- *caps = RTE_EVENT_ETH_TX_ADAPTER_CAP_INTERNAL_PORT;
+ *caps = RTE_EVENT_ETH_TX_ADAPTER_CAP_INTERNAL_PORT |
+ RTE_EVENT_ETH_TX_ADAPTER_CAP_EVENT_VECTOR;
return 0;
}
@@ -308,29 +308,120 @@ uint16_t __rte_hot cn10k_sso_hws_enq_fwd_burst(void *port,
NIX_RX_FASTPATH_MODES
#undef R
-static __rte_always_inline const struct cn10k_eth_txq *
+static __rte_always_inline struct cn10k_eth_txq *
cn10k_sso_hws_xtract_meta(struct rte_mbuf *m,
const uint64_t txq_data[][RTE_MAX_QUEUES_PER_PORT])
{
- return (const struct cn10k_eth_txq *)
+ return (struct cn10k_eth_txq *)
txq_data[m->port][rte_event_eth_tx_adapter_txq_get(m)];
}
+static __rte_always_inline void
+cn10k_sso_vwqe_split_tx(struct rte_mbuf **mbufs, uint16_t nb_mbufs,
+ uint64_t *cmd, uint16_t lmt_id, uintptr_t lmt_addr,
+ uint8_t sched_type, uintptr_t base,
+ const uint64_t txq_data[][RTE_MAX_QUEUES_PER_PORT],
+ const uint32_t flags)
+{
+ uint16_t port[4], queue[4];
+ struct cn10k_eth_txq *txq;
+ uint16_t i, j;
+ uintptr_t pa;
+
+ for (i = 0; i < nb_mbufs; i += 4) {
+ port[0] = mbufs[i]->port;
+ port[1] = mbufs[i + 1]->port;
+ port[2] = mbufs[i + 2]->port;
+ port[3] = mbufs[i + 3]->port;
+
+ queue[0] = rte_event_eth_tx_adapter_txq_get(mbufs[i]);
+ queue[1] = rte_event_eth_tx_adapter_txq_get(mbufs[i + 1]);
+ queue[2] = rte_event_eth_tx_adapter_txq_get(mbufs[i + 2]);
+ queue[3] = rte_event_eth_tx_adapter_txq_get(mbufs[i + 3]);
+
+ if (((port[0] ^ port[1]) & (port[2] ^ port[3])) ||
+ ((queue[0] ^ queue[1]) & (queue[2] ^ queue[3]))) {
+
+ for (j = 0; j < 4; j++) {
+ struct rte_mbuf *m = mbufs[i + j];
+
+ txq = (struct cn10k_eth_txq *)
+ txq_data[port[j]][queue[j]];
+ cn10k_nix_tx_skeleton(txq, cmd, flags);
+ /* Perform header writes before barrier
+ * for TSO
+ */
+ if (flags & NIX_TX_OFFLOAD_TSO_F)
+ cn10k_nix_xmit_prepare_tso(m, flags);
+
+ cn10k_nix_xmit_prepare(m, cmd, lmt_addr, flags,
+ txq->lso_tun_fmt);
+ if (flags & NIX_TX_MULTI_SEG_F) {
+ const uint16_t segdw =
+ cn10k_nix_prepare_mseg(
+ m, (uint64_t *)lmt_addr,
+ flags);
+ pa = txq->io_addr | ((segdw - 1) << 4);
+ } else {
+ pa = txq->io_addr |
+ (cn10k_nix_tx_ext_subs(flags) + 1)
+ << 4;
+ }
+ if (!sched_type)
+ roc_sso_hws_head_wait(base +
+ SSOW_LF_GWS_TAG);
+
+ roc_lmt_submit_steorl(lmt_id, pa);
+ }
+ } else {
+ txq = (struct cn10k_eth_txq *)
+ txq_data[port[0]][queue[0]];
+ cn10k_nix_xmit_pkts_vector(txq, &mbufs[i], 4, cmd, base
+ + SSOW_LF_GWS_TAG,
+ flags | NIX_TX_VWQE_F);
+ }
+ }
+}
+
static __rte_always_inline uint16_t
cn10k_sso_hws_event_tx(struct cn10k_sso_hws *ws, struct rte_event *ev,
uint64_t *cmd,
const uint64_t txq_data[][RTE_MAX_QUEUES_PER_PORT],
const uint32_t flags)
{
- const struct cn10k_eth_txq *txq;
- struct rte_mbuf *m = ev->mbuf;
- uint16_t ref_cnt = m->refcnt;
+ struct cn10k_eth_txq *txq;
+ struct rte_mbuf *m;
uintptr_t lmt_addr;
+ uint16_t ref_cnt;
uint16_t lmt_id;
uintptr_t pa;
lmt_addr = ws->lmt_base;
ROC_LMT_BASE_ID_GET(lmt_addr, lmt_id);
+
+ if (ev->event_type & RTE_EVENT_TYPE_VECTOR) {
+ struct rte_mbuf **mbufs = ev->vec->mbufs;
+ uint64_t meta = *(uint64_t *)ev->vec;
+
+ if (meta & BIT(31)) {
+ txq = (struct cn10k_eth_txq *)
+ txq_data[meta >> 32][meta >> 48];
+
+ cn10k_nix_xmit_pkts_vector(
+ txq, mbufs, meta & 0xFFFF, cmd,
+ ws->tx_base + SSOW_LF_GWS_TAG,
+ flags | NIX_TX_VWQE_F);
+ } else {
+ cn10k_sso_vwqe_split_tx(
+ mbufs, meta & 0xFFFF, cmd, lmt_id, lmt_addr,
+ ev->sched_type, ws->tx_base, txq_data, flags);
+ }
+ rte_mempool_put(rte_mempool_from_obj(ev->vec), ev->vec);
+ return (meta & 0xFFFF);
+ }
+
+ m = ev->mbuf;
+ ref_cnt = m->refcnt;
txq = cn10k_sso_hws_xtract_meta(m, txq_data);
cn10k_nix_tx_skeleton(txq, cmd, flags);
/* Perform header writes before barrier for TSO */
@@ -346,7 +437,7 @@ cn10k_sso_hws_event_tx(struct cn10k_sso_hws *ws, struct rte_event *ev,
pa = txq->io_addr | (cn10k_nix_tx_ext_subs(flags) + 1) << 4;
}
if (!ev->sched_type)
- cnxk_sso_hws_head_wait(ws->tx_base + SSOW_LF_GWS_TAG);
+ roc_sso_hws_head_wait(ws->tx_base + SSOW_LF_GWS_TAG);
roc_lmt_submit_steorl(lmt_id, pa);
@@ -357,7 +448,6 @@ cn10k_sso_hws_event_tx(struct cn10k_sso_hws *ws, struct rte_event *ev,
cnxk_sso_hws_swtag_flush(ws->tx_base + SSOW_LF_GWS_TAG,
ws->tx_base + SSOW_LF_GWS_OP_SWTAG_FLUSH);
-
return 1;
}
@@ -466,7 +466,7 @@ cn9k_sso_hws_event_tx(uint64_t base, struct rte_event *ev, uint64_t *cmd,
const uint16_t segdw = cn9k_nix_prepare_mseg(m, cmd, flags);
if (!CNXK_TT_FROM_EVENT(ev->event)) {
cn9k_nix_xmit_mseg_prep_lmt(cmd, txq->lmt_addr, segdw);
- cnxk_sso_hws_head_wait(base + SSOW_LF_GWS_TAG);
+ roc_sso_hws_head_wait(base + SSOW_LF_GWS_TAG);
cn9k_sso_txq_fc_wait(txq);
if (cn9k_nix_xmit_submit_lmt(txq->io_addr) == 0)
cn9k_nix_xmit_mseg_one(cmd, txq->lmt_addr,
@@ -478,7 +478,7 @@ cn9k_sso_hws_event_tx(uint64_t base, struct rte_event *ev, uint64_t *cmd,
} else {
if (!CNXK_TT_FROM_EVENT(ev->event)) {
cn9k_nix_xmit_prep_lmt(cmd, txq->lmt_addr, flags);
- cnxk_sso_hws_head_wait(base + SSOW_LF_GWS_TAG);
+ roc_sso_hws_head_wait(base + SSOW_LF_GWS_TAG);
cn9k_sso_txq_fc_wait(txq);
if (cn9k_nix_xmit_submit_lmt(txq->io_addr) == 0)
cn9k_nix_xmit_one(cmd, txq->lmt_addr,
@@ -75,26 +75,4 @@ cnxk_sso_hws_swtag_wait(uintptr_t tag_op)
#endif
}
-static __rte_always_inline void
-cnxk_sso_hws_head_wait(uintptr_t tag_op)
-{
-#ifdef RTE_ARCH_ARM64
- uint64_t tag;
-
- asm volatile(" ldr %[tag], [%[tag_op]] \n"
- " tbnz %[tag], 35, done%= \n"
- " sevl \n"
- "rty%=: wfe \n"
- " ldr %[tag], [%[tag_op]] \n"
- " tbz %[tag], 35, rty%= \n"
- "done%=: \n"
- : [tag] "=&r"(tag)
- : [tag_op] "r"(tag_op));
-#else
- /* Wait for the HEAD to be set */
- while (!(plt_read64(tag_op) & BIT_ULL(35)))
- ;
-#endif
-}
-
#endif
@@ -16,7 +16,7 @@
!((flags) & NIX_TX_OFFLOAD_L3_L4_CSUM_F)) \
return 0; \
return cn10k_nix_xmit_pkts(tx_queue, tx_pkts, pkts, cmd, \
- flags); \
+ 0, flags); \
}
NIX_TX_FASTPATH_MODES
@@ -18,6 +18,7 @@
* Defining it from backwards to denote its been
* not used as offload flags to pick function
*/
+#define NIX_TX_VWQE_F BIT(14)
#define NIX_TX_MULTI_SEG_F BIT(15)
#define NIX_TX_NEED_SEND_HDR_W1 \
@@ -519,7 +520,7 @@ cn10k_nix_prepare_mseg(struct rte_mbuf *m, uint64_t *cmd, const uint16_t flags)
static __rte_always_inline uint16_t
cn10k_nix_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts,
- uint64_t *cmd, const uint16_t flags)
+ uint64_t *cmd, uintptr_t base, const uint16_t flags)
{
struct cn10k_eth_txq *txq = tx_queue;
const rte_iova_t io_addr = txq->io_addr;
@@ -528,14 +529,15 @@ cn10k_nix_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts,
uint64_t lso_tun_fmt;
uint64_t data;
- NIX_XMIT_FC_OR_RETURN(txq, pkts);
+ if (!(flags & NIX_TX_VWQE_F)) {
+ NIX_XMIT_FC_OR_RETURN(txq, pkts);
+ /* Reduce the cached count */
+ txq->fc_cache_pkts -= pkts;
+ }
/* Get cmd skeleton */
cn10k_nix_tx_skeleton(txq, cmd, flags);
- /* Reduce the cached count */
- txq->fc_cache_pkts -= pkts;
-
if (flags & NIX_TX_OFFLOAD_TSO_F)
lso_tun_fmt = txq->lso_tun_fmt;
@@ -558,6 +560,9 @@ cn10k_nix_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts,
lmt_addr += (1ULL << ROC_LMT_LINE_SIZE_LOG2);
}
+ if (flags & NIX_TX_VWQE_F)
+ roc_sso_hws_head_wait(base);
+
/* Trigger LMTST */
if (burst > 16) {
data = cn10k_nix_tx_steor_data(flags);
@@ -604,7 +609,8 @@ cn10k_nix_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts,
static __rte_always_inline uint16_t
cn10k_nix_xmit_pkts_mseg(void *tx_queue, struct rte_mbuf **tx_pkts,
- uint16_t pkts, uint64_t *cmd, const uint16_t flags)
+ uint16_t pkts, uint64_t *cmd, uintptr_t base,
+ const uint16_t flags)
{
struct cn10k_eth_txq *txq = tx_queue;
uintptr_t pa0, pa1, lmt_addr = txq->lmt_base;
@@ -652,6 +658,9 @@ cn10k_nix_xmit_pkts_mseg(void *tx_queue, struct rte_mbuf **tx_pkts,
shft += 3;
}
+ if (flags & NIX_TX_VWQE_F)
+ roc_sso_hws_head_wait(base);
+
data0 = (uint64_t)data128;
data1 = (uint64_t)(data128 >> 64);
/* Make data0 similar to data1 */
@@ -984,7 +993,8 @@ cn10k_nix_prep_lmt_mseg_vector(struct rte_mbuf **mbufs, uint64x2_t *cmd0,
static __rte_always_inline uint16_t
cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
- uint16_t pkts, uint64_t *cmd, const uint16_t flags)
+ uint16_t pkts, uint64_t *cmd, uintptr_t base,
+ const uint16_t flags)
{
uint64x2_t dataoff_iova0, dataoff_iova1, dataoff_iova2, dataoff_iova3;
uint64x2_t len_olflags0, len_olflags1, len_olflags2, len_olflags3;
@@ -1013,13 +1023,17 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
uint64_t data[2];
} wd;
- NIX_XMIT_FC_OR_RETURN(txq, pkts);
-
- scalar = pkts & (NIX_DESCS_PER_LOOP - 1);
- pkts = RTE_ALIGN_FLOOR(pkts, NIX_DESCS_PER_LOOP);
+ if (!(flags & NIX_TX_VWQE_F)) {
+ NIX_XMIT_FC_OR_RETURN(txq, pkts);
+ scalar = pkts & (NIX_DESCS_PER_LOOP - 1);
+ pkts = RTE_ALIGN_FLOOR(pkts, NIX_DESCS_PER_LOOP);
+ /* Reduce the cached count */
+ txq->fc_cache_pkts -= pkts;
+ } else {
+ scalar = pkts & (NIX_DESCS_PER_LOOP - 1);
+ pkts = RTE_ALIGN_FLOOR(pkts, NIX_DESCS_PER_LOOP);
+ }
- /* Reduce the cached count */
- txq->fc_cache_pkts -= pkts;
/* Perform header writes before barrier for TSO */
if (flags & NIX_TX_OFFLOAD_TSO_F) {
for (i = 0; i < pkts; i++)
@@ -1973,6 +1987,9 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
if (flags & NIX_TX_MULTI_SEG_F)
wd.data[0] >>= 16;
+ if (flags & NIX_TX_VWQE_F)
+ roc_sso_hws_head_wait(base);
+
/* Trigger LMTST */
if (lnum > 16) {
if (!(flags & NIX_TX_MULTI_SEG_F))
@@ -2029,10 +2046,11 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
if (unlikely(scalar)) {
if (flags & NIX_TX_MULTI_SEG_F)
pkts += cn10k_nix_xmit_pkts_mseg(tx_queue, tx_pkts,
- scalar, cmd, flags);
+ scalar, cmd, base,
+ flags);
else
pkts += cn10k_nix_xmit_pkts(tx_queue, tx_pkts, scalar,
- cmd, flags);
+ cmd, base, flags);
}
return pkts;
@@ -2041,13 +2059,15 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
#else
static __rte_always_inline uint16_t
cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
- uint16_t pkts, uint64_t *cmd, const uint16_t flags)
+ uint16_t pkts, uint64_t *cmd, uintptr_t base,
+ const uint16_t flags)
{
RTE_SET_USED(tx_queue);
RTE_SET_USED(tx_pkts);
RTE_SET_USED(pkts);
RTE_SET_USED(cmd);
RTE_SET_USED(flags);
+ RTE_SET_USED(base);
return 0;
}
#endif
@@ -18,7 +18,8 @@
!((flags) & NIX_TX_OFFLOAD_L3_L4_CSUM_F)) \
return 0; \
return cn10k_nix_xmit_pkts_mseg(tx_queue, tx_pkts, pkts, cmd, \
- (flags) | NIX_TX_MULTI_SEG_F); \
+ 0, (flags) \
+ | NIX_TX_MULTI_SEG_F); \
}
NIX_TX_FASTPATH_MODES
@@ -18,7 +18,7 @@
!((flags) & NIX_TX_OFFLOAD_L3_L4_CSUM_F)) \
return 0; \
return cn10k_nix_xmit_pkts_vector(tx_queue, tx_pkts, pkts, cmd,\
- (flags)); \
+ 0, (flags)); \
}
NIX_TX_FASTPATH_MODES
@@ -16,7 +16,7 @@
!((flags) & NIX_TX_OFFLOAD_L3_L4_CSUM_F)) \
return 0; \
return cn10k_nix_xmit_pkts_vector( \
- tx_queue, tx_pkts, pkts, cmd, \
+ tx_queue, tx_pkts, pkts, cmd, 0, \
(flags) | NIX_TX_MULTI_SEG_F); \
}