[v2,3/3] event/cnxk: use WFE in Tx fc wait

Message ID 20230613092548.1315-3-pbhagavatula@marvell.com (mailing list archive)
State Accepted, archived
Delegated to: Jerin Jacob
Headers
Series [v2,1/3] event/cnxk: align TX queue buffer adjustment |

Checks

Context Check Description
ci/checkpatch warning coding style issues
ci/loongarch-compilation success Compilation OK
ci/loongarch-unit-testing success Unit Testing PASS
ci/Intel-compilation success Compilation OK
ci/github-robot: build success github build: passed
ci/iol-mellanox-Performance success Performance Testing PASS
ci/intel-Testing success Testing PASS
ci/iol-aarch-unit-testing success Testing PASS
ci/iol-abi-testing success Testing PASS
ci/intel-Functional success Functional PASS
ci/iol-x86_64-compile-testing success Testing PASS
ci/iol-testing success Testing PASS
ci/iol-x86_64-unit-testing success Testing PASS
ci/iol-broadcom-Performance success Performance Testing PASS
ci/iol-intel-Performance success Performance Testing PASS
ci/iol-broadcom-Functional success Functional Testing PASS
ci/iol-unit-testing fail Testing issues
ci/iol-intel-Functional success Functional Testing PASS

Commit Message

Pavan Nikhilesh Bhagavatula June 13, 2023, 9:25 a.m. UTC
  From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Use WFE is Tx path when waiting for space in the Tx queue.
Depending upon the Tx queue contention and size, WFE will
reduce the cache pressure and power consumption.
In multi-core scenarios we have observed up to 8W power reduction.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 drivers/event/cnxk/cn10k_tx_worker.h |  18 ++++
 drivers/net/cnxk/cn10k_tx.h          | 152 +++++++++++++++++++++++----
 2 files changed, 147 insertions(+), 23 deletions(-)
  

Comments

Jerin Jacob June 14, 2023, 10:33 a.m. UTC | #1
On Tue, Jun 13, 2023 at 2:56 PM <pbhagavatula@marvell.com> wrote:
>
> From: Pavan Nikhilesh <pbhagavatula@marvell.com>
>
> Use WFE is Tx path when waiting for space in the Tx queue.
> Depending upon the Tx queue contention and size, WFE will
> reduce the cache pressure and power consumption.
> In multi-core scenarios we have observed up to 8W power reduction.
>
> Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>

Series Applied to dpdk-next-net-eventdev/for-main. Thanks

> ---
>  drivers/event/cnxk/cn10k_tx_worker.h |  18 ++++
>  drivers/net/cnxk/cn10k_tx.h          | 152 +++++++++++++++++++++++----
>  2 files changed, 147 insertions(+), 23 deletions(-)
>
> diff --git a/drivers/event/cnxk/cn10k_tx_worker.h b/drivers/event/cnxk/cn10k_tx_worker.h
> index b6c9bb1d26..dea6cdcde2 100644
> --- a/drivers/event/cnxk/cn10k_tx_worker.h
> +++ b/drivers/event/cnxk/cn10k_tx_worker.h
> @@ -24,9 +24,27 @@ cn10k_sso_hws_xtract_meta(struct rte_mbuf *m, const uint64_t *txq_data)
>  static __rte_always_inline void
>  cn10k_sso_txq_fc_wait(const struct cn10k_eth_txq *txq)
>  {
> +#ifdef RTE_ARCH_ARM64
> +       uint64_t space;
> +
> +       asm volatile(PLT_CPU_FEATURE_PREAMBLE
> +                    "          ldxr %[space], [%[addr]]                \n"
> +                    "          cmp %[adj], %[space]                    \n"
> +                    "          b.hi .Ldne%=                            \n"
> +                    "          sevl                                    \n"
> +                    ".Lrty%=:  wfe                                     \n"
> +                    "          ldxr %[space], [%[addr]]                \n"
> +                    "          cmp %[adj], %[space]                    \n"
> +                    "          b.ls .Lrty%=                            \n"
> +                    ".Ldne%=:                                          \n"
> +                    : [space] "=&r"(space)
> +                    : [adj] "r"(txq->nb_sqb_bufs_adj), [addr] "r"(txq->fc_mem)
> +                    : "memory");
> +#else
>         while ((uint64_t)txq->nb_sqb_bufs_adj <=
>                __atomic_load_n(txq->fc_mem, __ATOMIC_RELAXED))
>                 ;
> +#endif
>  }
>
>  static __rte_always_inline int32_t
> diff --git a/drivers/net/cnxk/cn10k_tx.h b/drivers/net/cnxk/cn10k_tx.h
> index a365cbe0ee..d0e8350ce2 100644
> --- a/drivers/net/cnxk/cn10k_tx.h
> +++ b/drivers/net/cnxk/cn10k_tx.h
> @@ -102,27 +102,72 @@ cn10k_nix_tx_mbuf_validate(struct rte_mbuf *m, const uint32_t flags)
>  }
>
>  static __plt_always_inline void
> -cn10k_nix_vwqe_wait_fc(struct cn10k_eth_txq *txq, int64_t req)
> +cn10k_nix_vwqe_wait_fc(struct cn10k_eth_txq *txq, uint16_t req)
>  {
>         int64_t cached, refill;
> +       int64_t pkts;
>
>  retry:
> +#ifdef RTE_ARCH_ARM64
> +
> +       asm volatile(PLT_CPU_FEATURE_PREAMBLE
> +                    "          ldxr %[pkts], [%[addr]]                 \n"
> +                    "          tbz %[pkts], 63, .Ldne%=                \n"
> +                    "          sevl                                    \n"
> +                    ".Lrty%=:  wfe                                     \n"
> +                    "          ldxr %[pkts], [%[addr]]                 \n"
> +                    "          tbnz %[pkts], 63, .Lrty%=               \n"
> +                    ".Ldne%=:                                          \n"
> +                    : [pkts] "=&r"(pkts)
> +                    : [addr] "r"(&txq->fc_cache_pkts)
> +                    : "memory");
> +#else
> +       RTE_SET_USED(pkts);
>         while (__atomic_load_n(&txq->fc_cache_pkts, __ATOMIC_RELAXED) < 0)
>                 ;
> +#endif
>         cached = __atomic_fetch_sub(&txq->fc_cache_pkts, req, __ATOMIC_ACQUIRE) - req;
>         /* Check if there is enough space, else update and retry. */
> -       if (cached < 0) {
> -               /* Check if we have space else retry. */
> -               do {
> -                       refill = txq->nb_sqb_bufs_adj -
> -                                __atomic_load_n(txq->fc_mem, __ATOMIC_RELAXED);
> -                       refill = (refill << txq->sqes_per_sqb_log2) - refill;
> -               } while (refill <= 0);
> -               __atomic_compare_exchange(&txq->fc_cache_pkts, &cached, &refill,
> -                                         0, __ATOMIC_RELEASE,
> -                                         __ATOMIC_RELAXED);
> +       if (cached >= 0)
> +               return;
> +
> +       /* Check if we have space else retry. */
> +#ifdef RTE_ARCH_ARM64
> +       int64_t val;
> +
> +       asm volatile(PLT_CPU_FEATURE_PREAMBLE
> +                    "          ldxr %[val], [%[addr]]                  \n"
> +                    "          sub %[val], %[adj], %[val]              \n"
> +                    "          lsl %[refill], %[val], %[shft]          \n"
> +                    "          sub %[refill], %[refill], %[val]        \n"
> +                    "          sub %[refill], %[refill], %[sub]        \n"
> +                    "          cmp %[refill], #0x0                     \n"
> +                    "          b.ge .Ldne%=                            \n"
> +                    "          sevl                                    \n"
> +                    ".Lrty%=:  wfe                                     \n"
> +                    "          ldxr %[val], [%[addr]]                  \n"
> +                    "          sub %[val], %[adj], %[val]              \n"
> +                    "          lsl %[refill], %[val], %[shft]          \n"
> +                    "          sub %[refill], %[refill], %[val]        \n"
> +                    "          sub %[refill], %[refill], %[sub]        \n"
> +                    "          cmp %[refill], #0x0                     \n"
> +                    "          b.lt .Lrty%=                            \n"
> +                    ".Ldne%=:                                          \n"
> +                    : [refill] "=&r"(refill), [val] "=&r" (val)
> +                    : [addr] "r"(txq->fc_mem), [adj] "r"(txq->nb_sqb_bufs_adj),
> +                      [shft] "r"(txq->sqes_per_sqb_log2), [sub] "r"(req)
> +                    : "memory");
> +#else
> +       do {
> +               refill = (txq->nb_sqb_bufs_adj - __atomic_load_n(txq->fc_mem, __ATOMIC_RELAXED));
> +               refill = (refill << txq->sqes_per_sqb_log2) - refill;
> +               refill -= req;
> +       } while (refill < 0);
> +#endif
> +       if (!__atomic_compare_exchange(&txq->fc_cache_pkts, &cached, &refill,
> +                                 0, __ATOMIC_RELEASE,
> +                                 __ATOMIC_RELAXED))
>                 goto retry;
> -       }
>  }
>
>  /* Function to determine no of tx subdesc required in case ext
> @@ -283,10 +328,27 @@ static __rte_always_inline void
>  cn10k_nix_sec_fc_wait_one(struct cn10k_eth_txq *txq)
>  {
>         uint64_t nb_desc = txq->cpt_desc;
> -       uint64_t *fc = txq->cpt_fc;
> -
> -       while (nb_desc <= __atomic_load_n(fc, __ATOMIC_RELAXED))
> +       uint64_t fc;
> +
> +#ifdef RTE_ARCH_ARM64
> +       asm volatile(PLT_CPU_FEATURE_PREAMBLE
> +                    "          ldxr %[space], [%[addr]]                \n"
> +                    "          cmp %[nb_desc], %[space]                \n"
> +                    "          b.hi .Ldne%=                            \n"
> +                    "          sevl                                    \n"
> +                    ".Lrty%=:  wfe                                     \n"
> +                    "          ldxr %[space], [%[addr]]                \n"
> +                    "          cmp %[nb_desc], %[space]                \n"
> +                    "          b.ls .Lrty%=                            \n"
> +                    ".Ldne%=:                                          \n"
> +                    : [space] "=&r"(fc)
> +                    : [nb_desc] "r"(nb_desc), [addr] "r"(txq->cpt_fc)
> +                    : "memory");
> +#else
> +       RTE_SET_USED(fc);
> +       while (nb_desc <= __atomic_load_n(txq->cpt_fc, __ATOMIC_RELAXED))
>                 ;
> +#endif
>  }
>
>  static __rte_always_inline void
> @@ -294,7 +356,7 @@ cn10k_nix_sec_fc_wait(struct cn10k_eth_txq *txq, uint16_t nb_pkts)
>  {
>         int32_t nb_desc, val, newval;
>         int32_t *fc_sw;
> -       volatile uint64_t *fc;
> +       uint64_t *fc;
>
>         /* Check if there is any CPT instruction to submit */
>         if (!nb_pkts)
> @@ -302,21 +364,59 @@ cn10k_nix_sec_fc_wait(struct cn10k_eth_txq *txq, uint16_t nb_pkts)
>
>  again:
>         fc_sw = txq->cpt_fc_sw;
> -       val = __atomic_fetch_sub(fc_sw, nb_pkts, __ATOMIC_RELAXED) - nb_pkts;
> +#ifdef RTE_ARCH_ARM64
> +       asm volatile(PLT_CPU_FEATURE_PREAMBLE
> +                    "          ldxr %w[pkts], [%[addr]]                \n"
> +                    "          tbz %w[pkts], 31, .Ldne%=               \n"
> +                    "          sevl                                    \n"
> +                    ".Lrty%=:  wfe                                     \n"
> +                    "          ldxr %w[pkts], [%[addr]]                \n"
> +                    "          tbnz %w[pkts], 31, .Lrty%=              \n"
> +                    ".Ldne%=:                                          \n"
> +                    : [pkts] "=&r"(val)
> +                    : [addr] "r"(fc_sw)
> +                    : "memory");
> +#else
> +       /* Wait for primary core to refill FC. */
> +       while (__atomic_load_n(fc_sw, __ATOMIC_RELAXED) < 0)
> +               ;
> +#endif
> +
> +       val = __atomic_fetch_sub(fc_sw, nb_pkts, __ATOMIC_ACQUIRE) - nb_pkts;
>         if (likely(val >= 0))
>                 return;
>
>         nb_desc = txq->cpt_desc;
>         fc = txq->cpt_fc;
> +#ifdef RTE_ARCH_ARM64
> +       asm volatile(PLT_CPU_FEATURE_PREAMBLE
> +                    "          ldxr %[refill], [%[addr]]               \n"
> +                    "          sub %[refill], %[desc], %[refill]       \n"
> +                    "          sub %[refill], %[refill], %[pkts]       \n"
> +                    "          cmp %[refill], #0x0                     \n"
> +                    "          b.ge .Ldne%=                            \n"
> +                    "          sevl                                    \n"
> +                    ".Lrty%=:  wfe                                     \n"
> +                    "          ldxr %[refill], [%[addr]]               \n"
> +                    "          sub %[refill], %[desc], %[refill]       \n"
> +                    "          sub %[refill], %[refill], %[pkts]       \n"
> +                    "          cmp %[refill], #0x0                     \n"
> +                    "          b.lt .Lrty%=                            \n"
> +                    ".Ldne%=:                                          \n"
> +                    : [refill] "=&r"(newval)
> +                    : [addr] "r"(fc), [desc] "r"(nb_desc), [pkts] "r"(nb_pkts)
> +                    : "memory");
> +#else
>         while (true) {
>                 newval = nb_desc - __atomic_load_n(fc, __ATOMIC_RELAXED);
>                 newval -= nb_pkts;
>                 if (newval >= 0)
>                         break;
>         }
> +#endif
>
> -       if (!__atomic_compare_exchange_n(fc_sw, &val, newval, false,
> -                                        __ATOMIC_RELAXED, __ATOMIC_RELAXED))
> +       if (!__atomic_compare_exchange_n(fc_sw, &val, newval, false, __ATOMIC_RELEASE,
> +                                        __ATOMIC_RELAXED))
>                 goto again;
>  }
>
> @@ -3033,10 +3133,16 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, uint64_t *ws,
>                 wd.data[1] |= ((uint64_t)(lnum - 17)) << 12;
>                 wd.data[1] |= (uint64_t)(lmt_id + 16);
>
> -               if (flags & NIX_TX_VWQE_F)
> -                       cn10k_nix_vwqe_wait_fc(txq,
> -                               burst - (cn10k_nix_pkts_per_vec_brst(flags) >>
> -                                        1));
> +               if (flags & NIX_TX_VWQE_F) {
> +                       if (flags & NIX_TX_MULTI_SEG_F) {
> +                               if (burst - (cn10k_nix_pkts_per_vec_brst(flags) >> 1) > 0)
> +                                       cn10k_nix_vwqe_wait_fc(txq,
> +                                               burst - (cn10k_nix_pkts_per_vec_brst(flags) >> 1));
> +                       } else {
> +                               cn10k_nix_vwqe_wait_fc(txq,
> +                                               burst - (cn10k_nix_pkts_per_vec_brst(flags) >> 1));
> +                       }
> +               }
>                 /* STEOR1 */
>                 roc_lmt_submit_steorl(wd.data[1], pa);
>         } else if (lnum) {
> --
> 2.25.1
>
  
Patrick Robb June 14, 2023, 6:27 p.m. UTC | #2
Hello Pavan and Jerin,

The Community Lab's CI testing failed this patchseries on clang compile on
ARM systems. That wasn't properly reported to Patchwork due to issues with
our reporting process, which we are resolving currently. This updated
report show the failed compile. Apologies for the incomplete initial
report.

http://mails.dpdk.org/archives/test-report/2023-June/411303.html

On Wed, Jun 14, 2023 at 6:33 AM Jerin Jacob <jerinjacobk@gmail.com> wrote:

> On Tue, Jun 13, 2023 at 2:56 PM <pbhagavatula@marvell.com> wrote:
> >
> > From: Pavan Nikhilesh <pbhagavatula@marvell.com>
> >
> > Use WFE is Tx path when waiting for space in the Tx queue.
> > Depending upon the Tx queue contention and size, WFE will
> > reduce the cache pressure and power consumption.
> > In multi-core scenarios we have observed up to 8W power reduction.
> >
> > Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
>
> Series Applied to dpdk-next-net-eventdev/for-main. Thanks
>
> > ---
> >  drivers/event/cnxk/cn10k_tx_worker.h |  18 ++++
> >  drivers/net/cnxk/cn10k_tx.h          | 152 +++++++++++++++++++++++----
> >  2 files changed, 147 insertions(+), 23 deletions(-)
> >
> > diff --git a/drivers/event/cnxk/cn10k_tx_worker.h
> b/drivers/event/cnxk/cn10k_tx_worker.h
> > index b6c9bb1d26..dea6cdcde2 100644
> > --- a/drivers/event/cnxk/cn10k_tx_worker.h
> > +++ b/drivers/event/cnxk/cn10k_tx_worker.h
> > @@ -24,9 +24,27 @@ cn10k_sso_hws_xtract_meta(struct rte_mbuf *m, const
> uint64_t *txq_data)
> >  static __rte_always_inline void
> >  cn10k_sso_txq_fc_wait(const struct cn10k_eth_txq *txq)
> >  {
> > +#ifdef RTE_ARCH_ARM64
> > +       uint64_t space;
> > +
> > +       asm volatile(PLT_CPU_FEATURE_PREAMBLE
> > +                    "          ldxr %[space], [%[addr]]
> \n"
> > +                    "          cmp %[adj], %[space]
> \n"
> > +                    "          b.hi .Ldne%=
> \n"
> > +                    "          sevl
> \n"
> > +                    ".Lrty%=:  wfe
>  \n"
> > +                    "          ldxr %[space], [%[addr]]
> \n"
> > +                    "          cmp %[adj], %[space]
> \n"
> > +                    "          b.ls .Lrty%=
> \n"
> > +                    ".Ldne%=:
> \n"
> > +                    : [space] "=&r"(space)
> > +                    : [adj] "r"(txq->nb_sqb_bufs_adj), [addr]
> "r"(txq->fc_mem)
> > +                    : "memory");
> > +#else
> >         while ((uint64_t)txq->nb_sqb_bufs_adj <=
> >                __atomic_load_n(txq->fc_mem, __ATOMIC_RELAXED))
> >                 ;
> > +#endif
> >  }
> >
> >  static __rte_always_inline int32_t
> > diff --git a/drivers/net/cnxk/cn10k_tx.h b/drivers/net/cnxk/cn10k_tx.h
> > index a365cbe0ee..d0e8350ce2 100644
> > --- a/drivers/net/cnxk/cn10k_tx.h
> > +++ b/drivers/net/cnxk/cn10k_tx.h
> > @@ -102,27 +102,72 @@ cn10k_nix_tx_mbuf_validate(struct rte_mbuf *m,
> const uint32_t flags)
> >  }
> >
> >  static __plt_always_inline void
> > -cn10k_nix_vwqe_wait_fc(struct cn10k_eth_txq *txq, int64_t req)
> > +cn10k_nix_vwqe_wait_fc(struct cn10k_eth_txq *txq, uint16_t req)
> >  {
> >         int64_t cached, refill;
> > +       int64_t pkts;
> >
> >  retry:
> > +#ifdef RTE_ARCH_ARM64
> > +
> > +       asm volatile(PLT_CPU_FEATURE_PREAMBLE
> > +                    "          ldxr %[pkts], [%[addr]]
>  \n"
> > +                    "          tbz %[pkts], 63, .Ldne%=
> \n"
> > +                    "          sevl
> \n"
> > +                    ".Lrty%=:  wfe
>  \n"
> > +                    "          ldxr %[pkts], [%[addr]]
>  \n"
> > +                    "          tbnz %[pkts], 63, .Lrty%=
>  \n"
> > +                    ".Ldne%=:
> \n"
> > +                    : [pkts] "=&r"(pkts)
> > +                    : [addr] "r"(&txq->fc_cache_pkts)
> > +                    : "memory");
> > +#else
> > +       RTE_SET_USED(pkts);
> >         while (__atomic_load_n(&txq->fc_cache_pkts, __ATOMIC_RELAXED) <
> 0)
> >                 ;
> > +#endif
> >         cached = __atomic_fetch_sub(&txq->fc_cache_pkts, req,
> __ATOMIC_ACQUIRE) - req;
> >         /* Check if there is enough space, else update and retry. */
> > -       if (cached < 0) {
> > -               /* Check if we have space else retry. */
> > -               do {
> > -                       refill = txq->nb_sqb_bufs_adj -
> > -                                __atomic_load_n(txq->fc_mem,
> __ATOMIC_RELAXED);
> > -                       refill = (refill << txq->sqes_per_sqb_log2) -
> refill;
> > -               } while (refill <= 0);
> > -               __atomic_compare_exchange(&txq->fc_cache_pkts, &cached,
> &refill,
> > -                                         0, __ATOMIC_RELEASE,
> > -                                         __ATOMIC_RELAXED);
> > +       if (cached >= 0)
> > +               return;
> > +
> > +       /* Check if we have space else retry. */
> > +#ifdef RTE_ARCH_ARM64
> > +       int64_t val;
> > +
> > +       asm volatile(PLT_CPU_FEATURE_PREAMBLE
> > +                    "          ldxr %[val], [%[addr]]
> \n"
> > +                    "          sub %[val], %[adj], %[val]
> \n"
> > +                    "          lsl %[refill], %[val], %[shft]
> \n"
> > +                    "          sub %[refill], %[refill], %[val]
> \n"
> > +                    "          sub %[refill], %[refill], %[sub]
> \n"
> > +                    "          cmp %[refill], #0x0
>  \n"
> > +                    "          b.ge .Ldne%=
> \n"
> > +                    "          sevl
> \n"
> > +                    ".Lrty%=:  wfe
>  \n"
> > +                    "          ldxr %[val], [%[addr]]
> \n"
> > +                    "          sub %[val], %[adj], %[val]
> \n"
> > +                    "          lsl %[refill], %[val], %[shft]
> \n"
> > +                    "          sub %[refill], %[refill], %[val]
> \n"
> > +                    "          sub %[refill], %[refill], %[sub]
> \n"
> > +                    "          cmp %[refill], #0x0
>  \n"
> > +                    "          b.lt .Lrty%=
> \n"
> > +                    ".Ldne%=:
> \n"
> > +                    : [refill] "=&r"(refill), [val] "=&r" (val)
> > +                    : [addr] "r"(txq->fc_mem), [adj]
> "r"(txq->nb_sqb_bufs_adj),
> > +                      [shft] "r"(txq->sqes_per_sqb_log2), [sub] "r"(req)
> > +                    : "memory");
> > +#else
> > +       do {
> > +               refill = (txq->nb_sqb_bufs_adj -
> __atomic_load_n(txq->fc_mem, __ATOMIC_RELAXED));
> > +               refill = (refill << txq->sqes_per_sqb_log2) - refill;
> > +               refill -= req;
> > +       } while (refill < 0);
> > +#endif
> > +       if (!__atomic_compare_exchange(&txq->fc_cache_pkts, &cached,
> &refill,
> > +                                 0, __ATOMIC_RELEASE,
> > +                                 __ATOMIC_RELAXED))
> >                 goto retry;
> > -       }
> >  }
> >
> >  /* Function to determine no of tx subdesc required in case ext
> > @@ -283,10 +328,27 @@ static __rte_always_inline void
> >  cn10k_nix_sec_fc_wait_one(struct cn10k_eth_txq *txq)
> >  {
> >         uint64_t nb_desc = txq->cpt_desc;
> > -       uint64_t *fc = txq->cpt_fc;
> > -
> > -       while (nb_desc <= __atomic_load_n(fc, __ATOMIC_RELAXED))
> > +       uint64_t fc;
> > +
> > +#ifdef RTE_ARCH_ARM64
> > +       asm volatile(PLT_CPU_FEATURE_PREAMBLE
> > +                    "          ldxr %[space], [%[addr]]
> \n"
> > +                    "          cmp %[nb_desc], %[space]
> \n"
> > +                    "          b.hi .Ldne%=
> \n"
> > +                    "          sevl
> \n"
> > +                    ".Lrty%=:  wfe
>  \n"
> > +                    "          ldxr %[space], [%[addr]]
> \n"
> > +                    "          cmp %[nb_desc], %[space]
> \n"
> > +                    "          b.ls .Lrty%=
> \n"
> > +                    ".Ldne%=:
> \n"
> > +                    : [space] "=&r"(fc)
> > +                    : [nb_desc] "r"(nb_desc), [addr] "r"(txq->cpt_fc)
> > +                    : "memory");
> > +#else
> > +       RTE_SET_USED(fc);
> > +       while (nb_desc <= __atomic_load_n(txq->cpt_fc, __ATOMIC_RELAXED))
> >                 ;
> > +#endif
> >  }
> >
> >  static __rte_always_inline void
> > @@ -294,7 +356,7 @@ cn10k_nix_sec_fc_wait(struct cn10k_eth_txq *txq,
> uint16_t nb_pkts)
> >  {
> >         int32_t nb_desc, val, newval;
> >         int32_t *fc_sw;
> > -       volatile uint64_t *fc;
> > +       uint64_t *fc;
> >
> >         /* Check if there is any CPT instruction to submit */
> >         if (!nb_pkts)
> > @@ -302,21 +364,59 @@ cn10k_nix_sec_fc_wait(struct cn10k_eth_txq *txq,
> uint16_t nb_pkts)
> >
> >  again:
> >         fc_sw = txq->cpt_fc_sw;
> > -       val = __atomic_fetch_sub(fc_sw, nb_pkts, __ATOMIC_RELAXED) -
> nb_pkts;
> > +#ifdef RTE_ARCH_ARM64
> > +       asm volatile(PLT_CPU_FEATURE_PREAMBLE
> > +                    "          ldxr %w[pkts], [%[addr]]
> \n"
> > +                    "          tbz %w[pkts], 31, .Ldne%=
>  \n"
> > +                    "          sevl
> \n"
> > +                    ".Lrty%=:  wfe
>  \n"
> > +                    "          ldxr %w[pkts], [%[addr]]
> \n"
> > +                    "          tbnz %w[pkts], 31, .Lrty%=
> \n"
> > +                    ".Ldne%=:
> \n"
> > +                    : [pkts] "=&r"(val)
> > +                    : [addr] "r"(fc_sw)
> > +                    : "memory");
> > +#else
> > +       /* Wait for primary core to refill FC. */
> > +       while (__atomic_load_n(fc_sw, __ATOMIC_RELAXED) < 0)
> > +               ;
> > +#endif
> > +
> > +       val = __atomic_fetch_sub(fc_sw, nb_pkts, __ATOMIC_ACQUIRE) -
> nb_pkts;
> >         if (likely(val >= 0))
> >                 return;
> >
> >         nb_desc = txq->cpt_desc;
> >         fc = txq->cpt_fc;
> > +#ifdef RTE_ARCH_ARM64
> > +       asm volatile(PLT_CPU_FEATURE_PREAMBLE
> > +                    "          ldxr %[refill], [%[addr]]
>  \n"
> > +                    "          sub %[refill], %[desc], %[refill]
>  \n"
> > +                    "          sub %[refill], %[refill], %[pkts]
>  \n"
> > +                    "          cmp %[refill], #0x0
>  \n"
> > +                    "          b.ge .Ldne%=
> \n"
> > +                    "          sevl
> \n"
> > +                    ".Lrty%=:  wfe
>  \n"
> > +                    "          ldxr %[refill], [%[addr]]
>  \n"
> > +                    "          sub %[refill], %[desc], %[refill]
>  \n"
> > +                    "          sub %[refill], %[refill], %[pkts]
>  \n"
> > +                    "          cmp %[refill], #0x0
>  \n"
> > +                    "          b.lt .Lrty%=
> \n"
> > +                    ".Ldne%=:
> \n"
> > +                    : [refill] "=&r"(newval)
> > +                    : [addr] "r"(fc), [desc] "r"(nb_desc), [pkts]
> "r"(nb_pkts)
> > +                    : "memory");
> > +#else
> >         while (true) {
> >                 newval = nb_desc - __atomic_load_n(fc, __ATOMIC_RELAXED);
> >                 newval -= nb_pkts;
> >                 if (newval >= 0)
> >                         break;
> >         }
> > +#endif
> >
> > -       if (!__atomic_compare_exchange_n(fc_sw, &val, newval, false,
> > -                                        __ATOMIC_RELAXED,
> __ATOMIC_RELAXED))
> > +       if (!__atomic_compare_exchange_n(fc_sw, &val, newval, false,
> __ATOMIC_RELEASE,
> > +                                        __ATOMIC_RELAXED))
> >                 goto again;
> >  }
> >
> > @@ -3033,10 +3133,16 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue,
> uint64_t *ws,
> >                 wd.data[1] |= ((uint64_t)(lnum - 17)) << 12;
> >                 wd.data[1] |= (uint64_t)(lmt_id + 16);
> >
> > -               if (flags & NIX_TX_VWQE_F)
> > -                       cn10k_nix_vwqe_wait_fc(txq,
> > -                               burst -
> (cn10k_nix_pkts_per_vec_brst(flags) >>
> > -                                        1));
> > +               if (flags & NIX_TX_VWQE_F) {
> > +                       if (flags & NIX_TX_MULTI_SEG_F) {
> > +                               if (burst -
> (cn10k_nix_pkts_per_vec_brst(flags) >> 1) > 0)
> > +                                       cn10k_nix_vwqe_wait_fc(txq,
> > +                                               burst -
> (cn10k_nix_pkts_per_vec_brst(flags) >> 1));
> > +                       } else {
> > +                               cn10k_nix_vwqe_wait_fc(txq,
> > +                                               burst -
> (cn10k_nix_pkts_per_vec_brst(flags) >> 1));
> > +                       }
> > +               }
> >                 /* STEOR1 */
> >                 roc_lmt_submit_steorl(wd.data[1], pa);
> >         } else if (lnum) {
> > --
> > 2.25.1
> >
>
  
Pavan Nikhilesh Bhagavatula June 14, 2023, 8:24 p.m. UTC | #3
Thanks Patrick, 

I have sent out a fix.
https://mails.dpdk.org/archives/dev/2023-June/271209.html

Pavan. 

From: Patrick Robb <probb@iol.unh.edu> 
Sent: Wednesday, June 14, 2023 11:57 PM
To: Jerin Jacob <jerinjacobk@gmail.com>
Cc: Pavan Nikhilesh Bhagavatula <pbhagavatula@marvell.com>; Jerin Jacob Kollanukkaran <jerinj@marvell.com>; Shijith Thotton <sthotton@marvell.com>; Nithin Kumar Dabilpuram <ndabilpuram@marvell.com>; Kiran Kumar Kokkilagadda <kirankumark@marvell.com>; Sunil Kumar Kori <skori@marvell.com>; Satha Koteswara Rao Kottidi <skoteshwar@marvell.com>; dev@dpdk.org
Subject: [EXT] Re: [PATCH v2 3/3] event/cnxk: use WFE in Tx fc wait

External Email
  
Jerin Jacob Kollanukkaran June 15, 2023, 5:49 a.m. UTC | #4
Fix merged.
  
Stephen Hemminger June 15, 2023, 3:28 p.m. UTC | #5
On Tue, 13 Jun 2023 14:55:48 +0530
<pbhagavatula@marvell.com> wrote:

>  static __rte_always_inline void
>  cn10k_sso_txq_fc_wait(const struct cn10k_eth_txq *txq)
>  {
> +#ifdef RTE_ARCH_ARM64
> +	uint64_t space;
> +
> +	asm volatile(PLT_CPU_FEATURE_PREAMBLE
> +		     "		ldxr %[space], [%[addr]]		\n"
> +		     "		cmp %[adj], %[space] 			\n"
> +		     "		b.hi .Ldne%=				\n"
> +		     "		sevl					\n"
> +		     ".Lrty%=:	wfe					\n"
> +		     "		ldxr %[space], [%[addr]]		\n"
> +		     "		cmp %[adj], %[space]			\n"
> +		     "		b.ls .Lrty%=				\n"
> +		     ".Ldne%=:						\n"
> +		     : [space] "=&r"(space)
> +		     : [adj] "r"(txq->nb_sqb_bufs_adj), [addr] "r"(txq->fc_mem)
> +		     : "memory");
> +#else
>  	while ((uint64_t)txq->nb_sqb_bufs_adj <=
>  	       __atomic_load_n(txq->fc_mem, __ATOMIC_RELAXED))
>  		;
> +#endif
>  }

Rather than introduce assembly here, please extend existing rte_pause
functions and then other drivers could benefit and it would fit
existing WFE usage.

Something like:

static __rte_always_inline void
rte_wait_until_le_32(volatile uint32_t *addr, uint32_t expected,
		int memorder);

Direct assembly in drivers is hard to maintain and should be forbidden.
  
Pavan Nikhilesh Bhagavatula June 16, 2023, 6:46 a.m. UTC | #6
=
> On Tue, 13 Jun 2023 14:55:48 +0530
> <pbhagavatula@marvell.com> wrote:
> 
> >  static __rte_always_inline void
> >  cn10k_sso_txq_fc_wait(const struct cn10k_eth_txq *txq)
> >  {
> > +#ifdef RTE_ARCH_ARM64
> > +	uint64_t space;
> > +
> > +	asm volatile(PLT_CPU_FEATURE_PREAMBLE
> > +		     "		ldxr %[space], [%[addr]]		\n"
> > +		     "		cmp %[adj], %[space] 			\n"
> > +		     "		b.hi .Ldne%=				\n"
> > +		     "		sevl					\n"
> > +		     ".Lrty%=:	wfe					\n"
> > +		     "		ldxr %[space], [%[addr]]		\n"
> > +		     "		cmp %[adj], %[space]			\n"
> > +		     "		b.ls .Lrty%=				\n"
> > +		     ".Ldne%=:						\n"
> > +		     : [space] "=&r"(space)
> > +		     : [adj] "r"(txq->nb_sqb_bufs_adj), [addr] "r"(txq-
> >fc_mem)
> > +		     : "memory");
> > +#else
> >  	while ((uint64_t)txq->nb_sqb_bufs_adj <=
> >  	       __atomic_load_n(txq->fc_mem, __ATOMIC_RELAXED))
> >  		;
> > +#endif
> >  }
> 
> Rather than introduce assembly here, please extend existing rte_pause
> functions and then other drivers could benefit and it would fit
> existing WFE usage.
> 

Yes, a future patch is planned to replace this and other occurrences with 
RTE_WAIT_UNTIL_MASKED

> Something like:
> 
> static __rte_always_inline void
> rte_wait_until_le_32(volatile uint32_t *addr, uint32_t expected,
> 		int memorder);
> 
> Direct assembly in drivers is hard to maintain and should be forbidden.
  

Patch

diff --git a/drivers/event/cnxk/cn10k_tx_worker.h b/drivers/event/cnxk/cn10k_tx_worker.h
index b6c9bb1d26..dea6cdcde2 100644
--- a/drivers/event/cnxk/cn10k_tx_worker.h
+++ b/drivers/event/cnxk/cn10k_tx_worker.h
@@ -24,9 +24,27 @@  cn10k_sso_hws_xtract_meta(struct rte_mbuf *m, const uint64_t *txq_data)
 static __rte_always_inline void
 cn10k_sso_txq_fc_wait(const struct cn10k_eth_txq *txq)
 {
+#ifdef RTE_ARCH_ARM64
+	uint64_t space;
+
+	asm volatile(PLT_CPU_FEATURE_PREAMBLE
+		     "		ldxr %[space], [%[addr]]		\n"
+		     "		cmp %[adj], %[space] 			\n"
+		     "		b.hi .Ldne%=				\n"
+		     "		sevl					\n"
+		     ".Lrty%=:	wfe					\n"
+		     "		ldxr %[space], [%[addr]]		\n"
+		     "		cmp %[adj], %[space]			\n"
+		     "		b.ls .Lrty%=				\n"
+		     ".Ldne%=:						\n"
+		     : [space] "=&r"(space)
+		     : [adj] "r"(txq->nb_sqb_bufs_adj), [addr] "r"(txq->fc_mem)
+		     : "memory");
+#else
 	while ((uint64_t)txq->nb_sqb_bufs_adj <=
 	       __atomic_load_n(txq->fc_mem, __ATOMIC_RELAXED))
 		;
+#endif
 }
 
 static __rte_always_inline int32_t
diff --git a/drivers/net/cnxk/cn10k_tx.h b/drivers/net/cnxk/cn10k_tx.h
index a365cbe0ee..d0e8350ce2 100644
--- a/drivers/net/cnxk/cn10k_tx.h
+++ b/drivers/net/cnxk/cn10k_tx.h
@@ -102,27 +102,72 @@  cn10k_nix_tx_mbuf_validate(struct rte_mbuf *m, const uint32_t flags)
 }
 
 static __plt_always_inline void
-cn10k_nix_vwqe_wait_fc(struct cn10k_eth_txq *txq, int64_t req)
+cn10k_nix_vwqe_wait_fc(struct cn10k_eth_txq *txq, uint16_t req)
 {
 	int64_t cached, refill;
+	int64_t pkts;
 
 retry:
+#ifdef RTE_ARCH_ARM64
+
+	asm volatile(PLT_CPU_FEATURE_PREAMBLE
+		     "		ldxr %[pkts], [%[addr]]			\n"
+		     "		tbz %[pkts], 63, .Ldne%=		\n"
+		     "		sevl					\n"
+		     ".Lrty%=:	wfe					\n"
+		     "		ldxr %[pkts], [%[addr]]			\n"
+		     "		tbnz %[pkts], 63, .Lrty%=		\n"
+		     ".Ldne%=:						\n"
+		     : [pkts] "=&r"(pkts)
+		     : [addr] "r"(&txq->fc_cache_pkts)
+		     : "memory");
+#else
+	RTE_SET_USED(pkts);
 	while (__atomic_load_n(&txq->fc_cache_pkts, __ATOMIC_RELAXED) < 0)
 		;
+#endif
 	cached = __atomic_fetch_sub(&txq->fc_cache_pkts, req, __ATOMIC_ACQUIRE) - req;
 	/* Check if there is enough space, else update and retry. */
-	if (cached < 0) {
-		/* Check if we have space else retry. */
-		do {
-			refill = txq->nb_sqb_bufs_adj -
-				 __atomic_load_n(txq->fc_mem, __ATOMIC_RELAXED);
-			refill = (refill << txq->sqes_per_sqb_log2) - refill;
-		} while (refill <= 0);
-		__atomic_compare_exchange(&txq->fc_cache_pkts, &cached, &refill,
-					  0, __ATOMIC_RELEASE,
-					  __ATOMIC_RELAXED);
+	if (cached >= 0)
+		return;
+
+	/* Check if we have space else retry. */
+#ifdef RTE_ARCH_ARM64
+	int64_t val;
+
+	asm volatile(PLT_CPU_FEATURE_PREAMBLE
+		     "		ldxr %[val], [%[addr]]			\n"
+		     "		sub %[val], %[adj], %[val]		\n"
+		     "		lsl %[refill], %[val], %[shft]		\n"
+		     "		sub %[refill], %[refill], %[val]	\n"
+		     "		sub %[refill], %[refill], %[sub]	\n"
+		     "		cmp %[refill], #0x0			\n"
+		     "		b.ge .Ldne%=				\n"
+		     "		sevl					\n"
+		     ".Lrty%=:	wfe					\n"
+		     "		ldxr %[val], [%[addr]]			\n"
+		     "		sub %[val], %[adj], %[val]		\n"
+		     "		lsl %[refill], %[val], %[shft]		\n"
+		     "		sub %[refill], %[refill], %[val]	\n"
+		     "		sub %[refill], %[refill], %[sub]	\n"
+		     "		cmp %[refill], #0x0			\n"
+		     "		b.lt .Lrty%=				\n"
+		     ".Ldne%=:						\n"
+		     : [refill] "=&r"(refill), [val] "=&r" (val)
+		     : [addr] "r"(txq->fc_mem), [adj] "r"(txq->nb_sqb_bufs_adj),
+		       [shft] "r"(txq->sqes_per_sqb_log2), [sub] "r"(req)
+		     : "memory");
+#else
+	do {
+		refill = (txq->nb_sqb_bufs_adj - __atomic_load_n(txq->fc_mem, __ATOMIC_RELAXED));
+		refill = (refill << txq->sqes_per_sqb_log2) - refill;
+		refill -= req;
+	} while (refill < 0);
+#endif
+	if (!__atomic_compare_exchange(&txq->fc_cache_pkts, &cached, &refill,
+				  0, __ATOMIC_RELEASE,
+				  __ATOMIC_RELAXED))
 		goto retry;
-	}
 }
 
 /* Function to determine no of tx subdesc required in case ext
@@ -283,10 +328,27 @@  static __rte_always_inline void
 cn10k_nix_sec_fc_wait_one(struct cn10k_eth_txq *txq)
 {
 	uint64_t nb_desc = txq->cpt_desc;
-	uint64_t *fc = txq->cpt_fc;
-
-	while (nb_desc <= __atomic_load_n(fc, __ATOMIC_RELAXED))
+	uint64_t fc;
+
+#ifdef RTE_ARCH_ARM64
+	asm volatile(PLT_CPU_FEATURE_PREAMBLE
+		     "		ldxr %[space], [%[addr]]		\n"
+		     "		cmp %[nb_desc], %[space]		\n"
+		     "		b.hi .Ldne%=				\n"
+		     "		sevl					\n"
+		     ".Lrty%=:	wfe					\n"
+		     "		ldxr %[space], [%[addr]]		\n"
+		     "		cmp %[nb_desc], %[space]		\n"
+		     "		b.ls .Lrty%=				\n"
+		     ".Ldne%=:						\n"
+		     : [space] "=&r"(fc)
+		     : [nb_desc] "r"(nb_desc), [addr] "r"(txq->cpt_fc)
+		     : "memory");
+#else
+	RTE_SET_USED(fc);
+	while (nb_desc <= __atomic_load_n(txq->cpt_fc, __ATOMIC_RELAXED))
 		;
+#endif
 }
 
 static __rte_always_inline void
@@ -294,7 +356,7 @@  cn10k_nix_sec_fc_wait(struct cn10k_eth_txq *txq, uint16_t nb_pkts)
 {
 	int32_t nb_desc, val, newval;
 	int32_t *fc_sw;
-	volatile uint64_t *fc;
+	uint64_t *fc;
 
 	/* Check if there is any CPT instruction to submit */
 	if (!nb_pkts)
@@ -302,21 +364,59 @@  cn10k_nix_sec_fc_wait(struct cn10k_eth_txq *txq, uint16_t nb_pkts)
 
 again:
 	fc_sw = txq->cpt_fc_sw;
-	val = __atomic_fetch_sub(fc_sw, nb_pkts, __ATOMIC_RELAXED) - nb_pkts;
+#ifdef RTE_ARCH_ARM64
+	asm volatile(PLT_CPU_FEATURE_PREAMBLE
+		     "		ldxr %w[pkts], [%[addr]]		\n"
+		     "		tbz %w[pkts], 31, .Ldne%=		\n"
+		     "		sevl					\n"
+		     ".Lrty%=:	wfe					\n"
+		     "		ldxr %w[pkts], [%[addr]]		\n"
+		     "		tbnz %w[pkts], 31, .Lrty%=		\n"
+		     ".Ldne%=:						\n"
+		     : [pkts] "=&r"(val)
+		     : [addr] "r"(fc_sw)
+		     : "memory");
+#else
+	/* Wait for primary core to refill FC. */
+	while (__atomic_load_n(fc_sw, __ATOMIC_RELAXED) < 0)
+		;
+#endif
+
+	val = __atomic_fetch_sub(fc_sw, nb_pkts, __ATOMIC_ACQUIRE) - nb_pkts;
 	if (likely(val >= 0))
 		return;
 
 	nb_desc = txq->cpt_desc;
 	fc = txq->cpt_fc;
+#ifdef RTE_ARCH_ARM64
+	asm volatile(PLT_CPU_FEATURE_PREAMBLE
+		     "		ldxr %[refill], [%[addr]]		\n"
+		     "		sub %[refill], %[desc], %[refill]	\n"
+		     "		sub %[refill], %[refill], %[pkts]	\n"
+		     "		cmp %[refill], #0x0			\n"
+		     "		b.ge .Ldne%=				\n"
+		     "		sevl					\n"
+		     ".Lrty%=:	wfe					\n"
+		     "		ldxr %[refill], [%[addr]]		\n"
+		     "		sub %[refill], %[desc], %[refill]	\n"
+		     "		sub %[refill], %[refill], %[pkts]	\n"
+		     "		cmp %[refill], #0x0			\n"
+		     "		b.lt .Lrty%=				\n"
+		     ".Ldne%=:						\n"
+		     : [refill] "=&r"(newval)
+		     : [addr] "r"(fc), [desc] "r"(nb_desc), [pkts] "r"(nb_pkts)
+		     : "memory");
+#else
 	while (true) {
 		newval = nb_desc - __atomic_load_n(fc, __ATOMIC_RELAXED);
 		newval -= nb_pkts;
 		if (newval >= 0)
 			break;
 	}
+#endif
 
-	if (!__atomic_compare_exchange_n(fc_sw, &val, newval, false,
-					 __ATOMIC_RELAXED, __ATOMIC_RELAXED))
+	if (!__atomic_compare_exchange_n(fc_sw, &val, newval, false, __ATOMIC_RELEASE,
+					 __ATOMIC_RELAXED))
 		goto again;
 }
 
@@ -3033,10 +3133,16 @@  cn10k_nix_xmit_pkts_vector(void *tx_queue, uint64_t *ws,
 		wd.data[1] |= ((uint64_t)(lnum - 17)) << 12;
 		wd.data[1] |= (uint64_t)(lmt_id + 16);
 
-		if (flags & NIX_TX_VWQE_F)
-			cn10k_nix_vwqe_wait_fc(txq,
-				burst - (cn10k_nix_pkts_per_vec_brst(flags) >>
-					 1));
+		if (flags & NIX_TX_VWQE_F) {
+			if (flags & NIX_TX_MULTI_SEG_F) {
+				if (burst - (cn10k_nix_pkts_per_vec_brst(flags) >> 1) > 0)
+					cn10k_nix_vwqe_wait_fc(txq,
+						burst - (cn10k_nix_pkts_per_vec_brst(flags) >> 1));
+			} else {
+				cn10k_nix_vwqe_wait_fc(txq,
+						burst - (cn10k_nix_pkts_per_vec_brst(flags) >> 1));
+			}
+		}
 		/* STEOR1 */
 		roc_lmt_submit_steorl(wd.data[1], pa);
 	} else if (lnum) {