[2/4] event/octeontx2: improve single flow performance

Message ID 1600196207-31258-2-git-send-email-hkalra@marvell.com (mailing list archive)
State Superseded, archived
Delegated to: Jerin Jacob
Headers
Series [1/4] event/octeontx2: add switch tag flush op |

Checks

Context Check Description
ci/checkpatch success coding style OK

Commit Message

Harman Kalra Sept. 15, 2020, 6:56 p.m. UTC
  From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Improve single flow performance by moving the point of coherence
to the end of transmit sequence.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 drivers/event/octeontx2/otx2_worker.h | 35 +++++++++++++++++----------
 drivers/net/octeontx2/otx2_tx.h       | 18 ++++++++++++++
 2 files changed, 40 insertions(+), 13 deletions(-)
  

Comments

Jerin Jacob Oct. 5, 2020, 9:29 a.m. UTC | #1
On Wed, Sep 16, 2020 at 12:27 AM Harman Kalra <hkalra@marvell.com> wrote:
>
> From: Pavan Nikhilesh <pbhagavatula@marvell.com>
>
> Improve single flow performance by moving the point of coherence
> to the end of transmit sequence.
>
> Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
> ---
>  drivers/event/octeontx2/otx2_worker.h | 35 +++++++++++++++++----------
>  drivers/net/octeontx2/otx2_tx.h       | 18 ++++++++++++++
>  2 files changed, 40 insertions(+), 13 deletions(-)

Failed[1] to apply this patch on dpdk-next-eventdev
Could you rebase this patch to dpdk-next-eventdev tree and send an
update version?

[1]
[for-main][dpdk-next-eventdev] $ git am -3
/tmp/r/2-4-event-octeontx2-improve-single-flow-performance
Applying: event/octeontx2: improve single flow performance
error: sha1 information is lacking or useless
(drivers/event/octeontx2/otx2_worker.h).
error: could not build fake ancestor
Patch failed at 0001 event/octeontx2: improve single flow performance
hint: Use 'git am --show-current-patch=diff' to see the failed patch
When you have resolved this problem, run "git am --continue".
If you prefer to skip this patch, run "git am --skip" instead.
To restore the original branch and stop patching, run "git am --abort"

>
> diff --git a/drivers/event/octeontx2/otx2_worker.h b/drivers/event/octeontx2/otx2_worker.h
> index 1bf8afedf..32d611458 100644
> --- a/drivers/event/octeontx2/otx2_worker.h
> +++ b/drivers/event/octeontx2/otx2_worker.h
> @@ -247,15 +247,6 @@ otx2_ssogws_head_wait(struct otx2_ssogws *ws)
>  #endif
>  }
>
> -static __rte_always_inline void
> -otx2_ssogws_order(struct otx2_ssogws *ws, const uint8_t wait_flag)
> -{
> -       if (wait_flag)
> -               otx2_ssogws_head_wait(ws);
> -
> -       rte_cio_wmb();
> -}
> -
>  static __rte_always_inline const struct otx2_eth_txq *
>  otx2_ssogws_xtract_meta(struct rte_mbuf *m,
>                         const uint64_t txq_data[][RTE_MAX_QUEUES_PER_PORT])
> @@ -287,10 +278,9 @@ otx2_ssogws_event_tx(struct otx2_ssogws *ws, struct rte_event ev[],
>                 return otx2_sec_event_tx(ws, ev, m, txq, flags);
>         }
>
> -       rte_prefetch_non_temporal(&txq_data[m->port][0]);
>         /* Perform header writes before barrier for TSO */
>         otx2_nix_xmit_prepare_tso(m, flags);
> -       otx2_ssogws_order(ws, !ev->sched_type);
> +       rte_cio_wmb();
>         txq = otx2_ssogws_xtract_meta(m, txq_data);
>         otx2_ssogws_prepare_pkt(txq, m, cmd, flags);
>
> @@ -298,12 +288,31 @@ otx2_ssogws_event_tx(struct otx2_ssogws *ws, struct rte_event ev[],
>                 const uint16_t segdw = otx2_nix_prepare_mseg(m, cmd, flags);
>                 otx2_nix_xmit_prepare_tstamp(cmd, &txq->cmd[0],
>                                              m->ol_flags, segdw, flags);
> -               otx2_nix_xmit_mseg_one(cmd, txq->lmt_addr, txq->io_addr, segdw);
> +               if (!ev->sched_type) {
> +                       otx2_nix_xmit_mseg_prep_lmt(cmd, txq->lmt_addr, segdw);
> +                       otx2_ssogws_head_wait(ws);
> +                       if (otx2_nix_xmit_submit_lmt(txq->io_addr) == 0)
> +                               otx2_nix_xmit_mseg_one(cmd, txq->lmt_addr,
> +                                                      txq->io_addr, segdw);
> +               } else {
> +                       otx2_nix_xmit_mseg_one(cmd, txq->lmt_addr, txq->io_addr,
> +                                              segdw);
> +               }
>         } else {
>                 /* Passing no of segdw as 4: HDR + EXT + SG + SMEM */
>                 otx2_nix_xmit_prepare_tstamp(cmd, &txq->cmd[0],
>                                              m->ol_flags, 4, flags);
> -               otx2_nix_xmit_one(cmd, txq->lmt_addr, txq->io_addr, flags);
> +
> +               if (!ev->sched_type) {
> +                       otx2_nix_xmit_prep_lmt(cmd, txq->lmt_addr, flags);
> +                       otx2_ssogws_head_wait(ws);
> +                       if (otx2_nix_xmit_submit_lmt(txq->io_addr) == 0)
> +                               otx2_nix_xmit_one(cmd, txq->lmt_addr,
> +                                                 txq->io_addr, flags);
> +               } else {
> +                       otx2_nix_xmit_one(cmd, txq->lmt_addr, txq->io_addr,
> +                                         flags);
> +               }
>         }
>
>         otx2_write64(0, ws->swtag_flush_op);
> diff --git a/drivers/net/octeontx2/otx2_tx.h b/drivers/net/octeontx2/otx2_tx.h
> index 3c4317092..caf170fd1 100644
> --- a/drivers/net/octeontx2/otx2_tx.h
> +++ b/drivers/net/octeontx2/otx2_tx.h
> @@ -383,6 +383,18 @@ otx2_nix_xmit_one(uint64_t *cmd, void *lmt_addr,
>         } while (lmt_status == 0);
>  }
>
> +static __rte_always_inline void
> +otx2_nix_xmit_prep_lmt(uint64_t *cmd, void *lmt_addr, const uint32_t flags)
> +{
> +       otx2_lmt_mov(lmt_addr, cmd, otx2_nix_tx_ext_subs(flags));
> +}
> +
> +static __rte_always_inline uint64_t
> +otx2_nix_xmit_submit_lmt(const rte_iova_t io_addr)
> +{
> +       return otx2_lmt_submit(io_addr);
> +}
> +
>  static __rte_always_inline uint16_t
>  otx2_nix_prepare_mseg(struct rte_mbuf *m, uint64_t *cmd, const uint16_t flags)
>  {
> @@ -453,6 +465,12 @@ otx2_nix_prepare_mseg(struct rte_mbuf *m, uint64_t *cmd, const uint16_t flags)
>         return segdw;
>  }
>
> +static __rte_always_inline void
> +otx2_nix_xmit_mseg_prep_lmt(uint64_t *cmd, void *lmt_addr, uint16_t segdw)
> +{
> +       otx2_lmt_mov_seg(lmt_addr, (const void *)cmd, segdw);
> +}
> +
>  static __rte_always_inline void
>  otx2_nix_xmit_mseg_one(uint64_t *cmd, void *lmt_addr,
>                        rte_iova_t io_addr, uint16_t segdw)
> --
> 2.18.0
>
  

Patch

diff --git a/drivers/event/octeontx2/otx2_worker.h b/drivers/event/octeontx2/otx2_worker.h
index 1bf8afedf..32d611458 100644
--- a/drivers/event/octeontx2/otx2_worker.h
+++ b/drivers/event/octeontx2/otx2_worker.h
@@ -247,15 +247,6 @@  otx2_ssogws_head_wait(struct otx2_ssogws *ws)
 #endif
 }
 
-static __rte_always_inline void
-otx2_ssogws_order(struct otx2_ssogws *ws, const uint8_t wait_flag)
-{
-	if (wait_flag)
-		otx2_ssogws_head_wait(ws);
-
-	rte_cio_wmb();
-}
-
 static __rte_always_inline const struct otx2_eth_txq *
 otx2_ssogws_xtract_meta(struct rte_mbuf *m,
 			const uint64_t txq_data[][RTE_MAX_QUEUES_PER_PORT])
@@ -287,10 +278,9 @@  otx2_ssogws_event_tx(struct otx2_ssogws *ws, struct rte_event ev[],
 		return otx2_sec_event_tx(ws, ev, m, txq, flags);
 	}
 
-	rte_prefetch_non_temporal(&txq_data[m->port][0]);
 	/* Perform header writes before barrier for TSO */
 	otx2_nix_xmit_prepare_tso(m, flags);
-	otx2_ssogws_order(ws, !ev->sched_type);
+	rte_cio_wmb();
 	txq = otx2_ssogws_xtract_meta(m, txq_data);
 	otx2_ssogws_prepare_pkt(txq, m, cmd, flags);
 
@@ -298,12 +288,31 @@  otx2_ssogws_event_tx(struct otx2_ssogws *ws, struct rte_event ev[],
 		const uint16_t segdw = otx2_nix_prepare_mseg(m, cmd, flags);
 		otx2_nix_xmit_prepare_tstamp(cmd, &txq->cmd[0],
 					     m->ol_flags, segdw, flags);
-		otx2_nix_xmit_mseg_one(cmd, txq->lmt_addr, txq->io_addr, segdw);
+		if (!ev->sched_type) {
+			otx2_nix_xmit_mseg_prep_lmt(cmd, txq->lmt_addr, segdw);
+			otx2_ssogws_head_wait(ws);
+			if (otx2_nix_xmit_submit_lmt(txq->io_addr) == 0)
+				otx2_nix_xmit_mseg_one(cmd, txq->lmt_addr,
+						       txq->io_addr, segdw);
+		} else {
+			otx2_nix_xmit_mseg_one(cmd, txq->lmt_addr, txq->io_addr,
+					       segdw);
+		}
 	} else {
 		/* Passing no of segdw as 4: HDR + EXT + SG + SMEM */
 		otx2_nix_xmit_prepare_tstamp(cmd, &txq->cmd[0],
 					     m->ol_flags, 4, flags);
-		otx2_nix_xmit_one(cmd, txq->lmt_addr, txq->io_addr, flags);
+
+		if (!ev->sched_type) {
+			otx2_nix_xmit_prep_lmt(cmd, txq->lmt_addr, flags);
+			otx2_ssogws_head_wait(ws);
+			if (otx2_nix_xmit_submit_lmt(txq->io_addr) == 0)
+				otx2_nix_xmit_one(cmd, txq->lmt_addr,
+						  txq->io_addr, flags);
+		} else {
+			otx2_nix_xmit_one(cmd, txq->lmt_addr, txq->io_addr,
+					  flags);
+		}
 	}
 
 	otx2_write64(0, ws->swtag_flush_op);
diff --git a/drivers/net/octeontx2/otx2_tx.h b/drivers/net/octeontx2/otx2_tx.h
index 3c4317092..caf170fd1 100644
--- a/drivers/net/octeontx2/otx2_tx.h
+++ b/drivers/net/octeontx2/otx2_tx.h
@@ -383,6 +383,18 @@  otx2_nix_xmit_one(uint64_t *cmd, void *lmt_addr,
 	} while (lmt_status == 0);
 }
 
+static __rte_always_inline void
+otx2_nix_xmit_prep_lmt(uint64_t *cmd, void *lmt_addr, const uint32_t flags)
+{
+	otx2_lmt_mov(lmt_addr, cmd, otx2_nix_tx_ext_subs(flags));
+}
+
+static __rte_always_inline uint64_t
+otx2_nix_xmit_submit_lmt(const rte_iova_t io_addr)
+{
+	return otx2_lmt_submit(io_addr);
+}
+
 static __rte_always_inline uint16_t
 otx2_nix_prepare_mseg(struct rte_mbuf *m, uint64_t *cmd, const uint16_t flags)
 {
@@ -453,6 +465,12 @@  otx2_nix_prepare_mseg(struct rte_mbuf *m, uint64_t *cmd, const uint16_t flags)
 	return segdw;
 }
 
+static __rte_always_inline void
+otx2_nix_xmit_mseg_prep_lmt(uint64_t *cmd, void *lmt_addr, uint16_t segdw)
+{
+	otx2_lmt_mov_seg(lmt_addr, (const void *)cmd, segdw);
+}
+
 static __rte_always_inline void
 otx2_nix_xmit_mseg_one(uint64_t *cmd, void *lmt_addr,
 		       rte_iova_t io_addr, uint16_t segdw)