[4/4] examples/l3fwd: make data struct to be memory efficient

Message ID 20210318102550.59265-5-ruifeng.wang@arm.com (mailing list archive)
State Superseded, archived
Delegated to: David Marchand
Headers
Series l3fwd improvements |

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/travis-robot success travis build: passed
ci/github-robot success github build: passed
ci/Intel-compilation success Compilation OK
ci/intel-Testing success Testing PASS
ci/iol-intel-Performance success Performance Testing PASS
ci/iol-testing success Testing PASS
ci/iol-mellanox-Performance success Performance Testing PASS

Commit Message

Ruifeng Wang March 18, 2021, 10:25 a.m. UTC
  There are some holes in data struct lcore_conf. The holes are
due to alignment requirement.

For struct lcore_rx_queue, there is no need to make every element
of this type to be cache line aligned, because the data is not
shared between cores.

Member len of struct mbuf_table can be moved out. So data can be
packed and there will be no need to load an extra cache line when
mbuf table is empty.

The change showed slight performance improvement on N1SDP platform.

Suggested-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>

Signed-off-by: Ruifeng Wang <ruifeng.wang@arm.com>
---
 examples/l3fwd/l3fwd.h        | 12 ++++++------
 examples/l3fwd/l3fwd_common.h |  4 ++--
 examples/l3fwd/l3fwd_em.c     |  6 +++---
 examples/l3fwd/l3fwd_lpm.c    |  6 +++---
 4 files changed, 14 insertions(+), 14 deletions(-)
  

Comments

Jerin Jacob April 13, 2021, 7:06 p.m. UTC | #1
On Thu, Mar 18, 2021 at 3:56 PM Ruifeng Wang <ruifeng.wang@arm.com> wrote:
>
> There are some holes in data struct lcore_conf. The holes are
> due to alignment requirement.
>
> For struct lcore_rx_queue, there is no need to make every element
> of this type to be cache line aligned, because the data is not
> shared between cores.
>
> Member len of struct mbuf_table can be moved out. So data can be
> packed and there will be no need to load an extra cache line when
> mbuf table is empty.
>
> The change showed slight performance improvement on N1SDP platform.
>
> Suggested-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
>
> Signed-off-by: Ruifeng Wang <ruifeng.wang@arm.com>

This change alone is OK in the octeontx2 platform.(No difference in performance)
combining with 3/4 shows some regression. Probably is due to prefetch
or 128B cache line tuning specifics.


> ---
>  examples/l3fwd/l3fwd.h        | 12 ++++++------
>  examples/l3fwd/l3fwd_common.h |  4 ++--
>  examples/l3fwd/l3fwd_em.c     |  6 +++---
>  examples/l3fwd/l3fwd_lpm.c    |  6 +++---
>  4 files changed, 14 insertions(+), 14 deletions(-)
>
> diff --git a/examples/l3fwd/l3fwd.h b/examples/l3fwd/l3fwd.h
> index 2cf06099e..f3a301e12 100644
> --- a/examples/l3fwd/l3fwd.h
> +++ b/examples/l3fwd/l3fwd.h
> @@ -57,22 +57,22 @@
>  #define HASH_ENTRY_NUMBER_DEFAULT      4
>
>  struct mbuf_table {
> -       uint16_t len;
>         struct rte_mbuf *m_table[MAX_PKT_BURST];
>  };
>
>  struct lcore_rx_queue {
>         uint16_t port_id;
>         uint8_t queue_id;
> -} __rte_cache_aligned;
> +};
>
>  struct lcore_conf {
> -       uint16_t n_rx_queue;
>         struct lcore_rx_queue rx_queue_list[MAX_RX_QUEUE_PER_LCORE];
> -       uint16_t n_tx_port;
>         uint16_t tx_port_id[RTE_MAX_ETHPORTS];
>         uint16_t tx_queue_id[RTE_MAX_ETHPORTS];
> +       uint16_t tx_mbuf_len[RTE_MAX_ETHPORTS];
>         struct mbuf_table tx_mbufs[RTE_MAX_ETHPORTS];
> +       uint16_t n_rx_queue;
> +       uint16_t n_tx_port;
>         void *ipv4_lookup_struct;
>         void *ipv6_lookup_struct;
>  } __rte_cache_aligned;
> @@ -122,7 +122,7 @@ send_single_packet(struct lcore_conf *qconf,
>  {
>         uint16_t len;
>
> -       len = qconf->tx_mbufs[port].len;
> +       len = qconf->tx_mbuf_len[port];
>         qconf->tx_mbufs[port].m_table[len] = m;
>         len++;
>
> @@ -132,7 +132,7 @@ send_single_packet(struct lcore_conf *qconf,
>                 len = 0;
>         }
>
> -       qconf->tx_mbufs[port].len = len;
> +       qconf->tx_mbuf_len[port] = len;
>         return 0;
>  }
>
> diff --git a/examples/l3fwd/l3fwd_common.h b/examples/l3fwd/l3fwd_common.h
> index 7d83ff641..05e03dbfc 100644
> --- a/examples/l3fwd/l3fwd_common.h
> +++ b/examples/l3fwd/l3fwd_common.h
> @@ -183,7 +183,7 @@ send_packetsx4(struct lcore_conf *qconf, uint16_t port, struct rte_mbuf *m[],
>  {
>         uint32_t len, j, n;
>
> -       len = qconf->tx_mbufs[port].len;
> +       len = qconf->tx_mbuf_len[port];
>
>         /*
>          * If TX buffer for that queue is empty, and we have enough packets,
> @@ -258,7 +258,7 @@ send_packetsx4(struct lcore_conf *qconf, uint16_t port, struct rte_mbuf *m[],
>                 }
>         }
>
> -       qconf->tx_mbufs[port].len = len;
> +       qconf->tx_mbuf_len[port] = len;
>  }
>
>  #endif /* _L3FWD_COMMON_H_ */
> diff --git a/examples/l3fwd/l3fwd_em.c b/examples/l3fwd/l3fwd_em.c
> index 9996bfba3..1970e0376 100644
> --- a/examples/l3fwd/l3fwd_em.c
> +++ b/examples/l3fwd/l3fwd_em.c
> @@ -662,12 +662,12 @@ em_main_loop(__rte_unused void *dummy)
>
>                         for (i = 0; i < qconf->n_tx_port; ++i) {
>                                 portid = qconf->tx_port_id[i];
> -                               if (qconf->tx_mbufs[portid].len == 0)
> +                               if (qconf->tx_mbuf_len[portid] == 0)
>                                         continue;
>                                 send_burst(qconf,
> -                                       qconf->tx_mbufs[portid].len,
> +                                       qconf->tx_mbuf_len[portid],
>                                         portid);
> -                               qconf->tx_mbufs[portid].len = 0;
> +                               qconf->tx_mbuf_len[portid] = 0;
>                         }
>
>                         prev_tsc = cur_tsc;
> diff --git a/examples/l3fwd/l3fwd_lpm.c b/examples/l3fwd/l3fwd_lpm.c
> index d338590b9..e62139a0e 100644
> --- a/examples/l3fwd/l3fwd_lpm.c
> +++ b/examples/l3fwd/l3fwd_lpm.c
> @@ -220,12 +220,12 @@ lpm_main_loop(__rte_unused void *dummy)
>
>                         for (i = 0; i < n_tx_p; ++i) {
>                                 portid = qconf->tx_port_id[i];
> -                               if (qconf->tx_mbufs[portid].len == 0)
> +                               if (qconf->tx_mbuf_len[portid] == 0)
>                                         continue;
>                                 send_burst(qconf,
> -                                       qconf->tx_mbufs[portid].len,
> +                                       qconf->tx_mbuf_len[portid],
>                                         portid);
> -                               qconf->tx_mbufs[portid].len = 0;
> +                               qconf->tx_mbuf_len[portid] = 0;
>                         }
>
>                         prev_tsc = cur_tsc;
> --
> 2.25.1
>
  
Hemant Agrawal April 21, 2021, 5:22 a.m. UTC | #2
On 4/14/2021 12:36 AM, Jerin Jacob wrote:
> On Thu, Mar 18, 2021 at 3:56 PM Ruifeng Wang <ruifeng.wang@arm.com> wrote:
>> There are some holes in data struct lcore_conf. The holes are
>> due to alignment requirement.
>>
>> For struct lcore_rx_queue, there is no need to make every element
>> of this type to be cache line aligned, because the data is not
>> shared between cores.
>>
>> Member len of struct mbuf_table can be moved out. So data can be
>> packed and there will be no need to load an extra cache line when
>> mbuf table is empty.
>>
>> The change showed slight performance improvement on N1SDP platform.
>>
>> Suggested-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
>>
>> Signed-off-by: Ruifeng Wang <ruifeng.wang@arm.com>
> This change alone is OK in the octeontx2 platform.(No difference in performance)
> combining with 3/4 shows some regression. Probably is due to prefetch
> or 128B cache line tuning specifics.

We checked it on Layerscape LS2088A platform. No difference for 1-2 core 
case. However observing ~2% regression for 4-8 cores.

Regards,

Hemant
  
Conor Walsh April 26, 2021, 10:55 a.m. UTC | #3
<snip>
> >> There are some holes in data struct lcore_conf. The holes are
> >> due to alignment requirement.
> >>
> >> For struct lcore_rx_queue, there is no need to make every element
> >> of this type to be cache line aligned, because the data is not
> >> shared between cores.
> >>
> >> Member len of struct mbuf_table can be moved out. So data can be
> >> packed and there will be no need to load an extra cache line when
> >> mbuf table is empty.
> >>
> >> The change showed slight performance improvement on N1SDP platform.
> >>
> >> Suggested-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> >>
> >> Signed-off-by: Ruifeng Wang <ruifeng.wang@arm.com>
> > This change alone is OK in the octeontx2 platform.(No difference in
> performance)
> > combining with 3/4 shows some regression. Probably is due to prefetch
> > or 128B cache line tuning specifics.
> 
> We checked it on Layerscape LS2088A platform. No difference for 1-2 core
> case. However observing ~2% regression for 4-8 cores.
> 
> Regards,
> 
> Hemant
> 

Hi Ruifeng,

l3fwd will no longer build with this patch as you have changed a struct used by the FIB lookup method.
This patch will need to be updated to also update the FIB lookup method as you have done with EM and LPM.

Thanks,
Conor.
  
Ruifeng Wang April 27, 2021, 1:19 a.m. UTC | #4
> -----Original Message-----
> From: Walsh, Conor <conor.walsh@intel.com>
> Sent: Monday, April 26, 2021 6:55 PM
> To: hemant.agrawal@nxp.com; Jerin Jacob <jerinjacobk@gmail.com>;
> Ruifeng Wang <Ruifeng.Wang@arm.com>
> Cc: jerinj@marvell.com; Yigit, Ferruh <ferruh.yigit@intel.com>;
> thomas@monjalon.net; David Marchand <david.marchand@redhat.com>;
> dpdk-dev <dev@dpdk.org>; nd <nd@arm.com>; Honnappa Nagarahalli
> <Honnappa.Nagarahalli@arm.com>
> Subject: RE: [dpdk-dev] [PATCH 4/4] examples/l3fwd: make data struct to be
> memory efficient
> 
> <snip>
> > >> There are some holes in data struct lcore_conf. The holes are due
> > >> to alignment requirement.
> > >>
> > >> For struct lcore_rx_queue, there is no need to make every element
> > >> of this type to be cache line aligned, because the data is not
> > >> shared between cores.
> > >>
> > >> Member len of struct mbuf_table can be moved out. So data can be
> > >> packed and there will be no need to load an extra cache line when
> > >> mbuf table is empty.
> > >>
> > >> The change showed slight performance improvement on N1SDP
> platform.
> > >>
> > >> Suggested-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> > >>
> > >> Signed-off-by: Ruifeng Wang <ruifeng.wang@arm.com>
> > > This change alone is OK in the octeontx2 platform.(No difference in
> > performance)
> > > combining with 3/4 shows some regression. Probably is due to
> > > prefetch or 128B cache line tuning specifics.
> >
> > We checked it on Layerscape LS2088A platform. No difference for 1-2
> > core case. However observing ~2% regression for 4-8 cores.
> >
> > Regards,
> >
> > Hemant
> >
> 
> Hi Ruifeng,

Hi Conor,
> 
> l3fwd will no longer build with this patch as you have changed a struct used
> by the FIB lookup method.
> This patch will need to be updated to also update the FIB lookup method as
> you have done with EM and LPM.

Thanks for the comments.
I will provide v2 with updates. I'm considering to drop this one in v2 series.
> 
> Thanks,
> Conor.
  

Patch

diff --git a/examples/l3fwd/l3fwd.h b/examples/l3fwd/l3fwd.h
index 2cf06099e..f3a301e12 100644
--- a/examples/l3fwd/l3fwd.h
+++ b/examples/l3fwd/l3fwd.h
@@ -57,22 +57,22 @@ 
 #define HASH_ENTRY_NUMBER_DEFAULT	4
 
 struct mbuf_table {
-	uint16_t len;
 	struct rte_mbuf *m_table[MAX_PKT_BURST];
 };
 
 struct lcore_rx_queue {
 	uint16_t port_id;
 	uint8_t queue_id;
-} __rte_cache_aligned;
+};
 
 struct lcore_conf {
-	uint16_t n_rx_queue;
 	struct lcore_rx_queue rx_queue_list[MAX_RX_QUEUE_PER_LCORE];
-	uint16_t n_tx_port;
 	uint16_t tx_port_id[RTE_MAX_ETHPORTS];
 	uint16_t tx_queue_id[RTE_MAX_ETHPORTS];
+	uint16_t tx_mbuf_len[RTE_MAX_ETHPORTS];
 	struct mbuf_table tx_mbufs[RTE_MAX_ETHPORTS];
+	uint16_t n_rx_queue;
+	uint16_t n_tx_port;
 	void *ipv4_lookup_struct;
 	void *ipv6_lookup_struct;
 } __rte_cache_aligned;
@@ -122,7 +122,7 @@  send_single_packet(struct lcore_conf *qconf,
 {
 	uint16_t len;
 
-	len = qconf->tx_mbufs[port].len;
+	len = qconf->tx_mbuf_len[port];
 	qconf->tx_mbufs[port].m_table[len] = m;
 	len++;
 
@@ -132,7 +132,7 @@  send_single_packet(struct lcore_conf *qconf,
 		len = 0;
 	}
 
-	qconf->tx_mbufs[port].len = len;
+	qconf->tx_mbuf_len[port] = len;
 	return 0;
 }
 
diff --git a/examples/l3fwd/l3fwd_common.h b/examples/l3fwd/l3fwd_common.h
index 7d83ff641..05e03dbfc 100644
--- a/examples/l3fwd/l3fwd_common.h
+++ b/examples/l3fwd/l3fwd_common.h
@@ -183,7 +183,7 @@  send_packetsx4(struct lcore_conf *qconf, uint16_t port, struct rte_mbuf *m[],
 {
 	uint32_t len, j, n;
 
-	len = qconf->tx_mbufs[port].len;
+	len = qconf->tx_mbuf_len[port];
 
 	/*
 	 * If TX buffer for that queue is empty, and we have enough packets,
@@ -258,7 +258,7 @@  send_packetsx4(struct lcore_conf *qconf, uint16_t port, struct rte_mbuf *m[],
 		}
 	}
 
-	qconf->tx_mbufs[port].len = len;
+	qconf->tx_mbuf_len[port] = len;
 }
 
 #endif /* _L3FWD_COMMON_H_ */
diff --git a/examples/l3fwd/l3fwd_em.c b/examples/l3fwd/l3fwd_em.c
index 9996bfba3..1970e0376 100644
--- a/examples/l3fwd/l3fwd_em.c
+++ b/examples/l3fwd/l3fwd_em.c
@@ -662,12 +662,12 @@  em_main_loop(__rte_unused void *dummy)
 
 			for (i = 0; i < qconf->n_tx_port; ++i) {
 				portid = qconf->tx_port_id[i];
-				if (qconf->tx_mbufs[portid].len == 0)
+				if (qconf->tx_mbuf_len[portid] == 0)
 					continue;
 				send_burst(qconf,
-					qconf->tx_mbufs[portid].len,
+					qconf->tx_mbuf_len[portid],
 					portid);
-				qconf->tx_mbufs[portid].len = 0;
+				qconf->tx_mbuf_len[portid] = 0;
 			}
 
 			prev_tsc = cur_tsc;
diff --git a/examples/l3fwd/l3fwd_lpm.c b/examples/l3fwd/l3fwd_lpm.c
index d338590b9..e62139a0e 100644
--- a/examples/l3fwd/l3fwd_lpm.c
+++ b/examples/l3fwd/l3fwd_lpm.c
@@ -220,12 +220,12 @@  lpm_main_loop(__rte_unused void *dummy)
 
 			for (i = 0; i < n_tx_p; ++i) {
 				portid = qconf->tx_port_id[i];
-				if (qconf->tx_mbufs[portid].len == 0)
+				if (qconf->tx_mbuf_len[portid] == 0)
 					continue;
 				send_burst(qconf,
-					qconf->tx_mbufs[portid].len,
+					qconf->tx_mbuf_len[portid],
 					portid);
-				qconf->tx_mbufs[portid].len = 0;
+				qconf->tx_mbuf_len[portid] = 0;
 			}
 
 			prev_tsc = cur_tsc;