[dpdk-dev,v4] examples/l3fwd: em path performance fix

Message ID 1458293807-2604-1-git-send-email-tomaszx.kulasek@intel.com (mailing list archive)
State Superseded, archived
Headers

Commit Message

Tomasz Kulasek March 18, 2016, 9:36 a.m. UTC
  It seems that for the most use cases, previous hash_multi_lookup provides
better performance, and more, sequential lookup can cause significant
performance drop.

This patch sets previously optional hash_multi_lookup method as default.
It also provides some minor optimizations such as queue drain only on used
tx ports.


This patch should be applied after Maciej Czekaj's patch "l3fwd: Fix
compilation with HASH_MULTI_LOOKUP"


v4 changes:
 - rebased to be applicable after patch "l3fwd: Fix compilation with
   HASH_MULTI_LOOKUP" of Maciej Czekaj

v3 changes:
 - "lpm: extend IPv4 next hop field" patch extends dst_port table from
   uint16_t to uint32_t omiting previously disabled l3fwd_em_hlm_sse.h,
   what causes incompatible pointer type error after turning on this header

v2 changes:
 - fixed copy-paste error causing that not all packets are classified right
   in hash_multi_lookup implementation when burst size is not divisible
   by 8

Fixes: 94c54b4158d5 ("examples/l3fwd: rework exact-match")

Reported-by: Qian Xu <qian.q.xu@intel.com>
Signed-off-by: Tomasz Kulasek <tomaszx.kulasek@intel.com>
---
 examples/l3fwd/l3fwd.h            |    8 ++++++++
 examples/l3fwd/l3fwd_em.c         |    6 +++---
 examples/l3fwd/l3fwd_em_hlm_sse.h |   28 ++++++++++------------------
 examples/l3fwd/l3fwd_em_sse.h     |    9 +++++++++
 examples/l3fwd/l3fwd_lpm.c        |    4 ++--
 examples/l3fwd/main.c             |    7 +++++++
 6 files changed, 39 insertions(+), 23 deletions(-)
  

Comments

Tomasz Kulasek March 18, 2016, 9:43 a.m. UTC | #1
> -----Original Message-----
> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Tomasz Kulasek
> Sent: Friday, March 18, 2016 10:37
> To: dev@dpdk.org
> Subject: [dpdk-dev] [PATCH v4] examples/l3fwd: em path performance fix
> 
> It seems that for the most use cases, previous hash_multi_lookup provides
> better performance, and more, sequential lookup can cause significant
> performance drop.
> 
> This patch sets previously optional hash_multi_lookup method as default.
> It also provides some minor optimizations such as queue drain only on used
> tx ports.
> 
> 
> This patch should be applied after Maciej Czekaj's patch "l3fwd: Fix
> compilation with HASH_MULTI_LOOKUP"
> 
> 
> v4 changes:
>  - rebased to be applicable after patch "l3fwd: Fix compilation with
>    HASH_MULTI_LOOKUP" of Maciej Czekaj
> 
> v3 changes:
>  - "lpm: extend IPv4 next hop field" patch extends dst_port table from
>    uint16_t to uint32_t omiting previously disabled l3fwd_em_hlm_sse.h,
>    what causes incompatible pointer type error after turning on this
> header
> 
> v2 changes:
>  - fixed copy-paste error causing that not all packets are classified
> right
>    in hash_multi_lookup implementation when burst size is not divisible
>    by 8
> 
> Fixes: 94c54b4158d5 ("examples/l3fwd: rework exact-match")
> 
> Reported-by: Qian Xu <qian.q.xu@intel.com>
> Signed-off-by: Tomasz Kulasek <tomaszx.kulasek@intel.com>
> ---
>  examples/l3fwd/l3fwd.h            |    8 ++++++++
>  examples/l3fwd/l3fwd_em.c         |    6 +++---
>  examples/l3fwd/l3fwd_em_hlm_sse.h |   28 ++++++++++------------------
>  examples/l3fwd/l3fwd_em_sse.h     |    9 +++++++++
>  examples/l3fwd/l3fwd_lpm.c        |    4 ++--
>  examples/l3fwd/main.c             |    7 +++++++
>  6 files changed, 39 insertions(+), 23 deletions(-)
> 

Self NACK - I forgot to remove debug options
  

Patch

diff --git a/examples/l3fwd/l3fwd.h b/examples/l3fwd/l3fwd.h
index 7dcc7e5..69dcc17 100644
--- a/examples/l3fwd/l3fwd.h
+++ b/examples/l3fwd/l3fwd.h
@@ -40,6 +40,12 @@ 
 
 #define RTE_LOGTYPE_L3FWD RTE_LOGTYPE_USER1
 
+#define __ARM_NEON 1
+
+#if !defined(NO_HASH_MULTI_LOOKUP) && defined(__ARM_NEON)
+#define NO_HASH_MULTI_LOOKUP 1
+#endif
+
 #define MAX_PKT_BURST     32
 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */
 
@@ -86,6 +92,8 @@  struct lcore_rx_queue {
 struct lcore_conf {
 	uint16_t n_rx_queue;
 	struct lcore_rx_queue rx_queue_list[MAX_RX_QUEUE_PER_LCORE];
+	uint16_t n_tx_port;
+	uint16_t tx_port_id[RTE_MAX_ETHPORTS];
 	uint16_t tx_queue_id[RTE_MAX_ETHPORTS];
 	struct mbuf_table tx_mbufs[RTE_MAX_ETHPORTS];
 	void *ipv4_lookup_struct;
diff --git a/examples/l3fwd/l3fwd_em.c b/examples/l3fwd/l3fwd_em.c
index 0adf8f4..5a2e7ff 100644
--- a/examples/l3fwd/l3fwd_em.c
+++ b/examples/l3fwd/l3fwd_em.c
@@ -320,7 +320,7 @@  em_get_ipv6_dst_port(void *ipv6_hdr,  uint8_t portid, void *lookup_struct)
  * buffer optimization i.e. ENABLE_MULTI_BUFFER_OPTIMIZE=1.
  */
 #if defined(__SSE4_1__)
-#ifndef HASH_MULTI_LOOKUP
+#if defined(NO_HASH_MULTI_LOOKUP)
 #include "l3fwd_em_sse.h"
 #else
 #include "l3fwd_em_hlm_sse.h"
@@ -568,8 +568,8 @@  em_main_loop(__attribute__((unused)) void *dummy)
 		diff_tsc = cur_tsc - prev_tsc;
 		if (unlikely(diff_tsc > drain_tsc)) {
 
-			for (i = 0; i < qconf->n_rx_queue; i++) {
-				portid = qconf->rx_queue_list[i].port_id;
+			for (i = 0; i < qconf->n_tx_port; ++i) {
+				portid = qconf->tx_port_id[i];
 				if (qconf->tx_mbufs[portid].len == 0)
 					continue;
 				send_burst(qconf,
diff --git a/examples/l3fwd/l3fwd_em_hlm_sse.h b/examples/l3fwd/l3fwd_em_hlm_sse.h
index 891ae2e..7faf04a 100644
--- a/examples/l3fwd/l3fwd_em_hlm_sse.h
+++ b/examples/l3fwd/l3fwd_em_hlm_sse.h
@@ -34,17 +34,9 @@ 
 #ifndef __L3FWD_EM_HLM_SSE_H__
 #define __L3FWD_EM_HLM_SSE_H__
 
-/**
- * @file
- * This is an optional implementation of packet classification in Exact-Match
- * path using rte_hash_lookup_multi method from previous implementation.
- * While sequential classification seems to be faster, it's disabled by default
- * and can be enabled with HASH_LOOKUP_MULTI global define in compilation time.
- */
-
 #include "l3fwd_sse.h"
 
-static inline void
+static inline __attribute__((always_inline)) void
 em_get_dst_port_ipv4x8(struct lcore_conf *qconf, struct rte_mbuf *m[8],
 		uint8_t portid, uint32_t dst_port[8])
 {
@@ -168,7 +160,7 @@  get_ipv6_5tuple(struct rte_mbuf *m0, __m128i mask0,
 	key->xmm[2] = _mm_and_si128(tmpdata2, mask1);
 }
 
-static inline void
+static inline __attribute__((always_inline)) void
 em_get_dst_port_ipv6x8(struct lcore_conf *qconf, struct rte_mbuf *m[8],
 		uint8_t portid, uint32_t dst_port[8])
 {
@@ -322,17 +314,17 @@  l3fwd_em_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
 
 		} else {
 			dst_port[j]   = em_get_dst_port(qconf, pkts_burst[j], portid);
-			dst_port[j+1] = em_get_dst_port(qconf, pkts_burst[j], portid);
-			dst_port[j+2] = em_get_dst_port(qconf, pkts_burst[j], portid);
-			dst_port[j+3] = em_get_dst_port(qconf, pkts_burst[j], portid);
-			dst_port[j+4] = em_get_dst_port(qconf, pkts_burst[j], portid);
-			dst_port[j+5] = em_get_dst_port(qconf, pkts_burst[j], portid);
-			dst_port[j+6] = em_get_dst_port(qconf, pkts_burst[j], portid);
-			dst_port[j+7] = em_get_dst_port(qconf, pkts_burst[j], portid);
+			dst_port[j+1] = em_get_dst_port(qconf, pkts_burst[j+1], portid);
+			dst_port[j+2] = em_get_dst_port(qconf, pkts_burst[j+2], portid);
+			dst_port[j+3] = em_get_dst_port(qconf, pkts_burst[j+3], portid);
+			dst_port[j+4] = em_get_dst_port(qconf, pkts_burst[j+4], portid);
+			dst_port[j+5] = em_get_dst_port(qconf, pkts_burst[j+5], portid);
+			dst_port[j+6] = em_get_dst_port(qconf, pkts_burst[j+6], portid);
+			dst_port[j+7] = em_get_dst_port(qconf, pkts_burst[j+7], portid);
 		}
 	}
 
-	for (; j < n; j++)
+	for (; j < nb_rx; j++)
 		dst_port[j] = em_get_dst_port(qconf, pkts_burst[j], portid);
 
 	send_packets_multi(qconf, pkts_burst, dst_port, nb_rx);
diff --git a/examples/l3fwd/l3fwd_em_sse.h b/examples/l3fwd/l3fwd_em_sse.h
index d4a2a2d..8bd150a 100644
--- a/examples/l3fwd/l3fwd_em_sse.h
+++ b/examples/l3fwd/l3fwd_em_sse.h
@@ -34,6 +34,15 @@ 
 #ifndef __L3FWD_EM_SSE_H__
 #define __L3FWD_EM_SSE_H__
 
+/**
+ * @file
+ * This is an optional implementation of packet classification in Exact-Match
+ * path using sequential packet classification method.
+ * While hash lookup multi seems to provide better performance, it's disabled
+ * by default and can be enabled with NO_HASH_LOOKUP_MULTI global define in
+ * compilation time.
+ */
+
 #include "l3fwd_sse.h"
 
 static inline __attribute__((always_inline)) uint16_t
diff --git a/examples/l3fwd/l3fwd_lpm.c b/examples/l3fwd/l3fwd_lpm.c
index a354797..990a7f1 100644
--- a/examples/l3fwd/l3fwd_lpm.c
+++ b/examples/l3fwd/l3fwd_lpm.c
@@ -159,8 +159,8 @@  lpm_main_loop(__attribute__((unused)) void *dummy)
 		diff_tsc = cur_tsc - prev_tsc;
 		if (unlikely(diff_tsc > drain_tsc)) {
 
-			for (i = 0; i < qconf->n_rx_queue; i++) {
-				portid = qconf->rx_queue_list[i].port_id;
+			for (i = 0; i < qconf->n_tx_port; ++i) {
+				portid = qconf->tx_port_id[i];
 				if (qconf->tx_mbufs[portid].len == 0)
 					continue;
 				send_burst(qconf,
diff --git a/examples/l3fwd/main.c b/examples/l3fwd/main.c
index 8520f71..792894f 100644
--- a/examples/l3fwd/main.c
+++ b/examples/l3fwd/main.c
@@ -791,6 +791,7 @@  main(int argc, char **argv)
 	unsigned lcore_id;
 	uint32_t n_tx_queue, nb_lcores;
 	uint8_t portid, nb_rx_queue, queue, socketid;
+	uint8_t nb_tx_port;
 
 	/* init EAL */
 	ret = rte_eal_init(argc, argv);
@@ -830,6 +831,7 @@  main(int argc, char **argv)
 		rte_exit(EXIT_FAILURE, "check_port_config failed\n");
 
 	nb_lcores = rte_lcore_count();
+	nb_tx_port = 0;
 
 	/* Setup function pointers for lookup method. */
 	setup_l3fwd_lookup_tables();
@@ -906,8 +908,13 @@  main(int argc, char **argv)
 			qconf = &lcore_conf[lcore_id];
 			qconf->tx_queue_id[portid] = queueid;
 			queueid++;
+
+			qconf->n_tx_port = nb_tx_port;
+			qconf->tx_port_id[qconf->n_tx_port] = portid;
 		}
 		printf("\n");
+
+		nb_tx_port++;
 	}
 
 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {