[RFC,1/1] app/testpmd: add l3fwd mode to testpmd

Message ID 20210430213747.41530-2-kathleen.capella@arm.com (mailing list archive)
State Rejected, archived
Delegated to: Ferruh Yigit
Headers
Series app/testpmd: add l3fwd mode to testpmd |

Checks

Context Check Description
ci/checkpatch warning coding style issues
ci/Intel-compilation fail Compilation issues

Commit Message

Kathleen Capella April 30, 2021, 9:37 p.m. UTC
  Add l3fwd mode to testpmd to speed up debugging and performance analysis.
Implement LPM lookup for single NUMA socket only.

Signed-off-by: Kathleen Capella <kathleen.capella@arm.com>
---
 app/test-pmd/config.c         |  66 +++++++
 app/test-pmd/l3fwd.c          | 356 ++++++++++++++++++++++++++++++++++
 app/test-pmd/l3fwd.h          | 143 ++++++++++++++
 app/test-pmd/l3fwd_common.h   | 268 +++++++++++++++++++++++++
 app/test-pmd/l3fwd_lpm.h      | 107 ++++++++++
 app/test-pmd/l3fwd_lpm_neon.h | 169 ++++++++++++++++
 app/test-pmd/l3fwd_neon.h     | 234 ++++++++++++++++++++++
 app/test-pmd/meson.build      |   3 +-
 app/test-pmd/testpmd.c        |   4 +-
 app/test-pmd/testpmd.h        |  20 ++
 10 files changed, 1368 insertions(+), 2 deletions(-)
 create mode 100644 app/test-pmd/l3fwd.c
 create mode 100644 app/test-pmd/l3fwd.h
 create mode 100644 app/test-pmd/l3fwd_common.h
 create mode 100644 app/test-pmd/l3fwd_lpm.h
 create mode 100644 app/test-pmd/l3fwd_lpm_neon.h
 create mode 100644 app/test-pmd/l3fwd_neon.h
  

Patch

diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
index e189062ef..6ea742fda 100644
--- a/app/test-pmd/config.c
+++ b/app/test-pmd/config.c
@@ -65,6 +65,9 @@ 
 
 #define NS_PER_SEC 1E9
 
+/* Used in l3 fwd mode to ensure only one lookup table is created per socket */
+static uint8_t lkp_per_socket[NB_SOCKETS];
+
 static char *flowtype_to_str(uint16_t flow_type);
 
 static const struct {
@@ -2947,6 +2950,9 @@  simple_fwd_config_setup(void)
 			(lcoreid_t) cur_fwd_config.nb_fwd_ports;
 	setup_fwd_config_of_each_lcore(&cur_fwd_config);
 
+	printf("nb_fwd_ports: %d\n", cur_fwd_config.nb_fwd_ports);
+	printf("nb_fwd_streams: %d\n", cur_fwd_config.nb_fwd_streams);
+
 	for (i = 0; i < cur_fwd_config.nb_fwd_ports; i++) {
 		fwd_streams[i]->rx_port   = fwd_ports_ids[i];
 		fwd_streams[i]->rx_queue  = 0;
@@ -3151,6 +3157,61 @@  icmp_echo_config_setup(void)
 	}
 }
 
+static void
+l3_fwd_config_setup(void)
+{
+	portid_t rxp;
+	queueid_t rxq;
+	queueid_t nb_q;
+	streamid_t sm_id;
+	int socketid = 0;
+
+	nb_q = nb_rxq;
+	if (nb_q > nb_txq)
+		nb_q = nb_txq;
+
+	cur_fwd_config.nb_fwd_lcores = (lcoreid_t) nb_fwd_lcores;
+	cur_fwd_config.nb_fwd_ports = (portid_t) nb_fwd_ports;
+	cur_fwd_config.nb_fwd_streams =
+		(streamid_t) (nb_q * cur_fwd_config.nb_fwd_ports);
+
+	/*TODO check if want this behavior */
+	if (cur_fwd_config.nb_fwd_streams < cur_fwd_config.nb_fwd_lcores)
+		cur_fwd_config.nb_fwd_lcores =
+			(lcoreid_t)cur_fwd_config.nb_fwd_streams;
+
+	/* reinitialize forwarding streams */
+	init_fwd_streams();
+
+	setup_fwd_config_of_each_lcore(&cur_fwd_config);
+	rxp = 0; rxq = 0;
+	for (sm_id = 0; sm_id < cur_fwd_config.nb_fwd_streams; sm_id++) {
+		struct fwd_stream *fs;
+
+		fs = fwd_streams[sm_id];
+		fs->rx_port   = fwd_ports_ids[rxp];
+		fs->rx_queue  = rxq;
+		fs->tx_queue  = rxq;
+		fs->retry_enabled = retry_enabled;
+		fs->drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) /
+			US_PER_S * BURST_TX_DRAIN_US;
+		fs->cur_tsc = rte_rdtsc();
+		fs->prev_tsc = fs->cur_tsc;
+		rxp++;
+		if (rxp < nb_fwd_ports)
+			continue;
+		rxp = 0;
+		rxq++;
+	}
+
+	/*Set up lpm table */
+	/*TODO  Making the assumption that there is one socket*/
+	if (!lkp_per_socket[socketid]) {
+		setup_lpm(socketid);
+		lkp_per_socket[socketid] = 1;
+	}
+}
+
 void
 fwd_config_setup(void)
 {
@@ -3160,6 +3221,11 @@  fwd_config_setup(void)
 		return;
 	}
 
+	if (strcmp(cur_fwd_eng->fwd_mode_name, "l3") == 0) {
+		l3_fwd_config_setup();
+		return;
+	}
+
 	if ((nb_rxq > 1) && (nb_txq > 1)){
 		if (dcb_config)
 			dcb_fwd_config_setup();
diff --git a/app/test-pmd/l3fwd.c b/app/test-pmd/l3fwd.c
new file mode 100644
index 000000000..d0822ed13
--- /dev/null
+++ b/app/test-pmd/l3fwd.c
@@ -0,0 +1,356 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2014 Intel Corporation
+ */
+
+#include <stdarg.h>
+#include <string.h>
+#include <stdio.h>
+#include <errno.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <inttypes.h>
+#include <arpa/inet.h>
+
+#include <sys/queue.h>
+#include <sys/stat.h>
+
+#include <rte_common.h>
+#include <rte_byteorder.h>
+#include <rte_log.h>
+#include <rte_debug.h>
+#include <rte_cycles.h>
+#include <rte_memory.h>
+#include <rte_memcpy.h>
+#include <rte_launch.h>
+#include <rte_eal.h>
+#include <rte_per_lcore.h>
+#include <rte_lcore.h>
+#include <rte_atomic.h>
+#include <rte_branch_prediction.h>
+#include <rte_mempool.h>
+#include <rte_mbuf.h>
+#include <rte_interrupts.h>
+#include <rte_pci.h>
+#include <rte_ether.h>
+#include <rte_ethdev.h>
+#include <rte_ip.h>
+#include <rte_string_fns.h>
+#include <rte_flow.h>
+#include <rte_lpm.h>
+#include <rte_lpm6.h>
+
+#include "testpmd.h"
+#include "l3fwd.h"
+
+extern portid_t nb_fwd_ports;
+
+/* TODO determine usefulnesss/redundancy with other testpmd vars */
+uint64_t dest_eth_addr[RTE_MAX_ETHPORTS];
+struct rte_ether_addr ports_eth_addr[RTE_MAX_ETHPORTS];
+
+xmm_t val_eth[RTE_MAX_ETHPORTS];
+
+struct ipv4_l3fwd_lpm_route {
+	uint32_t ip;
+	uint8_t  depth;
+	uint8_t  if_out;
+};
+
+struct ipv6_l3fwd_lpm_route {
+	uint8_t ip[16];
+	uint8_t  depth;
+	uint8_t  if_out;
+};
+
+/* 198.18.0.0/16 are set aside for RFC2544 benchmarking (RFC5735). */
+static const struct ipv4_l3fwd_lpm_route ipv4_l3fwd_lpm_route_array[] = {
+	{RTE_IPV4(198, 18, 0, 0), 24, 0},
+	{RTE_IPV4(198, 18, 1, 0), 24, 1},
+	{RTE_IPV4(198, 18, 2, 0), 24, 2},
+	{RTE_IPV4(198, 18, 3, 0), 24, 3},
+	{RTE_IPV4(198, 18, 4, 0), 24, 4},
+	{RTE_IPV4(198, 18, 5, 0), 24, 5},
+	{RTE_IPV4(198, 18, 6, 0), 24, 6},
+	{RTE_IPV4(198, 18, 7, 0), 24, 7},
+};
+
+/* 2001:0200::/48 is IANA reserved range for IPv6 benchmarking (RFC5180) */
+static const struct ipv6_l3fwd_lpm_route ipv6_l3fwd_lpm_route_array[] = {
+	{{32, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, 48, 0},
+	{{32, 1, 2, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0}, 48, 1},
+	{{32, 1, 2, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0}, 48, 2},
+	{{32, 1, 2, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0}, 48, 3},
+	{{32, 1, 2, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0}, 48, 4},
+	{{32, 1, 2, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0}, 48, 5},
+	{{32, 1, 2, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0}, 48, 6},
+	{{32, 1, 2, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0}, 48, 7},
+};
+
+#define IPV4_L3FWD_LPM_MAX_RULES	 1024
+#define IPV4_L3FWD_LPM_NUMBER_TBL8S (1 << 8)
+#define IPV6_L3FWD_LPM_MAX_RULES	 1024
+#define IPV6_L3FWD_LPM_NUMBER_TBL8S (1 << 16)
+
+struct rte_lpm *ipv4_l3fwd_lpm_lookup_struct[NB_SOCKETS];
+struct rte_lpm6 *ipv6_l3fwd_lpm_lookup_struct[NB_SOCKETS];
+
+static inline uint16_t
+lpm_get_ipv4_dst_port(const struct rte_ipv4_hdr *ipv4_hdr,
+		      uint16_t portid,
+		      struct rte_lpm *ipv4_l3fwd_lookup_struct)
+{
+	uint32_t dst_ip = rte_be_to_cpu_32(ipv4_hdr->dst_addr);
+	uint32_t next_hop;
+
+	if (rte_lpm_lookup(ipv4_l3fwd_lookup_struct, dst_ip, &next_hop) == 0)
+		return next_hop;
+	else
+		return portid;
+}
+
+static inline uint16_t
+lpm_get_ipv6_dst_port(const struct rte_ipv6_hdr *ipv6_hdr,
+		      uint16_t portid,
+		      struct rte_lpm6 *ipv6_l3fwd_lookup_struct)
+{
+	const uint8_t *dst_ip = ipv6_hdr->dst_addr;
+	uint32_t next_hop;
+
+	if (rte_lpm6_lookup(ipv6_l3fwd_lookup_struct, dst_ip, &next_hop) == 0)
+		return next_hop;
+	else
+		return portid;
+}
+
+static __rte_always_inline uint16_t
+lpm_get_dst_port(const int socketid, struct rte_mbuf *pkt,
+		uint16_t portid)
+{
+	struct rte_ipv6_hdr *ipv6_hdr;
+	struct rte_ipv4_hdr *ipv4_hdr;
+	struct rte_ether_hdr *eth_hdr;
+
+	if (RTE_ETH_IS_IPV4_HDR(pkt->packet_type)) {
+
+		eth_hdr = rte_pktmbuf_mtod(pkt, struct rte_ether_hdr *);
+		ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
+
+		return lpm_get_ipv4_dst_port(ipv4_hdr, portid,
+					     ipv4_l3fwd_lpm_lookup_struct[socketid]);
+	} else if (RTE_ETH_IS_IPV6_HDR(pkt->packet_type)) {
+
+		eth_hdr = rte_pktmbuf_mtod(pkt, struct rte_ether_hdr *);
+		ipv6_hdr = (struct rte_ipv6_hdr *)(eth_hdr + 1);
+
+		return lpm_get_ipv6_dst_port(ipv6_hdr, portid,
+					     ipv6_l3fwd_lpm_lookup_struct[socketid]);
+	}
+
+	return portid;
+}
+
+/*
+ * lpm_get_dst_port optimized routine for packets where dst_ipv4 is already
+ * precalculated. If packet is ipv6 dst_addr is taken directly from packet
+ * header and dst_ipv4 value is not used.
+ */
+static __rte_always_inline uint16_t
+lpm_get_dst_port_with_ipv4(const int socketid, struct rte_mbuf *pkt,
+	uint32_t dst_ipv4, uint16_t portid)
+{
+	uint32_t next_hop;
+	struct rte_ipv6_hdr *ipv6_hdr;
+	struct rte_ether_hdr *eth_hdr;
+
+	if (RTE_ETH_IS_IPV4_HDR(pkt->packet_type)) {
+		return (uint16_t) ((rte_lpm_lookup(ipv4_l3fwd_lpm_lookup_struct[socketid],
+						   dst_ipv4, &next_hop) == 0)
+				   ? next_hop : portid);
+
+	} else if (RTE_ETH_IS_IPV6_HDR(pkt->packet_type)) {
+
+		eth_hdr = rte_pktmbuf_mtod(pkt, struct rte_ether_hdr *);
+		ipv6_hdr = (struct rte_ipv6_hdr *)(eth_hdr + 1);
+
+		return (uint16_t) ((rte_lpm6_lookup(ipv6_l3fwd_lpm_lookup_struct[socketid],
+				ipv6_hdr->dst_addr, &next_hop) == 0)
+				? next_hop : portid);
+
+	}
+
+	return portid;
+}
+
+void
+setup_lpm(const int socketid)
+{
+	struct rte_lpm6_config config;
+	struct rte_lpm_config config_ipv4;
+	unsigned i;
+	int ret;
+	char s[64];
+	char abuf[INET6_ADDRSTRLEN];
+
+	/* create the LPM table */
+	config_ipv4.max_rules = IPV4_L3FWD_LPM_MAX_RULES;
+	config_ipv4.number_tbl8s = IPV4_L3FWD_LPM_NUMBER_TBL8S;
+	config_ipv4.flags = 0;
+	snprintf(s, sizeof(s), "IPV4_L3FWD_LPM_%d", socketid);
+	ipv4_l3fwd_lpm_lookup_struct[socketid] =
+			rte_lpm_create(s, socketid, &config_ipv4);
+	if (ipv4_l3fwd_lpm_lookup_struct[socketid] == NULL)
+		rte_exit(EXIT_FAILURE,
+			"Unable to create the l3fwd LPM table on socket %d\n",
+			socketid);
+
+	/* populate the LPM table */
+	for (i = 0; i < RTE_DIM(ipv4_l3fwd_lpm_route_array); i++) {
+		struct in_addr in;
+
+		/*skip unused ports */
+		if (ipv4_l3fwd_lpm_route_array[i].if_out >= nb_fwd_ports)
+			continue;
+
+		ret = rte_lpm_add(ipv4_l3fwd_lpm_lookup_struct[socketid],
+			ipv4_l3fwd_lpm_route_array[i].ip,
+			ipv4_l3fwd_lpm_route_array[i].depth,
+			ipv4_l3fwd_lpm_route_array[i].if_out);
+
+		if (ret < 0) {
+			rte_exit(EXIT_FAILURE,
+				"Unable to add entry %u to the l3fwd LPM table on socket %d\n",
+				i, socketid);
+		}
+
+		in.s_addr = htonl(ipv4_l3fwd_lpm_route_array[i].ip);
+		printf("LPM: Adding route %s / %d (%d)\n",
+		       inet_ntop(AF_INET, &in, abuf, sizeof(abuf)),
+			ipv4_l3fwd_lpm_route_array[i].depth,
+			ipv4_l3fwd_lpm_route_array[i].if_out);
+	}
+
+	/* create the LPM6 table */
+	snprintf(s, sizeof(s), "IPV6_L3FWD_LPM_%d", socketid);
+
+	config.max_rules = IPV6_L3FWD_LPM_MAX_RULES;
+	config.number_tbl8s = IPV6_L3FWD_LPM_NUMBER_TBL8S;
+	config.flags = 0;
+	ipv6_l3fwd_lpm_lookup_struct[socketid] = rte_lpm6_create(s, socketid,
+				&config);
+	if (ipv6_l3fwd_lpm_lookup_struct[socketid] == NULL)
+		rte_exit(EXIT_FAILURE,
+			"Unable to create the l3fwd LPM table on socket %d\n",
+			socketid);
+
+	/* populate the LPM table */
+	for (i = 0; i < RTE_DIM(ipv6_l3fwd_lpm_route_array); i++) {
+
+
+		/*skip unused ports */
+		if (ipv6_l3fwd_lpm_route_array[i].if_out >= nb_fwd_ports)
+			continue;
+
+		ret = rte_lpm6_add(ipv6_l3fwd_lpm_lookup_struct[socketid],
+			ipv6_l3fwd_lpm_route_array[i].ip,
+			ipv6_l3fwd_lpm_route_array[i].depth,
+			ipv6_l3fwd_lpm_route_array[i].if_out);
+
+		if (ret < 0) {
+			rte_exit(EXIT_FAILURE,
+				"Unable to add entry %u to the l3fwd LPM table on socket %d\n",
+				i, socketid);
+		}
+
+		printf("LPM: Adding route %s / %d (%d)\n",
+		       inet_ntop(AF_INET6, ipv6_l3fwd_lpm_route_array[i].ip,
+				 abuf, sizeof(abuf)),
+		       ipv6_l3fwd_lpm_route_array[i].depth,
+		       ipv6_l3fwd_lpm_route_array[i].if_out);
+	}
+}
+
+/*TODO implement for SSE, AltiVec */
+#if defined __ARM_NEON
+#include "l3fwd_lpm_neon.h"
+#else
+#include "l3fwd_lpm.h"
+#endif
+
+/*
+ * Forwarding of packets in l3 mode.
+ */
+static void
+pkt_burst_l3_forward(struct fwd_stream *fs)
+{
+	struct rte_mbuf  *pkts_burst[MAX_PKT_BURST];
+	uint16_t nb_rx;
+	uint64_t start_tsc = 0;
+	uint64_t diff_tsc;
+	int i;
+
+
+	/*TODO change hardcoded value (assuming single socket) */
+	uint16_t portid = fs->rx_port;
+	uint16_t socketid = 0;
+
+	/* TODO insert code to skip current core if no rx_queue defined */
+
+	get_start_cycles(&start_tsc);
+
+	/*
+	 * TX burst queue drain
+	*/
+	diff_tsc = fs->cur_tsc - fs->prev_tsc;
+	if (unlikely(diff_tsc > fs->drain_tsc)) {
+
+		for (i = 0; i < nb_fwd_ports; i++) {
+			if (fs->tx_mbufs[i].len == 0)
+				continue;
+			send_burst(fs, fs->tx_mbufs[i].len, i);
+			fs->tx_mbufs[i].len = 0;
+		}
+
+		fs->prev_tsc = fs->cur_tsc;
+	}
+
+	fs->cur_tsc = rte_rdtsc();
+       /*
+	 * Receive a burst of packets and forward them.
+	 */
+	nb_rx = rte_eth_rx_burst(fs->rx_port, fs->rx_queue, pkts_burst,
+				 nb_pkt_per_burst);
+	inc_rx_burst_stats(fs, nb_rx);
+	if (unlikely(nb_rx == 0))
+		return;
+
+	fs->rx_packets += nb_rx;
+
+#if defined __ARM_NEON
+		l3fwd_lpm_send_packets(nb_rx, pkts_burst, portid, fs, socketid);
+#else
+		l3fwd_lpm_no_opt_send_packets(nb_rx, pkts_burst, portid, fs);
+#endif /* X86 */
+
+	get_end_cycles(fs, start_tsc);
+
+}
+
+static void
+l3fwd_begin(portid_t pi)
+{
+	/* Set dest MAC for port to 02:00:00:00:00:xx */
+	dest_eth_addr[pi] = RTE_ETHER_LOCAL_ADMIN_ADDR + ((uint64_t)pi << 40);
+
+	*(uint64_t *)(val_eth + pi) = dest_eth_addr[pi];
+	/* TODO check l3fwd_poll_resource_setup to see what must be done on
+		a port level, esp init ports_eth_addr[] */
+
+	/*TODO maybe here, enable promiscuous mode */
+}
+
+struct fwd_engine l3_fwd_engine = {
+	.fwd_mode_name	= "l3",
+	.port_fwd_begin = l3fwd_begin,
+	.port_fwd_end	= NULL,
+	.packet_fwd	= pkt_burst_l3_forward,
+};
diff --git a/app/test-pmd/l3fwd.h b/app/test-pmd/l3fwd.h
new file mode 100644
index 000000000..643f30abf
--- /dev/null
+++ b/app/test-pmd/l3fwd.h
@@ -0,0 +1,143 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2016 Intel Corporation
+ */
+
+#ifndef __L3_FWD_H__
+#define __L3_FWD_H__
+
+#include <rte_ethdev.h>
+#include <rte_vect.h>
+
+#define DO_RFC_1812_CHECKS
+
+#define RTE_LOGTYPE_L3FWD RTE_LOGTYPE_USER1
+
+#define MAX_TX_BURST	(nb_pkt_per_burst / 2)
+
+/* Configure how many packets ahead to prefetch, when reading packets */
+#define PREFETCH_OFFSET	  3
+
+/* Used to mark destination port as 'invalid'. */
+#define BAD_PORT ((uint16_t)-1)
+
+#define FWDSTEP	4
+
+/* replace first 12B of the ethernet header. */
+#define	MASK_ETH 0x3f
+
+/* ethernet addresses of ports */
+extern uint64_t dest_eth_addr[RTE_MAX_ETHPORTS];
+extern struct rte_ether_addr ports_eth_addr[RTE_MAX_ETHPORTS];
+
+extern xmm_t val_eth[RTE_MAX_ETHPORTS];
+
+/* Send burst of packets on an output interface */
+static inline int
+send_burst(struct fwd_stream *fs, uint16_t n, uint16_t port)
+{
+	struct rte_mbuf **m_table;
+	int ret;
+	uint16_t queueid;
+
+	queueid = fs->tx_queue;
+	m_table = (struct rte_mbuf **)fs->tx_mbufs[port].m_table;
+
+	ret = rte_eth_tx_burst(port, queueid, m_table, n);
+	fs->tx_packets += ret;
+	inc_tx_burst_stats(fs, ret);
+	if (unlikely(ret < n)) {
+		fs->fwd_dropped += (n - ret);
+		do {
+			rte_pktmbuf_free(m_table[ret]);
+		} while (++ret < n);
+	}
+
+	return 0;
+}
+
+/* Enqueue a single packet, and send burst if queue is filled */
+static inline int
+send_single_packet(struct fwd_stream *fs,
+		   struct rte_mbuf *m, uint16_t port)
+{
+	uint16_t len;
+
+	len = fs->tx_mbufs[port].len;
+	fs->tx_mbufs[port].m_table[len] = m;
+	len++;
+
+	/* enough pkts to be sent */
+	if (unlikely(len == nb_pkt_per_burst)) {
+		send_burst(fs, nb_pkt_per_burst, port);
+		len = 0;
+	}
+
+	fs->tx_mbufs[port].len = len;
+	return 0;
+}
+
+
+#ifdef DO_RFC_1812_CHECKS
+static inline int
+is_valid_ipv4_pkt(struct rte_ipv4_hdr *pkt, uint32_t link_len)
+{
+	/* From http://www.rfc-editor.org/rfc/rfc1812.txt section 5.2.2 */
+	/*
+	 * 1. The packet length reported by the Link Layer must be large
+	 * enough to hold the minimum length legal IP datagram (20 bytes).
+	 */
+	if (link_len < sizeof(struct rte_ipv4_hdr))
+		return -1;
+
+	/* 2. The IP checksum must be correct. */
+	/* this is checked in H/W */
+
+	/*
+	 * 3. The IP version number must be 4. If the version number is not 4
+	 * then the packet may be another version of IP, such as IPng or
+	 * ST-II.
+	 */
+	if (((pkt->version_ihl) >> 4) != 4)
+		return -3;
+	/*
+	 * 4. The IP header length field must be large enough to hold the
+	 * minimum length legal IP datagram (20 bytes = 5 words).
+	 */
+	if ((pkt->version_ihl & 0xf) < 5)
+		return -4;
+
+	/*
+	 * 5. The IP total length field must be large enough to hold the IP
+	 * datagram header, whose length is specified in the IP header length
+	 * field.
+	 */
+	if (rte_cpu_to_be_16(pkt->total_length) < sizeof(struct rte_ipv4_hdr))
+		return -5;
+
+	return 0;
+}
+#endif /* DO_RFC_1812_CHECKS */
+
+int
+init_mem(uint16_t portid, unsigned int nb_mbuf);
+
+/* Function pointers for LPM or EM functionality. */
+void
+setup_lpm(const int socketid);
+
+int
+lpm_check_ptype(int portid);
+
+uint16_t
+lpm_cb_parse_ptype(uint16_t port, uint16_t queue, struct rte_mbuf *pkts[],
+		   uint16_t nb_pkts, uint16_t max_pkts, void *user_param);
+
+
+/* Return ipv4/ipv6 fwd lookup struct for LPM or EM. */
+void *
+lpm_get_ipv4_l3fwd_lookup_struct(const int socketid);
+
+void *
+lpm_get_ipv6_l3fwd_lookup_struct(const int socketid);
+
+#endif	/* __L3_FWD_H__ */
diff --git a/app/test-pmd/l3fwd_common.h b/app/test-pmd/l3fwd_common.h
new file mode 100644
index 000000000..f3970e0b4
--- /dev/null
+++ b/app/test-pmd/l3fwd_common.h
@@ -0,0 +1,268 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2016-2018 Intel Corporation.
+ * Copyright(c) 2017-2018 Linaro Limited.
+ */
+
+
+#ifndef _L3FWD_COMMON_H_
+#define _L3FWD_COMMON_H_
+
+#ifdef DO_RFC_1812_CHECKS
+
+#define	IPV4_MIN_VER_IHL	0x45
+#define	IPV4_MAX_VER_IHL	0x4f
+#define	IPV4_MAX_VER_IHL_DIFF	(IPV4_MAX_VER_IHL - IPV4_MIN_VER_IHL)
+
+/* Minimum value of IPV4 total length (20B) in network byte order. */
+#define	IPV4_MIN_LEN_BE	(sizeof(struct rte_ipv4_hdr) << 8)
+
+/*
+ * From http://www.rfc-editor.org/rfc/rfc1812.txt section 5.2.2:
+ * - The IP version number must be 4.
+ * - The IP header length field must be large enough to hold the
+ *    minimum length legal IP datagram (20 bytes = 5 words).
+ * - The IP total length field must be large enough to hold the IP
+ *   datagram header, whose length is specified in the IP header length
+ *   field.
+ * If we encounter invalid IPV4 packet, then set destination port for it
+ * to BAD_PORT value.
+ */
+static __rte_always_inline void
+rfc1812_process(struct rte_ipv4_hdr *ipv4_hdr, uint16_t *dp, uint32_t ptype)
+{
+	uint8_t ihl;
+
+	if (RTE_ETH_IS_IPV4_HDR(ptype)) {
+		ihl = ipv4_hdr->version_ihl - IPV4_MIN_VER_IHL;
+
+		ipv4_hdr->time_to_live--;
+		ipv4_hdr->hdr_checksum++;
+
+		if (ihl > IPV4_MAX_VER_IHL_DIFF ||
+				((uint8_t)ipv4_hdr->total_length == 0 &&
+				ipv4_hdr->total_length < IPV4_MIN_LEN_BE))
+			dp[0] = BAD_PORT;
+
+	}
+}
+
+#else
+#define	rfc1812_process(mb, dp, ptype)	do { } while (0)
+#endif /* DO_RFC_1812_CHECKS */
+
+/*
+ * We group consecutive packets with the same destionation port into one burst.
+ * To avoid extra latency this is done together with some other packet
+ * processing, but after we made a final decision about packet's destination.
+ * To do this we maintain:
+ * pnum - array of number of consecutive packets with the same dest port for
+ * each packet in the input burst.
+ * lp - pointer to the last updated element in the pnum.
+ * dlp - dest port value lp corresponds to.
+ */
+
+#define	GRPSZ	(1 << FWDSTEP)
+#define	GRPMSK	(GRPSZ - 1)
+
+#define GROUP_PORT_STEP(dlp, dcp, lp, pn, idx)	do { \
+	if (likely((dlp) == (dcp)[(idx)])) {             \
+		(lp)[0]++;                                   \
+	} else {                                         \
+		(dlp) = (dcp)[idx];                          \
+		(lp) = (pn) + (idx);                         \
+		(lp)[0] = 1;                                 \
+	}                                                \
+} while (0)
+
+static const struct {
+	uint64_t pnum; /* prebuild 4 values for pnum[]. */
+	int32_t  idx;  /* index for new last updated elemnet. */
+	uint16_t lpv;  /* add value to the last updated element. */
+} gptbl[GRPSZ] = {
+	{
+		/* 0: a != b, b != c, c != d, d != e */
+		.pnum = UINT64_C(0x0001000100010001),
+		.idx = 4,
+		.lpv = 0,
+	},
+	{
+		/* 1: a == b, b != c, c != d, d != e */
+		.pnum = UINT64_C(0x0001000100010002),
+		.idx = 4,
+		.lpv = 1,
+	},
+	{
+		/* 2: a != b, b == c, c != d, d != e */
+		.pnum = UINT64_C(0x0001000100020001),
+		.idx = 4,
+		.lpv = 0,
+	},
+	{
+		/* 3: a == b, b == c, c != d, d != e */
+		.pnum = UINT64_C(0x0001000100020003),
+		.idx = 4,
+		.lpv = 2,
+	},
+	{
+		/* 4: a != b, b != c, c == d, d != e */
+		.pnum = UINT64_C(0x0001000200010001),
+		.idx = 4,
+		.lpv = 0,
+	},
+	{
+		/* 5: a == b, b != c, c == d, d != e */
+		.pnum = UINT64_C(0x0001000200010002),
+		.idx = 4,
+		.lpv = 1,
+	},
+	{
+		/* 6: a != b, b == c, c == d, d != e */
+		.pnum = UINT64_C(0x0001000200030001),
+		.idx = 4,
+		.lpv = 0,
+	},
+	{
+		/* 7: a == b, b == c, c == d, d != e */
+		.pnum = UINT64_C(0x0001000200030004),
+		.idx = 4,
+		.lpv = 3,
+	},
+	{
+		/* 8: a != b, b != c, c != d, d == e */
+		.pnum = UINT64_C(0x0002000100010001),
+		.idx = 3,
+		.lpv = 0,
+	},
+	{
+		/* 9: a == b, b != c, c != d, d == e */
+		.pnum = UINT64_C(0x0002000100010002),
+		.idx = 3,
+		.lpv = 1,
+	},
+	{
+		/* 0xa: a != b, b == c, c != d, d == e */
+		.pnum = UINT64_C(0x0002000100020001),
+		.idx = 3,
+		.lpv = 0,
+	},
+	{
+		/* 0xb: a == b, b == c, c != d, d == e */
+		.pnum = UINT64_C(0x0002000100020003),
+		.idx = 3,
+		.lpv = 2,
+	},
+	{
+		/* 0xc: a != b, b != c, c == d, d == e */
+		.pnum = UINT64_C(0x0002000300010001),
+		.idx = 2,
+		.lpv = 0,
+	},
+	{
+		/* 0xd: a == b, b != c, c == d, d == e */
+		.pnum = UINT64_C(0x0002000300010002),
+		.idx = 2,
+		.lpv = 1,
+	},
+	{
+		/* 0xe: a != b, b == c, c == d, d == e */
+		.pnum = UINT64_C(0x0002000300040001),
+		.idx = 1,
+		.lpv = 0,
+	},
+	{
+		/* 0xf: a == b, b == c, c == d, d == e */
+		.pnum = UINT64_C(0x0002000300040005),
+		.idx = 0,
+		.lpv = 4,
+	},
+};
+
+static __rte_always_inline void
+send_packetsx4(struct fwd_stream *fs, uint16_t port, struct rte_mbuf *m[],
+		uint32_t num)
+{
+	uint32_t len, j, n;
+
+	len = fs->tx_mbufs[port].len;
+
+	/*
+	 * If TX buffer for that queue is empty, and we have enough packets,
+	 * then send them straightway.
+	 */
+	if (num >= MAX_TX_BURST && len == 0) {
+		n = rte_eth_tx_burst(port, fs->tx_queue, m, num);
+		fs->tx_packets += n;
+		inc_tx_burst_stats(fs, n);
+		if (unlikely(n < num)) {
+			fs->fwd_dropped += (num - n);
+			do {
+				rte_pktmbuf_free(m[n]);
+			} while (++n < num);
+		}
+
+		return;
+	}
+
+	/*
+	 * Put packets into TX buffer for that queue.
+	 */
+
+	n = len + num;
+	n = (n > nb_pkt_per_burst) ? nb_pkt_per_burst - len : num;
+
+	j = 0;
+	switch (n % FWDSTEP) {
+	while (j < n) {
+	case 0:
+		fs->tx_mbufs[port].m_table[len + j] = m[j];
+		j++;
+		/* fallthrough */
+	case 3:
+		fs->tx_mbufs[port].m_table[len + j] = m[j];
+		j++;
+		/* fallthrough */
+	case 2:
+		fs->tx_mbufs[port].m_table[len + j] = m[j];
+		j++;
+		/* fallthrough */
+	case 1:
+		fs->tx_mbufs[port].m_table[len + j] = m[j];
+		j++;
+	}
+	}
+
+	len += n;
+
+	/* enough pkts to be sent */
+	if (unlikely(len == nb_pkt_per_burst)) {
+
+		send_burst(fs, nb_pkt_per_burst, port);
+
+		/* copy rest of the packets into the TX buffer. */
+		len = num - n;
+		j = 0;
+		switch (len % FWDSTEP) {
+		while (j < len) {
+		case 0:
+			fs->tx_mbufs[port].m_table[j] = m[n + j];
+			j++;
+			/* fallthrough */
+		case 3:
+			fs->tx_mbufs[port].m_table[j] = m[n + j];
+			j++;
+			/* fallthrough */
+		case 2:
+			fs->tx_mbufs[port].m_table[j] = m[n + j];
+			j++;
+			/* fallthrough */
+		case 1:
+			fs->tx_mbufs[port].m_table[j] = m[n + j];
+			j++;
+		}
+		}
+	}
+
+	fs->tx_mbufs[port].len = len;
+}
+
+#endif /* _L3FWD_COMMON_H_ */
diff --git a/app/test-pmd/l3fwd_lpm.h b/app/test-pmd/l3fwd_lpm.h
new file mode 100644
index 000000000..7c3959fbe
--- /dev/null
+++ b/app/test-pmd/l3fwd_lpm.h
@@ -0,0 +1,107 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2016 Intel Corporation
+ */
+
+#ifndef __L3FWD_LPM_H__
+#define __L3FWD_LPM_H__
+
+static __rte_always_inline void
+l3fwd_lpm_simple_forward(struct rte_mbuf *m, uint16_t portid,
+		struct fwd_stream *fs)
+{
+	struct rte_ether_hdr *eth_hdr;
+	struct rte_ipv4_hdr *ipv4_hdr;
+	uint16_t dst_port;
+
+
+	struct rte_lpm *ipv4_lookup_struct = ipv4_l3fwd_lpm_lookup_struct[0];
+	struct rte_lpm6 *ipv6_lookup_struct = ipv6_l3fwd_lpm_lookup_struct[0];
+
+	eth_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
+
+	if (RTE_ETH_IS_IPV4_HDR(m->packet_type)) {
+		/* Handle IPv4 headers.*/
+		ipv4_hdr = rte_pktmbuf_mtod_offset(m, struct rte_ipv4_hdr *,
+						sizeof(struct rte_ether_hdr));
+
+#ifdef DO_RFC_1812_CHECKS
+		/* Check to make sure the packet is valid (RFC1812) */
+		if (is_valid_ipv4_pkt(ipv4_hdr, m->pkt_len) < 0) {
+			rte_pktmbuf_free(m);
+			return;
+		}
+#endif
+		 dst_port = lpm_get_ipv4_dst_port(ipv4_hdr, portid,
+						ipv4_lookup_struct);
+
+		/*TODO check if port is enabled */
+	       /* if (dst_port >= RTE_MAX_ETHPORTS ||
+			(enabled_port_mask & 1 << dst_port) == 0)
+			dst_port = portid;
+		*/
+#ifdef DO_RFC_1812_CHECKS
+		/* Update time to live and header checksum */
+		--(ipv4_hdr->time_to_live);
+		++(ipv4_hdr->hdr_checksum);
+#endif
+		/* dst addr */
+		*(uint64_t *)&eth_hdr->d_addr = dest_eth_addr[dst_port];
+
+		/* src addr */
+		rte_ether_addr_copy(&ports_eth_addr[dst_port],
+				&eth_hdr->s_addr);
+
+		send_single_packet(fs, m, dst_port);
+	} else if (RTE_ETH_IS_IPV6_HDR(m->packet_type)) {
+		/* Handle IPv6 headers.*/
+		struct rte_ipv6_hdr *ipv6_hdr;
+
+		ipv6_hdr = rte_pktmbuf_mtod_offset(m, struct rte_ipv6_hdr *,
+						sizeof(struct rte_ether_hdr));
+
+		dst_port = lpm_get_ipv6_dst_port(ipv6_hdr, portid,
+					ipv6_lookup_struct);
+
+		/*TODO check if port is enabled
+		if (dst_port >= RTE_MAX_ETHPORTS ||
+			(enabled_port_mask & 1 << dst_port) == 0)
+			dst_port = portid;
+		*/
+
+		/* dst addr */
+		*(uint64_t *)&eth_hdr->d_addr = dest_eth_addr[dst_port];
+
+		/* src addr */
+		rte_ether_addr_copy(&ports_eth_addr[dst_port],
+				&eth_hdr->s_addr);
+
+		send_single_packet(fs, m, dst_port);
+	} else {
+		/* Free the mbuf that contains non-IPV4/IPV6 packet */
+		rte_pktmbuf_free(m);
+	}
+}
+
+static inline void
+l3fwd_lpm_no_opt_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
+				uint16_t portid, struct fwd_stream *fs)
+{
+	int32_t j;
+
+	/* Prefetch first packets */
+	for (j = 0; j < PREFETCH_OFFSET && j < nb_rx; j++)
+		rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[j], void *));
+
+	/* Prefetch and forward already prefetched packets. */
+	for (j = 0; j < (nb_rx - PREFETCH_OFFSET); j++) {
+		rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[
+				j + PREFETCH_OFFSET], void *));
+		l3fwd_lpm_simple_forward(pkts_burst[j], portid, fs);
+	}
+
+	/* Forward remaining prefetched packets */
+	for (; j < nb_rx; j++)
+		l3fwd_lpm_simple_forward(pkts_burst[j], portid, fs);
+}
+
+#endif /* __L3FWD_LPM_H__ */
diff --git a/app/test-pmd/l3fwd_lpm_neon.h b/app/test-pmd/l3fwd_lpm_neon.h
new file mode 100644
index 000000000..c3d36e014
--- /dev/null
+++ b/app/test-pmd/l3fwd_lpm_neon.h
@@ -0,0 +1,169 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2018 Intel Corporation.
+ * Copyright(c) 2017-2018 Linaro Limited.
+ */
+
+#ifndef __L3FWD_LPM_NEON_H__
+#define __L3FWD_LPM_NEON_H__
+
+#include <arm_neon.h>
+
+#include "l3fwd_neon.h"
+
+extern struct rte_lpm *ipv4_l3fwd_lpm_lookup_struct[NB_SOCKETS];
+extern struct rte_lpm6 *ipv6_l3fwd_lpm_lookup_struct[NB_SOCKETS];
+
+/*
+ * Read packet_type and destination IPV4 addresses from 4 mbufs.
+ */
+static inline void
+processx4_step1(struct rte_mbuf *pkt[FWDSTEP],
+		int32x4_t *dip,
+		uint32_t *ipv4_flag)
+{
+	struct rte_ipv4_hdr *ipv4_hdr;
+	struct rte_ether_hdr *eth_hdr;
+	int32_t dst[FWDSTEP];
+
+	eth_hdr = rte_pktmbuf_mtod(pkt[0], struct rte_ether_hdr *);
+	ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
+	dst[0] = ipv4_hdr->dst_addr;
+	ipv4_flag[0] = pkt[0]->packet_type & RTE_PTYPE_L3_IPV4;
+
+	eth_hdr = rte_pktmbuf_mtod(pkt[1], struct rte_ether_hdr *);
+	ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
+	dst[1] = ipv4_hdr->dst_addr;
+	ipv4_flag[0] &= pkt[1]->packet_type;
+
+	eth_hdr = rte_pktmbuf_mtod(pkt[2], struct rte_ether_hdr *);
+	ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
+	dst[2] = ipv4_hdr->dst_addr;
+	ipv4_flag[0] &= pkt[2]->packet_type;
+
+	eth_hdr = rte_pktmbuf_mtod(pkt[3], struct rte_ether_hdr *);
+	ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
+	dst[3] = ipv4_hdr->dst_addr;
+	ipv4_flag[0] &= pkt[3]->packet_type;
+
+	dip[0] = vld1q_s32(dst);
+}
+
+/*
+ * Lookup into LPM for destination port.
+ * If lookup fails, use incoming port (portid) as destination port.
+ */
+static inline void
+processx4_step2(const int socketid,
+		int32x4_t dip,
+		uint32_t ipv4_flag,
+		uint16_t portid,
+		struct rte_mbuf *pkt[FWDSTEP],
+		uint16_t dprt[FWDSTEP])
+{
+	rte_xmm_t dst;
+
+	dip = vreinterpretq_s32_u8(vrev32q_u8(vreinterpretq_u8_s32(dip)));
+
+	/* if all 4 packets are IPV4. */
+	if (likely(ipv4_flag)) {
+		rte_lpm_lookupx4(ipv4_l3fwd_lpm_lookup_struct[0], dip, dst.u32,
+			portid);
+		/* get rid of unused upper 16 bit for each dport. */
+		vst1_s16((int16_t *)dprt, vqmovn_s32(dst.x));
+	} else {
+		dst.x = dip;
+		dprt[0] = lpm_get_dst_port_with_ipv4(socketid, pkt[0],
+						     dst.u32[0], portid);
+		dprt[1] = lpm_get_dst_port_with_ipv4(socketid, pkt[1],
+						     dst.u32[1], portid);
+		dprt[2] = lpm_get_dst_port_with_ipv4(socketid, pkt[2],
+						     dst.u32[2], portid);
+		dprt[3] = lpm_get_dst_port_with_ipv4(socketid, pkt[3],
+						     dst.u32[3], portid);
+	}
+}
+
+/*
+ * Buffer optimized handling of packets, invoked
+ * from main_loop.
+ */
+static inline void
+l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
+			uint16_t portid, struct fwd_stream *fs,
+			const int socketid)
+{
+	int32_t i = 0, j = 0;
+	uint16_t dst_port[MAX_PKT_BURST];
+	int32x4_t dip;
+	uint32_t ipv4_flag;
+	const int32_t k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);
+	const int32_t m = nb_rx % FWDSTEP;
+
+
+	if (k) {
+		for (i = 0; i < FWDSTEP; i++) {
+			rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[i],
+						struct rte_ether_hdr *) + 1);
+		}
+
+		for (j = 0; j != k - FWDSTEP; j += FWDSTEP) {
+			for (i = 0; i < FWDSTEP; i++) {
+				rte_prefetch0(rte_pktmbuf_mtod(
+						pkts_burst[j + i + FWDSTEP],
+						struct rte_ether_hdr *) + 1);
+			}
+
+			processx4_step1(&pkts_burst[j], &dip, &ipv4_flag);
+			processx4_step2(socketid, dip, ipv4_flag, portid,
+					&pkts_burst[j], &dst_port[j]);
+		}
+
+		processx4_step1(&pkts_burst[j], &dip, &ipv4_flag);
+		processx4_step2(socketid, dip, ipv4_flag, portid, &pkts_burst[j],
+				&dst_port[j]);
+
+		j += FWDSTEP;
+	}
+
+	if (m) {
+		/* Prefetch last up to 3 packets one by one */
+		switch (m) {
+		case 3:
+			rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[j],
+						struct rte_ether_hdr *) + 1);
+			j++;
+			/* fallthrough */
+		case 2:
+			rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[j],
+						struct rte_ether_hdr *) + 1);
+			j++;
+			/* fallthrough */
+		case 1:
+			rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[j],
+						struct rte_ether_hdr *) + 1);
+			j++;
+		}
+
+		j -= m;
+		/* Classify last up to 3 packets one by one */
+		switch (m) {
+		case 3:
+			dst_port[j] = lpm_get_dst_port(socketid, pkts_burst[j],
+						       portid);
+			j++;
+			/* fallthrough */
+		case 2:
+			dst_port[j] = lpm_get_dst_port(socketid, pkts_burst[j],
+						       portid);
+			j++;
+			/* fallthrough */
+		case 1:
+			dst_port[j] = lpm_get_dst_port(socketid, pkts_burst[j],
+						       portid);
+		}
+	}
+
+	send_packets_multi(fs, pkts_burst, dst_port, nb_rx);
+}
+
+#endif /* __L3FWD_LPM_NEON_H__ */
diff --git a/app/test-pmd/l3fwd_neon.h b/app/test-pmd/l3fwd_neon.h
new file mode 100644
index 000000000..72091e542
--- /dev/null
+++ b/app/test-pmd/l3fwd_neon.h
@@ -0,0 +1,234 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2016-2018 Intel Corporation.
+ * Copyright(c) 2017-2018 Linaro Limited.
+ */
+
+#ifndef _L3FWD_NEON_H_
+#define _L3FWD_NEON_H_
+
+#include "l3fwd.h"
+#include "l3fwd_common.h"
+
+/*
+ * Update source and destination MAC addresses in the ethernet header.
+ * Perform RFC1812 checks and updates for IPV4 packets.
+ */
+static inline void
+processx4_step3(struct rte_mbuf *pkt[FWDSTEP], uint16_t dst_port[FWDSTEP])
+{
+	uint32x4_t te[FWDSTEP];
+	uint32x4_t ve[FWDSTEP];
+	uint32_t *p[FWDSTEP];
+
+	p[0] = rte_pktmbuf_mtod(pkt[0], uint32_t *);
+	p[1] = rte_pktmbuf_mtod(pkt[1], uint32_t *);
+	p[2] = rte_pktmbuf_mtod(pkt[2], uint32_t *);
+	p[3] = rte_pktmbuf_mtod(pkt[3], uint32_t *);
+
+	ve[0] = vreinterpretq_u32_s32(val_eth[dst_port[0]]);
+	te[0] = vld1q_u32(p[0]);
+
+	ve[1] = vreinterpretq_u32_s32(val_eth[dst_port[1]]);
+	te[1] = vld1q_u32(p[1]);
+
+	ve[2] = vreinterpretq_u32_s32(val_eth[dst_port[2]]);
+	te[2] = vld1q_u32(p[2]);
+
+	ve[3] = vreinterpretq_u32_s32(val_eth[dst_port[3]]);
+	te[3] = vld1q_u32(p[3]);
+
+	/* Update last 4 bytes */
+	ve[0] = vsetq_lane_u32(vgetq_lane_u32(te[0], 3), ve[0], 3);
+	ve[1] = vsetq_lane_u32(vgetq_lane_u32(te[1], 3), ve[1], 3);
+	ve[2] = vsetq_lane_u32(vgetq_lane_u32(te[2], 3), ve[2], 3);
+	ve[3] = vsetq_lane_u32(vgetq_lane_u32(te[3], 3), ve[3], 3);
+
+	vst1q_u32(p[0], ve[0]);
+	vst1q_u32(p[1], ve[1]);
+	vst1q_u32(p[2], ve[2]);
+	vst1q_u32(p[3], ve[3]);
+
+	rfc1812_process((struct rte_ipv4_hdr *)
+			((struct rte_ether_hdr *)p[0] + 1),
+			&dst_port[0], pkt[0]->packet_type);
+	rfc1812_process((struct rte_ipv4_hdr *)
+			((struct rte_ether_hdr *)p[1] + 1),
+			&dst_port[1], pkt[1]->packet_type);
+	rfc1812_process((struct rte_ipv4_hdr *)
+			((struct rte_ether_hdr *)p[2] + 1),
+			&dst_port[2], pkt[2]->packet_type);
+	rfc1812_process((struct rte_ipv4_hdr *)
+			((struct rte_ether_hdr *)p[3] + 1),
+			&dst_port[3], pkt[3]->packet_type);
+}
+
+/*
+ * Group consecutive packets with the same destination port in bursts of 4.
+ * Suppose we have array of destionation ports:
+ * dst_port[] = {a, b, c, d,, e, ... }
+ * dp1 should contain: <a, b, c, d>, dp2: <b, c, d, e>.
+ * We doing 4 comparisons at once and the result is 4 bit mask.
+ * This mask is used as an index into prebuild array of pnum values.
+ */
+static inline uint16_t *
+port_groupx4(uint16_t pn[FWDSTEP + 1], uint16_t *lp, uint16x8_t dp1,
+	     uint16x8_t dp2)
+{
+	union {
+		uint16_t u16[FWDSTEP + 1];
+		uint64_t u64;
+	} *pnum = (void *)pn;
+
+	int32_t v;
+	uint16x8_t mask = {1, 2, 4, 8, 0, 0, 0, 0};
+
+	dp1 = vceqq_u16(dp1, dp2);
+	dp1 = vandq_u16(dp1, mask);
+	v = vaddvq_u16(dp1);
+
+	/* update last port counter. */
+	lp[0] += gptbl[v].lpv;
+	rte_compiler_barrier();
+
+	/* if dest port value has changed. */
+	if (v != GRPMSK) {
+		pnum->u64 = gptbl[v].pnum;
+		pnum->u16[FWDSTEP] = 1;
+		lp = pnum->u16 + gptbl[v].idx;
+	}
+
+	return lp;
+}
+
+/**
+ * Process one packet:
+ * Update source and destination MAC addresses in the ethernet header.
+ * Perform RFC1812 checks and updates for IPV4 packets.
+ */
+static inline void
+process_packet(struct rte_mbuf *pkt, uint16_t *dst_port)
+{
+	struct rte_ether_hdr *eth_hdr;
+	uint32x4_t te, ve;
+
+	eth_hdr = rte_pktmbuf_mtod(pkt, struct rte_ether_hdr *);
+
+	te = vld1q_u32((uint32_t *)eth_hdr);
+	ve = vreinterpretq_u32_s32(val_eth[dst_port[0]]);
+
+
+	rfc1812_process((struct rte_ipv4_hdr *)(eth_hdr + 1), dst_port,
+			pkt->packet_type);
+
+	ve = vcopyq_laneq_u32(ve, 3, te, 3);
+	vst1q_u32((uint32_t *)eth_hdr, ve);
+}
+
+/**
+ * Send packets burst from pkts_burst to the ports in dst_port array
+ */
+static __rte_always_inline void
+send_packets_multi(struct fwd_stream *fs, struct rte_mbuf **pkts_burst,
+		uint16_t dst_port[MAX_PKT_BURST], int nb_rx)
+{
+	int32_t k;
+	int j = 0;
+	uint16_t dlp;
+	uint16_t *lp;
+	uint16_t pnum[MAX_PKT_BURST + 1];
+
+	/*
+	 * Finish packet processing and group consecutive
+	 * packets with the same destination port.
+	 */
+	k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);
+	if (k != 0) {
+		uint16x8_t dp1, dp2;
+
+		lp = pnum;
+		lp[0] = 1;
+
+		processx4_step3(pkts_burst, dst_port);
+
+		/* dp1: <d[0], d[1], d[2], d[3], ... > */
+		dp1 = vld1q_u16(dst_port);
+
+		for (j = FWDSTEP; j != k; j += FWDSTEP) {
+			processx4_step3(&pkts_burst[j], &dst_port[j]);
+
+			/*
+			 * dp2:
+			 * <d[j-3], d[j-2], d[j-1], d[j], ... >
+			 */
+			dp2 = vld1q_u16(&dst_port[j - FWDSTEP + 1]);
+			lp  = port_groupx4(&pnum[j - FWDSTEP], lp, dp1, dp2);
+
+			/*
+			 * dp1:
+			 * <d[j], d[j+1], d[j+2], d[j+3], ... >
+			 */
+			dp1 = vextq_u16(dp2, dp1, FWDSTEP - 1);
+		}
+
+		/*
+		 * dp2: <d[j-3], d[j-2], d[j-1], d[j-1], ... >
+		 */
+		dp2 = vextq_u16(dp1, dp1, 1);
+		dp2 = vsetq_lane_u16(vgetq_lane_u16(dp2, 2), dp2, 3);
+		lp  = port_groupx4(&pnum[j - FWDSTEP], lp, dp1, dp2);
+
+		/*
+		 * remove values added by the last repeated
+		 * dst port.
+		 */
+		lp[0]--;
+		dlp = dst_port[j - 1];
+	} else {
+		/* set dlp and lp to the never used values. */
+		dlp = BAD_PORT - 1;
+		lp = pnum + MAX_PKT_BURST;
+	}
+
+	/* Process up to last 3 packets one by one. */
+	switch (nb_rx % FWDSTEP) {
+	case 3:
+		process_packet(pkts_burst[j], dst_port + j);
+		GROUP_PORT_STEP(dlp, dst_port, lp, pnum, j);
+		j++;
+		/* fallthrough */
+	case 2:
+		process_packet(pkts_burst[j], dst_port + j);
+		GROUP_PORT_STEP(dlp, dst_port, lp, pnum, j);
+		j++;
+		/* fallthrough */
+	case 1:
+		process_packet(pkts_burst[j], dst_port + j);
+		GROUP_PORT_STEP(dlp, dst_port, lp, pnum, j);
+		j++;
+	}
+
+	/*
+	 * Send packets out, through destination port.
+	 * Consecutive packets with the same destination port
+	 * are already grouped together.
+	 * If destination port for the packet equals BAD_PORT,
+	 * then free the packet without sending it out.
+	 */
+	for (j = 0; j < nb_rx; j += k) {
+
+		int32_t m;
+		uint16_t pn;
+
+		pn = dst_port[j];
+		k = pnum[j];
+
+		if (likely(pn != BAD_PORT))
+			send_packetsx4(fs, pn, pkts_burst + j, k);
+		else
+			for (m = j; m != j + k; m++)
+				rte_pktmbuf_free(pkts_burst[m]);
+
+	}
+}
+
+#endif /* _L3FWD_NEON_H_ */
diff --git a/app/test-pmd/meson.build b/app/test-pmd/meson.build
index 98f3289bd..933738e62 100644
--- a/app/test-pmd/meson.build
+++ b/app/test-pmd/meson.build
@@ -16,6 +16,7 @@  sources = files(
         'icmpecho.c',
         'ieee1588fwd.c',
         'iofwd.c',
+	'l3fwd.c',
         'macfwd.c',
         'macswap.c',
         'noisy_vnf.c',
@@ -26,7 +27,7 @@  sources = files(
         'util.c',
 )
 
-deps += ['ethdev', 'gro', 'gso', 'cmdline', 'metrics', 'meter', 'bus_pci']
+deps += ['ethdev', 'gro', 'gso', 'cmdline', 'metrics', 'meter', 'bus_pci', 'lpm']
 if dpdk_conf.has('RTE_LIB_BITRATESTATS')
     deps += 'bitratestats'
 endif
diff --git a/app/test-pmd/testpmd.c b/app/test-pmd/testpmd.c
index d4be23f8f..d9c614dd6 100644
--- a/app/test-pmd/testpmd.c
+++ b/app/test-pmd/testpmd.c
@@ -172,6 +172,7 @@  streamid_t nb_fwd_streams;       /**< Is equal to (nb_ports * nb_rxq). */
 struct fwd_engine * fwd_engines[] = {
 	&io_fwd_engine,
 	&mac_fwd_engine,
+	&l3_fwd_engine,
 	&mac_swap_engine,
 	&flow_gen_engine,
 	&rx_only_engine,
@@ -2116,6 +2117,7 @@  launch_packet_forwarding(lcore_function_t *pkt_fwd_on_lcore)
 	int diag;
 
 	port_fwd_begin = cur_fwd_config.fwd_eng->port_fwd_begin;
+
 	if (port_fwd_begin != NULL) {
 		for (i = 0; i < cur_fwd_config.nb_fwd_ports; i++)
 			(*port_fwd_begin)(fwd_ports_ids[i]);
@@ -2132,7 +2134,6 @@  launch_packet_forwarding(lcore_function_t *pkt_fwd_on_lcore)
 		}
 	}
 }
-
 /*
  * Launch packet forwarding configuration.
  */
@@ -2212,6 +2213,7 @@  start_packet_forwarding(int with_tx_first)
 				(*port_fwd_end)(fwd_ports_ids[i]);
 		}
 	}
+
 	launch_packet_forwarding(start_pkt_forward_on_core);
 }
 
diff --git a/app/test-pmd/testpmd.h b/app/test-pmd/testpmd.h
index 6ca872db8..80c014276 100644
--- a/app/test-pmd/testpmd.h
+++ b/app/test-pmd/testpmd.h
@@ -24,6 +24,9 @@ 
 #define RTE_PORT_CLOSED         (uint16_t)2
 #define RTE_PORT_HANDLING       (uint16_t)3
 
+#define NB_SOCKETS	8
+#define BURST_TX_DRAIN_US 100 /* TX drain every ~100us in l3 mode*/
+
 /*
  * It is used to allocate the memory for hash key.
  * The hash key size is NIC dependent.
@@ -115,6 +118,12 @@  extern const struct rss_type_info rss_type_table[];
  */
 extern char dynf_names[64][RTE_MBUF_DYN_NAMESIZE];
 
+/*Used for buffering tx packets to send in a burst in l3 fwd mode */
+struct mbuf_table {
+	uint16_t len;
+	struct rte_mbuf *m_table[MAX_PKT_BURST];
+};
+
 /**
  * The data structure associated with a forwarding stream between a receive
  * port/queue and a transmit port/queue.
@@ -143,6 +152,13 @@  struct fwd_stream {
 	uint64_t     core_cycles; /**< used for RX and TX processing */
 	struct pkt_burst_stats rx_burst_stats;
 	struct pkt_burst_stats tx_burst_stats;
+
+	/* l3 fwd mode data */
+	struct mbuf_table tx_mbufs[RTE_MAX_ETHPORTS];
+	/**< Table for buffering packets before sending in a burst in l3 mode */
+	uint64_t prev_tsc; /**< timestamp of previous iteration of fwding loop */
+	uint64_t cur_tsc; /**< timestamp of current iteration of fwding loop */
+	uint64_t drain_tsc; /**< timeout for draining tx queue */
 };
 
 /**
@@ -286,6 +302,7 @@  extern uint32_t burst_tx_retry_num;
 
 extern struct fwd_engine io_fwd_engine;
 extern struct fwd_engine mac_fwd_engine;
+extern struct fwd_engine l3_fwd_engine;
 extern struct fwd_engine mac_swap_engine;
 extern struct fwd_engine flow_gen_engine;
 extern struct fwd_engine rx_only_engine;
@@ -1014,6 +1031,9 @@  void add_tx_dynf_callback(portid_t portid);
 void remove_tx_dynf_callback(portid_t portid);
 int update_jumbo_frame_offload(portid_t portid);
 
+/*Functions for l3 forwarding*/
+void setup_lpm(const int socketid);
+
 /*
  * Work-around of a compilation error with ICC on invocations of the
  * rte_be_to_cpu_16() function.