[v2,2/2] examples/ipsec-secgw: add support of NEON with poll mode

Message ID 20220617074241.3260496-2-rbhansali@marvell.com (mailing list archive)
State Superseded, archived
Delegated to: akhil goyal
Headers
Series [v2,1/2] examples/l3fwd: common packet group functionality |

Checks

Context Check Description
ci/checkpatch warning coding style issues
ci/Intel-compilation success Compilation OK
ci/intel-Testing success Testing PASS
ci/iol-mellanox-Performance success Performance Testing PASS
ci/iol-aarch64-unit-testing success Testing PASS
ci/iol-aarch64-compile-testing success Testing PASS
ci/iol-x86_64-unit-testing success Testing PASS
ci/github-robot: build success github build: passed
ci/iol-x86_64-compile-testing success Testing PASS
ci/iol-intel-Performance success Performance Testing PASS
ci/iol-intel-Functional success Functional Testing PASS
ci/iol-abi-testing success Testing PASS

Commit Message

Rahul Bhansali June 17, 2022, 7:42 a.m. UTC
  This adds the support of NEON based lpm lookup along with
multi packet processing for burst send in packets routing.

Performance impact:
On cn10k, with poll mode inline protocol, outbound performance
increased by upto ~8% and inbound performance increased by
upto ~6%.

Signed-off-by: Rahul Bhansali <rbhansali@marvell.com>
---
Changes in v2: Removed Neon packet grouping function and used
the common one.

 examples/ipsec-secgw/Makefile         |   5 +-
 examples/ipsec-secgw/ipsec-secgw.c    |  25 ++
 examples/ipsec-secgw/ipsec_lpm_neon.h | 213 +++++++++++++++++
 examples/ipsec-secgw/ipsec_neon.h     | 321 ++++++++++++++++++++++++++
 examples/ipsec-secgw/ipsec_worker.c   |   9 +
 5 files changed, 571 insertions(+), 2 deletions(-)
 create mode 100644 examples/ipsec-secgw/ipsec_lpm_neon.h
 create mode 100644 examples/ipsec-secgw/ipsec_neon.h

--
2.25.1
  

Comments

Rahul Bhansali June 17, 2022, 7:51 a.m. UTC | #1
CC: Konstantin Ananyev

> -----Original Message-----
> From: Rahul Bhansali <rbhansali@marvell.com>
> Sent: Friday, June 17, 2022 1:13 PM
> To: dev@dpdk.org; Radu Nicolau <radu.nicolau@intel.com>; Akhil Goyal
> <gakhil@marvell.com>; Ruifeng Wang <ruifeng.wang@arm.com>
> Cc: Jerin Jacob Kollanukkaran <jerinj@marvell.com>; Rahul Bhansali
> <rbhansali@marvell.com>
> Subject: [PATCH v2 2/2] examples/ipsec-secgw: add support of NEON with poll
> mode
> 
> This adds the support of NEON based lpm lookup along with multi packet
> processing for burst send in packets routing.
> 
> Performance impact:
> On cn10k, with poll mode inline protocol, outbound performance increased by
> upto ~8% and inbound performance increased by upto ~6%.
> 
> Signed-off-by: Rahul Bhansali <rbhansali@marvell.com>
> ---
> Changes in v2: Removed Neon packet grouping function and used the common
> one.
> 
>  examples/ipsec-secgw/Makefile         |   5 +-
>  examples/ipsec-secgw/ipsec-secgw.c    |  25 ++
>  examples/ipsec-secgw/ipsec_lpm_neon.h | 213 +++++++++++++++++
>  examples/ipsec-secgw/ipsec_neon.h     | 321 ++++++++++++++++++++++++++
>  examples/ipsec-secgw/ipsec_worker.c   |   9 +
>  5 files changed, 571 insertions(+), 2 deletions(-)  create mode 100644
> examples/ipsec-secgw/ipsec_lpm_neon.h
>  create mode 100644 examples/ipsec-secgw/ipsec_neon.h
> 
> diff --git a/examples/ipsec-secgw/Makefile b/examples/ipsec-secgw/Makefile
> index 89af54bd37..ffe232774d 100644
> --- a/examples/ipsec-secgw/Makefile
> +++ b/examples/ipsec-secgw/Makefile
> @@ -36,6 +36,7 @@ shared: build/$(APP)-shared
>  static: build/$(APP)-static
>  	ln -sf $(APP)-static build/$(APP)
> 
> +INCLUDES =-I../common
>  PC_FILE := $(shell $(PKGCONF) --path libdpdk 2>/dev/null)  CFLAGS += -O3
> $(shell $(PKGCONF) --cflags libdpdk)  LDFLAGS_SHARED = $(shell $(PKGCONF) --
> libs libdpdk) @@ -53,10 +54,10 @@ CFLAGS += -DALLOW_EXPERIMENTAL_API
> CFLAGS += -Wno-address-of-packed-member
> 
>  build/$(APP)-shared: $(SRCS-y) Makefile $(PC_FILE) | build
> -	$(CC) $(CFLAGS) $(SRCS-y) -o $@ $(LDFLAGS) $(LDFLAGS_SHARED)
> +	$(CC) $(CFLAGS) $(SRCS-y) $(INCLUDES) -o $@ $(LDFLAGS)
> +$(LDFLAGS_SHARED)
> 
>  build/$(APP)-static: $(SRCS-y) Makefile $(PC_FILE) | build
> -	$(CC) $(CFLAGS) $(SRCS-y) -o $@ $(LDFLAGS) $(LDFLAGS_STATIC)
> +	$(CC) $(CFLAGS) $(SRCS-y) $(INCLUDES) -o $@ $(LDFLAGS)
> +$(LDFLAGS_STATIC)
> 
>  build:
>  	@mkdir -p $@
> diff --git a/examples/ipsec-secgw/ipsec-secgw.c b/examples/ipsec-
> secgw/ipsec-secgw.c
> index 4d8a4a71b8..b650668305 100644
> --- a/examples/ipsec-secgw/ipsec-secgw.c
> +++ b/examples/ipsec-secgw/ipsec-secgw.c
> @@ -56,6 +56,10 @@
>  #include "parser.h"
>  #include "sad.h"
> 
> +#if defined(__ARM_NEON)
> +#include "ipsec_lpm_neon.h"
> +#endif
> +
>  volatile bool force_quit;
> 
>  #define MAX_JUMBO_PKT_LEN  9600
> @@ -100,6 +104,12 @@ struct ethaddr_info ethaddr_tbl[RTE_MAX_ETHPORTS]
> = {
>  	{ 0, ETHADDR(0x00, 0x16, 0x3e, 0x49, 0x9e, 0xdd) }  };
> 
> +/*
> + * To hold ethernet header per port, which will be applied
> + * to outgoing packets.
> + */
> +xmm_t val_eth[RTE_MAX_ETHPORTS];
> +
>  struct flow_info flow_info_tbl[RTE_MAX_ETHPORTS];
> 
>  #define CMD_LINE_OPT_CONFIG		"config"
> @@ -568,9 +578,16 @@ process_pkts(struct lcore_conf *qconf, struct
> rte_mbuf **pkts,
>  			process_pkts_outbound(&qconf->outbound, &traffic);
>  	}
> 
> +#if defined __ARM_NEON
> +	/* Neon optimized packet routing */
> +	route4_pkts_neon(qconf->rt4_ctx, traffic.ip4.pkts, traffic.ip4.num,
> +			 qconf->outbound.ipv4_offloads, true);
> +	route6_pkts_neon(qconf->rt6_ctx, traffic.ip6.pkts, traffic.ip6.num);
> +#else
>  	route4_pkts(qconf->rt4_ctx, traffic.ip4.pkts, traffic.ip4.num,
>  		    qconf->outbound.ipv4_offloads, true);
>  	route6_pkts(qconf->rt6_ctx, traffic.ip6.pkts, traffic.ip6.num);
> +#endif
>  }
> 
>  static inline void
> @@ -1403,6 +1420,8 @@ add_dst_ethaddr(uint16_t port, const struct
> rte_ether_addr *addr)
>  		return -EINVAL;
> 
>  	ethaddr_tbl[port].dst = ETHADDR_TO_UINT64(addr);
> +	rte_ether_addr_copy((struct rte_ether_addr *)&ethaddr_tbl[port].dst,
> +			    (struct rte_ether_addr *)(val_eth + port));
>  	return 0;
>  }
> 
> @@ -1865,6 +1884,12 @@ port_init(uint16_t portid, uint64_t req_rx_offloads,
> uint64_t req_tx_offloads)
>  			portid, rte_strerror(-ret));
> 
>  	ethaddr_tbl[portid].src = ETHADDR_TO_UINT64(&ethaddr);
> +
> +	rte_ether_addr_copy((struct rte_ether_addr *)&ethaddr_tbl[portid].dst,
> +			    (struct rte_ether_addr *)(val_eth + portid));
> +	rte_ether_addr_copy((struct rte_ether_addr *)&ethaddr_tbl[portid].src,
> +			    (struct rte_ether_addr *)(val_eth + portid) + 1);
> +
>  	print_ethaddr("Address: ", &ethaddr);
>  	printf("\n");
> 
> diff --git a/examples/ipsec-secgw/ipsec_lpm_neon.h b/examples/ipsec-
> secgw/ipsec_lpm_neon.h
> new file mode 100644
> index 0000000000..959a5a8666
> --- /dev/null
> +++ b/examples/ipsec-secgw/ipsec_lpm_neon.h
> @@ -0,0 +1,213 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(C) 2022 Marvell.
> + */
> +
> +#ifndef __IPSEC_LPM_NEON_H__
> +#define __IPSEC_LPM_NEON_H__
> +
> +#include <arm_neon.h>
> +#include "ipsec_neon.h"
> +
> +/*
> + * Append ethernet header and read destination IPV4 addresses from 4 mbufs.
> + */
> +static inline void
> +processx4_step1(struct rte_mbuf *pkt[FWDSTEP], int32x4_t *dip,
> +		uint64_t *inline_flag)
> +{
> +	struct rte_ipv4_hdr *ipv4_hdr;
> +	struct rte_ether_hdr *eth_hdr;
> +	int32_t dst[FWDSTEP];
> +	int i;
> +
> +	for (i = 0; i < FWDSTEP; i++) {
> +		eth_hdr = (struct rte_ether_hdr *)rte_pktmbuf_prepend(pkt[i],
> +
> 	RTE_ETHER_HDR_LEN);
> +		pkt[i]->ol_flags |= RTE_MBUF_F_TX_IPV4;
> +		pkt[i]->l2_len = RTE_ETHER_HDR_LEN;
> +
> +		ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
> +
> +		/* Fetch destination IPv4 address */
> +		dst[i] = ipv4_hdr->dst_addr;
> +		*inline_flag |= pkt[i]->ol_flags &
> RTE_MBUF_F_TX_SEC_OFFLOAD;
> +	}
> +
> +	dip[0] = vld1q_s32(dst);
> +}
> +
> +/*
> + * Lookup into LPM for destination port.
> + */
> +static inline void
> +processx4_step2(struct rt_ctx *rt_ctx, int32x4_t dip, uint64_t inline_flag,
> +		struct rte_mbuf *pkt[FWDSTEP], uint16_t dprt[FWDSTEP]) {
> +	uint32_t next_hop;
> +	rte_xmm_t dst;
> +	uint8_t i;
> +
> +	dip = vreinterpretq_s32_u8(vrev32q_u8(vreinterpretq_u8_s32(dip)));
> +
> +	/* If all 4 packets are non-inline */
> +	if (!inline_flag) {
> +		rte_lpm_lookupx4((struct rte_lpm *)rt_ctx, dip, dst.u32,
> +				 BAD_PORT);
> +		/* get rid of unused upper 16 bit for each dport. */
> +		vst1_s16((int16_t *)dprt, vqmovn_s32(dst.x));
> +		return;
> +	}
> +
> +	/* Inline and non-inline packets */
> +	dst.x = dip;
> +	for (i = 0; i < FWDSTEP; i++) {
> +		if (pkt[i]->ol_flags & RTE_MBUF_F_TX_SEC_OFFLOAD) {
> +			next_hop = get_hop_for_offload_pkt(pkt[i], 0);
> +			dprt[i] = (uint16_t) (((next_hop &
> +						RTE_LPM_LOOKUP_SUCCESS)
> != 0)
> +						? next_hop : BAD_PORT);
> +
> +		} else {
> +			dprt[i] = (uint16_t) ((rte_lpm_lookup(
> +						(struct rte_lpm *)rt_ctx,
> +						 dst.u32[i], &next_hop) == 0)
> +						? next_hop : BAD_PORT);
> +		}
> +	}
> +}
> +
> +/*
> + * Process single packets for destination port.
> + */
> +static inline void
> +process_single_pkt(struct rt_ctx *rt_ctx, struct rte_mbuf *pkt,
> +		   uint16_t *dst_port)
> +{
> +	struct rte_ether_hdr *eth_hdr;
> +	struct rte_ipv4_hdr *ipv4_hdr;
> +	uint32_t next_hop;
> +	uint32_t dst_ip;
> +
> +	eth_hdr = (struct rte_ether_hdr *)rte_pktmbuf_prepend(pkt,
> +
> 	RTE_ETHER_HDR_LEN);
> +	pkt->ol_flags |= RTE_MBUF_F_TX_IPV4;
> +	pkt->l2_len = RTE_ETHER_HDR_LEN;
> +
> +	if (pkt->ol_flags & RTE_MBUF_F_TX_SEC_OFFLOAD) {
> +		next_hop = get_hop_for_offload_pkt(pkt, 0);
> +		*dst_port = (uint16_t) (((next_hop &
> +					  RTE_LPM_LOOKUP_SUCCESS) != 0)
> +					  ? next_hop : BAD_PORT);
> +	} else {
> +		ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
> +		dst_ip = rte_be_to_cpu_32(ipv4_hdr->dst_addr);
> +		*dst_port = (uint16_t) ((rte_lpm_lookup(
> +						(struct rte_lpm *)rt_ctx,
> +						dst_ip, &next_hop) == 0)
> +						? next_hop : BAD_PORT);
> +	}
> +}
> +
> +/*
> + * Buffer optimized handling of IPv6 packets.
> + */
> +static inline void
> +route6_pkts_neon(struct rt_ctx *rt_ctx, struct rte_mbuf **pkts, int
> +nb_rx) {
> +	uint8_t dst_ip6[MAX_PKT_BURST][16];
> +	int32_t dst_port[MAX_PKT_BURST];
> +	struct rte_ether_hdr *eth_hdr;
> +	struct rte_ipv6_hdr *ipv6_hdr;
> +	int32_t hop[MAX_PKT_BURST];
> +	struct rte_mbuf *pkt;
> +	uint8_t lpm_pkts = 0;
> +	int32_t i;
> +
> +	if (nb_rx == 0)
> +		return;
> +
> +	/* Need to do an LPM lookup for non-inline packets. Inline packets will
> +	 * have port ID in the SA
> +	 */
> +
> +	for (i = 0; i < nb_rx; i++) {
> +		pkt = pkts[i];
> +		eth_hdr = (struct rte_ether_hdr *)rte_pktmbuf_prepend(pkt,
> +
> 	RTE_ETHER_HDR_LEN);
> +		pkt->l2_len = RTE_ETHER_HDR_LEN;
> +		pkt->ol_flags |= RTE_MBUF_F_TX_IPV6;
> +
> +		if (!(pkt->ol_flags & RTE_MBUF_F_TX_SEC_OFFLOAD)) {
> +			/* Security offload not enabled. So an LPM lookup is
> +			 * required to get the hop
> +			 */
> +			ipv6_hdr = (struct rte_ipv6_hdr *)(eth_hdr + 1);
> +			memcpy(&dst_ip6[lpm_pkts][0],
> +					ipv6_hdr->dst_addr, 16);
> +			lpm_pkts++;
> +		}
> +	}
> +
> +	rte_lpm6_lookup_bulk_func((struct rte_lpm6 *)rt_ctx, dst_ip6,
> +				  hop, lpm_pkts);
> +
> +	lpm_pkts = 0;
> +
> +	for (i = 0; i < nb_rx; i++) {
> +		pkt = pkts[i];
> +		if (pkt->ol_flags & RTE_MBUF_F_TX_SEC_OFFLOAD) {
> +			/* Read hop from the SA */
> +			dst_port[i] = get_hop_for_offload_pkt(pkt, 1);
> +		} else {
> +			/* Need to use hop returned by lookup */
> +			dst_port[i] = hop[lpm_pkts++];
> +		}
> +		if (dst_port[i] == -1)
> +			dst_port[i] = BAD_PORT;
> +	}
> +
> +	/* Send packets */
> +	send_multi_pkts(pkts, (uint16_t *)dst_port, nb_rx, 0, 0, false); }
> +
> +/*
> + * Buffer optimized handling of IPv4 packets.
> + */
> +static inline void
> +route4_pkts_neon(struct rt_ctx *rt_ctx, struct rte_mbuf **pkts, int nb_rx,
> +		 uint64_t tx_offloads, bool ip_cksum) {
> +	const int32_t k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);
> +	const int32_t m = nb_rx % FWDSTEP;
> +	uint16_t dst_port[MAX_PKT_BURST];
> +	uint64_t inline_flag = 0;
> +	int32x4_t dip;
> +	int32_t i;
> +
> +	if (nb_rx == 0)
> +		return;
> +
> +	for (i = 0; i != k; i += FWDSTEP) {
> +		processx4_step1(&pkts[i], &dip, &inline_flag);
> +		processx4_step2(rt_ctx, dip, inline_flag, &pkts[i],
> +				&dst_port[i]);
> +	}
> +
> +	/* Classify last up to 3 packets one by one */
> +	switch (m) {
> +	case 3:
> +		process_single_pkt(rt_ctx, pkts[i], &dst_port[i]);
> +		i++;
> +		/* fallthrough */
> +	case 2:
> +		process_single_pkt(rt_ctx, pkts[i], &dst_port[i]);
> +		i++;
> +		/* fallthrough */
> +	case 1:
> +		process_single_pkt(rt_ctx, pkts[i], &dst_port[i]);
> +	}
> +
> +	send_multi_pkts(pkts, dst_port, nb_rx, tx_offloads, ip_cksum, true); }
> +
> +#endif /* __IPSEC_LPM_NEON_H__ */
> diff --git a/examples/ipsec-secgw/ipsec_neon.h b/examples/ipsec-
> secgw/ipsec_neon.h
> new file mode 100644
> index 0000000000..0f72219ed0
> --- /dev/null
> +++ b/examples/ipsec-secgw/ipsec_neon.h
> @@ -0,0 +1,321 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(C) 2022 Marvell.
> + */
> +
> +#ifndef _IPSEC_NEON_H_
> +#define _IPSEC_NEON_H_
> +
> +#include "ipsec.h"
> +#include "neon_common.h"
> +
> +#define MAX_TX_BURST	(MAX_PKT_BURST / 2)
> +#define BAD_PORT	((uint16_t)-1)
> +
> +extern xmm_t val_eth[RTE_MAX_ETHPORTS];
> +
> +/*
> + * Update source and destination MAC addresses in the ethernet header.
> + */
> +static inline void
> +processx4_step3(struct rte_mbuf *pkts[FWDSTEP], uint16_t
> dst_port[FWDSTEP],
> +		uint64_t tx_offloads, bool ip_cksum, uint8_t *l_pkt) {
> +	uint32x4_t te[FWDSTEP];
> +	uint32x4_t ve[FWDSTEP];
> +	uint32_t *p[FWDSTEP];
> +	struct rte_mbuf *pkt;
> +	uint8_t i;
> +
> +	for (i = 0; i < FWDSTEP; i++) {
> +		pkt = pkts[i];
> +
> +		/* Check if it is a large packet */
> +		if (pkt->pkt_len - RTE_ETHER_HDR_LEN > mtu_size)
> +			*l_pkt |= 1;
> +
> +		p[i] = rte_pktmbuf_mtod(pkt, uint32_t *);
> +		ve[i] = vreinterpretq_u32_s32(val_eth[dst_port[i]]);
> +		te[i] = vld1q_u32(p[i]);
> +
> +		/* Update last 4 bytes */
> +		ve[i] = vsetq_lane_u32(vgetq_lane_u32(te[i], 3), ve[i], 3);
> +		vst1q_u32(p[i], ve[i]);
> +
> +		if (ip_cksum) {
> +			struct rte_ipv4_hdr *ip;
> +
> +			pkt->ol_flags |= tx_offloads;
> +
> +			ip = (struct rte_ipv4_hdr *)
> +				(p[i] + RTE_ETHER_HDR_LEN + 1);
> +			ip->hdr_checksum = 0;
> +
> +			/* calculate IPv4 cksum in SW */
> +			if ((pkt->ol_flags & RTE_MBUF_F_TX_IP_CKSUM) == 0)
> +				ip->hdr_checksum = rte_ipv4_cksum(ip);
> +		}
> +
> +	}
> +}
> +
> +/**
> + * Process single packet:
> + * Update source and destination MAC addresses in the ethernet header.
> + */
> +static inline void
> +process_packet(struct rte_mbuf *pkt, uint16_t *dst_port, uint64_t tx_offloads,
> +	       bool ip_cksum, uint8_t *l_pkt)
> +{
> +	struct rte_ether_hdr *eth_hdr;
> +	uint32x4_t te, ve;
> +
> +	/* Check if it is a large packet */
> +	if (pkt->pkt_len - RTE_ETHER_HDR_LEN > mtu_size)
> +		*l_pkt |= 1;
> +
> +	eth_hdr = rte_pktmbuf_mtod(pkt, struct rte_ether_hdr *);
> +
> +	te = vld1q_u32((uint32_t *)eth_hdr);
> +	ve = vreinterpretq_u32_s32(val_eth[dst_port[0]]);
> +
> +	ve = vcopyq_laneq_u32(ve, 3, te, 3);
> +	vst1q_u32((uint32_t *)eth_hdr, ve);
> +
> +	if (ip_cksum) {
> +		struct rte_ipv4_hdr *ip;
> +
> +		pkt->ol_flags |= tx_offloads;
> +
> +		ip = (struct rte_ipv4_hdr *)(eth_hdr + 1);
> +		ip->hdr_checksum = 0;
> +
> +		/* calculate IPv4 cksum in SW */
> +		if ((pkt->ol_flags & RTE_MBUF_F_TX_IP_CKSUM) == 0)
> +			ip->hdr_checksum = rte_ipv4_cksum(ip);
> +	}
> +}
> +
> +static inline void
> +send_packets(struct rte_mbuf *m[], uint16_t port, uint32_t num, bool
> +is_ipv4) {
> +	uint8_t proto;
> +	uint32_t i;
> +
> +	proto = is_ipv4 ? IPPROTO_IP : IPPROTO_IPV6;
> +	for (i = 0; i < num; i++)
> +		send_single_packet(m[i], port, proto); }
> +
> +static inline void
> +send_packetsx4(struct rte_mbuf *m[], uint16_t port, uint32_t num) {
> +	unsigned int lcoreid = rte_lcore_id();
> +	struct lcore_conf *qconf;
> +	uint32_t len, j, n;
> +
> +	qconf = &lcore_conf[lcoreid];
> +
> +	len = qconf->tx_mbufs[port].len;
> +
> +	/*
> +	 * If TX buffer for that queue is empty, and we have enough packets,
> +	 * then send them straightway.
> +	 */
> +	if (num >= MAX_TX_BURST && len == 0) {
> +		n = rte_eth_tx_burst(port, qconf->tx_queue_id[port], m, num);
> +		core_stats_update_tx(n);
> +		if (unlikely(n < num)) {
> +			do {
> +				rte_pktmbuf_free(m[n]);
> +			} while (++n < num);
> +		}
> +		return;
> +	}
> +
> +	/*
> +	 * Put packets into TX buffer for that queue.
> +	 */
> +
> +	n = len + num;
> +	n = (n > MAX_PKT_BURST) ? MAX_PKT_BURST - len : num;
> +
> +	j = 0;
> +	switch (n % FWDSTEP) {
> +	while (j < n) {
> +		case 0:
> +			qconf->tx_mbufs[port].m_table[len + j] = m[j];
> +			j++;
> +			/* fallthrough */
> +		case 3:
> +			qconf->tx_mbufs[port].m_table[len + j] = m[j];
> +			j++;
> +			/* fallthrough */
> +		case 2:
> +			qconf->tx_mbufs[port].m_table[len + j] = m[j];
> +			j++;
> +			/* fallthrough */
> +		case 1:
> +			qconf->tx_mbufs[port].m_table[len + j] = m[j];
> +			j++;
> +		}
> +	}
> +
> +	len += n;
> +
> +	/* enough pkts to be sent */
> +	if (unlikely(len == MAX_PKT_BURST)) {
> +
> +		send_burst(qconf, MAX_PKT_BURST, port);
> +
> +		/* copy rest of the packets into the TX buffer. */
> +		len = num - n;
> +		if (len == 0)
> +			goto exit;
> +
> +		j = 0;
> +		switch (len % FWDSTEP) {
> +		while (j < len) {
> +			case 0:
> +				qconf->tx_mbufs[port].m_table[j] = m[n + j];
> +				j++;
> +				/* fallthrough */
> +			case 3:
> +				qconf->tx_mbufs[port].m_table[j] = m[n + j];
> +				j++;
> +				/* fallthrough */
> +			case 2:
> +				qconf->tx_mbufs[port].m_table[j] = m[n + j];
> +				j++;
> +				/* fallthrough */
> +			case 1:
> +				qconf->tx_mbufs[port].m_table[j] = m[n + j];
> +				j++;
> +		}
> +		}
> +	}
> +
> +exit:
> +	qconf->tx_mbufs[port].len = len;
> +}
> +
> +/**
> + * Send packets burst to the ports in dst_port array  */ static
> +__rte_always_inline void send_multi_pkts(struct rte_mbuf **pkts,
> +uint16_t dst_port[MAX_PKT_BURST],
> +		int nb_rx, uint64_t tx_offloads, bool ip_cksum, bool is_ipv4) {
> +	unsigned int lcoreid = rte_lcore_id();
> +	uint16_t pnum[MAX_PKT_BURST + 1];
> +	uint8_t l_pkt = 0;
> +	uint16_t dlp, *lp;
> +	int i = 0, k;
> +
> +	/*
> +	 * Finish packet processing and group consecutive
> +	 * packets with the same destination port.
> +	 */
> +	k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);
> +
> +	if (k != 0) {
> +		uint16x8_t dp1, dp2;
> +
> +		lp = pnum;
> +		lp[0] = 1;
> +
> +		processx4_step3(pkts, dst_port, tx_offloads, ip_cksum, &l_pkt);
> +
> +		/* dp1: <d[0], d[1], d[2], d[3], ... > */
> +		dp1 = vld1q_u16(dst_port);
> +
> +		for (i = FWDSTEP; i != k; i += FWDSTEP) {
> +			processx4_step3(&pkts[i], &dst_port[i], tx_offloads,
> +					ip_cksum, &l_pkt);
> +
> +			/*
> +			 * dp2:
> +			 * <d[j-3], d[j-2], d[j-1], d[j], ... >
> +			 */
> +			dp2 = vld1q_u16(&dst_port[i - FWDSTEP + 1]);
> +			lp  = neon_port_groupx4(&pnum[i - FWDSTEP], lp, dp1,
> dp2);
> +
> +			/*
> +			 * dp1:
> +			 * <d[j], d[j+1], d[j+2], d[j+3], ... >
> +			 */
> +			dp1 = vextq_u16(dp2, dp1, FWDSTEP - 1);
> +		}
> +
> +		/*
> +		 * dp2: <d[j-3], d[j-2], d[j-1], d[j-1], ... >
> +		 */
> +		dp2 = vextq_u16(dp1, dp1, 1);
> +		dp2 = vsetq_lane_u16(vgetq_lane_u16(dp2, 2), dp2, 3);
> +		lp  = neon_port_groupx4(&pnum[i - FWDSTEP], lp, dp1, dp2);
> +
> +		/*
> +		 * remove values added by the last repeated
> +		 * dst port.
> +		 */
> +		lp[0]--;
> +		dlp = dst_port[i - 1];
> +	} else {
> +		/* set dlp and lp to the never used values. */
> +		dlp = BAD_PORT - 1;
> +		lp = pnum + MAX_PKT_BURST;
> +	}
> +
> +	/* Process up to last 3 packets one by one. */
> +	switch (nb_rx % FWDSTEP) {
> +	case 3:
> +		process_packet(pkts[i], dst_port + i, tx_offloads, ip_cksum,
> +			       &l_pkt);
> +		GROUP_PORT_STEP(dlp, dst_port, lp, pnum, i);
> +		i++;
> +		/* fallthrough */
> +	case 2:
> +		process_packet(pkts[i], dst_port + i, tx_offloads, ip_cksum,
> +			       &l_pkt);
> +		GROUP_PORT_STEP(dlp, dst_port, lp, pnum, i);
> +		i++;
> +		/* fallthrough */
> +	case 1:
> +		process_packet(pkts[i], dst_port + i, tx_offloads, ip_cksum,
> +			       &l_pkt);
> +		GROUP_PORT_STEP(dlp, dst_port, lp, pnum, i);
> +	}
> +
> +	/*
> +	 * Send packets out, through destination port.
> +	 * Consecutive packets with the same destination port
> +	 * are already grouped together.
> +	 * If destination port for the packet equals BAD_PORT,
> +	 * then free the packet without sending it out.
> +	 */
> +	for (i = 0; i < nb_rx; i += k) {
> +
> +		uint16_t pn;
> +
> +		pn = dst_port[i];
> +		k = pnum[i];
> +
> +		if (likely(pn != BAD_PORT)) {
> +			if (l_pkt)
> +				/* Large packet is present, need to send
> +				 * individual packets with fragment
> +				 */
> +				send_packets(pkts + i, pn, k, is_ipv4);
> +			else
> +				send_packetsx4(pkts + i, pn, k);
> +
> +		} else {
> +			free_pkts(&pkts[i], k);
> +			if (is_ipv4)
> +				core_statistics[lcoreid].lpm4.miss++;
> +			else
> +				core_statistics[lcoreid].lpm6.miss++;
> +		}
> +	}
> +}
> +
> +#endif /* _IPSEC_NEON_H_ */
> diff --git a/examples/ipsec-secgw/ipsec_worker.c b/examples/ipsec-
> secgw/ipsec_worker.c
> index e1d4e3d864..803157d8ee 100644
> --- a/examples/ipsec-secgw/ipsec_worker.c
> +++ b/examples/ipsec-secgw/ipsec_worker.c
> @@ -12,6 +12,10 @@
>  #include "ipsec-secgw.h"
>  #include "ipsec_worker.h"
> 
> +#if defined(__ARM_NEON)
> +#include "ipsec_lpm_neon.h"
> +#endif
> +
>  struct port_drv_mode_data {
>  	struct rte_security_session *sess;
>  	struct rte_security_ctx *ctx;
> @@ -1248,8 +1252,13 @@ ipsec_poll_mode_wrkr_inl_pr(void)
>  				v6_num = ip6.num;
>  			}
> 
> +#if defined __ARM_NEON
> +			route4_pkts_neon(rt4_ctx, v4, v4_num, 0, false);
> +			route6_pkts_neon(rt6_ctx, v6, v6_num); #else
>  			route4_pkts(rt4_ctx, v4, v4_num, 0, false);
>  			route6_pkts(rt6_ctx, v6, v6_num);
> +#endif
>  		}
>  	}
>  }
> --
> 2.25.1
  
Akhil Goyal June 21, 2022, 12:55 p.m. UTC | #2
> This adds the support of NEON based lpm lookup along with
> multi packet processing for burst send in packets routing.
> 
> Performance impact:
> On cn10k, with poll mode inline protocol, outbound performance
> increased by upto ~8% and inbound performance increased by
> upto ~6%.
> 
> Signed-off-by: Rahul Bhansali <rbhansali@marvell.com>
> ---
Acked-by: Akhil Goyal <gakhil@marvell.com>
  
Fan Zhang June 23, 2022, 8:46 a.m. UTC | #3
Hi Rahul

> -----Original Message-----
> From: Rahul Bhansali <rbhansali@marvell.com>
> Sent: Friday, June 17, 2022 8:43 AM
> To: dev@dpdk.org; Nicolau, Radu <radu.nicolau@intel.com>; Akhil Goyal
> <gakhil@marvell.com>; Ruifeng Wang <ruifeng.wang@arm.com>
> Cc: jerinj@marvell.com; Rahul Bhansali <rbhansali@marvell.com>
> Subject: [PATCH v2 2/2] examples/ipsec-secgw: add support of NEON with poll
> mode
> 
> This adds the support of NEON based lpm lookup along with
> multi packet processing for burst send in packets routing.
> 
> Performance impact:
> On cn10k, with poll mode inline protocol, outbound performance
> increased by upto ~8% and inbound performance increased by
> upto ~6%.
> 
> Signed-off-by: Rahul Bhansali <rbhansali@marvell.com>
> ---
--snip--
> 
>  static inline void
> @@ -1403,6 +1420,8 @@ add_dst_ethaddr(uint16_t port, const struct
> rte_ether_addr *addr)
>  		return -EINVAL;
> 

Fan: I failed to understand why do we need to overwrite address to do an address
copy here. Was it a bug?

>  	ethaddr_tbl[port].dst = ETHADDR_TO_UINT64(addr);
> +	rte_ether_addr_copy((struct rte_ether_addr *)&ethaddr_tbl[port].dst,
> +			    (struct rte_ether_addr *)(val_eth + port));
>  	return 0;
>  }
> 
> @@ -1865,6 +1884,12 @@ port_init(uint16_t portid, uint64_t req_rx_offloads,
> uint64_t req_tx_offloads)
>  			portid, rte_strerror(-ret));
> 
>  	ethaddr_tbl[portid].src = ETHADDR_TO_UINT64(&ethaddr);

Fan: Same here 

> +
> +	rte_ether_addr_copy((struct rte_ether_addr *)&ethaddr_tbl[portid].dst,
> +			    (struct rte_ether_addr *)(val_eth + portid));
> +	rte_ether_addr_copy((struct rte_ether_addr *)&ethaddr_tbl[portid].src,
> +			    (struct rte_ether_addr *)(val_eth + portid) + 1);
> +
>  	print_ethaddr("Address: ", &ethaddr);
>  	printf("\n");
  
Rahul Bhansali June 23, 2022, 9:37 a.m. UTC | #4
> -----Original Message-----
> From: Zhang, Roy Fan <roy.fan.zhang@intel.com>
> Sent: Thursday, June 23, 2022 2:17 PM
> To: Rahul Bhansali <rbhansali@marvell.com>; dev@dpdk.org; Nicolau, Radu
> <radu.nicolau@intel.com>; Akhil Goyal <gakhil@marvell.com>; Ruifeng Wang
> <ruifeng.wang@arm.com>
> Cc: Jerin Jacob Kollanukkaran <jerinj@marvell.com>
> Subject: [EXT] RE: [PATCH v2 2/2] examples/ipsec-secgw: add support of NEON
> with poll mode
> 
> External Email
> 
> ----------------------------------------------------------------------
> Hi Rahul
> 
> > -----Original Message-----
> > From: Rahul Bhansali <rbhansali@marvell.com>
> > Sent: Friday, June 17, 2022 8:43 AM
> > To: dev@dpdk.org; Nicolau, Radu <radu.nicolau@intel.com>; Akhil Goyal
> > <gakhil@marvell.com>; Ruifeng Wang <ruifeng.wang@arm.com>
> > Cc: jerinj@marvell.com; Rahul Bhansali <rbhansali@marvell.com>
> > Subject: [PATCH v2 2/2] examples/ipsec-secgw: add support of NEON with
> > poll mode
> >
> > This adds the support of NEON based lpm lookup along with multi packet
> > processing for burst send in packets routing.
> >
> > Performance impact:
> > On cn10k, with poll mode inline protocol, outbound performance
> > increased by upto ~8% and inbound performance increased by upto ~6%.
> >
> > Signed-off-by: Rahul Bhansali <rbhansali@marvell.com>
> > ---
> --snip--
> >
> >  static inline void
> > @@ -1403,6 +1420,8 @@ add_dst_ethaddr(uint16_t port, const struct
> > rte_ether_addr *addr)
> >  		return -EINVAL;
> >
> 
> Fan: I failed to understand why do we need to overwrite address to do an
> address copy here. Was it a bug?

It is not overwriting the ethaddr_tbl[port].dst address, instead it is copying from dst to  xmm_t val_eth data in a format as required by Neon based packet processing path on routing.

> 
> >  	ethaddr_tbl[port].dst = ETHADDR_TO_UINT64(addr);
> > +	rte_ether_addr_copy((struct rte_ether_addr *)&ethaddr_tbl[port].dst,
> > +			    (struct rte_ether_addr *)(val_eth + port));
> >  	return 0;
> >  }
> >
> > @@ -1865,6 +1884,12 @@ port_init(uint16_t portid, uint64_t
> > req_rx_offloads, uint64_t req_tx_offloads)
> >  			portid, rte_strerror(-ret));
> >
> >  	ethaddr_tbl[portid].src = ETHADDR_TO_UINT64(&ethaddr);
> 
> Fan: Same here
> 
> > +
> > +	rte_ether_addr_copy((struct rte_ether_addr *)&ethaddr_tbl[portid].dst,
> > +			    (struct rte_ether_addr *)(val_eth + portid));
> > +	rte_ether_addr_copy((struct rte_ether_addr *)&ethaddr_tbl[portid].src,
> > +			    (struct rte_ether_addr *)(val_eth + portid) + 1);
> > +
> >  	print_ethaddr("Address: ", &ethaddr);
> >  	printf("\n");
  

Patch

diff --git a/examples/ipsec-secgw/Makefile b/examples/ipsec-secgw/Makefile
index 89af54bd37..ffe232774d 100644
--- a/examples/ipsec-secgw/Makefile
+++ b/examples/ipsec-secgw/Makefile
@@ -36,6 +36,7 @@  shared: build/$(APP)-shared
 static: build/$(APP)-static
 	ln -sf $(APP)-static build/$(APP)

+INCLUDES =-I../common
 PC_FILE := $(shell $(PKGCONF) --path libdpdk 2>/dev/null)
 CFLAGS += -O3 $(shell $(PKGCONF) --cflags libdpdk)
 LDFLAGS_SHARED = $(shell $(PKGCONF) --libs libdpdk)
@@ -53,10 +54,10 @@  CFLAGS += -DALLOW_EXPERIMENTAL_API
 CFLAGS += -Wno-address-of-packed-member

 build/$(APP)-shared: $(SRCS-y) Makefile $(PC_FILE) | build
-	$(CC) $(CFLAGS) $(SRCS-y) -o $@ $(LDFLAGS) $(LDFLAGS_SHARED)
+	$(CC) $(CFLAGS) $(SRCS-y) $(INCLUDES) -o $@ $(LDFLAGS) $(LDFLAGS_SHARED)

 build/$(APP)-static: $(SRCS-y) Makefile $(PC_FILE) | build
-	$(CC) $(CFLAGS) $(SRCS-y) -o $@ $(LDFLAGS) $(LDFLAGS_STATIC)
+	$(CC) $(CFLAGS) $(SRCS-y) $(INCLUDES) -o $@ $(LDFLAGS) $(LDFLAGS_STATIC)

 build:
 	@mkdir -p $@
diff --git a/examples/ipsec-secgw/ipsec-secgw.c b/examples/ipsec-secgw/ipsec-secgw.c
index 4d8a4a71b8..b650668305 100644
--- a/examples/ipsec-secgw/ipsec-secgw.c
+++ b/examples/ipsec-secgw/ipsec-secgw.c
@@ -56,6 +56,10 @@ 
 #include "parser.h"
 #include "sad.h"

+#if defined(__ARM_NEON)
+#include "ipsec_lpm_neon.h"
+#endif
+
 volatile bool force_quit;

 #define MAX_JUMBO_PKT_LEN  9600
@@ -100,6 +104,12 @@  struct ethaddr_info ethaddr_tbl[RTE_MAX_ETHPORTS] = {
 	{ 0, ETHADDR(0x00, 0x16, 0x3e, 0x49, 0x9e, 0xdd) }
 };

+/*
+ * To hold ethernet header per port, which will be applied
+ * to outgoing packets.
+ */
+xmm_t val_eth[RTE_MAX_ETHPORTS];
+
 struct flow_info flow_info_tbl[RTE_MAX_ETHPORTS];

 #define CMD_LINE_OPT_CONFIG		"config"
@@ -568,9 +578,16 @@  process_pkts(struct lcore_conf *qconf, struct rte_mbuf **pkts,
 			process_pkts_outbound(&qconf->outbound, &traffic);
 	}

+#if defined __ARM_NEON
+	/* Neon optimized packet routing */
+	route4_pkts_neon(qconf->rt4_ctx, traffic.ip4.pkts, traffic.ip4.num,
+			 qconf->outbound.ipv4_offloads, true);
+	route6_pkts_neon(qconf->rt6_ctx, traffic.ip6.pkts, traffic.ip6.num);
+#else
 	route4_pkts(qconf->rt4_ctx, traffic.ip4.pkts, traffic.ip4.num,
 		    qconf->outbound.ipv4_offloads, true);
 	route6_pkts(qconf->rt6_ctx, traffic.ip6.pkts, traffic.ip6.num);
+#endif
 }

 static inline void
@@ -1403,6 +1420,8 @@  add_dst_ethaddr(uint16_t port, const struct rte_ether_addr *addr)
 		return -EINVAL;

 	ethaddr_tbl[port].dst = ETHADDR_TO_UINT64(addr);
+	rte_ether_addr_copy((struct rte_ether_addr *)&ethaddr_tbl[port].dst,
+			    (struct rte_ether_addr *)(val_eth + port));
 	return 0;
 }

@@ -1865,6 +1884,12 @@  port_init(uint16_t portid, uint64_t req_rx_offloads, uint64_t req_tx_offloads)
 			portid, rte_strerror(-ret));

 	ethaddr_tbl[portid].src = ETHADDR_TO_UINT64(&ethaddr);
+
+	rte_ether_addr_copy((struct rte_ether_addr *)&ethaddr_tbl[portid].dst,
+			    (struct rte_ether_addr *)(val_eth + portid));
+	rte_ether_addr_copy((struct rte_ether_addr *)&ethaddr_tbl[portid].src,
+			    (struct rte_ether_addr *)(val_eth + portid) + 1);
+
 	print_ethaddr("Address: ", &ethaddr);
 	printf("\n");

diff --git a/examples/ipsec-secgw/ipsec_lpm_neon.h b/examples/ipsec-secgw/ipsec_lpm_neon.h
new file mode 100644
index 0000000000..959a5a8666
--- /dev/null
+++ b/examples/ipsec-secgw/ipsec_lpm_neon.h
@@ -0,0 +1,213 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2022 Marvell.
+ */
+
+#ifndef __IPSEC_LPM_NEON_H__
+#define __IPSEC_LPM_NEON_H__
+
+#include <arm_neon.h>
+#include "ipsec_neon.h"
+
+/*
+ * Append ethernet header and read destination IPV4 addresses from 4 mbufs.
+ */
+static inline void
+processx4_step1(struct rte_mbuf *pkt[FWDSTEP], int32x4_t *dip,
+		uint64_t *inline_flag)
+{
+	struct rte_ipv4_hdr *ipv4_hdr;
+	struct rte_ether_hdr *eth_hdr;
+	int32_t dst[FWDSTEP];
+	int i;
+
+	for (i = 0; i < FWDSTEP; i++) {
+		eth_hdr = (struct rte_ether_hdr *)rte_pktmbuf_prepend(pkt[i],
+							RTE_ETHER_HDR_LEN);
+		pkt[i]->ol_flags |= RTE_MBUF_F_TX_IPV4;
+		pkt[i]->l2_len = RTE_ETHER_HDR_LEN;
+
+		ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
+
+		/* Fetch destination IPv4 address */
+		dst[i] = ipv4_hdr->dst_addr;
+		*inline_flag |= pkt[i]->ol_flags & RTE_MBUF_F_TX_SEC_OFFLOAD;
+	}
+
+	dip[0] = vld1q_s32(dst);
+}
+
+/*
+ * Lookup into LPM for destination port.
+ */
+static inline void
+processx4_step2(struct rt_ctx *rt_ctx, int32x4_t dip, uint64_t inline_flag,
+		struct rte_mbuf *pkt[FWDSTEP], uint16_t dprt[FWDSTEP])
+{
+	uint32_t next_hop;
+	rte_xmm_t dst;
+	uint8_t i;
+
+	dip = vreinterpretq_s32_u8(vrev32q_u8(vreinterpretq_u8_s32(dip)));
+
+	/* If all 4 packets are non-inline */
+	if (!inline_flag) {
+		rte_lpm_lookupx4((struct rte_lpm *)rt_ctx, dip, dst.u32,
+				 BAD_PORT);
+		/* get rid of unused upper 16 bit for each dport. */
+		vst1_s16((int16_t *)dprt, vqmovn_s32(dst.x));
+		return;
+	}
+
+	/* Inline and non-inline packets */
+	dst.x = dip;
+	for (i = 0; i < FWDSTEP; i++) {
+		if (pkt[i]->ol_flags & RTE_MBUF_F_TX_SEC_OFFLOAD) {
+			next_hop = get_hop_for_offload_pkt(pkt[i], 0);
+			dprt[i] = (uint16_t) (((next_hop &
+						RTE_LPM_LOOKUP_SUCCESS) != 0)
+						? next_hop : BAD_PORT);
+
+		} else {
+			dprt[i] = (uint16_t) ((rte_lpm_lookup(
+						(struct rte_lpm *)rt_ctx,
+						 dst.u32[i], &next_hop) == 0)
+						? next_hop : BAD_PORT);
+		}
+	}
+}
+
+/*
+ * Process single packets for destination port.
+ */
+static inline void
+process_single_pkt(struct rt_ctx *rt_ctx, struct rte_mbuf *pkt,
+		   uint16_t *dst_port)
+{
+	struct rte_ether_hdr *eth_hdr;
+	struct rte_ipv4_hdr *ipv4_hdr;
+	uint32_t next_hop;
+	uint32_t dst_ip;
+
+	eth_hdr = (struct rte_ether_hdr *)rte_pktmbuf_prepend(pkt,
+							RTE_ETHER_HDR_LEN);
+	pkt->ol_flags |= RTE_MBUF_F_TX_IPV4;
+	pkt->l2_len = RTE_ETHER_HDR_LEN;
+
+	if (pkt->ol_flags & RTE_MBUF_F_TX_SEC_OFFLOAD) {
+		next_hop = get_hop_for_offload_pkt(pkt, 0);
+		*dst_port = (uint16_t) (((next_hop &
+					  RTE_LPM_LOOKUP_SUCCESS) != 0)
+					  ? next_hop : BAD_PORT);
+	} else {
+		ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
+		dst_ip = rte_be_to_cpu_32(ipv4_hdr->dst_addr);
+		*dst_port = (uint16_t) ((rte_lpm_lookup(
+						(struct rte_lpm *)rt_ctx,
+						dst_ip, &next_hop) == 0)
+						? next_hop : BAD_PORT);
+	}
+}
+
+/*
+ * Buffer optimized handling of IPv6 packets.
+ */
+static inline void
+route6_pkts_neon(struct rt_ctx *rt_ctx, struct rte_mbuf **pkts, int nb_rx)
+{
+	uint8_t dst_ip6[MAX_PKT_BURST][16];
+	int32_t dst_port[MAX_PKT_BURST];
+	struct rte_ether_hdr *eth_hdr;
+	struct rte_ipv6_hdr *ipv6_hdr;
+	int32_t hop[MAX_PKT_BURST];
+	struct rte_mbuf *pkt;
+	uint8_t lpm_pkts = 0;
+	int32_t i;
+
+	if (nb_rx == 0)
+		return;
+
+	/* Need to do an LPM lookup for non-inline packets. Inline packets will
+	 * have port ID in the SA
+	 */
+
+	for (i = 0; i < nb_rx; i++) {
+		pkt = pkts[i];
+		eth_hdr = (struct rte_ether_hdr *)rte_pktmbuf_prepend(pkt,
+							RTE_ETHER_HDR_LEN);
+		pkt->l2_len = RTE_ETHER_HDR_LEN;
+		pkt->ol_flags |= RTE_MBUF_F_TX_IPV6;
+
+		if (!(pkt->ol_flags & RTE_MBUF_F_TX_SEC_OFFLOAD)) {
+			/* Security offload not enabled. So an LPM lookup is
+			 * required to get the hop
+			 */
+			ipv6_hdr = (struct rte_ipv6_hdr *)(eth_hdr + 1);
+			memcpy(&dst_ip6[lpm_pkts][0],
+					ipv6_hdr->dst_addr, 16);
+			lpm_pkts++;
+		}
+	}
+
+	rte_lpm6_lookup_bulk_func((struct rte_lpm6 *)rt_ctx, dst_ip6,
+				  hop, lpm_pkts);
+
+	lpm_pkts = 0;
+
+	for (i = 0; i < nb_rx; i++) {
+		pkt = pkts[i];
+		if (pkt->ol_flags & RTE_MBUF_F_TX_SEC_OFFLOAD) {
+			/* Read hop from the SA */
+			dst_port[i] = get_hop_for_offload_pkt(pkt, 1);
+		} else {
+			/* Need to use hop returned by lookup */
+			dst_port[i] = hop[lpm_pkts++];
+		}
+		if (dst_port[i] == -1)
+			dst_port[i] = BAD_PORT;
+	}
+
+	/* Send packets */
+	send_multi_pkts(pkts, (uint16_t *)dst_port, nb_rx, 0, 0, false);
+}
+
+/*
+ * Buffer optimized handling of IPv4 packets.
+ */
+static inline void
+route4_pkts_neon(struct rt_ctx *rt_ctx, struct rte_mbuf **pkts, int nb_rx,
+		 uint64_t tx_offloads, bool ip_cksum)
+{
+	const int32_t k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);
+	const int32_t m = nb_rx % FWDSTEP;
+	uint16_t dst_port[MAX_PKT_BURST];
+	uint64_t inline_flag = 0;
+	int32x4_t dip;
+	int32_t i;
+
+	if (nb_rx == 0)
+		return;
+
+	for (i = 0; i != k; i += FWDSTEP) {
+		processx4_step1(&pkts[i], &dip, &inline_flag);
+		processx4_step2(rt_ctx, dip, inline_flag, &pkts[i],
+				&dst_port[i]);
+	}
+
+	/* Classify last up to 3 packets one by one */
+	switch (m) {
+	case 3:
+		process_single_pkt(rt_ctx, pkts[i], &dst_port[i]);
+		i++;
+		/* fallthrough */
+	case 2:
+		process_single_pkt(rt_ctx, pkts[i], &dst_port[i]);
+		i++;
+		/* fallthrough */
+	case 1:
+		process_single_pkt(rt_ctx, pkts[i], &dst_port[i]);
+	}
+
+	send_multi_pkts(pkts, dst_port, nb_rx, tx_offloads, ip_cksum, true);
+}
+
+#endif /* __IPSEC_LPM_NEON_H__ */
diff --git a/examples/ipsec-secgw/ipsec_neon.h b/examples/ipsec-secgw/ipsec_neon.h
new file mode 100644
index 0000000000..0f72219ed0
--- /dev/null
+++ b/examples/ipsec-secgw/ipsec_neon.h
@@ -0,0 +1,321 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2022 Marvell.
+ */
+
+#ifndef _IPSEC_NEON_H_
+#define _IPSEC_NEON_H_
+
+#include "ipsec.h"
+#include "neon_common.h"
+
+#define MAX_TX_BURST	(MAX_PKT_BURST / 2)
+#define BAD_PORT	((uint16_t)-1)
+
+extern xmm_t val_eth[RTE_MAX_ETHPORTS];
+
+/*
+ * Update source and destination MAC addresses in the ethernet header.
+ */
+static inline void
+processx4_step3(struct rte_mbuf *pkts[FWDSTEP], uint16_t dst_port[FWDSTEP],
+		uint64_t tx_offloads, bool ip_cksum, uint8_t *l_pkt)
+{
+	uint32x4_t te[FWDSTEP];
+	uint32x4_t ve[FWDSTEP];
+	uint32_t *p[FWDSTEP];
+	struct rte_mbuf *pkt;
+	uint8_t i;
+
+	for (i = 0; i < FWDSTEP; i++) {
+		pkt = pkts[i];
+
+		/* Check if it is a large packet */
+		if (pkt->pkt_len - RTE_ETHER_HDR_LEN > mtu_size)
+			*l_pkt |= 1;
+
+		p[i] = rte_pktmbuf_mtod(pkt, uint32_t *);
+		ve[i] = vreinterpretq_u32_s32(val_eth[dst_port[i]]);
+		te[i] = vld1q_u32(p[i]);
+
+		/* Update last 4 bytes */
+		ve[i] = vsetq_lane_u32(vgetq_lane_u32(te[i], 3), ve[i], 3);
+		vst1q_u32(p[i], ve[i]);
+
+		if (ip_cksum) {
+			struct rte_ipv4_hdr *ip;
+
+			pkt->ol_flags |= tx_offloads;
+
+			ip = (struct rte_ipv4_hdr *)
+				(p[i] + RTE_ETHER_HDR_LEN + 1);
+			ip->hdr_checksum = 0;
+
+			/* calculate IPv4 cksum in SW */
+			if ((pkt->ol_flags & RTE_MBUF_F_TX_IP_CKSUM) == 0)
+				ip->hdr_checksum = rte_ipv4_cksum(ip);
+		}
+
+	}
+}
+
+/**
+ * Process single packet:
+ * Update source and destination MAC addresses in the ethernet header.
+ */
+static inline void
+process_packet(struct rte_mbuf *pkt, uint16_t *dst_port, uint64_t tx_offloads,
+	       bool ip_cksum, uint8_t *l_pkt)
+{
+	struct rte_ether_hdr *eth_hdr;
+	uint32x4_t te, ve;
+
+	/* Check if it is a large packet */
+	if (pkt->pkt_len - RTE_ETHER_HDR_LEN > mtu_size)
+		*l_pkt |= 1;
+
+	eth_hdr = rte_pktmbuf_mtod(pkt, struct rte_ether_hdr *);
+
+	te = vld1q_u32((uint32_t *)eth_hdr);
+	ve = vreinterpretq_u32_s32(val_eth[dst_port[0]]);
+
+	ve = vcopyq_laneq_u32(ve, 3, te, 3);
+	vst1q_u32((uint32_t *)eth_hdr, ve);
+
+	if (ip_cksum) {
+		struct rte_ipv4_hdr *ip;
+
+		pkt->ol_flags |= tx_offloads;
+
+		ip = (struct rte_ipv4_hdr *)(eth_hdr + 1);
+		ip->hdr_checksum = 0;
+
+		/* calculate IPv4 cksum in SW */
+		if ((pkt->ol_flags & RTE_MBUF_F_TX_IP_CKSUM) == 0)
+			ip->hdr_checksum = rte_ipv4_cksum(ip);
+	}
+}
+
+static inline void
+send_packets(struct rte_mbuf *m[], uint16_t port, uint32_t num, bool is_ipv4)
+{
+	uint8_t proto;
+	uint32_t i;
+
+	proto = is_ipv4 ? IPPROTO_IP : IPPROTO_IPV6;
+	for (i = 0; i < num; i++)
+		send_single_packet(m[i], port, proto);
+}
+
+static inline void
+send_packetsx4(struct rte_mbuf *m[], uint16_t port, uint32_t num)
+{
+	unsigned int lcoreid = rte_lcore_id();
+	struct lcore_conf *qconf;
+	uint32_t len, j, n;
+
+	qconf = &lcore_conf[lcoreid];
+
+	len = qconf->tx_mbufs[port].len;
+
+	/*
+	 * If TX buffer for that queue is empty, and we have enough packets,
+	 * then send them straightway.
+	 */
+	if (num >= MAX_TX_BURST && len == 0) {
+		n = rte_eth_tx_burst(port, qconf->tx_queue_id[port], m, num);
+		core_stats_update_tx(n);
+		if (unlikely(n < num)) {
+			do {
+				rte_pktmbuf_free(m[n]);
+			} while (++n < num);
+		}
+		return;
+	}
+
+	/*
+	 * Put packets into TX buffer for that queue.
+	 */
+
+	n = len + num;
+	n = (n > MAX_PKT_BURST) ? MAX_PKT_BURST - len : num;
+
+	j = 0;
+	switch (n % FWDSTEP) {
+	while (j < n) {
+		case 0:
+			qconf->tx_mbufs[port].m_table[len + j] = m[j];
+			j++;
+			/* fallthrough */
+		case 3:
+			qconf->tx_mbufs[port].m_table[len + j] = m[j];
+			j++;
+			/* fallthrough */
+		case 2:
+			qconf->tx_mbufs[port].m_table[len + j] = m[j];
+			j++;
+			/* fallthrough */
+		case 1:
+			qconf->tx_mbufs[port].m_table[len + j] = m[j];
+			j++;
+		}
+	}
+
+	len += n;
+
+	/* enough pkts to be sent */
+	if (unlikely(len == MAX_PKT_BURST)) {
+
+		send_burst(qconf, MAX_PKT_BURST, port);
+
+		/* copy rest of the packets into the TX buffer. */
+		len = num - n;
+		if (len == 0)
+			goto exit;
+
+		j = 0;
+		switch (len % FWDSTEP) {
+		while (j < len) {
+			case 0:
+				qconf->tx_mbufs[port].m_table[j] = m[n + j];
+				j++;
+				/* fallthrough */
+			case 3:
+				qconf->tx_mbufs[port].m_table[j] = m[n + j];
+				j++;
+				/* fallthrough */
+			case 2:
+				qconf->tx_mbufs[port].m_table[j] = m[n + j];
+				j++;
+				/* fallthrough */
+			case 1:
+				qconf->tx_mbufs[port].m_table[j] = m[n + j];
+				j++;
+		}
+		}
+	}
+
+exit:
+	qconf->tx_mbufs[port].len = len;
+}
+
+/**
+ * Send packets burst to the ports in dst_port array
+ */
+static __rte_always_inline void
+send_multi_pkts(struct rte_mbuf **pkts, uint16_t dst_port[MAX_PKT_BURST],
+		int nb_rx, uint64_t tx_offloads, bool ip_cksum, bool is_ipv4)
+{
+	unsigned int lcoreid = rte_lcore_id();
+	uint16_t pnum[MAX_PKT_BURST + 1];
+	uint8_t l_pkt = 0;
+	uint16_t dlp, *lp;
+	int i = 0, k;
+
+	/*
+	 * Finish packet processing and group consecutive
+	 * packets with the same destination port.
+	 */
+	k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);
+
+	if (k != 0) {
+		uint16x8_t dp1, dp2;
+
+		lp = pnum;
+		lp[0] = 1;
+
+		processx4_step3(pkts, dst_port, tx_offloads, ip_cksum, &l_pkt);
+
+		/* dp1: <d[0], d[1], d[2], d[3], ... > */
+		dp1 = vld1q_u16(dst_port);
+
+		for (i = FWDSTEP; i != k; i += FWDSTEP) {
+			processx4_step3(&pkts[i], &dst_port[i], tx_offloads,
+					ip_cksum, &l_pkt);
+
+			/*
+			 * dp2:
+			 * <d[j-3], d[j-2], d[j-1], d[j], ... >
+			 */
+			dp2 = vld1q_u16(&dst_port[i - FWDSTEP + 1]);
+			lp  = neon_port_groupx4(&pnum[i - FWDSTEP], lp, dp1, dp2);
+
+			/*
+			 * dp1:
+			 * <d[j], d[j+1], d[j+2], d[j+3], ... >
+			 */
+			dp1 = vextq_u16(dp2, dp1, FWDSTEP - 1);
+		}
+
+		/*
+		 * dp2: <d[j-3], d[j-2], d[j-1], d[j-1], ... >
+		 */
+		dp2 = vextq_u16(dp1, dp1, 1);
+		dp2 = vsetq_lane_u16(vgetq_lane_u16(dp2, 2), dp2, 3);
+		lp  = neon_port_groupx4(&pnum[i - FWDSTEP], lp, dp1, dp2);
+
+		/*
+		 * remove values added by the last repeated
+		 * dst port.
+		 */
+		lp[0]--;
+		dlp = dst_port[i - 1];
+	} else {
+		/* set dlp and lp to the never used values. */
+		dlp = BAD_PORT - 1;
+		lp = pnum + MAX_PKT_BURST;
+	}
+
+	/* Process up to last 3 packets one by one. */
+	switch (nb_rx % FWDSTEP) {
+	case 3:
+		process_packet(pkts[i], dst_port + i, tx_offloads, ip_cksum,
+			       &l_pkt);
+		GROUP_PORT_STEP(dlp, dst_port, lp, pnum, i);
+		i++;
+		/* fallthrough */
+	case 2:
+		process_packet(pkts[i], dst_port + i, tx_offloads, ip_cksum,
+			       &l_pkt);
+		GROUP_PORT_STEP(dlp, dst_port, lp, pnum, i);
+		i++;
+		/* fallthrough */
+	case 1:
+		process_packet(pkts[i], dst_port + i, tx_offloads, ip_cksum,
+			       &l_pkt);
+		GROUP_PORT_STEP(dlp, dst_port, lp, pnum, i);
+	}
+
+	/*
+	 * Send packets out, through destination port.
+	 * Consecutive packets with the same destination port
+	 * are already grouped together.
+	 * If destination port for the packet equals BAD_PORT,
+	 * then free the packet without sending it out.
+	 */
+	for (i = 0; i < nb_rx; i += k) {
+
+		uint16_t pn;
+
+		pn = dst_port[i];
+		k = pnum[i];
+
+		if (likely(pn != BAD_PORT)) {
+			if (l_pkt)
+				/* Large packet is present, need to send
+				 * individual packets with fragment
+				 */
+				send_packets(pkts + i, pn, k, is_ipv4);
+			else
+				send_packetsx4(pkts + i, pn, k);
+
+		} else {
+			free_pkts(&pkts[i], k);
+			if (is_ipv4)
+				core_statistics[lcoreid].lpm4.miss++;
+			else
+				core_statistics[lcoreid].lpm6.miss++;
+		}
+	}
+}
+
+#endif /* _IPSEC_NEON_H_ */
diff --git a/examples/ipsec-secgw/ipsec_worker.c b/examples/ipsec-secgw/ipsec_worker.c
index e1d4e3d864..803157d8ee 100644
--- a/examples/ipsec-secgw/ipsec_worker.c
+++ b/examples/ipsec-secgw/ipsec_worker.c
@@ -12,6 +12,10 @@ 
 #include "ipsec-secgw.h"
 #include "ipsec_worker.h"

+#if defined(__ARM_NEON)
+#include "ipsec_lpm_neon.h"
+#endif
+
 struct port_drv_mode_data {
 	struct rte_security_session *sess;
 	struct rte_security_ctx *ctx;
@@ -1248,8 +1252,13 @@  ipsec_poll_mode_wrkr_inl_pr(void)
 				v6_num = ip6.num;
 			}

+#if defined __ARM_NEON
+			route4_pkts_neon(rt4_ctx, v4, v4_num, 0, false);
+			route6_pkts_neon(rt6_ctx, v6, v6_num);
+#else
 			route4_pkts(rt4_ctx, v4, v4_num, 0, false);
 			route6_pkts(rt6_ctx, v6, v6_num);
+#endif
 		}
 	}
 }