examples/ipsec-secgw: add support of NEON with poll mode

Message ID 20220524095717.3875284-1-rbhansali@marvell.com (mailing list archive)
State Superseded, archived
Delegated to: akhil goyal
Headers
Series examples/ipsec-secgw: add support of NEON with poll mode |

Checks

Context Check Description
ci/checkpatch warning coding style issues
ci/Intel-compilation warning apply issues
ci/iol-testing warning apply patch failure

Commit Message

Rahul Bhansali May 24, 2022, 9:57 a.m. UTC
  This adds the support of NEON based lpm lookup along with
multi packet processing for burst send in packets routing.

Performance impact:
On cn10k, with poll mode inline protocol, outbound performance
increased by upto ~8% and inbound performance increased by
upto ~6%.

Signed-off-by: Rahul Bhansali <rbhansali@marvell.com>
---
 examples/ipsec-secgw/ipsec-secgw.c    |  25 ++
 examples/ipsec-secgw/ipsec_lpm_neon.h | 213 +++++++++++
 examples/ipsec-secgw/ipsec_neon.h     | 487 ++++++++++++++++++++++++++
 examples/ipsec-secgw/ipsec_worker.c   |   9 +
 4 files changed, 734 insertions(+)
 create mode 100644 examples/ipsec-secgw/ipsec_lpm_neon.h
 create mode 100644 examples/ipsec-secgw/ipsec_neon.h
  

Comments

Konstantin Ananyev May 24, 2022, 11 p.m. UTC | #1
24/05/2022 10:57, Rahul Bhansali пишет:
> This adds the support of NEON based lpm lookup along with
> multi packet processing for burst send in packets routing.
> 
> Performance impact:
> On cn10k, with poll mode inline protocol, outbound performance
> increased by upto ~8% and inbound performance increased by
> upto ~6%.


Interesting, good bunch of code looks like a dup from l3fwd:
grouping, precessx4_step?, etc.
Would it be possible to move dup code into some common place,
so it can be used by both examples?

> 
> Signed-off-by: Rahul Bhansali <rbhansali@marvell.com>
> ---
>   examples/ipsec-secgw/ipsec-secgw.c    |  25 ++
>   examples/ipsec-secgw/ipsec_lpm_neon.h | 213 +++++++++++
>   examples/ipsec-secgw/ipsec_neon.h     | 487 ++++++++++++++++++++++++++
>   examples/ipsec-secgw/ipsec_worker.c   |   9 +
>   4 files changed, 734 insertions(+)
>   create mode 100644 examples/ipsec-secgw/ipsec_lpm_neon.h
>   create mode 100644 examples/ipsec-secgw/ipsec_neon.h
> 
> diff --git a/examples/ipsec-secgw/ipsec-secgw.c b/examples/ipsec-secgw/ipsec-secgw.c
> index 25255e053c..038c4669f5 100644
> --- a/examples/ipsec-secgw/ipsec-secgw.c
> +++ b/examples/ipsec-secgw/ipsec-secgw.c
> @@ -56,6 +56,10 @@
>   #include "parser.h"
>   #include "sad.h"
>   
> +#if defined(__ARM_NEON)
> +#include "ipsec_lpm_neon.h"
> +#endif
> +
>   volatile bool force_quit;
>   
>   #define MAX_JUMBO_PKT_LEN  9600
> @@ -96,6 +100,12 @@ struct ethaddr_info ethaddr_tbl[RTE_MAX_ETHPORTS] = {
>   	{ 0, ETHADDR(0x00, 0x16, 0x3e, 0x49, 0x9e, 0xdd) }
>   };
>   
> +/*
> + * To hold ethernet header per port, which will be applied
> + * to outgoing packets.
> + */
> +xmm_t val_eth[RTE_MAX_ETHPORTS];
> +
>   struct flow_info flow_info_tbl[RTE_MAX_ETHPORTS];
>   
>   #define CMD_LINE_OPT_CONFIG		"config"
> @@ -561,9 +571,16 @@ process_pkts(struct lcore_conf *qconf, struct rte_mbuf **pkts,
>   			process_pkts_outbound(&qconf->outbound, &traffic);
>   	}
>   
> +#if defined __ARM_NEON
> +	/* Neon optimized packet routing */
> +	route4_pkts_neon(qconf->rt4_ctx, traffic.ip4.pkts, traffic.ip4.num,
> +			 qconf->outbound.ipv4_offloads, true);
> +	route6_pkts_neon(qconf->rt6_ctx, traffic.ip6.pkts, traffic.ip6.num);
> +#else
>   	route4_pkts(qconf->rt4_ctx, traffic.ip4.pkts, traffic.ip4.num,
>   		    qconf->outbound.ipv4_offloads, true);
>   	route6_pkts(qconf->rt6_ctx, traffic.ip6.pkts, traffic.ip6.num);
> +#endif
>   }
>   
>   static inline void
> @@ -1390,6 +1407,8 @@ add_dst_ethaddr(uint16_t port, const struct rte_ether_addr *addr)
>   		return -EINVAL;
>   
>   	ethaddr_tbl[port].dst = ETHADDR_TO_UINT64(addr);
> +	rte_ether_addr_copy((struct rte_ether_addr *)&ethaddr_tbl[port].dst,
> +			    (struct rte_ether_addr *)(val_eth + port));
>   	return 0;
>   }
>   
> @@ -1852,6 +1871,12 @@ port_init(uint16_t portid, uint64_t req_rx_offloads, uint64_t req_tx_offloads)
>   			portid, rte_strerror(-ret));
>   
>   	ethaddr_tbl[portid].src = ETHADDR_TO_UINT64(&ethaddr);
> +
> +	rte_ether_addr_copy((struct rte_ether_addr *)&ethaddr_tbl[portid].dst,
> +			    (struct rte_ether_addr *)(val_eth + portid));
> +	rte_ether_addr_copy((struct rte_ether_addr *)&ethaddr_tbl[portid].src,
> +			    (struct rte_ether_addr *)(val_eth + portid) + 1);
> +
>   	print_ethaddr("Address: ", &ethaddr);
>   	printf("\n");
>   
> diff --git a/examples/ipsec-secgw/ipsec_lpm_neon.h b/examples/ipsec-secgw/ipsec_lpm_neon.h
> new file mode 100644
> index 0000000000..959a5a8666
> --- /dev/null
> +++ b/examples/ipsec-secgw/ipsec_lpm_neon.h
> @@ -0,0 +1,213 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(C) 2022 Marvell.
> + */
> +
> +#ifndef __IPSEC_LPM_NEON_H__
> +#define __IPSEC_LPM_NEON_H__
> +
> +#include <arm_neon.h>
> +#include "ipsec_neon.h"
> +
> +/*
> + * Append ethernet header and read destination IPV4 addresses from 4 mbufs.
> + */
> +static inline void
> +processx4_step1(struct rte_mbuf *pkt[FWDSTEP], int32x4_t *dip,
> +		uint64_t *inline_flag)
> +{
> +	struct rte_ipv4_hdr *ipv4_hdr;
> +	struct rte_ether_hdr *eth_hdr;
> +	int32_t dst[FWDSTEP];
> +	int i;
> +
> +	for (i = 0; i < FWDSTEP; i++) {
> +		eth_hdr = (struct rte_ether_hdr *)rte_pktmbuf_prepend(pkt[i],
> +							RTE_ETHER_HDR_LEN);
> +		pkt[i]->ol_flags |= RTE_MBUF_F_TX_IPV4;
> +		pkt[i]->l2_len = RTE_ETHER_HDR_LEN;
> +
> +		ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
> +
> +		/* Fetch destination IPv4 address */
> +		dst[i] = ipv4_hdr->dst_addr;
> +		*inline_flag |= pkt[i]->ol_flags & RTE_MBUF_F_TX_SEC_OFFLOAD;
> +	}
> +
> +	dip[0] = vld1q_s32(dst);
> +}
> +
> +/*
> + * Lookup into LPM for destination port.
> + */
> +static inline void
> +processx4_step2(struct rt_ctx *rt_ctx, int32x4_t dip, uint64_t inline_flag,
> +		struct rte_mbuf *pkt[FWDSTEP], uint16_t dprt[FWDSTEP])
> +{
> +	uint32_t next_hop;
> +	rte_xmm_t dst;
> +	uint8_t i;
> +
> +	dip = vreinterpretq_s32_u8(vrev32q_u8(vreinterpretq_u8_s32(dip)));
> +
> +	/* If all 4 packets are non-inline */
> +	if (!inline_flag) {
> +		rte_lpm_lookupx4((struct rte_lpm *)rt_ctx, dip, dst.u32,
> +				 BAD_PORT);
> +		/* get rid of unused upper 16 bit for each dport. */
> +		vst1_s16((int16_t *)dprt, vqmovn_s32(dst.x));
> +		return;
> +	}
> +
> +	/* Inline and non-inline packets */
> +	dst.x = dip;
> +	for (i = 0; i < FWDSTEP; i++) {
> +		if (pkt[i]->ol_flags & RTE_MBUF_F_TX_SEC_OFFLOAD) {
> +			next_hop = get_hop_for_offload_pkt(pkt[i], 0);
> +			dprt[i] = (uint16_t) (((next_hop &
> +						RTE_LPM_LOOKUP_SUCCESS) != 0)
> +						? next_hop : BAD_PORT);
> +
> +		} else {
> +			dprt[i] = (uint16_t) ((rte_lpm_lookup(
> +						(struct rte_lpm *)rt_ctx,
> +						 dst.u32[i], &next_hop) == 0)
> +						? next_hop : BAD_PORT);
> +		}
> +	}
> +}
> +
> +/*
> + * Process single packets for destination port.
> + */
> +static inline void
> +process_single_pkt(struct rt_ctx *rt_ctx, struct rte_mbuf *pkt,
> +		   uint16_t *dst_port)
> +{
> +	struct rte_ether_hdr *eth_hdr;
> +	struct rte_ipv4_hdr *ipv4_hdr;
> +	uint32_t next_hop;
> +	uint32_t dst_ip;
> +
> +	eth_hdr = (struct rte_ether_hdr *)rte_pktmbuf_prepend(pkt,
> +							RTE_ETHER_HDR_LEN);
> +	pkt->ol_flags |= RTE_MBUF_F_TX_IPV4;
> +	pkt->l2_len = RTE_ETHER_HDR_LEN;
> +
> +	if (pkt->ol_flags & RTE_MBUF_F_TX_SEC_OFFLOAD) {
> +		next_hop = get_hop_for_offload_pkt(pkt, 0);
> +		*dst_port = (uint16_t) (((next_hop &
> +					  RTE_LPM_LOOKUP_SUCCESS) != 0)
> +					  ? next_hop : BAD_PORT);
> +	} else {
> +		ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
> +		dst_ip = rte_be_to_cpu_32(ipv4_hdr->dst_addr);
> +		*dst_port = (uint16_t) ((rte_lpm_lookup(
> +						(struct rte_lpm *)rt_ctx,
> +						dst_ip, &next_hop) == 0)
> +						? next_hop : BAD_PORT);
> +	}
> +}
> +
> +/*
> + * Buffer optimized handling of IPv6 packets.
> + */
> +static inline void
> +route6_pkts_neon(struct rt_ctx *rt_ctx, struct rte_mbuf **pkts, int nb_rx)
> +{
> +	uint8_t dst_ip6[MAX_PKT_BURST][16];
> +	int32_t dst_port[MAX_PKT_BURST];
> +	struct rte_ether_hdr *eth_hdr;
> +	struct rte_ipv6_hdr *ipv6_hdr;
> +	int32_t hop[MAX_PKT_BURST];
> +	struct rte_mbuf *pkt;
> +	uint8_t lpm_pkts = 0;
> +	int32_t i;
> +
> +	if (nb_rx == 0)
> +		return;
> +
> +	/* Need to do an LPM lookup for non-inline packets. Inline packets will
> +	 * have port ID in the SA
> +	 */
> +
> +	for (i = 0; i < nb_rx; i++) {
> +		pkt = pkts[i];
> +		eth_hdr = (struct rte_ether_hdr *)rte_pktmbuf_prepend(pkt,
> +							RTE_ETHER_HDR_LEN);
> +		pkt->l2_len = RTE_ETHER_HDR_LEN;
> +		pkt->ol_flags |= RTE_MBUF_F_TX_IPV6;
> +
> +		if (!(pkt->ol_flags & RTE_MBUF_F_TX_SEC_OFFLOAD)) {
> +			/* Security offload not enabled. So an LPM lookup is
> +			 * required to get the hop
> +			 */
> +			ipv6_hdr = (struct rte_ipv6_hdr *)(eth_hdr + 1);
> +			memcpy(&dst_ip6[lpm_pkts][0],
> +					ipv6_hdr->dst_addr, 16);
> +			lpm_pkts++;
> +		}
> +	}
> +
> +	rte_lpm6_lookup_bulk_func((struct rte_lpm6 *)rt_ctx, dst_ip6,
> +				  hop, lpm_pkts);
> +
> +	lpm_pkts = 0;
> +
> +	for (i = 0; i < nb_rx; i++) {
> +		pkt = pkts[i];
> +		if (pkt->ol_flags & RTE_MBUF_F_TX_SEC_OFFLOAD) {
> +			/* Read hop from the SA */
> +			dst_port[i] = get_hop_for_offload_pkt(pkt, 1);
> +		} else {
> +			/* Need to use hop returned by lookup */
> +			dst_port[i] = hop[lpm_pkts++];
> +		}
> +		if (dst_port[i] == -1)
> +			dst_port[i] = BAD_PORT;
> +	}
> +
> +	/* Send packets */
> +	send_multi_pkts(pkts, (uint16_t *)dst_port, nb_rx, 0, 0, false);
> +}
> +
> +/*
> + * Buffer optimized handling of IPv4 packets.
> + */
> +static inline void
> +route4_pkts_neon(struct rt_ctx *rt_ctx, struct rte_mbuf **pkts, int nb_rx,
> +		 uint64_t tx_offloads, bool ip_cksum)
> +{
> +	const int32_t k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);
> +	const int32_t m = nb_rx % FWDSTEP;
> +	uint16_t dst_port[MAX_PKT_BURST];
> +	uint64_t inline_flag = 0;
> +	int32x4_t dip;
> +	int32_t i;
> +
> +	if (nb_rx == 0)
> +		return;
> +
> +	for (i = 0; i != k; i += FWDSTEP) {
> +		processx4_step1(&pkts[i], &dip, &inline_flag);
> +		processx4_step2(rt_ctx, dip, inline_flag, &pkts[i],
> +				&dst_port[i]);
> +	}
> +
> +	/* Classify last up to 3 packets one by one */
> +	switch (m) {
> +	case 3:
> +		process_single_pkt(rt_ctx, pkts[i], &dst_port[i]);
> +		i++;
> +		/* fallthrough */
> +	case 2:
> +		process_single_pkt(rt_ctx, pkts[i], &dst_port[i]);
> +		i++;
> +		/* fallthrough */
> +	case 1:
> +		process_single_pkt(rt_ctx, pkts[i], &dst_port[i]);
> +	}
> +
> +	send_multi_pkts(pkts, dst_port, nb_rx, tx_offloads, ip_cksum, true);
> +}
> +
> +#endif /* __IPSEC_LPM_NEON_H__ */
> diff --git a/examples/ipsec-secgw/ipsec_neon.h b/examples/ipsec-secgw/ipsec_neon.h
> new file mode 100644
> index 0000000000..39dddcd1e3
> --- /dev/null
> +++ b/examples/ipsec-secgw/ipsec_neon.h
> @@ -0,0 +1,487 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(C) 2022 Marvell.
> + */
> +
> +#ifndef _IPSEC_NEON_H_
> +#define _IPSEC_NEON_H_
> +
> +#include "ipsec.h"
> +
> +#define FWDSTEP		4
> +#define MAX_TX_BURST	(MAX_PKT_BURST / 2)
> +#define BAD_PORT	((uint16_t)-1)
> +
> +extern xmm_t val_eth[RTE_MAX_ETHPORTS];
> +
> +/*
> + * Group consecutive packets with the same destination port into one burst.
> + * To avoid extra latency this is done together with some other packet
> + * processing, but after we made a final decision about packet's destination.
> + * To do this we maintain:
> + * pnum - array of number of consecutive packets with the same dest port for
> + * each packet in the input burst.
> + * lp - pointer to the last updated element in the pnum.
> + * dlp - dest port value lp corresponds to.
> + */
> +
> +#define	GRPSZ	(1 << FWDSTEP)
> +#define	GRPMSK	(GRPSZ - 1)
> +
> +#define GROUP_PORT_STEP(dlp, dcp, lp, pn, idx)	do { \
> +	if (likely((dlp) == (dcp)[(idx)])) {         \
> +		(lp)[0]++;                           \
> +	} else {                                     \
> +		(dlp) = (dcp)[idx];                  \
> +		(lp) = (pn) + (idx);                 \
> +		(lp)[0] = 1;                         \
> +	}                                            \
> +} while (0)
> +
> +static const struct {
> +	uint64_t pnum; /* prebuild 4 values for pnum[]. */
> +	int32_t  idx;  /* index for new last updated elemnet. */
> +	uint16_t lpv;  /* add value to the last updated element. */
> +} gptbl[GRPSZ] = {
> +	{
> +		/* 0: a != b, b != c, c != d, d != e */
> +		.pnum = UINT64_C(0x0001000100010001),
> +		.idx = 4,
> +		.lpv = 0,
> +	},
> +	{
> +		/* 1: a == b, b != c, c != d, d != e */
> +		.pnum = UINT64_C(0x0001000100010002),
> +		.idx = 4,
> +		.lpv = 1,
> +	},
> +	{
> +		/* 2: a != b, b == c, c != d, d != e */
> +		.pnum = UINT64_C(0x0001000100020001),
> +		.idx = 4,
> +		.lpv = 0,
> +	},
> +	{
> +		/* 3: a == b, b == c, c != d, d != e */
> +		.pnum = UINT64_C(0x0001000100020003),
> +		.idx = 4,
> +		.lpv = 2,
> +	},
> +	{
> +		/* 4: a != b, b != c, c == d, d != e */
> +		.pnum = UINT64_C(0x0001000200010001),
> +		.idx = 4,
> +		.lpv = 0,
> +	},
> +	{
> +		/* 5: a == b, b != c, c == d, d != e */
> +		.pnum = UINT64_C(0x0001000200010002),
> +		.idx = 4,
> +		.lpv = 1,
> +	},
> +	{
> +		/* 6: a != b, b == c, c == d, d != e */
> +		.pnum = UINT64_C(0x0001000200030001),
> +		.idx = 4,
> +		.lpv = 0,
> +	},
> +	{
> +		/* 7: a == b, b == c, c == d, d != e */
> +		.pnum = UINT64_C(0x0001000200030004),
> +		.idx = 4,
> +		.lpv = 3,
> +	},
> +	{
> +		/* 8: a != b, b != c, c != d, d == e */
> +		.pnum = UINT64_C(0x0002000100010001),
> +		.idx = 3,
> +		.lpv = 0,
> +	},
> +	{
> +		/* 9: a == b, b != c, c != d, d == e */
> +		.pnum = UINT64_C(0x0002000100010002),
> +		.idx = 3,
> +		.lpv = 1,
> +	},
> +	{
> +		/* 0xa: a != b, b == c, c != d, d == e */
> +		.pnum = UINT64_C(0x0002000100020001),
> +		.idx = 3,
> +		.lpv = 0,
> +	},
> +	{
> +		/* 0xb: a == b, b == c, c != d, d == e */
> +		.pnum = UINT64_C(0x0002000100020003),
> +		.idx = 3,
> +		.lpv = 2,
> +	},
> +	{
> +		/* 0xc: a != b, b != c, c == d, d == e */
> +		.pnum = UINT64_C(0x0002000300010001),
> +		.idx = 2,
> +		.lpv = 0,
> +	},
> +	{
> +		/* 0xd: a == b, b != c, c == d, d == e */
> +		.pnum = UINT64_C(0x0002000300010002),
> +		.idx = 2,
> +		.lpv = 1,
> +	},
> +	{
> +		/* 0xe: a != b, b == c, c == d, d == e */
> +		.pnum = UINT64_C(0x0002000300040001),
> +		.idx = 1,
> +		.lpv = 0,
> +	},
> +	{
> +		/* 0xf: a == b, b == c, c == d, d == e */
> +		.pnum = UINT64_C(0x0002000300040005),
> +		.idx = 0,
> +		.lpv = 4,
> +	},
> +};
> +
> +
> +/*
> + * Update source and destination MAC addresses in the ethernet header.
> + */
> +static inline void
> +processx4_step3(struct rte_mbuf *pkts[FWDSTEP], uint16_t dst_port[FWDSTEP],
> +		uint64_t tx_offloads, bool ip_cksum, uint8_t *l_pkt)
> +{
> +	uint32x4_t te[FWDSTEP];
> +	uint32x4_t ve[FWDSTEP];
> +	uint32_t *p[FWDSTEP];
> +	struct rte_mbuf *pkt;
> +	uint8_t i;
> +
> +	for (i = 0; i < FWDSTEP; i++) {
> +		pkt = pkts[i];
> +
> +		/* Check if it is a large packet */
> +		if (pkt->pkt_len - RTE_ETHER_HDR_LEN > mtu_size)
> +			*l_pkt |= 1;
> +
> +		p[i] = rte_pktmbuf_mtod(pkt, uint32_t *);
> +		ve[i] = vreinterpretq_u32_s32(val_eth[dst_port[i]]);
> +		te[i] = vld1q_u32(p[i]);
> +
> +		/* Update last 4 bytes */
> +		ve[i] = vsetq_lane_u32(vgetq_lane_u32(te[i], 3), ve[i], 3);
> +		vst1q_u32(p[i], ve[i]);
> +
> +		if (ip_cksum) {
> +			struct rte_ipv4_hdr *ip;
> +
> +			pkt->ol_flags |= tx_offloads;
> +
> +			ip = (struct rte_ipv4_hdr *)
> +				(p[i] + RTE_ETHER_HDR_LEN + 1);
> +			ip->hdr_checksum = 0;
> +
> +			/* calculate IPv4 cksum in SW */
> +			if ((pkt->ol_flags & RTE_MBUF_F_TX_IP_CKSUM) == 0)
> +				ip->hdr_checksum = rte_ipv4_cksum(ip);
> +		}
> +
> +	}
> +}
> +
> +/*
> + * Group consecutive packets with the same destination port in bursts of 4.
> + * Suppose we have array of destination ports:
> + * dst_port[] = {a, b, c, d,, e, ... }
> + * dp1 should contain: <a, b, c, d>, dp2: <b, c, d, e>.
> + * We doing 4 comparisons at once and the result is 4 bit mask.
> + * This mask is used as an index into prebuild array of pnum values.
> + */
> +static inline uint16_t *
> +port_groupx4(uint16_t pn[FWDSTEP + 1], uint16_t *lp, uint16x8_t dp1,
> +	     uint16x8_t dp2)
> +{
> +	union {
> +		uint16_t u16[FWDSTEP + 1];
> +		uint64_t u64;
> +	} *pnum = (void *)pn;
> +
> +	uint16x8_t mask = {1, 2, 4, 8, 0, 0, 0, 0};
> +	int32_t v;
> +
> +	dp1 = vceqq_u16(dp1, dp2);
> +	dp1 = vandq_u16(dp1, mask);
> +	v = vaddvq_u16(dp1);
> +
> +	/* update last port counter. */
> +	lp[0] += gptbl[v].lpv;
> +	rte_compiler_barrier();
> +
> +	/* if dest port value has changed. */
> +	if (v != GRPMSK) {
> +		pnum->u64 = gptbl[v].pnum;
> +		pnum->u16[FWDSTEP] = 1;
> +		lp = pnum->u16 + gptbl[v].idx;
> +	}
> +
> +	return lp;
> +}
> +
> +/**
> + * Process single packet:
> + * Update source and destination MAC addresses in the ethernet header.
> + */
> +static inline void
> +process_packet(struct rte_mbuf *pkt, uint16_t *dst_port, uint64_t tx_offloads,
> +	       bool ip_cksum, uint8_t *l_pkt)
> +{
> +	struct rte_ether_hdr *eth_hdr;
> +	uint32x4_t te, ve;
> +
> +	/* Check if it is a large packet */
> +	if (pkt->pkt_len - RTE_ETHER_HDR_LEN > mtu_size)
> +		*l_pkt |= 1;
> +
> +	eth_hdr = rte_pktmbuf_mtod(pkt, struct rte_ether_hdr *);
> +
> +	te = vld1q_u32((uint32_t *)eth_hdr);
> +	ve = vreinterpretq_u32_s32(val_eth[dst_port[0]]);
> +
> +	ve = vcopyq_laneq_u32(ve, 3, te, 3);
> +	vst1q_u32((uint32_t *)eth_hdr, ve);
> +
> +	if (ip_cksum) {
> +		struct rte_ipv4_hdr *ip;
> +
> +		pkt->ol_flags |= tx_offloads;
> +
> +		ip = (struct rte_ipv4_hdr *)(eth_hdr + 1);
> +		ip->hdr_checksum = 0;
> +
> +		/* calculate IPv4 cksum in SW */
> +		if ((pkt->ol_flags & RTE_MBUF_F_TX_IP_CKSUM) == 0)
> +			ip->hdr_checksum = rte_ipv4_cksum(ip);
> +	}
> +}
> +
> +static inline void
> +send_packets(struct rte_mbuf *m[], uint16_t port, uint32_t num, bool is_ipv4)
> +{
> +	uint8_t proto;
> +	uint32_t i;
> +
> +	proto = is_ipv4 ? IPPROTO_IP : IPPROTO_IPV6;
> +	for (i = 0; i < num; i++)
> +		send_single_packet(m[i], port, proto);
> +}
> +
> +static inline void
> +send_packetsx4(struct rte_mbuf *m[], uint16_t port, uint32_t num)
> +{
> +	unsigned int lcoreid = rte_lcore_id();
> +	struct lcore_conf *qconf;
> +	uint32_t len, j, n;
> +
> +	qconf = &lcore_conf[lcoreid];
> +
> +	len = qconf->tx_mbufs[port].len;
> +
> +	/*
> +	 * If TX buffer for that queue is empty, and we have enough packets,
> +	 * then send them straightway.
> +	 */
> +	if (num >= MAX_TX_BURST && len == 0) {
> +		n = rte_eth_tx_burst(port, qconf->tx_queue_id[port], m, num);
> +		core_stats_update_tx(n);
> +		if (unlikely(n < num)) {
> +			do {
> +				rte_pktmbuf_free(m[n]);
> +			} while (++n < num);
> +		}
> +		return;
> +	}
> +
> +	/*
> +	 * Put packets into TX buffer for that queue.
> +	 */
> +
> +	n = len + num;
> +	n = (n > MAX_PKT_BURST) ? MAX_PKT_BURST - len : num;
> +
> +	j = 0;
> +	switch (n % FWDSTEP) {
> +	while (j < n) {
> +		case 0:
> +			qconf->tx_mbufs[port].m_table[len + j] = m[j];
> +			j++;
> +			/* fallthrough */
> +		case 3:
> +			qconf->tx_mbufs[port].m_table[len + j] = m[j];
> +			j++;
> +			/* fallthrough */
> +		case 2:
> +			qconf->tx_mbufs[port].m_table[len + j] = m[j];
> +			j++;
> +			/* fallthrough */
> +		case 1:
> +			qconf->tx_mbufs[port].m_table[len + j] = m[j];
> +			j++;
> +		}
> +	}
> +
> +	len += n;
> +
> +	/* enough pkts to be sent */
> +	if (unlikely(len == MAX_PKT_BURST)) {
> +
> +		send_burst(qconf, MAX_PKT_BURST, port);
> +
> +		/* copy rest of the packets into the TX buffer. */
> +		len = num - n;
> +		if (len == 0)
> +			goto exit;
> +
> +		j = 0;
> +		switch (len % FWDSTEP) {
> +		while (j < len) {
> +			case 0:
> +				qconf->tx_mbufs[port].m_table[j] = m[n + j];
> +				j++;
> +				/* fallthrough */
> +			case 3:
> +				qconf->tx_mbufs[port].m_table[j] = m[n + j];
> +				j++;
> +				/* fallthrough */
> +			case 2:
> +				qconf->tx_mbufs[port].m_table[j] = m[n + j];
> +				j++;
> +				/* fallthrough */
> +			case 1:
> +				qconf->tx_mbufs[port].m_table[j] = m[n + j];
> +				j++;
> +		}
> +		}
> +	}
> +
> +exit:
> +	qconf->tx_mbufs[port].len = len;
> +}
> +
> +/**
> + * Send packets burst to the ports in dst_port array
> + */
> +static __rte_always_inline void
> +send_multi_pkts(struct rte_mbuf **pkts, uint16_t dst_port[MAX_PKT_BURST],
> +		int nb_rx, uint64_t tx_offloads, bool ip_cksum, bool is_ipv4)
> +{
> +	unsigned int lcoreid = rte_lcore_id();
> +	uint16_t pnum[MAX_PKT_BURST + 1];
> +	uint8_t l_pkt = 0;
> +	uint16_t dlp, *lp;
> +	int i = 0, k;
> +
> +	/*
> +	 * Finish packet processing and group consecutive
> +	 * packets with the same destination port.
> +	 */
> +	k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);
> +
> +	if (k != 0) {
> +		uint16x8_t dp1, dp2;
> +
> +		lp = pnum;
> +		lp[0] = 1;
> +
> +		processx4_step3(pkts, dst_port, tx_offloads, ip_cksum, &l_pkt);
> +
> +		/* dp1: <d[0], d[1], d[2], d[3], ... > */
> +		dp1 = vld1q_u16(dst_port);
> +
> +		for (i = FWDSTEP; i != k; i += FWDSTEP) {
> +			processx4_step3(&pkts[i], &dst_port[i], tx_offloads,
> +					ip_cksum, &l_pkt);
> +
> +			/*
> +			 * dp2:
> +			 * <d[j-3], d[j-2], d[j-1], d[j], ... >
> +			 */
> +			dp2 = vld1q_u16(&dst_port[i - FWDSTEP + 1]);
> +			lp  = port_groupx4(&pnum[i - FWDSTEP], lp, dp1, dp2);
> +
> +			/*
> +			 * dp1:
> +			 * <d[j], d[j+1], d[j+2], d[j+3], ... >
> +			 */
> +			dp1 = vextq_u16(dp2, dp1, FWDSTEP - 1);
> +		}
> +
> +		/*
> +		 * dp2: <d[j-3], d[j-2], d[j-1], d[j-1], ... >
> +		 */
> +		dp2 = vextq_u16(dp1, dp1, 1);
> +		dp2 = vsetq_lane_u16(vgetq_lane_u16(dp2, 2), dp2, 3);
> +		lp  = port_groupx4(&pnum[i - FWDSTEP], lp, dp1, dp2);
> +
> +		/*
> +		 * remove values added by the last repeated
> +		 * dst port.
> +		 */
> +		lp[0]--;
> +		dlp = dst_port[i - 1];
> +	} else {
> +		/* set dlp and lp to the never used values. */
> +		dlp = BAD_PORT - 1;
> +		lp = pnum + MAX_PKT_BURST;
> +	}
> +
> +	/* Process up to last 3 packets one by one. */
> +	switch (nb_rx % FWDSTEP) {
> +	case 3:
> +		process_packet(pkts[i], dst_port + i, tx_offloads, ip_cksum,
> +			       &l_pkt);
> +		GROUP_PORT_STEP(dlp, dst_port, lp, pnum, i);
> +		i++;
> +		/* fallthrough */
> +	case 2:
> +		process_packet(pkts[i], dst_port + i, tx_offloads, ip_cksum,
> +			       &l_pkt);
> +		GROUP_PORT_STEP(dlp, dst_port, lp, pnum, i);
> +		i++;
> +		/* fallthrough */
> +	case 1:
> +		process_packet(pkts[i], dst_port + i, tx_offloads, ip_cksum,
> +			       &l_pkt);
> +		GROUP_PORT_STEP(dlp, dst_port, lp, pnum, i);
> +	}
> +
> +	/*
> +	 * Send packets out, through destination port.
> +	 * Consecutive packets with the same destination port
> +	 * are already grouped together.
> +	 * If destination port for the packet equals BAD_PORT,
> +	 * then free the packet without sending it out.
> +	 */
> +	for (i = 0; i < nb_rx; i += k) {
> +
> +		uint16_t pn;
> +
> +		pn = dst_port[i];
> +		k = pnum[i];
> +
> +		if (likely(pn != BAD_PORT)) {
> +			if (l_pkt)
> +				/* Large packet is present, need to send
> +				 * individual packets with fragment
> +				 */
> +				send_packets(pkts + i, pn, k, is_ipv4);
> +			else
> +				send_packetsx4(pkts + i, pn, k);
> +
> +		} else {
> +			free_pkts(&pkts[i], k);
> +			if (is_ipv4)
> +				core_statistics[lcoreid].lpm4.miss++;
> +			else
> +				core_statistics[lcoreid].lpm6.miss++;
> +		}
> +	}
> +}
> +
> +#endif /* _IPSEC_NEON_H_ */
> diff --git a/examples/ipsec-secgw/ipsec_worker.c b/examples/ipsec-secgw/ipsec_worker.c
> index e1d4e3d864..803157d8ee 100644
> --- a/examples/ipsec-secgw/ipsec_worker.c
> +++ b/examples/ipsec-secgw/ipsec_worker.c
> @@ -12,6 +12,10 @@
>   #include "ipsec-secgw.h"
>   #include "ipsec_worker.h"
>   
> +#if defined(__ARM_NEON)
> +#include "ipsec_lpm_neon.h"
> +#endif
> +
>   struct port_drv_mode_data {
>   	struct rte_security_session *sess;
>   	struct rte_security_ctx *ctx;
> @@ -1248,8 +1252,13 @@ ipsec_poll_mode_wrkr_inl_pr(void)
>   				v6_num = ip6.num;
>   			}
>   
> +#if defined __ARM_NEON
> +			route4_pkts_neon(rt4_ctx, v4, v4_num, 0, false);
> +			route6_pkts_neon(rt6_ctx, v6, v6_num);
> +#else
>   			route4_pkts(rt4_ctx, v4, v4_num, 0, false);
>   			route6_pkts(rt6_ctx, v6, v6_num);
> +#endif
>   		}
>   	}
>   }
  
Rahul Bhansali May 25, 2022, 11:03 a.m. UTC | #2
> -----Original Message-----
> From: Konstantin Ananyev <konstantin.v.ananyev@yandex.ru>
> Sent: Wednesday, May 25, 2022 4:30 AM
> To: Rahul Bhansali <rbhansali@marvell.com>; dev@dpdk.org; Radu Nicolau
> <radu.nicolau@intel.com>; Akhil Goyal <gakhil@marvell.com>; Ruifeng Wang
> <ruifeng.wang@arm.com>
> Cc: Jerin Jacob Kollanukkaran <jerinj@marvell.com>
> Subject: [EXT] Re: [PATCH] examples/ipsec-secgw: add support of NEON with
> poll mode
> 
> External Email
> 
> ----------------------------------------------------------------------
> 24/05/2022 10:57, Rahul Bhansali пишет:
> > This adds the support of NEON based lpm lookup along with multi packet
> > processing for burst send in packets routing.
> >
> > Performance impact:
> > On cn10k, with poll mode inline protocol, outbound performance
> > increased by upto ~8% and inbound performance increased by upto ~6%.
> 
> 
> Interesting, good bunch of code looks like a dup from l3fwd:
> grouping, precessx4_step?, etc.

Yes, neon logic is taken as a reference from l3fwd and some modifications as per
requirement of ipsec example.

> Would it be possible to move dup code into some common place,
> so it can be used by both examples?
processx4_step... has some additional Ethernet header, inline vs non-inline packets lpm lookup,
IP checksum etc processes and even if we separate out to make common code with l3fwd then getting
less performance as additional things to be done separately again under certain conditions for
individual packets.

For grouping specific port_groupx4() only, we can have it in a common place. If it is worth,
I can make changes accordingly. Do let me know.

> 
> >
> > Signed-off-by: Rahul Bhansali <rbhansali@marvell.com>
> > ---
> >   examples/ipsec-secgw/ipsec-secgw.c    |  25 ++
> >   examples/ipsec-secgw/ipsec_lpm_neon.h | 213 +++++++++++
> >   examples/ipsec-secgw/ipsec_neon.h     | 487 ++++++++++++++++++++++++++
> >   examples/ipsec-secgw/ipsec_worker.c   |   9 +
> >   4 files changed, 734 insertions(+)
> >   create mode 100644 examples/ipsec-secgw/ipsec_lpm_neon.h
> >   create mode 100644 examples/ipsec-secgw/ipsec_neon.h
> >
> > diff --git a/examples/ipsec-secgw/ipsec-secgw.c b/examples/ipsec-
> secgw/ipsec-secgw.c
> > index 25255e053c..038c4669f5 100644
> > --- a/examples/ipsec-secgw/ipsec-secgw.c
> > +++ b/examples/ipsec-secgw/ipsec-secgw.c
> > @@ -56,6 +56,10 @@
> >   #include "parser.h"
> >   #include "sad.h"
> >
> > +#if defined(__ARM_NEON)
> > +#include "ipsec_lpm_neon.h"
> > +#endif
> > +
> >   volatile bool force_quit;
> >
> >   #define MAX_JUMBO_PKT_LEN  9600
> > @@ -96,6 +100,12 @@ struct ethaddr_info
> ethaddr_tbl[RTE_MAX_ETHPORTS] = {
> >   	{ 0, ETHADDR(0x00, 0x16, 0x3e, 0x49, 0x9e, 0xdd) }
> >   };
> >
> > +/*
> > + * To hold ethernet header per port, which will be applied
> > + * to outgoing packets.
> > + */
> > +xmm_t val_eth[RTE_MAX_ETHPORTS];
> > +
> >   struct flow_info flow_info_tbl[RTE_MAX_ETHPORTS];
> >
> >   #define CMD_LINE_OPT_CONFIG		"config"
> > @@ -561,9 +571,16 @@ process_pkts(struct lcore_conf *qconf, struct
> rte_mbuf **pkts,
> >   			process_pkts_outbound(&qconf->outbound, &traffic);
> >   	}
> >
> > +#if defined __ARM_NEON
> > +	/* Neon optimized packet routing */
> > +	route4_pkts_neon(qconf->rt4_ctx, traffic.ip4.pkts, traffic.ip4.num,
> > +			 qconf->outbound.ipv4_offloads, true);
> > +	route6_pkts_neon(qconf->rt6_ctx, traffic.ip6.pkts, traffic.ip6.num);
> > +#else
> >   	route4_pkts(qconf->rt4_ctx, traffic.ip4.pkts, traffic.ip4.num,
> >   		    qconf->outbound.ipv4_offloads, true);
> >   	route6_pkts(qconf->rt6_ctx, traffic.ip6.pkts, traffic.ip6.num);
> > +#endif
> >   }
> >
> >   static inline void
> > @@ -1390,6 +1407,8 @@ add_dst_ethaddr(uint16_t port, const struct
> rte_ether_addr *addr)
> >   		return -EINVAL;
> >
> >   	ethaddr_tbl[port].dst = ETHADDR_TO_UINT64(addr);
> > +	rte_ether_addr_copy((struct rte_ether_addr *)&ethaddr_tbl[port].dst,
> > +			    (struct rte_ether_addr *)(val_eth + port));
> >   	return 0;
> >   }
> >
> > @@ -1852,6 +1871,12 @@ port_init(uint16_t portid, uint64_t
> req_rx_offloads, uint64_t req_tx_offloads)
> >   			portid, rte_strerror(-ret));
> >
> >   	ethaddr_tbl[portid].src = ETHADDR_TO_UINT64(&ethaddr);
> > +
> > +	rte_ether_addr_copy((struct rte_ether_addr *)&ethaddr_tbl[portid].dst,
> > +			    (struct rte_ether_addr *)(val_eth + portid));
> > +	rte_ether_addr_copy((struct rte_ether_addr *)&ethaddr_tbl[portid].src,
> > +			    (struct rte_ether_addr *)(val_eth + portid) + 1);
> > +
> >   	print_ethaddr("Address: ", &ethaddr);
> >   	printf("\n");
> >
> > diff --git a/examples/ipsec-secgw/ipsec_lpm_neon.h b/examples/ipsec-
> secgw/ipsec_lpm_neon.h
> > new file mode 100644
> > index 0000000000..959a5a8666
> > --- /dev/null
> > +++ b/examples/ipsec-secgw/ipsec_lpm_neon.h
> > @@ -0,0 +1,213 @@
> > +/* SPDX-License-Identifier: BSD-3-Clause
> > + * Copyright(C) 2022 Marvell.
> > + */
> > +
> > +#ifndef __IPSEC_LPM_NEON_H__
> > +#define __IPSEC_LPM_NEON_H__
> > +
> > +#include <arm_neon.h>
> > +#include "ipsec_neon.h"
> > +
> > +/*
> > + * Append ethernet header and read destination IPV4 addresses from 4
> mbufs.
> > + */
> > +static inline void
> > +processx4_step1(struct rte_mbuf *pkt[FWDSTEP], int32x4_t *dip,
> > +		uint64_t *inline_flag)
> > +{
> > +	struct rte_ipv4_hdr *ipv4_hdr;
> > +	struct rte_ether_hdr *eth_hdr;
> > +	int32_t dst[FWDSTEP];
> > +	int i;
> > +
> > +	for (i = 0; i < FWDSTEP; i++) {
> > +		eth_hdr = (struct rte_ether_hdr *)rte_pktmbuf_prepend(pkt[i],
> > +
> 	RTE_ETHER_HDR_LEN);
> > +		pkt[i]->ol_flags |= RTE_MBUF_F_TX_IPV4;
> > +		pkt[i]->l2_len = RTE_ETHER_HDR_LEN;
> > +
> > +		ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
> > +
> > +		/* Fetch destination IPv4 address */
> > +		dst[i] = ipv4_hdr->dst_addr;
> > +		*inline_flag |= pkt[i]->ol_flags &
> RTE_MBUF_F_TX_SEC_OFFLOAD;
> > +	}
> > +
> > +	dip[0] = vld1q_s32(dst);
> > +}
> > +
> > +/*
> > + * Lookup into LPM for destination port.
> > + */
> > +static inline void
> > +processx4_step2(struct rt_ctx *rt_ctx, int32x4_t dip, uint64_t inline_flag,
> > +		struct rte_mbuf *pkt[FWDSTEP], uint16_t dprt[FWDSTEP])
> > +{
> > +	uint32_t next_hop;
> > +	rte_xmm_t dst;
> > +	uint8_t i;
> > +
> > +	dip = vreinterpretq_s32_u8(vrev32q_u8(vreinterpretq_u8_s32(dip)));
> > +
> > +	/* If all 4 packets are non-inline */
> > +	if (!inline_flag) {
> > +		rte_lpm_lookupx4((struct rte_lpm *)rt_ctx, dip, dst.u32,
> > +				 BAD_PORT);
> > +		/* get rid of unused upper 16 bit for each dport. */
> > +		vst1_s16((int16_t *)dprt, vqmovn_s32(dst.x));
> > +		return;
> > +	}
> > +
> > +	/* Inline and non-inline packets */
> > +	dst.x = dip;
> > +	for (i = 0; i < FWDSTEP; i++) {
> > +		if (pkt[i]->ol_flags & RTE_MBUF_F_TX_SEC_OFFLOAD) {
> > +			next_hop = get_hop_for_offload_pkt(pkt[i], 0);
> > +			dprt[i] = (uint16_t) (((next_hop &
> > +						RTE_LPM_LOOKUP_SUCCESS)
> != 0)
> > +						? next_hop : BAD_PORT);
> > +
> > +		} else {
> > +			dprt[i] = (uint16_t) ((rte_lpm_lookup(
> > +						(struct rte_lpm *)rt_ctx,
> > +						 dst.u32[i], &next_hop) == 0)
> > +						? next_hop : BAD_PORT);
> > +		}
> > +	}
> > +}
> > +
> > +/*
> > + * Process single packets for destination port.
> > + */
> > +static inline void
> > +process_single_pkt(struct rt_ctx *rt_ctx, struct rte_mbuf *pkt,
> > +		   uint16_t *dst_port)
> > +{
> > +	struct rte_ether_hdr *eth_hdr;
> > +	struct rte_ipv4_hdr *ipv4_hdr;
> > +	uint32_t next_hop;
> > +	uint32_t dst_ip;
> > +
> > +	eth_hdr = (struct rte_ether_hdr *)rte_pktmbuf_prepend(pkt,
> > +
> 	RTE_ETHER_HDR_LEN);
> > +	pkt->ol_flags |= RTE_MBUF_F_TX_IPV4;
> > +	pkt->l2_len = RTE_ETHER_HDR_LEN;
> > +
> > +	if (pkt->ol_flags & RTE_MBUF_F_TX_SEC_OFFLOAD) {
> > +		next_hop = get_hop_for_offload_pkt(pkt, 0);
> > +		*dst_port = (uint16_t) (((next_hop &
> > +					  RTE_LPM_LOOKUP_SUCCESS) != 0)
> > +					  ? next_hop : BAD_PORT);
> > +	} else {
> > +		ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
> > +		dst_ip = rte_be_to_cpu_32(ipv4_hdr->dst_addr);
> > +		*dst_port = (uint16_t) ((rte_lpm_lookup(
> > +						(struct rte_lpm *)rt_ctx,
> > +						dst_ip, &next_hop) == 0)
> > +						? next_hop : BAD_PORT);
> > +	}
> > +}
> > +
> > +/*
> > + * Buffer optimized handling of IPv6 packets.
> > + */
> > +static inline void
> > +route6_pkts_neon(struct rt_ctx *rt_ctx, struct rte_mbuf **pkts, int nb_rx)
> > +{
> > +	uint8_t dst_ip6[MAX_PKT_BURST][16];
> > +	int32_t dst_port[MAX_PKT_BURST];
> > +	struct rte_ether_hdr *eth_hdr;
> > +	struct rte_ipv6_hdr *ipv6_hdr;
> > +	int32_t hop[MAX_PKT_BURST];
> > +	struct rte_mbuf *pkt;
> > +	uint8_t lpm_pkts = 0;
> > +	int32_t i;
> > +
> > +	if (nb_rx == 0)
> > +		return;
> > +
> > +	/* Need to do an LPM lookup for non-inline packets. Inline packets will
> > +	 * have port ID in the SA
> > +	 */
> > +
> > +	for (i = 0; i < nb_rx; i++) {
> > +		pkt = pkts[i];
> > +		eth_hdr = (struct rte_ether_hdr *)rte_pktmbuf_prepend(pkt,
> > +
> 	RTE_ETHER_HDR_LEN);
> > +		pkt->l2_len = RTE_ETHER_HDR_LEN;
> > +		pkt->ol_flags |= RTE_MBUF_F_TX_IPV6;
> > +
> > +		if (!(pkt->ol_flags & RTE_MBUF_F_TX_SEC_OFFLOAD)) {
> > +			/* Security offload not enabled. So an LPM lookup is
> > +			 * required to get the hop
> > +			 */
> > +			ipv6_hdr = (struct rte_ipv6_hdr *)(eth_hdr + 1);
> > +			memcpy(&dst_ip6[lpm_pkts][0],
> > +					ipv6_hdr->dst_addr, 16);
> > +			lpm_pkts++;
> > +		}
> > +	}
> > +
> > +	rte_lpm6_lookup_bulk_func((struct rte_lpm6 *)rt_ctx, dst_ip6,
> > +				  hop, lpm_pkts);
> > +
> > +	lpm_pkts = 0;
> > +
> > +	for (i = 0; i < nb_rx; i++) {
> > +		pkt = pkts[i];
> > +		if (pkt->ol_flags & RTE_MBUF_F_TX_SEC_OFFLOAD) {
> > +			/* Read hop from the SA */
> > +			dst_port[i] = get_hop_for_offload_pkt(pkt, 1);
> > +		} else {
> > +			/* Need to use hop returned by lookup */
> > +			dst_port[i] = hop[lpm_pkts++];
> > +		}
> > +		if (dst_port[i] == -1)
> > +			dst_port[i] = BAD_PORT;
> > +	}
> > +
> > +	/* Send packets */
> > +	send_multi_pkts(pkts, (uint16_t *)dst_port, nb_rx, 0, 0, false);
> > +}
> > +
> > +/*
> > + * Buffer optimized handling of IPv4 packets.
> > + */
> > +static inline void
> > +route4_pkts_neon(struct rt_ctx *rt_ctx, struct rte_mbuf **pkts, int nb_rx,
> > +		 uint64_t tx_offloads, bool ip_cksum)
> > +{
> > +	const int32_t k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);
> > +	const int32_t m = nb_rx % FWDSTEP;
> > +	uint16_t dst_port[MAX_PKT_BURST];
> > +	uint64_t inline_flag = 0;
> > +	int32x4_t dip;
> > +	int32_t i;
> > +
> > +	if (nb_rx == 0)
> > +		return;
> > +
> > +	for (i = 0; i != k; i += FWDSTEP) {
> > +		processx4_step1(&pkts[i], &dip, &inline_flag);
> > +		processx4_step2(rt_ctx, dip, inline_flag, &pkts[i],
> > +				&dst_port[i]);
> > +	}
> > +
> > +	/* Classify last up to 3 packets one by one */
> > +	switch (m) {
> > +	case 3:
> > +		process_single_pkt(rt_ctx, pkts[i], &dst_port[i]);
> > +		i++;
> > +		/* fallthrough */
> > +	case 2:
> > +		process_single_pkt(rt_ctx, pkts[i], &dst_port[i]);
> > +		i++;
> > +		/* fallthrough */
> > +	case 1:
> > +		process_single_pkt(rt_ctx, pkts[i], &dst_port[i]);
> > +	}
> > +
> > +	send_multi_pkts(pkts, dst_port, nb_rx, tx_offloads, ip_cksum, true);
> > +}
> > +
> > +#endif /* __IPSEC_LPM_NEON_H__ */
> > diff --git a/examples/ipsec-secgw/ipsec_neon.h b/examples/ipsec-
> secgw/ipsec_neon.h
> > new file mode 100644
> > index 0000000000..39dddcd1e3
> > --- /dev/null
> > +++ b/examples/ipsec-secgw/ipsec_neon.h
> > @@ -0,0 +1,487 @@
> > +/* SPDX-License-Identifier: BSD-3-Clause
> > + * Copyright(C) 2022 Marvell.
> > + */
> > +
> > +#ifndef _IPSEC_NEON_H_
> > +#define _IPSEC_NEON_H_
> > +
> > +#include "ipsec.h"
> > +
> > +#define FWDSTEP		4
> > +#define MAX_TX_BURST	(MAX_PKT_BURST / 2)
> > +#define BAD_PORT	((uint16_t)-1)
> > +
> > +extern xmm_t val_eth[RTE_MAX_ETHPORTS];
> > +
> > +/*
> > + * Group consecutive packets with the same destination port into one burst.
> > + * To avoid extra latency this is done together with some other packet
> > + * processing, but after we made a final decision about packet's destination.
> > + * To do this we maintain:
> > + * pnum - array of number of consecutive packets with the same dest port for
> > + * each packet in the input burst.
> > + * lp - pointer to the last updated element in the pnum.
> > + * dlp - dest port value lp corresponds to.
> > + */
> > +
> > +#define	GRPSZ	(1 << FWDSTEP)
> > +#define	GRPMSK	(GRPSZ - 1)
> > +
> > +#define GROUP_PORT_STEP(dlp, dcp, lp, pn, idx)	do { \
> > +	if (likely((dlp) == (dcp)[(idx)])) {         \
> > +		(lp)[0]++;                           \
> > +	} else {                                     \
> > +		(dlp) = (dcp)[idx];                  \
> > +		(lp) = (pn) + (idx);                 \
> > +		(lp)[0] = 1;                         \
> > +	}                                            \
> > +} while (0)
> > +
> > +static const struct {
> > +	uint64_t pnum; /* prebuild 4 values for pnum[]. */
> > +	int32_t  idx;  /* index for new last updated elemnet. */
> > +	uint16_t lpv;  /* add value to the last updated element. */
> > +} gptbl[GRPSZ] = {
> > +	{
> > +		/* 0: a != b, b != c, c != d, d != e */
> > +		.pnum = UINT64_C(0x0001000100010001),
> > +		.idx = 4,
> > +		.lpv = 0,
> > +	},
> > +	{
> > +		/* 1: a == b, b != c, c != d, d != e */
> > +		.pnum = UINT64_C(0x0001000100010002),
> > +		.idx = 4,
> > +		.lpv = 1,
> > +	},
> > +	{
> > +		/* 2: a != b, b == c, c != d, d != e */
> > +		.pnum = UINT64_C(0x0001000100020001),
> > +		.idx = 4,
> > +		.lpv = 0,
> > +	},
> > +	{
> > +		/* 3: a == b, b == c, c != d, d != e */
> > +		.pnum = UINT64_C(0x0001000100020003),
> > +		.idx = 4,
> > +		.lpv = 2,
> > +	},
> > +	{
> > +		/* 4: a != b, b != c, c == d, d != e */
> > +		.pnum = UINT64_C(0x0001000200010001),
> > +		.idx = 4,
> > +		.lpv = 0,
> > +	},
> > +	{
> > +		/* 5: a == b, b != c, c == d, d != e */
> > +		.pnum = UINT64_C(0x0001000200010002),
> > +		.idx = 4,
> > +		.lpv = 1,
> > +	},
> > +	{
> > +		/* 6: a != b, b == c, c == d, d != e */
> > +		.pnum = UINT64_C(0x0001000200030001),
> > +		.idx = 4,
> > +		.lpv = 0,
> > +	},
> > +	{
> > +		/* 7: a == b, b == c, c == d, d != e */
> > +		.pnum = UINT64_C(0x0001000200030004),
> > +		.idx = 4,
> > +		.lpv = 3,
> > +	},
> > +	{
> > +		/* 8: a != b, b != c, c != d, d == e */
> > +		.pnum = UINT64_C(0x0002000100010001),
> > +		.idx = 3,
> > +		.lpv = 0,
> > +	},
> > +	{
> > +		/* 9: a == b, b != c, c != d, d == e */
> > +		.pnum = UINT64_C(0x0002000100010002),
> > +		.idx = 3,
> > +		.lpv = 1,
> > +	},
> > +	{
> > +		/* 0xa: a != b, b == c, c != d, d == e */
> > +		.pnum = UINT64_C(0x0002000100020001),
> > +		.idx = 3,
> > +		.lpv = 0,
> > +	},
> > +	{
> > +		/* 0xb: a == b, b == c, c != d, d == e */
> > +		.pnum = UINT64_C(0x0002000100020003),
> > +		.idx = 3,
> > +		.lpv = 2,
> > +	},
> > +	{
> > +		/* 0xc: a != b, b != c, c == d, d == e */
> > +		.pnum = UINT64_C(0x0002000300010001),
> > +		.idx = 2,
> > +		.lpv = 0,
> > +	},
> > +	{
> > +		/* 0xd: a == b, b != c, c == d, d == e */
> > +		.pnum = UINT64_C(0x0002000300010002),
> > +		.idx = 2,
> > +		.lpv = 1,
> > +	},
> > +	{
> > +		/* 0xe: a != b, b == c, c == d, d == e */
> > +		.pnum = UINT64_C(0x0002000300040001),
> > +		.idx = 1,
> > +		.lpv = 0,
> > +	},
> > +	{
> > +		/* 0xf: a == b, b == c, c == d, d == e */
> > +		.pnum = UINT64_C(0x0002000300040005),
> > +		.idx = 0,
> > +		.lpv = 4,
> > +	},
> > +};
> > +
> > +
> > +/*
> > + * Update source and destination MAC addresses in the ethernet header.
> > + */
> > +static inline void
> > +processx4_step3(struct rte_mbuf *pkts[FWDSTEP], uint16_t
> dst_port[FWDSTEP],
> > +		uint64_t tx_offloads, bool ip_cksum, uint8_t *l_pkt)
> > +{
> > +	uint32x4_t te[FWDSTEP];
> > +	uint32x4_t ve[FWDSTEP];
> > +	uint32_t *p[FWDSTEP];
> > +	struct rte_mbuf *pkt;
> > +	uint8_t i;
> > +
> > +	for (i = 0; i < FWDSTEP; i++) {
> > +		pkt = pkts[i];
> > +
> > +		/* Check if it is a large packet */
> > +		if (pkt->pkt_len - RTE_ETHER_HDR_LEN > mtu_size)
> > +			*l_pkt |= 1;
> > +
> > +		p[i] = rte_pktmbuf_mtod(pkt, uint32_t *);
> > +		ve[i] = vreinterpretq_u32_s32(val_eth[dst_port[i]]);
> > +		te[i] = vld1q_u32(p[i]);
> > +
> > +		/* Update last 4 bytes */
> > +		ve[i] = vsetq_lane_u32(vgetq_lane_u32(te[i], 3), ve[i], 3);
> > +		vst1q_u32(p[i], ve[i]);
> > +
> > +		if (ip_cksum) {
> > +			struct rte_ipv4_hdr *ip;
> > +
> > +			pkt->ol_flags |= tx_offloads;
> > +
> > +			ip = (struct rte_ipv4_hdr *)
> > +				(p[i] + RTE_ETHER_HDR_LEN + 1);
> > +			ip->hdr_checksum = 0;
> > +
> > +			/* calculate IPv4 cksum in SW */
> > +			if ((pkt->ol_flags & RTE_MBUF_F_TX_IP_CKSUM) == 0)
> > +				ip->hdr_checksum = rte_ipv4_cksum(ip);
> > +		}
> > +
> > +	}
> > +}
> > +
> > +/*
> > + * Group consecutive packets with the same destination port in bursts of 4.
> > + * Suppose we have array of destination ports:
> > + * dst_port[] = {a, b, c, d,, e, ... }
> > + * dp1 should contain: <a, b, c, d>, dp2: <b, c, d, e>.
> > + * We doing 4 comparisons at once and the result is 4 bit mask.
> > + * This mask is used as an index into prebuild array of pnum values.
> > + */
> > +static inline uint16_t *
> > +port_groupx4(uint16_t pn[FWDSTEP + 1], uint16_t *lp, uint16x8_t dp1,
> > +	     uint16x8_t dp2)
> > +{
> > +	union {
> > +		uint16_t u16[FWDSTEP + 1];
> > +		uint64_t u64;
> > +	} *pnum = (void *)pn;
> > +
> > +	uint16x8_t mask = {1, 2, 4, 8, 0, 0, 0, 0};
> > +	int32_t v;
> > +
> > +	dp1 = vceqq_u16(dp1, dp2);
> > +	dp1 = vandq_u16(dp1, mask);
> > +	v = vaddvq_u16(dp1);
> > +
> > +	/* update last port counter. */
> > +	lp[0] += gptbl[v].lpv;
> > +	rte_compiler_barrier();
> > +
> > +	/* if dest port value has changed. */
> > +	if (v != GRPMSK) {
> > +		pnum->u64 = gptbl[v].pnum;
> > +		pnum->u16[FWDSTEP] = 1;
> > +		lp = pnum->u16 + gptbl[v].idx;
> > +	}
> > +
> > +	return lp;
> > +}
> > +
> > +/**
> > + * Process single packet:
> > + * Update source and destination MAC addresses in the ethernet header.
> > + */
> > +static inline void
> > +process_packet(struct rte_mbuf *pkt, uint16_t *dst_port, uint64_t
> tx_offloads,
> > +	       bool ip_cksum, uint8_t *l_pkt)
> > +{
> > +	struct rte_ether_hdr *eth_hdr;
> > +	uint32x4_t te, ve;
> > +
> > +	/* Check if it is a large packet */
> > +	if (pkt->pkt_len - RTE_ETHER_HDR_LEN > mtu_size)
> > +		*l_pkt |= 1;
> > +
> > +	eth_hdr = rte_pktmbuf_mtod(pkt, struct rte_ether_hdr *);
> > +
> > +	te = vld1q_u32((uint32_t *)eth_hdr);
> > +	ve = vreinterpretq_u32_s32(val_eth[dst_port[0]]);
> > +
> > +	ve = vcopyq_laneq_u32(ve, 3, te, 3);
> > +	vst1q_u32((uint32_t *)eth_hdr, ve);
> > +
> > +	if (ip_cksum) {
> > +		struct rte_ipv4_hdr *ip;
> > +
> > +		pkt->ol_flags |= tx_offloads;
> > +
> > +		ip = (struct rte_ipv4_hdr *)(eth_hdr + 1);
> > +		ip->hdr_checksum = 0;
> > +
> > +		/* calculate IPv4 cksum in SW */
> > +		if ((pkt->ol_flags & RTE_MBUF_F_TX_IP_CKSUM) == 0)
> > +			ip->hdr_checksum = rte_ipv4_cksum(ip);
> > +	}
> > +}
> > +
> > +static inline void
> > +send_packets(struct rte_mbuf *m[], uint16_t port, uint32_t num, bool
> is_ipv4)
> > +{
> > +	uint8_t proto;
> > +	uint32_t i;
> > +
> > +	proto = is_ipv4 ? IPPROTO_IP : IPPROTO_IPV6;
> > +	for (i = 0; i < num; i++)
> > +		send_single_packet(m[i], port, proto);
> > +}
> > +
> > +static inline void
> > +send_packetsx4(struct rte_mbuf *m[], uint16_t port, uint32_t num)
> > +{
> > +	unsigned int lcoreid = rte_lcore_id();
> > +	struct lcore_conf *qconf;
> > +	uint32_t len, j, n;
> > +
> > +	qconf = &lcore_conf[lcoreid];
> > +
> > +	len = qconf->tx_mbufs[port].len;
> > +
> > +	/*
> > +	 * If TX buffer for that queue is empty, and we have enough packets,
> > +	 * then send them straightway.
> > +	 */
> > +	if (num >= MAX_TX_BURST && len == 0) {
> > +		n = rte_eth_tx_burst(port, qconf->tx_queue_id[port], m, num);
> > +		core_stats_update_tx(n);
> > +		if (unlikely(n < num)) {
> > +			do {
> > +				rte_pktmbuf_free(m[n]);
> > +			} while (++n < num);
> > +		}
> > +		return;
> > +	}
> > +
> > +	/*
> > +	 * Put packets into TX buffer for that queue.
> > +	 */
> > +
> > +	n = len + num;
> > +	n = (n > MAX_PKT_BURST) ? MAX_PKT_BURST - len : num;
> > +
> > +	j = 0;
> > +	switch (n % FWDSTEP) {
> > +	while (j < n) {
> > +		case 0:
> > +			qconf->tx_mbufs[port].m_table[len + j] = m[j];
> > +			j++;
> > +			/* fallthrough */
> > +		case 3:
> > +			qconf->tx_mbufs[port].m_table[len + j] = m[j];
> > +			j++;
> > +			/* fallthrough */
> > +		case 2:
> > +			qconf->tx_mbufs[port].m_table[len + j] = m[j];
> > +			j++;
> > +			/* fallthrough */
> > +		case 1:
> > +			qconf->tx_mbufs[port].m_table[len + j] = m[j];
> > +			j++;
> > +		}
> > +	}
> > +
> > +	len += n;
> > +
> > +	/* enough pkts to be sent */
> > +	if (unlikely(len == MAX_PKT_BURST)) {
> > +
> > +		send_burst(qconf, MAX_PKT_BURST, port);
> > +
> > +		/* copy rest of the packets into the TX buffer. */
> > +		len = num - n;
> > +		if (len == 0)
> > +			goto exit;
> > +
> > +		j = 0;
> > +		switch (len % FWDSTEP) {
> > +		while (j < len) {
> > +			case 0:
> > +				qconf->tx_mbufs[port].m_table[j] = m[n + j];
> > +				j++;
> > +				/* fallthrough */
> > +			case 3:
> > +				qconf->tx_mbufs[port].m_table[j] = m[n + j];
> > +				j++;
> > +				/* fallthrough */
> > +			case 2:
> > +				qconf->tx_mbufs[port].m_table[j] = m[n + j];
> > +				j++;
> > +				/* fallthrough */
> > +			case 1:
> > +				qconf->tx_mbufs[port].m_table[j] = m[n + j];
> > +				j++;
> > +		}
> > +		}
> > +	}
> > +
> > +exit:
> > +	qconf->tx_mbufs[port].len = len;
> > +}
> > +
> > +/**
> > + * Send packets burst to the ports in dst_port array
> > + */
> > +static __rte_always_inline void
> > +send_multi_pkts(struct rte_mbuf **pkts, uint16_t
> dst_port[MAX_PKT_BURST],
> > +		int nb_rx, uint64_t tx_offloads, bool ip_cksum, bool is_ipv4)
> > +{
> > +	unsigned int lcoreid = rte_lcore_id();
> > +	uint16_t pnum[MAX_PKT_BURST + 1];
> > +	uint8_t l_pkt = 0;
> > +	uint16_t dlp, *lp;
> > +	int i = 0, k;
> > +
> > +	/*
> > +	 * Finish packet processing and group consecutive
> > +	 * packets with the same destination port.
> > +	 */
> > +	k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);
> > +
> > +	if (k != 0) {
> > +		uint16x8_t dp1, dp2;
> > +
> > +		lp = pnum;
> > +		lp[0] = 1;
> > +
> > +		processx4_step3(pkts, dst_port, tx_offloads, ip_cksum, &l_pkt);
> > +
> > +		/* dp1: <d[0], d[1], d[2], d[3], ... > */
> > +		dp1 = vld1q_u16(dst_port);
> > +
> > +		for (i = FWDSTEP; i != k; i += FWDSTEP) {
> > +			processx4_step3(&pkts[i], &dst_port[i], tx_offloads,
> > +					ip_cksum, &l_pkt);
> > +
> > +			/*
> > +			 * dp2:
> > +			 * <d[j-3], d[j-2], d[j-1], d[j], ... >
> > +			 */
> > +			dp2 = vld1q_u16(&dst_port[i - FWDSTEP + 1]);
> > +			lp  = port_groupx4(&pnum[i - FWDSTEP], lp, dp1, dp2);
> > +
> > +			/*
> > +			 * dp1:
> > +			 * <d[j], d[j+1], d[j+2], d[j+3], ... >
> > +			 */
> > +			dp1 = vextq_u16(dp2, dp1, FWDSTEP - 1);
> > +		}
> > +
> > +		/*
> > +		 * dp2: <d[j-3], d[j-2], d[j-1], d[j-1], ... >
> > +		 */
> > +		dp2 = vextq_u16(dp1, dp1, 1);
> > +		dp2 = vsetq_lane_u16(vgetq_lane_u16(dp2, 2), dp2, 3);
> > +		lp  = port_groupx4(&pnum[i - FWDSTEP], lp, dp1, dp2);
> > +
> > +		/*
> > +		 * remove values added by the last repeated
> > +		 * dst port.
> > +		 */
> > +		lp[0]--;
> > +		dlp = dst_port[i - 1];
> > +	} else {
> > +		/* set dlp and lp to the never used values. */
> > +		dlp = BAD_PORT - 1;
> > +		lp = pnum + MAX_PKT_BURST;
> > +	}
> > +
> > +	/* Process up to last 3 packets one by one. */
> > +	switch (nb_rx % FWDSTEP) {
> > +	case 3:
> > +		process_packet(pkts[i], dst_port + i, tx_offloads, ip_cksum,
> > +			       &l_pkt);
> > +		GROUP_PORT_STEP(dlp, dst_port, lp, pnum, i);
> > +		i++;
> > +		/* fallthrough */
> > +	case 2:
> > +		process_packet(pkts[i], dst_port + i, tx_offloads, ip_cksum,
> > +			       &l_pkt);
> > +		GROUP_PORT_STEP(dlp, dst_port, lp, pnum, i);
> > +		i++;
> > +		/* fallthrough */
> > +	case 1:
> > +		process_packet(pkts[i], dst_port + i, tx_offloads, ip_cksum,
> > +			       &l_pkt);
> > +		GROUP_PORT_STEP(dlp, dst_port, lp, pnum, i);
> > +	}
> > +
> > +	/*
> > +	 * Send packets out, through destination port.
> > +	 * Consecutive packets with the same destination port
> > +	 * are already grouped together.
> > +	 * If destination port for the packet equals BAD_PORT,
> > +	 * then free the packet without sending it out.
> > +	 */
> > +	for (i = 0; i < nb_rx; i += k) {
> > +
> > +		uint16_t pn;
> > +
> > +		pn = dst_port[i];
> > +		k = pnum[i];
> > +
> > +		if (likely(pn != BAD_PORT)) {
> > +			if (l_pkt)
> > +				/* Large packet is present, need to send
> > +				 * individual packets with fragment
> > +				 */
> > +				send_packets(pkts + i, pn, k, is_ipv4);
> > +			else
> > +				send_packetsx4(pkts + i, pn, k);
> > +
> > +		} else {
> > +			free_pkts(&pkts[i], k);
> > +			if (is_ipv4)
> > +				core_statistics[lcoreid].lpm4.miss++;
> > +			else
> > +				core_statistics[lcoreid].lpm6.miss++;
> > +		}
> > +	}
> > +}
> > +
> > +#endif /* _IPSEC_NEON_H_ */
> > diff --git a/examples/ipsec-secgw/ipsec_worker.c b/examples/ipsec-
> secgw/ipsec_worker.c
> > index e1d4e3d864..803157d8ee 100644
> > --- a/examples/ipsec-secgw/ipsec_worker.c
> > +++ b/examples/ipsec-secgw/ipsec_worker.c
> > @@ -12,6 +12,10 @@
> >   #include "ipsec-secgw.h"
> >   #include "ipsec_worker.h"
> >
> > +#if defined(__ARM_NEON)
> > +#include "ipsec_lpm_neon.h"
> > +#endif
> > +
> >   struct port_drv_mode_data {
> >   	struct rte_security_session *sess;
> >   	struct rte_security_ctx *ctx;
> > @@ -1248,8 +1252,13 @@ ipsec_poll_mode_wrkr_inl_pr(void)
> >   				v6_num = ip6.num;
> >   			}
> >
> > +#if defined __ARM_NEON
> > +			route4_pkts_neon(rt4_ctx, v4, v4_num, 0, false);
> > +			route6_pkts_neon(rt6_ctx, v6, v6_num);
> > +#else
> >   			route4_pkts(rt4_ctx, v4, v4_num, 0, false);
> >   			route6_pkts(rt6_ctx, v6, v6_num);
> > +#endif
> >   		}
> >   	}
> >   }
  
Konstantin Ananyev May 27, 2022, 11:44 a.m. UTC | #3
> 
> 
>> -----Original Message-----
>> From: Konstantin Ananyev <konstantin.v.ananyev@yandex.ru>
>> Sent: Wednesday, May 25, 2022 4:30 AM
>> To: Rahul Bhansali <rbhansali@marvell.com>; dev@dpdk.org; Radu Nicolau
>> <radu.nicolau@intel.com>; Akhil Goyal <gakhil@marvell.com>; Ruifeng Wang
>> <ruifeng.wang@arm.com>
>> Cc: Jerin Jacob Kollanukkaran <jerinj@marvell.com>
>> Subject: [EXT] Re: [PATCH] examples/ipsec-secgw: add support of NEON with
>> poll mode
>>
>> External Email
>>
>> ----------------------------------------------------------------------
>> 24/05/2022 10:57, Rahul Bhansali пишет:
>>> This adds the support of NEON based lpm lookup along with multi packet
>>> processing for burst send in packets routing.
>>>
>>> Performance impact:
>>> On cn10k, with poll mode inline protocol, outbound performance
>>> increased by upto ~8% and inbound performance increased by upto ~6%.
>>
>>
>> Interesting, good bunch of code looks like a dup from l3fwd:
>> grouping, precessx4_step?, etc.
> 
> Yes, neon logic is taken as a reference from l3fwd and some modifications as per
> requirement of ipsec example.
> 
>> Would it be possible to move dup code into some common place,
>> so it can be used by both examples?
> processx4_step... has some additional Ethernet header, inline vs non-inline packets lpm lookup,
> IP checksum etc processes and even if we separate out to make common code with l3fwd then getting
> less performance as additional things to be done separately again under certain conditions for
> individual packets.

Ok.

> 
> For grouping specific port_groupx4() only, we can have it in a common place. If it is worth,
> I can make changes accordingly. Do let me know.


I think would be really good.
Probably some other apps (or even libs) can benefit from it too -
it seems generic enough to me.

> 
>>
>>>
>>> Signed-off-by: Rahul Bhansali <rbhansali@marvell.com>
>>> ---
>>>    examples/ipsec-secgw/ipsec-secgw.c    |  25 ++
>>>    examples/ipsec-secgw/ipsec_lpm_neon.h | 213 +++++++++++
>>>    examples/ipsec-secgw/ipsec_neon.h     | 487 ++++++++++++++++++++++++++
>>>    examples/ipsec-secgw/ipsec_worker.c   |   9 +
>>>    4 files changed, 734 insertions(+)
>>>    create mode 100644 examples/ipsec-secgw/ipsec_lpm_neon.h
>>>    create mode 100644 examples/ipsec-secgw/ipsec_neon.h
>>>
>>> diff --git a/examples/ipsec-secgw/ipsec-secgw.c b/examples/ipsec-
>> secgw/ipsec-secgw.c
>>> index 25255e053c..038c4669f5 100644
>>> --- a/examples/ipsec-secgw/ipsec-secgw.c
>>> +++ b/examples/ipsec-secgw/ipsec-secgw.c
>>> @@ -56,6 +56,10 @@
>>>    #include "parser.h"
>>>    #include "sad.h"
>>>
>>> +#if defined(__ARM_NEON)
>>> +#include "ipsec_lpm_neon.h"
>>> +#endif
>>> +
>>>    volatile bool force_quit;
>>>
>>>    #define MAX_JUMBO_PKT_LEN  9600
>>> @@ -96,6 +100,12 @@ struct ethaddr_info
>> ethaddr_tbl[RTE_MAX_ETHPORTS] = {
>>>    	{ 0, ETHADDR(0x00, 0x16, 0x3e, 0x49, 0x9e, 0xdd) }
>>>    };
>>>
>>> +/*
>>> + * To hold ethernet header per port, which will be applied
>>> + * to outgoing packets.
>>> + */
>>> +xmm_t val_eth[RTE_MAX_ETHPORTS];
>>> +
>>>    struct flow_info flow_info_tbl[RTE_MAX_ETHPORTS];
>>>
>>>    #define CMD_LINE_OPT_CONFIG		"config"
>>> @@ -561,9 +571,16 @@ process_pkts(struct lcore_conf *qconf, struct
>> rte_mbuf **pkts,
>>>    			process_pkts_outbound(&qconf->outbound, &traffic);
>>>    	}
>>>
>>> +#if defined __ARM_NEON
>>> +	/* Neon optimized packet routing */
>>> +	route4_pkts_neon(qconf->rt4_ctx, traffic.ip4.pkts, traffic.ip4.num,
>>> +			 qconf->outbound.ipv4_offloads, true);
>>> +	route6_pkts_neon(qconf->rt6_ctx, traffic.ip6.pkts, traffic.ip6.num);
>>> +#else
>>>    	route4_pkts(qconf->rt4_ctx, traffic.ip4.pkts, traffic.ip4.num,
>>>    		    qconf->outbound.ipv4_offloads, true);
>>>    	route6_pkts(qconf->rt6_ctx, traffic.ip6.pkts, traffic.ip6.num);
>>> +#endif
>>>    }
>>>
>>>    static inline void
>>> @@ -1390,6 +1407,8 @@ add_dst_ethaddr(uint16_t port, const struct
>> rte_ether_addr *addr)
>>>    		return -EINVAL;
>>>
>>>    	ethaddr_tbl[port].dst = ETHADDR_TO_UINT64(addr);
>>> +	rte_ether_addr_copy((struct rte_ether_addr *)&ethaddr_tbl[port].dst,
>>> +			    (struct rte_ether_addr *)(val_eth + port));
>>>    	return 0;
>>>    }
>>>
>>> @@ -1852,6 +1871,12 @@ port_init(uint16_t portid, uint64_t
>> req_rx_offloads, uint64_t req_tx_offloads)
>>>    			portid, rte_strerror(-ret));
>>>
>>>    	ethaddr_tbl[portid].src = ETHADDR_TO_UINT64(&ethaddr);
>>> +
>>> +	rte_ether_addr_copy((struct rte_ether_addr *)&ethaddr_tbl[portid].dst,
>>> +			    (struct rte_ether_addr *)(val_eth + portid));
>>> +	rte_ether_addr_copy((struct rte_ether_addr *)&ethaddr_tbl[portid].src,
>>> +			    (struct rte_ether_addr *)(val_eth + portid) + 1);
>>> +
>>>    	print_ethaddr("Address: ", &ethaddr);
>>>    	printf("\n");
>>>
>>> diff --git a/examples/ipsec-secgw/ipsec_lpm_neon.h b/examples/ipsec-
>> secgw/ipsec_lpm_neon.h
>>> new file mode 100644
>>> index 0000000000..959a5a8666
>>> --- /dev/null
>>> +++ b/examples/ipsec-secgw/ipsec_lpm_neon.h
>>> @@ -0,0 +1,213 @@
>>> +/* SPDX-License-Identifier: BSD-3-Clause
>>> + * Copyright(C) 2022 Marvell.
>>> + */
>>> +
>>> +#ifndef __IPSEC_LPM_NEON_H__
>>> +#define __IPSEC_LPM_NEON_H__
>>> +
>>> +#include <arm_neon.h>
>>> +#include "ipsec_neon.h"
>>> +
>>> +/*
>>> + * Append ethernet header and read destination IPV4 addresses from 4
>> mbufs.
>>> + */
>>> +static inline void
>>> +processx4_step1(struct rte_mbuf *pkt[FWDSTEP], int32x4_t *dip,
>>> +		uint64_t *inline_flag)
>>> +{
>>> +	struct rte_ipv4_hdr *ipv4_hdr;
>>> +	struct rte_ether_hdr *eth_hdr;
>>> +	int32_t dst[FWDSTEP];
>>> +	int i;
>>> +
>>> +	for (i = 0; i < FWDSTEP; i++) {
>>> +		eth_hdr = (struct rte_ether_hdr *)rte_pktmbuf_prepend(pkt[i],
>>> +
>> 	RTE_ETHER_HDR_LEN);
>>> +		pkt[i]->ol_flags |= RTE_MBUF_F_TX_IPV4;
>>> +		pkt[i]->l2_len = RTE_ETHER_HDR_LEN;
>>> +
>>> +		ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
>>> +
>>> +		/* Fetch destination IPv4 address */
>>> +		dst[i] = ipv4_hdr->dst_addr;
>>> +		*inline_flag |= pkt[i]->ol_flags &
>> RTE_MBUF_F_TX_SEC_OFFLOAD;
>>> +	}
>>> +
>>> +	dip[0] = vld1q_s32(dst);
>>> +}
>>> +
>>> +/*
>>> + * Lookup into LPM for destination port.
>>> + */
>>> +static inline void
>>> +processx4_step2(struct rt_ctx *rt_ctx, int32x4_t dip, uint64_t inline_flag,
>>> +		struct rte_mbuf *pkt[FWDSTEP], uint16_t dprt[FWDSTEP])
>>> +{
>>> +	uint32_t next_hop;
>>> +	rte_xmm_t dst;
>>> +	uint8_t i;
>>> +
>>> +	dip = vreinterpretq_s32_u8(vrev32q_u8(vreinterpretq_u8_s32(dip)));
>>> +
>>> +	/* If all 4 packets are non-inline */
>>> +	if (!inline_flag) {
>>> +		rte_lpm_lookupx4((struct rte_lpm *)rt_ctx, dip, dst.u32,
>>> +				 BAD_PORT);
>>> +		/* get rid of unused upper 16 bit for each dport. */
>>> +		vst1_s16((int16_t *)dprt, vqmovn_s32(dst.x));
>>> +		return;
>>> +	}
>>> +
>>> +	/* Inline and non-inline packets */
>>> +	dst.x = dip;
>>> +	for (i = 0; i < FWDSTEP; i++) {
>>> +		if (pkt[i]->ol_flags & RTE_MBUF_F_TX_SEC_OFFLOAD) {
>>> +			next_hop = get_hop_for_offload_pkt(pkt[i], 0);
>>> +			dprt[i] = (uint16_t) (((next_hop &
>>> +						RTE_LPM_LOOKUP_SUCCESS)
>> != 0)
>>> +						? next_hop : BAD_PORT);
>>> +
>>> +		} else {
>>> +			dprt[i] = (uint16_t) ((rte_lpm_lookup(
>>> +						(struct rte_lpm *)rt_ctx,
>>> +						 dst.u32[i], &next_hop) == 0)
>>> +						? next_hop : BAD_PORT);
>>> +		}
>>> +	}
>>> +}
>>> +
>>> +/*
>>> + * Process single packets for destination port.
>>> + */
>>> +static inline void
>>> +process_single_pkt(struct rt_ctx *rt_ctx, struct rte_mbuf *pkt,
>>> +		   uint16_t *dst_port)
>>> +{
>>> +	struct rte_ether_hdr *eth_hdr;
>>> +	struct rte_ipv4_hdr *ipv4_hdr;
>>> +	uint32_t next_hop;
>>> +	uint32_t dst_ip;
>>> +
>>> +	eth_hdr = (struct rte_ether_hdr *)rte_pktmbuf_prepend(pkt,
>>> +
>> 	RTE_ETHER_HDR_LEN);
>>> +	pkt->ol_flags |= RTE_MBUF_F_TX_IPV4;
>>> +	pkt->l2_len = RTE_ETHER_HDR_LEN;
>>> +
>>> +	if (pkt->ol_flags & RTE_MBUF_F_TX_SEC_OFFLOAD) {
>>> +		next_hop = get_hop_for_offload_pkt(pkt, 0);
>>> +		*dst_port = (uint16_t) (((next_hop &
>>> +					  RTE_LPM_LOOKUP_SUCCESS) != 0)
>>> +					  ? next_hop : BAD_PORT);
>>> +	} else {
>>> +		ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
>>> +		dst_ip = rte_be_to_cpu_32(ipv4_hdr->dst_addr);
>>> +		*dst_port = (uint16_t) ((rte_lpm_lookup(
>>> +						(struct rte_lpm *)rt_ctx,
>>> +						dst_ip, &next_hop) == 0)
>>> +						? next_hop : BAD_PORT);
>>> +	}
>>> +}
>>> +
>>> +/*
>>> + * Buffer optimized handling of IPv6 packets.
>>> + */
>>> +static inline void
>>> +route6_pkts_neon(struct rt_ctx *rt_ctx, struct rte_mbuf **pkts, int nb_rx)
>>> +{
>>> +	uint8_t dst_ip6[MAX_PKT_BURST][16];
>>> +	int32_t dst_port[MAX_PKT_BURST];
>>> +	struct rte_ether_hdr *eth_hdr;
>>> +	struct rte_ipv6_hdr *ipv6_hdr;
>>> +	int32_t hop[MAX_PKT_BURST];
>>> +	struct rte_mbuf *pkt;
>>> +	uint8_t lpm_pkts = 0;
>>> +	int32_t i;
>>> +
>>> +	if (nb_rx == 0)
>>> +		return;
>>> +
>>> +	/* Need to do an LPM lookup for non-inline packets. Inline packets will
>>> +	 * have port ID in the SA
>>> +	 */
>>> +
>>> +	for (i = 0; i < nb_rx; i++) {
>>> +		pkt = pkts[i];
>>> +		eth_hdr = (struct rte_ether_hdr *)rte_pktmbuf_prepend(pkt,
>>> +
>> 	RTE_ETHER_HDR_LEN);
>>> +		pkt->l2_len = RTE_ETHER_HDR_LEN;
>>> +		pkt->ol_flags |= RTE_MBUF_F_TX_IPV6;
>>> +
>>> +		if (!(pkt->ol_flags & RTE_MBUF_F_TX_SEC_OFFLOAD)) {
>>> +			/* Security offload not enabled. So an LPM lookup is
>>> +			 * required to get the hop
>>> +			 */
>>> +			ipv6_hdr = (struct rte_ipv6_hdr *)(eth_hdr + 1);
>>> +			memcpy(&dst_ip6[lpm_pkts][0],
>>> +					ipv6_hdr->dst_addr, 16);
>>> +			lpm_pkts++;
>>> +		}
>>> +	}
>>> +
>>> +	rte_lpm6_lookup_bulk_func((struct rte_lpm6 *)rt_ctx, dst_ip6,
>>> +				  hop, lpm_pkts);
>>> +
>>> +	lpm_pkts = 0;
>>> +
>>> +	for (i = 0; i < nb_rx; i++) {
>>> +		pkt = pkts[i];
>>> +		if (pkt->ol_flags & RTE_MBUF_F_TX_SEC_OFFLOAD) {
>>> +			/* Read hop from the SA */
>>> +			dst_port[i] = get_hop_for_offload_pkt(pkt, 1);
>>> +		} else {
>>> +			/* Need to use hop returned by lookup */
>>> +			dst_port[i] = hop[lpm_pkts++];
>>> +		}
>>> +		if (dst_port[i] == -1)
>>> +			dst_port[i] = BAD_PORT;
>>> +	}
>>> +
>>> +	/* Send packets */
>>> +	send_multi_pkts(pkts, (uint16_t *)dst_port, nb_rx, 0, 0, false);
>>> +}
>>> +
>>> +/*
>>> + * Buffer optimized handling of IPv4 packets.
>>> + */
>>> +static inline void
>>> +route4_pkts_neon(struct rt_ctx *rt_ctx, struct rte_mbuf **pkts, int nb_rx,
>>> +		 uint64_t tx_offloads, bool ip_cksum)
>>> +{
>>> +	const int32_t k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);
>>> +	const int32_t m = nb_rx % FWDSTEP;
>>> +	uint16_t dst_port[MAX_PKT_BURST];
>>> +	uint64_t inline_flag = 0;
>>> +	int32x4_t dip;
>>> +	int32_t i;
>>> +
>>> +	if (nb_rx == 0)
>>> +		return;
>>> +
>>> +	for (i = 0; i != k; i += FWDSTEP) {
>>> +		processx4_step1(&pkts[i], &dip, &inline_flag);
>>> +		processx4_step2(rt_ctx, dip, inline_flag, &pkts[i],
>>> +				&dst_port[i]);
>>> +	}
>>> +
>>> +	/* Classify last up to 3 packets one by one */
>>> +	switch (m) {
>>> +	case 3:
>>> +		process_single_pkt(rt_ctx, pkts[i], &dst_port[i]);
>>> +		i++;
>>> +		/* fallthrough */
>>> +	case 2:
>>> +		process_single_pkt(rt_ctx, pkts[i], &dst_port[i]);
>>> +		i++;
>>> +		/* fallthrough */
>>> +	case 1:
>>> +		process_single_pkt(rt_ctx, pkts[i], &dst_port[i]);
>>> +	}
>>> +
>>> +	send_multi_pkts(pkts, dst_port, nb_rx, tx_offloads, ip_cksum, true);
>>> +}
>>> +
>>> +#endif /* __IPSEC_LPM_NEON_H__ */
>>> diff --git a/examples/ipsec-secgw/ipsec_neon.h b/examples/ipsec-
>> secgw/ipsec_neon.h
>>> new file mode 100644
>>> index 0000000000..39dddcd1e3
>>> --- /dev/null
>>> +++ b/examples/ipsec-secgw/ipsec_neon.h
>>> @@ -0,0 +1,487 @@
>>> +/* SPDX-License-Identifier: BSD-3-Clause
>>> + * Copyright(C) 2022 Marvell.
>>> + */
>>> +
>>> +#ifndef _IPSEC_NEON_H_
>>> +#define _IPSEC_NEON_H_
>>> +
>>> +#include "ipsec.h"
>>> +
>>> +#define FWDSTEP		4
>>> +#define MAX_TX_BURST	(MAX_PKT_BURST / 2)
>>> +#define BAD_PORT	((uint16_t)-1)
>>> +
>>> +extern xmm_t val_eth[RTE_MAX_ETHPORTS];
>>> +
>>> +/*
>>> + * Group consecutive packets with the same destination port into one burst.
>>> + * To avoid extra latency this is done together with some other packet
>>> + * processing, but after we made a final decision about packet's destination.
>>> + * To do this we maintain:
>>> + * pnum - array of number of consecutive packets with the same dest port for
>>> + * each packet in the input burst.
>>> + * lp - pointer to the last updated element in the pnum.
>>> + * dlp - dest port value lp corresponds to.
>>> + */
>>> +
>>> +#define	GRPSZ	(1 << FWDSTEP)
>>> +#define	GRPMSK	(GRPSZ - 1)
>>> +
>>> +#define GROUP_PORT_STEP(dlp, dcp, lp, pn, idx)	do { \
>>> +	if (likely((dlp) == (dcp)[(idx)])) {         \
>>> +		(lp)[0]++;                           \
>>> +	} else {                                     \
>>> +		(dlp) = (dcp)[idx];                  \
>>> +		(lp) = (pn) + (idx);                 \
>>> +		(lp)[0] = 1;                         \
>>> +	}                                            \
>>> +} while (0)
>>> +
>>> +static const struct {
>>> +	uint64_t pnum; /* prebuild 4 values for pnum[]. */
>>> +	int32_t  idx;  /* index for new last updated elemnet. */
>>> +	uint16_t lpv;  /* add value to the last updated element. */
>>> +} gptbl[GRPSZ] = {
>>> +	{
>>> +		/* 0: a != b, b != c, c != d, d != e */
>>> +		.pnum = UINT64_C(0x0001000100010001),
>>> +		.idx = 4,
>>> +		.lpv = 0,
>>> +	},
>>> +	{
>>> +		/* 1: a == b, b != c, c != d, d != e */
>>> +		.pnum = UINT64_C(0x0001000100010002),
>>> +		.idx = 4,
>>> +		.lpv = 1,
>>> +	},
>>> +	{
>>> +		/* 2: a != b, b == c, c != d, d != e */
>>> +		.pnum = UINT64_C(0x0001000100020001),
>>> +		.idx = 4,
>>> +		.lpv = 0,
>>> +	},
>>> +	{
>>> +		/* 3: a == b, b == c, c != d, d != e */
>>> +		.pnum = UINT64_C(0x0001000100020003),
>>> +		.idx = 4,
>>> +		.lpv = 2,
>>> +	},
>>> +	{
>>> +		/* 4: a != b, b != c, c == d, d != e */
>>> +		.pnum = UINT64_C(0x0001000200010001),
>>> +		.idx = 4,
>>> +		.lpv = 0,
>>> +	},
>>> +	{
>>> +		/* 5: a == b, b != c, c == d, d != e */
>>> +		.pnum = UINT64_C(0x0001000200010002),
>>> +		.idx = 4,
>>> +		.lpv = 1,
>>> +	},
>>> +	{
>>> +		/* 6: a != b, b == c, c == d, d != e */
>>> +		.pnum = UINT64_C(0x0001000200030001),
>>> +		.idx = 4,
>>> +		.lpv = 0,
>>> +	},
>>> +	{
>>> +		/* 7: a == b, b == c, c == d, d != e */
>>> +		.pnum = UINT64_C(0x0001000200030004),
>>> +		.idx = 4,
>>> +		.lpv = 3,
>>> +	},
>>> +	{
>>> +		/* 8: a != b, b != c, c != d, d == e */
>>> +		.pnum = UINT64_C(0x0002000100010001),
>>> +		.idx = 3,
>>> +		.lpv = 0,
>>> +	},
>>> +	{
>>> +		/* 9: a == b, b != c, c != d, d == e */
>>> +		.pnum = UINT64_C(0x0002000100010002),
>>> +		.idx = 3,
>>> +		.lpv = 1,
>>> +	},
>>> +	{
>>> +		/* 0xa: a != b, b == c, c != d, d == e */
>>> +		.pnum = UINT64_C(0x0002000100020001),
>>> +		.idx = 3,
>>> +		.lpv = 0,
>>> +	},
>>> +	{
>>> +		/* 0xb: a == b, b == c, c != d, d == e */
>>> +		.pnum = UINT64_C(0x0002000100020003),
>>> +		.idx = 3,
>>> +		.lpv = 2,
>>> +	},
>>> +	{
>>> +		/* 0xc: a != b, b != c, c == d, d == e */
>>> +		.pnum = UINT64_C(0x0002000300010001),
>>> +		.idx = 2,
>>> +		.lpv = 0,
>>> +	},
>>> +	{
>>> +		/* 0xd: a == b, b != c, c == d, d == e */
>>> +		.pnum = UINT64_C(0x0002000300010002),
>>> +		.idx = 2,
>>> +		.lpv = 1,
>>> +	},
>>> +	{
>>> +		/* 0xe: a != b, b == c, c == d, d == e */
>>> +		.pnum = UINT64_C(0x0002000300040001),
>>> +		.idx = 1,
>>> +		.lpv = 0,
>>> +	},
>>> +	{
>>> +		/* 0xf: a == b, b == c, c == d, d == e */
>>> +		.pnum = UINT64_C(0x0002000300040005),
>>> +		.idx = 0,
>>> +		.lpv = 4,
>>> +	},
>>> +};
>>> +
>>> +
>>> +/*
>>> + * Update source and destination MAC addresses in the ethernet header.
>>> + */
>>> +static inline void
>>> +processx4_step3(struct rte_mbuf *pkts[FWDSTEP], uint16_t
>> dst_port[FWDSTEP],
>>> +		uint64_t tx_offloads, bool ip_cksum, uint8_t *l_pkt)
>>> +{
>>> +	uint32x4_t te[FWDSTEP];
>>> +	uint32x4_t ve[FWDSTEP];
>>> +	uint32_t *p[FWDSTEP];
>>> +	struct rte_mbuf *pkt;
>>> +	uint8_t i;
>>> +
>>> +	for (i = 0; i < FWDSTEP; i++) {
>>> +		pkt = pkts[i];
>>> +
>>> +		/* Check if it is a large packet */
>>> +		if (pkt->pkt_len - RTE_ETHER_HDR_LEN > mtu_size)
>>> +			*l_pkt |= 1;
>>> +
>>> +		p[i] = rte_pktmbuf_mtod(pkt, uint32_t *);
>>> +		ve[i] = vreinterpretq_u32_s32(val_eth[dst_port[i]]);
>>> +		te[i] = vld1q_u32(p[i]);
>>> +
>>> +		/* Update last 4 bytes */
>>> +		ve[i] = vsetq_lane_u32(vgetq_lane_u32(te[i], 3), ve[i], 3);
>>> +		vst1q_u32(p[i], ve[i]);
>>> +
>>> +		if (ip_cksum) {
>>> +			struct rte_ipv4_hdr *ip;
>>> +
>>> +			pkt->ol_flags |= tx_offloads;
>>> +
>>> +			ip = (struct rte_ipv4_hdr *)
>>> +				(p[i] + RTE_ETHER_HDR_LEN + 1);
>>> +			ip->hdr_checksum = 0;
>>> +
>>> +			/* calculate IPv4 cksum in SW */
>>> +			if ((pkt->ol_flags & RTE_MBUF_F_TX_IP_CKSUM) == 0)
>>> +				ip->hdr_checksum = rte_ipv4_cksum(ip);
>>> +		}
>>> +
>>> +	}
>>> +}
>>> +
>>> +/*
>>> + * Group consecutive packets with the same destination port in bursts of 4.
>>> + * Suppose we have array of destination ports:
>>> + * dst_port[] = {a, b, c, d,, e, ... }
>>> + * dp1 should contain: <a, b, c, d>, dp2: <b, c, d, e>.
>>> + * We doing 4 comparisons at once and the result is 4 bit mask.
>>> + * This mask is used as an index into prebuild array of pnum values.
>>> + */
>>> +static inline uint16_t *
>>> +port_groupx4(uint16_t pn[FWDSTEP + 1], uint16_t *lp, uint16x8_t dp1,
>>> +	     uint16x8_t dp2)
>>> +{
>>> +	union {
>>> +		uint16_t u16[FWDSTEP + 1];
>>> +		uint64_t u64;
>>> +	} *pnum = (void *)pn;
>>> +
>>> +	uint16x8_t mask = {1, 2, 4, 8, 0, 0, 0, 0};
>>> +	int32_t v;
>>> +
>>> +	dp1 = vceqq_u16(dp1, dp2);
>>> +	dp1 = vandq_u16(dp1, mask);
>>> +	v = vaddvq_u16(dp1);
>>> +
>>> +	/* update last port counter. */
>>> +	lp[0] += gptbl[v].lpv;
>>> +	rte_compiler_barrier();
>>> +
>>> +	/* if dest port value has changed. */
>>> +	if (v != GRPMSK) {
>>> +		pnum->u64 = gptbl[v].pnum;
>>> +		pnum->u16[FWDSTEP] = 1;
>>> +		lp = pnum->u16 + gptbl[v].idx;
>>> +	}
>>> +
>>> +	return lp;
>>> +}
>>> +
>>> +/**
>>> + * Process single packet:
>>> + * Update source and destination MAC addresses in the ethernet header.
>>> + */
>>> +static inline void
>>> +process_packet(struct rte_mbuf *pkt, uint16_t *dst_port, uint64_t
>> tx_offloads,
>>> +	       bool ip_cksum, uint8_t *l_pkt)
>>> +{
>>> +	struct rte_ether_hdr *eth_hdr;
>>> +	uint32x4_t te, ve;
>>> +
>>> +	/* Check if it is a large packet */
>>> +	if (pkt->pkt_len - RTE_ETHER_HDR_LEN > mtu_size)
>>> +		*l_pkt |= 1;
>>> +
>>> +	eth_hdr = rte_pktmbuf_mtod(pkt, struct rte_ether_hdr *);
>>> +
>>> +	te = vld1q_u32((uint32_t *)eth_hdr);
>>> +	ve = vreinterpretq_u32_s32(val_eth[dst_port[0]]);
>>> +
>>> +	ve = vcopyq_laneq_u32(ve, 3, te, 3);
>>> +	vst1q_u32((uint32_t *)eth_hdr, ve);
>>> +
>>> +	if (ip_cksum) {
>>> +		struct rte_ipv4_hdr *ip;
>>> +
>>> +		pkt->ol_flags |= tx_offloads;
>>> +
>>> +		ip = (struct rte_ipv4_hdr *)(eth_hdr + 1);
>>> +		ip->hdr_checksum = 0;
>>> +
>>> +		/* calculate IPv4 cksum in SW */
>>> +		if ((pkt->ol_flags & RTE_MBUF_F_TX_IP_CKSUM) == 0)
>>> +			ip->hdr_checksum = rte_ipv4_cksum(ip);
>>> +	}
>>> +}
>>> +
>>> +static inline void
>>> +send_packets(struct rte_mbuf *m[], uint16_t port, uint32_t num, bool
>> is_ipv4)
>>> +{
>>> +	uint8_t proto;
>>> +	uint32_t i;
>>> +
>>> +	proto = is_ipv4 ? IPPROTO_IP : IPPROTO_IPV6;
>>> +	for (i = 0; i < num; i++)
>>> +		send_single_packet(m[i], port, proto);
>>> +}
>>> +
>>> +static inline void
>>> +send_packetsx4(struct rte_mbuf *m[], uint16_t port, uint32_t num)
>>> +{
>>> +	unsigned int lcoreid = rte_lcore_id();
>>> +	struct lcore_conf *qconf;
>>> +	uint32_t len, j, n;
>>> +
>>> +	qconf = &lcore_conf[lcoreid];
>>> +
>>> +	len = qconf->tx_mbufs[port].len;
>>> +
>>> +	/*
>>> +	 * If TX buffer for that queue is empty, and we have enough packets,
>>> +	 * then send them straightway.
>>> +	 */
>>> +	if (num >= MAX_TX_BURST && len == 0) {
>>> +		n = rte_eth_tx_burst(port, qconf->tx_queue_id[port], m, num);
>>> +		core_stats_update_tx(n);
>>> +		if (unlikely(n < num)) {
>>> +			do {
>>> +				rte_pktmbuf_free(m[n]);
>>> +			} while (++n < num);
>>> +		}
>>> +		return;
>>> +	}
>>> +
>>> +	/*
>>> +	 * Put packets into TX buffer for that queue.
>>> +	 */
>>> +
>>> +	n = len + num;
>>> +	n = (n > MAX_PKT_BURST) ? MAX_PKT_BURST - len : num;
>>> +
>>> +	j = 0;
>>> +	switch (n % FWDSTEP) {
>>> +	while (j < n) {
>>> +		case 0:
>>> +			qconf->tx_mbufs[port].m_table[len + j] = m[j];
>>> +			j++;
>>> +			/* fallthrough */
>>> +		case 3:
>>> +			qconf->tx_mbufs[port].m_table[len + j] = m[j];
>>> +			j++;
>>> +			/* fallthrough */
>>> +		case 2:
>>> +			qconf->tx_mbufs[port].m_table[len + j] = m[j];
>>> +			j++;
>>> +			/* fallthrough */
>>> +		case 1:
>>> +			qconf->tx_mbufs[port].m_table[len + j] = m[j];
>>> +			j++;
>>> +		}
>>> +	}
>>> +
>>> +	len += n;
>>> +
>>> +	/* enough pkts to be sent */
>>> +	if (unlikely(len == MAX_PKT_BURST)) {
>>> +
>>> +		send_burst(qconf, MAX_PKT_BURST, port);
>>> +
>>> +		/* copy rest of the packets into the TX buffer. */
>>> +		len = num - n;
>>> +		if (len == 0)
>>> +			goto exit;
>>> +
>>> +		j = 0;
>>> +		switch (len % FWDSTEP) {
>>> +		while (j < len) {
>>> +			case 0:
>>> +				qconf->tx_mbufs[port].m_table[j] = m[n + j];
>>> +				j++;
>>> +				/* fallthrough */
>>> +			case 3:
>>> +				qconf->tx_mbufs[port].m_table[j] = m[n + j];
>>> +				j++;
>>> +				/* fallthrough */
>>> +			case 2:
>>> +				qconf->tx_mbufs[port].m_table[j] = m[n + j];
>>> +				j++;
>>> +				/* fallthrough */
>>> +			case 1:
>>> +				qconf->tx_mbufs[port].m_table[j] = m[n + j];
>>> +				j++;
>>> +		}
>>> +		}
>>> +	}
>>> +
>>> +exit:
>>> +	qconf->tx_mbufs[port].len = len;
>>> +}
>>> +
>>> +/**
>>> + * Send packets burst to the ports in dst_port array
>>> + */
>>> +static __rte_always_inline void
>>> +send_multi_pkts(struct rte_mbuf **pkts, uint16_t
>> dst_port[MAX_PKT_BURST],
>>> +		int nb_rx, uint64_t tx_offloads, bool ip_cksum, bool is_ipv4)
>>> +{
>>> +	unsigned int lcoreid = rte_lcore_id();
>>> +	uint16_t pnum[MAX_PKT_BURST + 1];
>>> +	uint8_t l_pkt = 0;
>>> +	uint16_t dlp, *lp;
>>> +	int i = 0, k;
>>> +
>>> +	/*
>>> +	 * Finish packet processing and group consecutive
>>> +	 * packets with the same destination port.
>>> +	 */
>>> +	k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);
>>> +
>>> +	if (k != 0) {
>>> +		uint16x8_t dp1, dp2;
>>> +
>>> +		lp = pnum;
>>> +		lp[0] = 1;
>>> +
>>> +		processx4_step3(pkts, dst_port, tx_offloads, ip_cksum, &l_pkt);
>>> +
>>> +		/* dp1: <d[0], d[1], d[2], d[3], ... > */
>>> +		dp1 = vld1q_u16(dst_port);
>>> +
>>> +		for (i = FWDSTEP; i != k; i += FWDSTEP) {
>>> +			processx4_step3(&pkts[i], &dst_port[i], tx_offloads,
>>> +					ip_cksum, &l_pkt);
>>> +
>>> +			/*
>>> +			 * dp2:
>>> +			 * <d[j-3], d[j-2], d[j-1], d[j], ... >
>>> +			 */
>>> +			dp2 = vld1q_u16(&dst_port[i - FWDSTEP + 1]);
>>> +			lp  = port_groupx4(&pnum[i - FWDSTEP], lp, dp1, dp2);
>>> +
>>> +			/*
>>> +			 * dp1:
>>> +			 * <d[j], d[j+1], d[j+2], d[j+3], ... >
>>> +			 */
>>> +			dp1 = vextq_u16(dp2, dp1, FWDSTEP - 1);
>>> +		}
>>> +
>>> +		/*
>>> +		 * dp2: <d[j-3], d[j-2], d[j-1], d[j-1], ... >
>>> +		 */
>>> +		dp2 = vextq_u16(dp1, dp1, 1);
>>> +		dp2 = vsetq_lane_u16(vgetq_lane_u16(dp2, 2), dp2, 3);
>>> +		lp  = port_groupx4(&pnum[i - FWDSTEP], lp, dp1, dp2);
>>> +
>>> +		/*
>>> +		 * remove values added by the last repeated
>>> +		 * dst port.
>>> +		 */
>>> +		lp[0]--;
>>> +		dlp = dst_port[i - 1];
>>> +	} else {
>>> +		/* set dlp and lp to the never used values. */
>>> +		dlp = BAD_PORT - 1;
>>> +		lp = pnum + MAX_PKT_BURST;
>>> +	}
>>> +
>>> +	/* Process up to last 3 packets one by one. */
>>> +	switch (nb_rx % FWDSTEP) {
>>> +	case 3:
>>> +		process_packet(pkts[i], dst_port + i, tx_offloads, ip_cksum,
>>> +			       &l_pkt);
>>> +		GROUP_PORT_STEP(dlp, dst_port, lp, pnum, i);
>>> +		i++;
>>> +		/* fallthrough */
>>> +	case 2:
>>> +		process_packet(pkts[i], dst_port + i, tx_offloads, ip_cksum,
>>> +			       &l_pkt);
>>> +		GROUP_PORT_STEP(dlp, dst_port, lp, pnum, i);
>>> +		i++;
>>> +		/* fallthrough */
>>> +	case 1:
>>> +		process_packet(pkts[i], dst_port + i, tx_offloads, ip_cksum,
>>> +			       &l_pkt);
>>> +		GROUP_PORT_STEP(dlp, dst_port, lp, pnum, i);
>>> +	}
>>> +
>>> +	/*
>>> +	 * Send packets out, through destination port.
>>> +	 * Consecutive packets with the same destination port
>>> +	 * are already grouped together.
>>> +	 * If destination port for the packet equals BAD_PORT,
>>> +	 * then free the packet without sending it out.
>>> +	 */
>>> +	for (i = 0; i < nb_rx; i += k) {
>>> +
>>> +		uint16_t pn;
>>> +
>>> +		pn = dst_port[i];
>>> +		k = pnum[i];
>>> +
>>> +		if (likely(pn != BAD_PORT)) {
>>> +			if (l_pkt)
>>> +				/* Large packet is present, need to send
>>> +				 * individual packets with fragment
>>> +				 */
>>> +				send_packets(pkts + i, pn, k, is_ipv4);
>>> +			else
>>> +				send_packetsx4(pkts + i, pn, k);
>>> +
>>> +		} else {
>>> +			free_pkts(&pkts[i], k);
>>> +			if (is_ipv4)
>>> +				core_statistics[lcoreid].lpm4.miss++;
>>> +			else
>>> +				core_statistics[lcoreid].lpm6.miss++;
>>> +		}
>>> +	}
>>> +}
>>> +
>>> +#endif /* _IPSEC_NEON_H_ */
>>> diff --git a/examples/ipsec-secgw/ipsec_worker.c b/examples/ipsec-
>> secgw/ipsec_worker.c
>>> index e1d4e3d864..803157d8ee 100644
>>> --- a/examples/ipsec-secgw/ipsec_worker.c
>>> +++ b/examples/ipsec-secgw/ipsec_worker.c
>>> @@ -12,6 +12,10 @@
>>>    #include "ipsec-secgw.h"
>>>    #include "ipsec_worker.h"
>>>
>>> +#if defined(__ARM_NEON)
>>> +#include "ipsec_lpm_neon.h"
>>> +#endif
>>> +
>>>    struct port_drv_mode_data {
>>>    	struct rte_security_session *sess;
>>>    	struct rte_security_ctx *ctx;
>>> @@ -1248,8 +1252,13 @@ ipsec_poll_mode_wrkr_inl_pr(void)
>>>    				v6_num = ip6.num;
>>>    			}
>>>
>>> +#if defined __ARM_NEON
>>> +			route4_pkts_neon(rt4_ctx, v4, v4_num, 0, false);
>>> +			route6_pkts_neon(rt6_ctx, v6, v6_num);
>>> +#else
>>>    			route4_pkts(rt4_ctx, v4, v4_num, 0, false);
>>>    			route6_pkts(rt6_ctx, v6, v6_num);
>>> +#endif
>>>    		}
>>>    	}
>>>    }
>
  

Patch

diff --git a/examples/ipsec-secgw/ipsec-secgw.c b/examples/ipsec-secgw/ipsec-secgw.c
index 25255e053c..038c4669f5 100644
--- a/examples/ipsec-secgw/ipsec-secgw.c
+++ b/examples/ipsec-secgw/ipsec-secgw.c
@@ -56,6 +56,10 @@ 
 #include "parser.h"
 #include "sad.h"
 
+#if defined(__ARM_NEON)
+#include "ipsec_lpm_neon.h"
+#endif
+
 volatile bool force_quit;
 
 #define MAX_JUMBO_PKT_LEN  9600
@@ -96,6 +100,12 @@  struct ethaddr_info ethaddr_tbl[RTE_MAX_ETHPORTS] = {
 	{ 0, ETHADDR(0x00, 0x16, 0x3e, 0x49, 0x9e, 0xdd) }
 };
 
+/*
+ * To hold ethernet header per port, which will be applied
+ * to outgoing packets.
+ */
+xmm_t val_eth[RTE_MAX_ETHPORTS];
+
 struct flow_info flow_info_tbl[RTE_MAX_ETHPORTS];
 
 #define CMD_LINE_OPT_CONFIG		"config"
@@ -561,9 +571,16 @@  process_pkts(struct lcore_conf *qconf, struct rte_mbuf **pkts,
 			process_pkts_outbound(&qconf->outbound, &traffic);
 	}
 
+#if defined __ARM_NEON
+	/* Neon optimized packet routing */
+	route4_pkts_neon(qconf->rt4_ctx, traffic.ip4.pkts, traffic.ip4.num,
+			 qconf->outbound.ipv4_offloads, true);
+	route6_pkts_neon(qconf->rt6_ctx, traffic.ip6.pkts, traffic.ip6.num);
+#else
 	route4_pkts(qconf->rt4_ctx, traffic.ip4.pkts, traffic.ip4.num,
 		    qconf->outbound.ipv4_offloads, true);
 	route6_pkts(qconf->rt6_ctx, traffic.ip6.pkts, traffic.ip6.num);
+#endif
 }
 
 static inline void
@@ -1390,6 +1407,8 @@  add_dst_ethaddr(uint16_t port, const struct rte_ether_addr *addr)
 		return -EINVAL;
 
 	ethaddr_tbl[port].dst = ETHADDR_TO_UINT64(addr);
+	rte_ether_addr_copy((struct rte_ether_addr *)&ethaddr_tbl[port].dst,
+			    (struct rte_ether_addr *)(val_eth + port));
 	return 0;
 }
 
@@ -1852,6 +1871,12 @@  port_init(uint16_t portid, uint64_t req_rx_offloads, uint64_t req_tx_offloads)
 			portid, rte_strerror(-ret));
 
 	ethaddr_tbl[portid].src = ETHADDR_TO_UINT64(&ethaddr);
+
+	rte_ether_addr_copy((struct rte_ether_addr *)&ethaddr_tbl[portid].dst,
+			    (struct rte_ether_addr *)(val_eth + portid));
+	rte_ether_addr_copy((struct rte_ether_addr *)&ethaddr_tbl[portid].src,
+			    (struct rte_ether_addr *)(val_eth + portid) + 1);
+
 	print_ethaddr("Address: ", &ethaddr);
 	printf("\n");
 
diff --git a/examples/ipsec-secgw/ipsec_lpm_neon.h b/examples/ipsec-secgw/ipsec_lpm_neon.h
new file mode 100644
index 0000000000..959a5a8666
--- /dev/null
+++ b/examples/ipsec-secgw/ipsec_lpm_neon.h
@@ -0,0 +1,213 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2022 Marvell.
+ */
+
+#ifndef __IPSEC_LPM_NEON_H__
+#define __IPSEC_LPM_NEON_H__
+
+#include <arm_neon.h>
+#include "ipsec_neon.h"
+
+/*
+ * Append ethernet header and read destination IPV4 addresses from 4 mbufs.
+ */
+static inline void
+processx4_step1(struct rte_mbuf *pkt[FWDSTEP], int32x4_t *dip,
+		uint64_t *inline_flag)
+{
+	struct rte_ipv4_hdr *ipv4_hdr;
+	struct rte_ether_hdr *eth_hdr;
+	int32_t dst[FWDSTEP];
+	int i;
+
+	for (i = 0; i < FWDSTEP; i++) {
+		eth_hdr = (struct rte_ether_hdr *)rte_pktmbuf_prepend(pkt[i],
+							RTE_ETHER_HDR_LEN);
+		pkt[i]->ol_flags |= RTE_MBUF_F_TX_IPV4;
+		pkt[i]->l2_len = RTE_ETHER_HDR_LEN;
+
+		ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
+
+		/* Fetch destination IPv4 address */
+		dst[i] = ipv4_hdr->dst_addr;
+		*inline_flag |= pkt[i]->ol_flags & RTE_MBUF_F_TX_SEC_OFFLOAD;
+	}
+
+	dip[0] = vld1q_s32(dst);
+}
+
+/*
+ * Lookup into LPM for destination port.
+ */
+static inline void
+processx4_step2(struct rt_ctx *rt_ctx, int32x4_t dip, uint64_t inline_flag,
+		struct rte_mbuf *pkt[FWDSTEP], uint16_t dprt[FWDSTEP])
+{
+	uint32_t next_hop;
+	rte_xmm_t dst;
+	uint8_t i;
+
+	dip = vreinterpretq_s32_u8(vrev32q_u8(vreinterpretq_u8_s32(dip)));
+
+	/* If all 4 packets are non-inline */
+	if (!inline_flag) {
+		rte_lpm_lookupx4((struct rte_lpm *)rt_ctx, dip, dst.u32,
+				 BAD_PORT);
+		/* get rid of unused upper 16 bit for each dport. */
+		vst1_s16((int16_t *)dprt, vqmovn_s32(dst.x));
+		return;
+	}
+
+	/* Inline and non-inline packets */
+	dst.x = dip;
+	for (i = 0; i < FWDSTEP; i++) {
+		if (pkt[i]->ol_flags & RTE_MBUF_F_TX_SEC_OFFLOAD) {
+			next_hop = get_hop_for_offload_pkt(pkt[i], 0);
+			dprt[i] = (uint16_t) (((next_hop &
+						RTE_LPM_LOOKUP_SUCCESS) != 0)
+						? next_hop : BAD_PORT);
+
+		} else {
+			dprt[i] = (uint16_t) ((rte_lpm_lookup(
+						(struct rte_lpm *)rt_ctx,
+						 dst.u32[i], &next_hop) == 0)
+						? next_hop : BAD_PORT);
+		}
+	}
+}
+
+/*
+ * Process single packets for destination port.
+ */
+static inline void
+process_single_pkt(struct rt_ctx *rt_ctx, struct rte_mbuf *pkt,
+		   uint16_t *dst_port)
+{
+	struct rte_ether_hdr *eth_hdr;
+	struct rte_ipv4_hdr *ipv4_hdr;
+	uint32_t next_hop;
+	uint32_t dst_ip;
+
+	eth_hdr = (struct rte_ether_hdr *)rte_pktmbuf_prepend(pkt,
+							RTE_ETHER_HDR_LEN);
+	pkt->ol_flags |= RTE_MBUF_F_TX_IPV4;
+	pkt->l2_len = RTE_ETHER_HDR_LEN;
+
+	if (pkt->ol_flags & RTE_MBUF_F_TX_SEC_OFFLOAD) {
+		next_hop = get_hop_for_offload_pkt(pkt, 0);
+		*dst_port = (uint16_t) (((next_hop &
+					  RTE_LPM_LOOKUP_SUCCESS) != 0)
+					  ? next_hop : BAD_PORT);
+	} else {
+		ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
+		dst_ip = rte_be_to_cpu_32(ipv4_hdr->dst_addr);
+		*dst_port = (uint16_t) ((rte_lpm_lookup(
+						(struct rte_lpm *)rt_ctx,
+						dst_ip, &next_hop) == 0)
+						? next_hop : BAD_PORT);
+	}
+}
+
+/*
+ * Buffer optimized handling of IPv6 packets.
+ */
+static inline void
+route6_pkts_neon(struct rt_ctx *rt_ctx, struct rte_mbuf **pkts, int nb_rx)
+{
+	uint8_t dst_ip6[MAX_PKT_BURST][16];
+	int32_t dst_port[MAX_PKT_BURST];
+	struct rte_ether_hdr *eth_hdr;
+	struct rte_ipv6_hdr *ipv6_hdr;
+	int32_t hop[MAX_PKT_BURST];
+	struct rte_mbuf *pkt;
+	uint8_t lpm_pkts = 0;
+	int32_t i;
+
+	if (nb_rx == 0)
+		return;
+
+	/* Need to do an LPM lookup for non-inline packets. Inline packets will
+	 * have port ID in the SA
+	 */
+
+	for (i = 0; i < nb_rx; i++) {
+		pkt = pkts[i];
+		eth_hdr = (struct rte_ether_hdr *)rte_pktmbuf_prepend(pkt,
+							RTE_ETHER_HDR_LEN);
+		pkt->l2_len = RTE_ETHER_HDR_LEN;
+		pkt->ol_flags |= RTE_MBUF_F_TX_IPV6;
+
+		if (!(pkt->ol_flags & RTE_MBUF_F_TX_SEC_OFFLOAD)) {
+			/* Security offload not enabled. So an LPM lookup is
+			 * required to get the hop
+			 */
+			ipv6_hdr = (struct rte_ipv6_hdr *)(eth_hdr + 1);
+			memcpy(&dst_ip6[lpm_pkts][0],
+					ipv6_hdr->dst_addr, 16);
+			lpm_pkts++;
+		}
+	}
+
+	rte_lpm6_lookup_bulk_func((struct rte_lpm6 *)rt_ctx, dst_ip6,
+				  hop, lpm_pkts);
+
+	lpm_pkts = 0;
+
+	for (i = 0; i < nb_rx; i++) {
+		pkt = pkts[i];
+		if (pkt->ol_flags & RTE_MBUF_F_TX_SEC_OFFLOAD) {
+			/* Read hop from the SA */
+			dst_port[i] = get_hop_for_offload_pkt(pkt, 1);
+		} else {
+			/* Need to use hop returned by lookup */
+			dst_port[i] = hop[lpm_pkts++];
+		}
+		if (dst_port[i] == -1)
+			dst_port[i] = BAD_PORT;
+	}
+
+	/* Send packets */
+	send_multi_pkts(pkts, (uint16_t *)dst_port, nb_rx, 0, 0, false);
+}
+
+/*
+ * Buffer optimized handling of IPv4 packets.
+ */
+static inline void
+route4_pkts_neon(struct rt_ctx *rt_ctx, struct rte_mbuf **pkts, int nb_rx,
+		 uint64_t tx_offloads, bool ip_cksum)
+{
+	const int32_t k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);
+	const int32_t m = nb_rx % FWDSTEP;
+	uint16_t dst_port[MAX_PKT_BURST];
+	uint64_t inline_flag = 0;
+	int32x4_t dip;
+	int32_t i;
+
+	if (nb_rx == 0)
+		return;
+
+	for (i = 0; i != k; i += FWDSTEP) {
+		processx4_step1(&pkts[i], &dip, &inline_flag);
+		processx4_step2(rt_ctx, dip, inline_flag, &pkts[i],
+				&dst_port[i]);
+	}
+
+	/* Classify last up to 3 packets one by one */
+	switch (m) {
+	case 3:
+		process_single_pkt(rt_ctx, pkts[i], &dst_port[i]);
+		i++;
+		/* fallthrough */
+	case 2:
+		process_single_pkt(rt_ctx, pkts[i], &dst_port[i]);
+		i++;
+		/* fallthrough */
+	case 1:
+		process_single_pkt(rt_ctx, pkts[i], &dst_port[i]);
+	}
+
+	send_multi_pkts(pkts, dst_port, nb_rx, tx_offloads, ip_cksum, true);
+}
+
+#endif /* __IPSEC_LPM_NEON_H__ */
diff --git a/examples/ipsec-secgw/ipsec_neon.h b/examples/ipsec-secgw/ipsec_neon.h
new file mode 100644
index 0000000000..39dddcd1e3
--- /dev/null
+++ b/examples/ipsec-secgw/ipsec_neon.h
@@ -0,0 +1,487 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2022 Marvell.
+ */
+
+#ifndef _IPSEC_NEON_H_
+#define _IPSEC_NEON_H_
+
+#include "ipsec.h"
+
+#define FWDSTEP		4
+#define MAX_TX_BURST	(MAX_PKT_BURST / 2)
+#define BAD_PORT	((uint16_t)-1)
+
+extern xmm_t val_eth[RTE_MAX_ETHPORTS];
+
+/*
+ * Group consecutive packets with the same destination port into one burst.
+ * To avoid extra latency this is done together with some other packet
+ * processing, but after we made a final decision about packet's destination.
+ * To do this we maintain:
+ * pnum - array of number of consecutive packets with the same dest port for
+ * each packet in the input burst.
+ * lp - pointer to the last updated element in the pnum.
+ * dlp - dest port value lp corresponds to.
+ */
+
+#define	GRPSZ	(1 << FWDSTEP)
+#define	GRPMSK	(GRPSZ - 1)
+
+#define GROUP_PORT_STEP(dlp, dcp, lp, pn, idx)	do { \
+	if (likely((dlp) == (dcp)[(idx)])) {         \
+		(lp)[0]++;                           \
+	} else {                                     \
+		(dlp) = (dcp)[idx];                  \
+		(lp) = (pn) + (idx);                 \
+		(lp)[0] = 1;                         \
+	}                                            \
+} while (0)
+
+static const struct {
+	uint64_t pnum; /* prebuild 4 values for pnum[]. */
+	int32_t  idx;  /* index for new last updated elemnet. */
+	uint16_t lpv;  /* add value to the last updated element. */
+} gptbl[GRPSZ] = {
+	{
+		/* 0: a != b, b != c, c != d, d != e */
+		.pnum = UINT64_C(0x0001000100010001),
+		.idx = 4,
+		.lpv = 0,
+	},
+	{
+		/* 1: a == b, b != c, c != d, d != e */
+		.pnum = UINT64_C(0x0001000100010002),
+		.idx = 4,
+		.lpv = 1,
+	},
+	{
+		/* 2: a != b, b == c, c != d, d != e */
+		.pnum = UINT64_C(0x0001000100020001),
+		.idx = 4,
+		.lpv = 0,
+	},
+	{
+		/* 3: a == b, b == c, c != d, d != e */
+		.pnum = UINT64_C(0x0001000100020003),
+		.idx = 4,
+		.lpv = 2,
+	},
+	{
+		/* 4: a != b, b != c, c == d, d != e */
+		.pnum = UINT64_C(0x0001000200010001),
+		.idx = 4,
+		.lpv = 0,
+	},
+	{
+		/* 5: a == b, b != c, c == d, d != e */
+		.pnum = UINT64_C(0x0001000200010002),
+		.idx = 4,
+		.lpv = 1,
+	},
+	{
+		/* 6: a != b, b == c, c == d, d != e */
+		.pnum = UINT64_C(0x0001000200030001),
+		.idx = 4,
+		.lpv = 0,
+	},
+	{
+		/* 7: a == b, b == c, c == d, d != e */
+		.pnum = UINT64_C(0x0001000200030004),
+		.idx = 4,
+		.lpv = 3,
+	},
+	{
+		/* 8: a != b, b != c, c != d, d == e */
+		.pnum = UINT64_C(0x0002000100010001),
+		.idx = 3,
+		.lpv = 0,
+	},
+	{
+		/* 9: a == b, b != c, c != d, d == e */
+		.pnum = UINT64_C(0x0002000100010002),
+		.idx = 3,
+		.lpv = 1,
+	},
+	{
+		/* 0xa: a != b, b == c, c != d, d == e */
+		.pnum = UINT64_C(0x0002000100020001),
+		.idx = 3,
+		.lpv = 0,
+	},
+	{
+		/* 0xb: a == b, b == c, c != d, d == e */
+		.pnum = UINT64_C(0x0002000100020003),
+		.idx = 3,
+		.lpv = 2,
+	},
+	{
+		/* 0xc: a != b, b != c, c == d, d == e */
+		.pnum = UINT64_C(0x0002000300010001),
+		.idx = 2,
+		.lpv = 0,
+	},
+	{
+		/* 0xd: a == b, b != c, c == d, d == e */
+		.pnum = UINT64_C(0x0002000300010002),
+		.idx = 2,
+		.lpv = 1,
+	},
+	{
+		/* 0xe: a != b, b == c, c == d, d == e */
+		.pnum = UINT64_C(0x0002000300040001),
+		.idx = 1,
+		.lpv = 0,
+	},
+	{
+		/* 0xf: a == b, b == c, c == d, d == e */
+		.pnum = UINT64_C(0x0002000300040005),
+		.idx = 0,
+		.lpv = 4,
+	},
+};
+
+
+/*
+ * Update source and destination MAC addresses in the ethernet header.
+ */
+static inline void
+processx4_step3(struct rte_mbuf *pkts[FWDSTEP], uint16_t dst_port[FWDSTEP],
+		uint64_t tx_offloads, bool ip_cksum, uint8_t *l_pkt)
+{
+	uint32x4_t te[FWDSTEP];
+	uint32x4_t ve[FWDSTEP];
+	uint32_t *p[FWDSTEP];
+	struct rte_mbuf *pkt;
+	uint8_t i;
+
+	for (i = 0; i < FWDSTEP; i++) {
+		pkt = pkts[i];
+
+		/* Check if it is a large packet */
+		if (pkt->pkt_len - RTE_ETHER_HDR_LEN > mtu_size)
+			*l_pkt |= 1;
+
+		p[i] = rte_pktmbuf_mtod(pkt, uint32_t *);
+		ve[i] = vreinterpretq_u32_s32(val_eth[dst_port[i]]);
+		te[i] = vld1q_u32(p[i]);
+
+		/* Update last 4 bytes */
+		ve[i] = vsetq_lane_u32(vgetq_lane_u32(te[i], 3), ve[i], 3);
+		vst1q_u32(p[i], ve[i]);
+
+		if (ip_cksum) {
+			struct rte_ipv4_hdr *ip;
+
+			pkt->ol_flags |= tx_offloads;
+
+			ip = (struct rte_ipv4_hdr *)
+				(p[i] + RTE_ETHER_HDR_LEN + 1);
+			ip->hdr_checksum = 0;
+
+			/* calculate IPv4 cksum in SW */
+			if ((pkt->ol_flags & RTE_MBUF_F_TX_IP_CKSUM) == 0)
+				ip->hdr_checksum = rte_ipv4_cksum(ip);
+		}
+
+	}
+}
+
+/*
+ * Group consecutive packets with the same destination port in bursts of 4.
+ * Suppose we have array of destination ports:
+ * dst_port[] = {a, b, c, d,, e, ... }
+ * dp1 should contain: <a, b, c, d>, dp2: <b, c, d, e>.
+ * We doing 4 comparisons at once and the result is 4 bit mask.
+ * This mask is used as an index into prebuild array of pnum values.
+ */
+static inline uint16_t *
+port_groupx4(uint16_t pn[FWDSTEP + 1], uint16_t *lp, uint16x8_t dp1,
+	     uint16x8_t dp2)
+{
+	union {
+		uint16_t u16[FWDSTEP + 1];
+		uint64_t u64;
+	} *pnum = (void *)pn;
+
+	uint16x8_t mask = {1, 2, 4, 8, 0, 0, 0, 0};
+	int32_t v;
+
+	dp1 = vceqq_u16(dp1, dp2);
+	dp1 = vandq_u16(dp1, mask);
+	v = vaddvq_u16(dp1);
+
+	/* update last port counter. */
+	lp[0] += gptbl[v].lpv;
+	rte_compiler_barrier();
+
+	/* if dest port value has changed. */
+	if (v != GRPMSK) {
+		pnum->u64 = gptbl[v].pnum;
+		pnum->u16[FWDSTEP] = 1;
+		lp = pnum->u16 + gptbl[v].idx;
+	}
+
+	return lp;
+}
+
+/**
+ * Process single packet:
+ * Update source and destination MAC addresses in the ethernet header.
+ */
+static inline void
+process_packet(struct rte_mbuf *pkt, uint16_t *dst_port, uint64_t tx_offloads,
+	       bool ip_cksum, uint8_t *l_pkt)
+{
+	struct rte_ether_hdr *eth_hdr;
+	uint32x4_t te, ve;
+
+	/* Check if it is a large packet */
+	if (pkt->pkt_len - RTE_ETHER_HDR_LEN > mtu_size)
+		*l_pkt |= 1;
+
+	eth_hdr = rte_pktmbuf_mtod(pkt, struct rte_ether_hdr *);
+
+	te = vld1q_u32((uint32_t *)eth_hdr);
+	ve = vreinterpretq_u32_s32(val_eth[dst_port[0]]);
+
+	ve = vcopyq_laneq_u32(ve, 3, te, 3);
+	vst1q_u32((uint32_t *)eth_hdr, ve);
+
+	if (ip_cksum) {
+		struct rte_ipv4_hdr *ip;
+
+		pkt->ol_flags |= tx_offloads;
+
+		ip = (struct rte_ipv4_hdr *)(eth_hdr + 1);
+		ip->hdr_checksum = 0;
+
+		/* calculate IPv4 cksum in SW */
+		if ((pkt->ol_flags & RTE_MBUF_F_TX_IP_CKSUM) == 0)
+			ip->hdr_checksum = rte_ipv4_cksum(ip);
+	}
+}
+
+static inline void
+send_packets(struct rte_mbuf *m[], uint16_t port, uint32_t num, bool is_ipv4)
+{
+	uint8_t proto;
+	uint32_t i;
+
+	proto = is_ipv4 ? IPPROTO_IP : IPPROTO_IPV6;
+	for (i = 0; i < num; i++)
+		send_single_packet(m[i], port, proto);
+}
+
+static inline void
+send_packetsx4(struct rte_mbuf *m[], uint16_t port, uint32_t num)
+{
+	unsigned int lcoreid = rte_lcore_id();
+	struct lcore_conf *qconf;
+	uint32_t len, j, n;
+
+	qconf = &lcore_conf[lcoreid];
+
+	len = qconf->tx_mbufs[port].len;
+
+	/*
+	 * If TX buffer for that queue is empty, and we have enough packets,
+	 * then send them straightway.
+	 */
+	if (num >= MAX_TX_BURST && len == 0) {
+		n = rte_eth_tx_burst(port, qconf->tx_queue_id[port], m, num);
+		core_stats_update_tx(n);
+		if (unlikely(n < num)) {
+			do {
+				rte_pktmbuf_free(m[n]);
+			} while (++n < num);
+		}
+		return;
+	}
+
+	/*
+	 * Put packets into TX buffer for that queue.
+	 */
+
+	n = len + num;
+	n = (n > MAX_PKT_BURST) ? MAX_PKT_BURST - len : num;
+
+	j = 0;
+	switch (n % FWDSTEP) {
+	while (j < n) {
+		case 0:
+			qconf->tx_mbufs[port].m_table[len + j] = m[j];
+			j++;
+			/* fallthrough */
+		case 3:
+			qconf->tx_mbufs[port].m_table[len + j] = m[j];
+			j++;
+			/* fallthrough */
+		case 2:
+			qconf->tx_mbufs[port].m_table[len + j] = m[j];
+			j++;
+			/* fallthrough */
+		case 1:
+			qconf->tx_mbufs[port].m_table[len + j] = m[j];
+			j++;
+		}
+	}
+
+	len += n;
+
+	/* enough pkts to be sent */
+	if (unlikely(len == MAX_PKT_BURST)) {
+
+		send_burst(qconf, MAX_PKT_BURST, port);
+
+		/* copy rest of the packets into the TX buffer. */
+		len = num - n;
+		if (len == 0)
+			goto exit;
+
+		j = 0;
+		switch (len % FWDSTEP) {
+		while (j < len) {
+			case 0:
+				qconf->tx_mbufs[port].m_table[j] = m[n + j];
+				j++;
+				/* fallthrough */
+			case 3:
+				qconf->tx_mbufs[port].m_table[j] = m[n + j];
+				j++;
+				/* fallthrough */
+			case 2:
+				qconf->tx_mbufs[port].m_table[j] = m[n + j];
+				j++;
+				/* fallthrough */
+			case 1:
+				qconf->tx_mbufs[port].m_table[j] = m[n + j];
+				j++;
+		}
+		}
+	}
+
+exit:
+	qconf->tx_mbufs[port].len = len;
+}
+
+/**
+ * Send packets burst to the ports in dst_port array
+ */
+static __rte_always_inline void
+send_multi_pkts(struct rte_mbuf **pkts, uint16_t dst_port[MAX_PKT_BURST],
+		int nb_rx, uint64_t tx_offloads, bool ip_cksum, bool is_ipv4)
+{
+	unsigned int lcoreid = rte_lcore_id();
+	uint16_t pnum[MAX_PKT_BURST + 1];
+	uint8_t l_pkt = 0;
+	uint16_t dlp, *lp;
+	int i = 0, k;
+
+	/*
+	 * Finish packet processing and group consecutive
+	 * packets with the same destination port.
+	 */
+	k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);
+
+	if (k != 0) {
+		uint16x8_t dp1, dp2;
+
+		lp = pnum;
+		lp[0] = 1;
+
+		processx4_step3(pkts, dst_port, tx_offloads, ip_cksum, &l_pkt);
+
+		/* dp1: <d[0], d[1], d[2], d[3], ... > */
+		dp1 = vld1q_u16(dst_port);
+
+		for (i = FWDSTEP; i != k; i += FWDSTEP) {
+			processx4_step3(&pkts[i], &dst_port[i], tx_offloads,
+					ip_cksum, &l_pkt);
+
+			/*
+			 * dp2:
+			 * <d[j-3], d[j-2], d[j-1], d[j], ... >
+			 */
+			dp2 = vld1q_u16(&dst_port[i - FWDSTEP + 1]);
+			lp  = port_groupx4(&pnum[i - FWDSTEP], lp, dp1, dp2);
+
+			/*
+			 * dp1:
+			 * <d[j], d[j+1], d[j+2], d[j+3], ... >
+			 */
+			dp1 = vextq_u16(dp2, dp1, FWDSTEP - 1);
+		}
+
+		/*
+		 * dp2: <d[j-3], d[j-2], d[j-1], d[j-1], ... >
+		 */
+		dp2 = vextq_u16(dp1, dp1, 1);
+		dp2 = vsetq_lane_u16(vgetq_lane_u16(dp2, 2), dp2, 3);
+		lp  = port_groupx4(&pnum[i - FWDSTEP], lp, dp1, dp2);
+
+		/*
+		 * remove values added by the last repeated
+		 * dst port.
+		 */
+		lp[0]--;
+		dlp = dst_port[i - 1];
+	} else {
+		/* set dlp and lp to the never used values. */
+		dlp = BAD_PORT - 1;
+		lp = pnum + MAX_PKT_BURST;
+	}
+
+	/* Process up to last 3 packets one by one. */
+	switch (nb_rx % FWDSTEP) {
+	case 3:
+		process_packet(pkts[i], dst_port + i, tx_offloads, ip_cksum,
+			       &l_pkt);
+		GROUP_PORT_STEP(dlp, dst_port, lp, pnum, i);
+		i++;
+		/* fallthrough */
+	case 2:
+		process_packet(pkts[i], dst_port + i, tx_offloads, ip_cksum,
+			       &l_pkt);
+		GROUP_PORT_STEP(dlp, dst_port, lp, pnum, i);
+		i++;
+		/* fallthrough */
+	case 1:
+		process_packet(pkts[i], dst_port + i, tx_offloads, ip_cksum,
+			       &l_pkt);
+		GROUP_PORT_STEP(dlp, dst_port, lp, pnum, i);
+	}
+
+	/*
+	 * Send packets out, through destination port.
+	 * Consecutive packets with the same destination port
+	 * are already grouped together.
+	 * If destination port for the packet equals BAD_PORT,
+	 * then free the packet without sending it out.
+	 */
+	for (i = 0; i < nb_rx; i += k) {
+
+		uint16_t pn;
+
+		pn = dst_port[i];
+		k = pnum[i];
+
+		if (likely(pn != BAD_PORT)) {
+			if (l_pkt)
+				/* Large packet is present, need to send
+				 * individual packets with fragment
+				 */
+				send_packets(pkts + i, pn, k, is_ipv4);
+			else
+				send_packetsx4(pkts + i, pn, k);
+
+		} else {
+			free_pkts(&pkts[i], k);
+			if (is_ipv4)
+				core_statistics[lcoreid].lpm4.miss++;
+			else
+				core_statistics[lcoreid].lpm6.miss++;
+		}
+	}
+}
+
+#endif /* _IPSEC_NEON_H_ */
diff --git a/examples/ipsec-secgw/ipsec_worker.c b/examples/ipsec-secgw/ipsec_worker.c
index e1d4e3d864..803157d8ee 100644
--- a/examples/ipsec-secgw/ipsec_worker.c
+++ b/examples/ipsec-secgw/ipsec_worker.c
@@ -12,6 +12,10 @@ 
 #include "ipsec-secgw.h"
 #include "ipsec_worker.h"
 
+#if defined(__ARM_NEON)
+#include "ipsec_lpm_neon.h"
+#endif
+
 struct port_drv_mode_data {
 	struct rte_security_session *sess;
 	struct rte_security_ctx *ctx;
@@ -1248,8 +1252,13 @@  ipsec_poll_mode_wrkr_inl_pr(void)
 				v6_num = ip6.num;
 			}
 
+#if defined __ARM_NEON
+			route4_pkts_neon(rt4_ctx, v4, v4_num, 0, false);
+			route6_pkts_neon(rt6_ctx, v6, v6_num);
+#else
 			route4_pkts(rt4_ctx, v4, v4_num, 0, false);
 			route6_pkts(rt6_ctx, v6, v6_num);
+#endif
 		}
 	}
 }