[v4,1/9] examples/ipsec-secgw: avoid to request unused TX offloads

Message ID 1544805623-18150-2-git-send-email-konstantin.ananyev@intel.com (mailing list archive)
State Superseded, archived
Delegated to: Thomas Monjalon
Headers
Series [v4,1/9] examples/ipsec-secgw: avoid to request unused TX offloads |

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/Intel-compilation success Compilation OK
ci/mellanox-Performance-Testing success Performance Testing PASS
ci/intel-Performance-Testing success Performance Testing PASS

Commit Message

Ananyev, Konstantin Dec. 14, 2018, 4:40 p.m. UTC
  ipsec-secgw always enables TX offloads
(DEV_TX_OFFLOAD_MULTI_SEGS, DEV_TX_OFFLOAD_SECURITY),
even when they are not requested by the config.
That causes many PMD to choose full-featured TX function,
which in many cases is much slower then one without offloads.
That patch adds checks to enabled extra HW offloads, only when
they were requested.
Plus it enables DEV_TX_OFFLOAD_IPV4_CKSUM,
only when other HW TX ofloads are going to be enabled.
Otherwise SW version of ip cksum calculation is used.
That allows to use vector TX function, when inline-ipsec is not
requested.

Signed-off-by: Remy Horton <remy.horton@intel.com>
Signed-off-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
Acked-by: Radu Nicolau <radu.nicolau@intel.com>
---
 examples/ipsec-secgw/ipsec-secgw.c | 44 +++++++++++++++--------
 examples/ipsec-secgw/ipsec.h       |  6 ++++
 examples/ipsec-secgw/sa.c          | 56 ++++++++++++++++++++++++++++++
 3 files changed, 91 insertions(+), 15 deletions(-)
  

Comments

Akhil Goyal Dec. 21, 2018, 1:57 p.m. UTC | #1
Hi Konstantin,

On 12/14/2018 10:10 PM, Konstantin Ananyev wrote:
> ipsec-secgw always enables TX offloads
> (DEV_TX_OFFLOAD_MULTI_SEGS, DEV_TX_OFFLOAD_SECURITY),
> even when they are not requested by the config.
> That causes many PMD to choose full-featured TX function,
> which in many cases is much slower then one without offloads.
> That patch adds checks to enabled extra HW offloads, only when
> they were requested.
> Plus it enables DEV_TX_OFFLOAD_IPV4_CKSUM,
> only when other HW TX ofloads are going to be enabled.
> Otherwise SW version of ip cksum calculation is used.
> That allows to use vector TX function, when inline-ipsec is not
> requested.
>
> Signed-off-by: Remy Horton <remy.horton@intel.com>
> Signed-off-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
> Acked-by: Radu Nicolau <radu.nicolau@intel.com>
> ---
>   examples/ipsec-secgw/ipsec-secgw.c | 44 +++++++++++++++--------
>   examples/ipsec-secgw/ipsec.h       |  6 ++++
>   examples/ipsec-secgw/sa.c          | 56 ++++++++++++++++++++++++++++++
>   3 files changed, 91 insertions(+), 15 deletions(-)
>
> diff --git a/examples/ipsec-secgw/ipsec-secgw.c b/examples/ipsec-secgw/ipsec-secgw.c
> index 1bc0b5b50..cfc2b05e5 100644
> --- a/examples/ipsec-secgw/ipsec-secgw.c
> +++ b/examples/ipsec-secgw/ipsec-secgw.c
> @@ -208,8 +208,6 @@ static struct rte_eth_conf port_conf = {
>   	},
>   	.txmode = {
>   		.mq_mode = ETH_MQ_TX_NONE,
> -		.offloads = (DEV_TX_OFFLOAD_IPV4_CKSUM |
> -			     DEV_TX_OFFLOAD_MULTI_SEGS),
I believe this is disabling checksum offload for all cases and then 
enabling only for inline crypto and inline proto.
This is breaking lookaside proto and lookaside none cases. Please 
correct me if I am wrong.
So a NACK for this if my understanding is correct.
>   	},
>   };
>   
> @@ -315,7 +313,8 @@ prepare_traffic(struct rte_mbuf **pkts, struct ipsec_traffic *t,
>   }
>   
>   static inline void
> -prepare_tx_pkt(struct rte_mbuf *pkt, uint16_t port)
> +prepare_tx_pkt(struct rte_mbuf *pkt, uint16_t port,
> +		const struct lcore_conf *qconf)
>   {
>   	struct ip *ip;
>   	struct ether_hdr *ethhdr;
> @@ -325,14 +324,19 @@ prepare_tx_pkt(struct rte_mbuf *pkt, uint16_t port)
>   	ethhdr = (struct ether_hdr *)rte_pktmbuf_prepend(pkt, ETHER_HDR_LEN);
>   
>   	if (ip->ip_v == IPVERSION) {
> -		pkt->ol_flags |= PKT_TX_IP_CKSUM | PKT_TX_IPV4;
> +		pkt->ol_flags |= qconf->outbound.ipv4_offloads;
>   		pkt->l3_len = sizeof(struct ip);
>   		pkt->l2_len = ETHER_HDR_LEN;
>   
>   		ip->ip_sum = 0;
> +
> +		/* calculate IPv4 cksum in SW */
> +		if ((pkt->ol_flags & PKT_TX_IP_CKSUM) == 0)
> +			ip->ip_sum = rte_ipv4_cksum((struct ipv4_hdr *)ip);
> +
>   		ethhdr->ether_type = rte_cpu_to_be_16(ETHER_TYPE_IPv4);
>   	} else {
> -		pkt->ol_flags |= PKT_TX_IPV6;
> +		pkt->ol_flags |= qconf->outbound.ipv6_offloads;
>   		pkt->l3_len = sizeof(struct ip6_hdr);
>   		pkt->l2_len = ETHER_HDR_LEN;
>   
> @@ -346,18 +350,19 @@ prepare_tx_pkt(struct rte_mbuf *pkt, uint16_t port)
>   }
>   
>   static inline void
> -prepare_tx_burst(struct rte_mbuf *pkts[], uint16_t nb_pkts, uint16_t port)
> +prepare_tx_burst(struct rte_mbuf *pkts[], uint16_t nb_pkts, uint16_t port,
> +		const struct lcore_conf *qconf)
>   {
>   	int32_t i;
>   	const int32_t prefetch_offset = 2;
>   
>   	for (i = 0; i < (nb_pkts - prefetch_offset); i++) {
>   		rte_mbuf_prefetch_part2(pkts[i + prefetch_offset]);
> -		prepare_tx_pkt(pkts[i], port);
> +		prepare_tx_pkt(pkts[i], port, qconf);
>   	}
>   	/* Process left packets */
>   	for (; i < nb_pkts; i++)
> -		prepare_tx_pkt(pkts[i], port);
> +		prepare_tx_pkt(pkts[i], port, qconf);
>   }
>   
>   /* Send burst of packets on an output interface */
> @@ -371,7 +376,7 @@ send_burst(struct lcore_conf *qconf, uint16_t n, uint16_t port)
>   	queueid = qconf->tx_queue_id[port];
>   	m_table = (struct rte_mbuf **)qconf->tx_mbufs[port].m_table;
>   
> -	prepare_tx_burst(m_table, n, port);
> +	prepare_tx_burst(m_table, n, port, qconf);
>   
>   	ret = rte_eth_tx_burst(port, queueid, m_table, n);
>   	if (unlikely(ret < n)) {
> @@ -1543,7 +1548,7 @@ cryptodevs_init(void)
>   }
>   
>   static void
> -port_init(uint16_t portid)
> +port_init(uint16_t portid, uint64_t req_rx_offloads, uint64_t req_tx_offloads)
>   {
>   	struct rte_eth_dev_info dev_info;
>   	struct rte_eth_txconf *txconf;
> @@ -1584,10 +1589,10 @@ port_init(uint16_t portid)
>   		local_port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_JUMBO_FRAME;
>   	}
>   
> -	if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_SECURITY)
> -		local_port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_SECURITY;
> -	if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_SECURITY)
> -		local_port_conf.txmode.offloads |= DEV_TX_OFFLOAD_SECURITY;
> +	/* Capabilities will already have been checked.. */
> +	local_port_conf.rxmode.offloads |= req_rx_offloads;
> +	local_port_conf.txmode.offloads |= req_tx_offloads;
> +
>   	if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE)
>   		local_port_conf.txmode.offloads |=
>   			DEV_TX_OFFLOAD_MBUF_FAST_FREE;
> @@ -1639,6 +1644,13 @@ port_init(uint16_t portid)
>   
>   		qconf = &lcore_conf[lcore_id];
>   		qconf->tx_queue_id[portid] = tx_queueid;
> +
> +		/* Pre-populate pkt offloads based on capabilities */
> +		qconf->outbound.ipv4_offloads = PKT_TX_IPV4;
> +		qconf->outbound.ipv6_offloads = PKT_TX_IPV6;
> +		if (req_tx_offloads & DEV_TX_OFFLOAD_IPV4_CKSUM)
> +			qconf->outbound.ipv4_offloads |= PKT_TX_IP_CKSUM;
> +
>   		tx_queueid++;
>   
>   		/* init RX queues */
> @@ -1749,6 +1761,7 @@ main(int32_t argc, char **argv)
>   	uint32_t lcore_id;
>   	uint8_t socket_id;
>   	uint16_t portid;
> +	uint64_t req_rx_offloads, req_tx_offloads;
>   
>   	/* init EAL */
>   	ret = rte_eal_init(argc, argv);
> @@ -1804,7 +1817,8 @@ main(int32_t argc, char **argv)
>   		if ((enabled_port_mask & (1 << portid)) == 0)
>   			continue;
>   
> -		port_init(portid);
> +		sa_check_offloads(portid, &req_rx_offloads, &req_tx_offloads);
> +		port_init(portid, req_rx_offloads, req_tx_offloads);
>   	}
>   
>   	cryptodevs_init();
> diff --git a/examples/ipsec-secgw/ipsec.h b/examples/ipsec-secgw/ipsec.h
> index c998c8076..9b1586f52 100644
> --- a/examples/ipsec-secgw/ipsec.h
> +++ b/examples/ipsec-secgw/ipsec.h
> @@ -146,6 +146,8 @@ struct ipsec_ctx {
>   	struct rte_mempool *session_pool;
>   	struct rte_mbuf *ol_pkts[MAX_PKT_BURST] __rte_aligned(sizeof(void *));
>   	uint16_t ol_pkts_cnt;
> +	uint64_t ipv4_offloads;
> +	uint64_t ipv6_offloads;
>   };
>   
>   struct cdev_key {
> @@ -239,4 +241,8 @@ sa_init(struct socket_ctx *ctx, int32_t socket_id);
>   void
>   rt_init(struct socket_ctx *ctx, int32_t socket_id);
>   
> +int
> +sa_check_offloads(uint16_t port_id, uint64_t *rx_offloads,
> +		uint64_t *tx_offloads);
> +
>   #endif /* __IPSEC_H__ */
> diff --git a/examples/ipsec-secgw/sa.c b/examples/ipsec-secgw/sa.c
> index d2d3550a4..ff8c4b829 100644
> --- a/examples/ipsec-secgw/sa.c
> +++ b/examples/ipsec-secgw/sa.c
> @@ -1017,3 +1017,59 @@ outbound_sa_lookup(struct sa_ctx *sa_ctx, uint32_t sa_idx[],
>   	for (i = 0; i < nb_pkts; i++)
>   		sa[i] = &sa_ctx->sa[sa_idx[i]];
>   }
> +
> +/*
> + * Select HW offloads to be used.
> + */
> +int
> +sa_check_offloads(uint16_t port_id, uint64_t *rx_offloads,
> +		uint64_t *tx_offloads)
> +{
> +	struct ipsec_sa *rule;
> +	uint32_t idx_sa;
> +	struct rte_eth_dev_info dev_info;
> +
> +	rte_eth_dev_info_get(port_id, &dev_info);
> +
> +	*rx_offloads = 0;
> +	*tx_offloads = 0;
> +
> +	/* Check for inbound rules that use offloads and use this port */
> +	for (idx_sa = 0; idx_sa < nb_sa_in; idx_sa++) {
> +		rule = &sa_in[idx_sa];
> +		if ((rule->type == RTE_SECURITY_ACTION_TYPE_INLINE_CRYPTO ||
> +				rule->type ==
> +				RTE_SECURITY_ACTION_TYPE_INLINE_PROTOCOL)
> +				&& rule->portid == port_id) {
> +			if ((dev_info.rx_offload_capa & DEV_RX_OFFLOAD_SECURITY)
> +					== 0) {
> +				RTE_LOG(WARNING, PORT,
> +					"HW RX IPSec is not supported\n");
> +				return -EINVAL;
> +			}
> +			*rx_offloads |= DEV_RX_OFFLOAD_SECURITY;
> +		}
> +	}
> +
> +	/* Check for outbound rules that use offloads and use this port */
> +	for (idx_sa = 0; idx_sa < nb_sa_out; idx_sa++) {
> +		rule = &sa_out[idx_sa];
> +		if ((rule->type == RTE_SECURITY_ACTION_TYPE_INLINE_CRYPTO ||
> +				rule->type ==
> +				RTE_SECURITY_ACTION_TYPE_INLINE_PROTOCOL)
> +				&& rule->portid == port_id) {
> +			if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_SECURITY)
> +					== 0) {
> +				RTE_LOG(WARNING, PORT,
> +					"HW TX IPSec is not supported\n");
> +				return -EINVAL;
> +			}
> +			*tx_offloads |= DEV_TX_OFFLOAD_SECURITY;
> +			/* Enable HW IPv4 cksum as well, if it is available */
> +			if (dev_info.tx_offload_capa &
> +					DEV_TX_OFFLOAD_IPV4_CKSUM)
> +				*tx_offloads |= DEV_TX_OFFLOAD_IPV4_CKSUM;
> +		}
> +	}
> +	return 0;
> +}
  
Ananyev, Konstantin Dec. 21, 2018, 3:58 p.m. UTC | #2
> -----Original Message-----
> From: Akhil Goyal [mailto:akhil.goyal@nxp.com]
> Sent: Friday, December 21, 2018 1:57 PM
> To: Ananyev, Konstantin <konstantin.ananyev@intel.com>; dev@dpdk.org
> Cc: Nicolau, Radu <radu.nicolau@intel.com>; Horton, Remy <remy.horton@intel.com>
> Subject: Re: [dpdk-dev] [PATCH v4 1/9] examples/ipsec-secgw: avoid to request unused TX offloads
> 
> Hi Konstantin,
> 
> On 12/14/2018 10:10 PM, Konstantin Ananyev wrote:
> > ipsec-secgw always enables TX offloads
> > (DEV_TX_OFFLOAD_MULTI_SEGS, DEV_TX_OFFLOAD_SECURITY),
> > even when they are not requested by the config.
> > That causes many PMD to choose full-featured TX function,
> > which in many cases is much slower then one without offloads.
> > That patch adds checks to enabled extra HW offloads, only when
> > they were requested.
> > Plus it enables DEV_TX_OFFLOAD_IPV4_CKSUM,
> > only when other HW TX ofloads are going to be enabled.
> > Otherwise SW version of ip cksum calculation is used.
> > That allows to use vector TX function, when inline-ipsec is not
> > requested.
> >
> > Signed-off-by: Remy Horton <remy.horton@intel.com>
> > Signed-off-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
> > Acked-by: Radu Nicolau <radu.nicolau@intel.com>
> > ---
> >   examples/ipsec-secgw/ipsec-secgw.c | 44 +++++++++++++++--------
> >   examples/ipsec-secgw/ipsec.h       |  6 ++++
> >   examples/ipsec-secgw/sa.c          | 56 ++++++++++++++++++++++++++++++
> >   3 files changed, 91 insertions(+), 15 deletions(-)
> >
> > diff --git a/examples/ipsec-secgw/ipsec-secgw.c b/examples/ipsec-secgw/ipsec-secgw.c
> > index 1bc0b5b50..cfc2b05e5 100644
> > --- a/examples/ipsec-secgw/ipsec-secgw.c
> > +++ b/examples/ipsec-secgw/ipsec-secgw.c
> > @@ -208,8 +208,6 @@ static struct rte_eth_conf port_conf = {
> >   	},
> >   	.txmode = {
> >   		.mq_mode = ETH_MQ_TX_NONE,
> > -		.offloads = (DEV_TX_OFFLOAD_IPV4_CKSUM |
> > -			     DEV_TX_OFFLOAD_MULTI_SEGS),
> I believe this is disabling checksum offload for all cases and then
> enabling only for inline crypto and inline proto.

Yes.

> This is breaking lookaside proto and lookaside none cases. Please
> correct me if I am wrong.

Why breaking?
For cases when HW cksum offload is disabled, IPv4 cksum calculation
will be done in SW, see below:
prepare_tx_pkt(...)
{
   ...
    +
    +		/* calculate IPv4 cksum in SW */
    +		if ((pkt->ol_flags & PKT_TX_IP_CKSUM) == 0)
    +			ip->ip_sum = rte_ipv4_cksum((struct ipv4_hdr *)ip);


We tested lookaside-none case quite extensively - all works well,
in fact on Intel NICs it became even a bit faster because of that change
(though not much). 
Disabling HW offloads when they are not really required has 2 benefits:
 1) allows app to be run on NICs without HW offloads support.
 2) allows dev_configure() for TX path to select simple/vector TX functions
     which for many NICs are significantly faster. 

Konstantin

> So a NACK for this if my understanding is correct.
> >   	},
> >   };
> >
> > @@ -315,7 +313,8 @@ prepare_traffic(struct rte_mbuf **pkts, struct ipsec_traffic *t,
> >   }
> >
> >   static inline void
> > -prepare_tx_pkt(struct rte_mbuf *pkt, uint16_t port)
> > +prepare_tx_pkt(struct rte_mbuf *pkt, uint16_t port,
> > +		const struct lcore_conf *qconf)
> >   {
> >   	struct ip *ip;
> >   	struct ether_hdr *ethhdr;
> > @@ -325,14 +324,19 @@ prepare_tx_pkt(struct rte_mbuf *pkt, uint16_t port)
> >   	ethhdr = (struct ether_hdr *)rte_pktmbuf_prepend(pkt, ETHER_HDR_LEN);
> >
> >   	if (ip->ip_v == IPVERSION) {
> > -		pkt->ol_flags |= PKT_TX_IP_CKSUM | PKT_TX_IPV4;
> > +		pkt->ol_flags |= qconf->outbound.ipv4_offloads;
> >   		pkt->l3_len = sizeof(struct ip);
> >   		pkt->l2_len = ETHER_HDR_LEN;
> >
> >   		ip->ip_sum = 0;
> > +
> > +		/* calculate IPv4 cksum in SW */
> > +		if ((pkt->ol_flags & PKT_TX_IP_CKSUM) == 0)
> > +			ip->ip_sum = rte_ipv4_cksum((struct ipv4_hdr *)ip);
> > +
> >   		ethhdr->ether_type = rte_cpu_to_be_16(ETHER_TYPE_IPv4);
> >   	} else {
> > -		pkt->ol_flags |= PKT_TX_IPV6;
> > +		pkt->ol_flags |= qconf->outbound.ipv6_offloads;
> >   		pkt->l3_len = sizeof(struct ip6_hdr);
> >   		pkt->l2_len = ETHER_HDR_LEN;
> >
> > @@ -346,18 +350,19 @@ prepare_tx_pkt(struct rte_mbuf *pkt, uint16_t port)
> >   }
> >
> >   static inline void
> > -prepare_tx_burst(struct rte_mbuf *pkts[], uint16_t nb_pkts, uint16_t port)
> > +prepare_tx_burst(struct rte_mbuf *pkts[], uint16_t nb_pkts, uint16_t port,
> > +		const struct lcore_conf *qconf)
> >   {
> >   	int32_t i;
> >   	const int32_t prefetch_offset = 2;
> >
> >   	for (i = 0; i < (nb_pkts - prefetch_offset); i++) {
> >   		rte_mbuf_prefetch_part2(pkts[i + prefetch_offset]);
> > -		prepare_tx_pkt(pkts[i], port);
> > +		prepare_tx_pkt(pkts[i], port, qconf);
> >   	}
> >   	/* Process left packets */
> >   	for (; i < nb_pkts; i++)
> > -		prepare_tx_pkt(pkts[i], port);
> > +		prepare_tx_pkt(pkts[i], port, qconf);
> >   }
> >
> >   /* Send burst of packets on an output interface */
> > @@ -371,7 +376,7 @@ send_burst(struct lcore_conf *qconf, uint16_t n, uint16_t port)
> >   	queueid = qconf->tx_queue_id[port];
> >   	m_table = (struct rte_mbuf **)qconf->tx_mbufs[port].m_table;
> >
> > -	prepare_tx_burst(m_table, n, port);
> > +	prepare_tx_burst(m_table, n, port, qconf);
> >
> >   	ret = rte_eth_tx_burst(port, queueid, m_table, n);
> >   	if (unlikely(ret < n)) {
> > @@ -1543,7 +1548,7 @@ cryptodevs_init(void)
> >   }
> >
> >   static void
> > -port_init(uint16_t portid)
> > +port_init(uint16_t portid, uint64_t req_rx_offloads, uint64_t req_tx_offloads)
> >   {
> >   	struct rte_eth_dev_info dev_info;
> >   	struct rte_eth_txconf *txconf;
> > @@ -1584,10 +1589,10 @@ port_init(uint16_t portid)
> >   		local_port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_JUMBO_FRAME;
> >   	}
> >
> > -	if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_SECURITY)
> > -		local_port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_SECURITY;
> > -	if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_SECURITY)
> > -		local_port_conf.txmode.offloads |= DEV_TX_OFFLOAD_SECURITY;
> > +	/* Capabilities will already have been checked.. */
> > +	local_port_conf.rxmode.offloads |= req_rx_offloads;
> > +	local_port_conf.txmode.offloads |= req_tx_offloads;
> > +
> >   	if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE)
> >   		local_port_conf.txmode.offloads |=
> >   			DEV_TX_OFFLOAD_MBUF_FAST_FREE;
> > @@ -1639,6 +1644,13 @@ port_init(uint16_t portid)
> >
> >   		qconf = &lcore_conf[lcore_id];
> >   		qconf->tx_queue_id[portid] = tx_queueid;
> > +
> > +		/* Pre-populate pkt offloads based on capabilities */
> > +		qconf->outbound.ipv4_offloads = PKT_TX_IPV4;
> > +		qconf->outbound.ipv6_offloads = PKT_TX_IPV6;
> > +		if (req_tx_offloads & DEV_TX_OFFLOAD_IPV4_CKSUM)
> > +			qconf->outbound.ipv4_offloads |= PKT_TX_IP_CKSUM;
> > +
> >   		tx_queueid++;
> >
> >   		/* init RX queues */
> > @@ -1749,6 +1761,7 @@ main(int32_t argc, char **argv)
> >   	uint32_t lcore_id;
> >   	uint8_t socket_id;
> >   	uint16_t portid;
> > +	uint64_t req_rx_offloads, req_tx_offloads;
> >
> >   	/* init EAL */
> >   	ret = rte_eal_init(argc, argv);
> > @@ -1804,7 +1817,8 @@ main(int32_t argc, char **argv)
> >   		if ((enabled_port_mask & (1 << portid)) == 0)
> >   			continue;
> >
> > -		port_init(portid);
> > +		sa_check_offloads(portid, &req_rx_offloads, &req_tx_offloads);
> > +		port_init(portid, req_rx_offloads, req_tx_offloads);
> >   	}
> >
> >   	cryptodevs_init();
> > diff --git a/examples/ipsec-secgw/ipsec.h b/examples/ipsec-secgw/ipsec.h
> > index c998c8076..9b1586f52 100644
> > --- a/examples/ipsec-secgw/ipsec.h
> > +++ b/examples/ipsec-secgw/ipsec.h
> > @@ -146,6 +146,8 @@ struct ipsec_ctx {
> >   	struct rte_mempool *session_pool;
> >   	struct rte_mbuf *ol_pkts[MAX_PKT_BURST] __rte_aligned(sizeof(void *));
> >   	uint16_t ol_pkts_cnt;
> > +	uint64_t ipv4_offloads;
> > +	uint64_t ipv6_offloads;
> >   };
> >
> >   struct cdev_key {
> > @@ -239,4 +241,8 @@ sa_init(struct socket_ctx *ctx, int32_t socket_id);
> >   void
> >   rt_init(struct socket_ctx *ctx, int32_t socket_id);
> >
> > +int
> > +sa_check_offloads(uint16_t port_id, uint64_t *rx_offloads,
> > +		uint64_t *tx_offloads);
> > +
> >   #endif /* __IPSEC_H__ */
> > diff --git a/examples/ipsec-secgw/sa.c b/examples/ipsec-secgw/sa.c
> > index d2d3550a4..ff8c4b829 100644
> > --- a/examples/ipsec-secgw/sa.c
> > +++ b/examples/ipsec-secgw/sa.c
> > @@ -1017,3 +1017,59 @@ outbound_sa_lookup(struct sa_ctx *sa_ctx, uint32_t sa_idx[],
> >   	for (i = 0; i < nb_pkts; i++)
> >   		sa[i] = &sa_ctx->sa[sa_idx[i]];
> >   }
> > +
> > +/*
> > + * Select HW offloads to be used.
> > + */
> > +int
> > +sa_check_offloads(uint16_t port_id, uint64_t *rx_offloads,
> > +		uint64_t *tx_offloads)
> > +{
> > +	struct ipsec_sa *rule;
> > +	uint32_t idx_sa;
> > +	struct rte_eth_dev_info dev_info;
> > +
> > +	rte_eth_dev_info_get(port_id, &dev_info);
> > +
> > +	*rx_offloads = 0;
> > +	*tx_offloads = 0;
> > +
> > +	/* Check for inbound rules that use offloads and use this port */
> > +	for (idx_sa = 0; idx_sa < nb_sa_in; idx_sa++) {
> > +		rule = &sa_in[idx_sa];
> > +		if ((rule->type == RTE_SECURITY_ACTION_TYPE_INLINE_CRYPTO ||
> > +				rule->type ==
> > +				RTE_SECURITY_ACTION_TYPE_INLINE_PROTOCOL)
> > +				&& rule->portid == port_id) {
> > +			if ((dev_info.rx_offload_capa & DEV_RX_OFFLOAD_SECURITY)
> > +					== 0) {
> > +				RTE_LOG(WARNING, PORT,
> > +					"HW RX IPSec is not supported\n");
> > +				return -EINVAL;
> > +			}
> > +			*rx_offloads |= DEV_RX_OFFLOAD_SECURITY;
> > +		}
> > +	}
> > +
> > +	/* Check for outbound rules that use offloads and use this port */
> > +	for (idx_sa = 0; idx_sa < nb_sa_out; idx_sa++) {
> > +		rule = &sa_out[idx_sa];
> > +		if ((rule->type == RTE_SECURITY_ACTION_TYPE_INLINE_CRYPTO ||
> > +				rule->type ==
> > +				RTE_SECURITY_ACTION_TYPE_INLINE_PROTOCOL)
> > +				&& rule->portid == port_id) {
> > +			if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_SECURITY)
> > +					== 0) {
> > +				RTE_LOG(WARNING, PORT,
> > +					"HW TX IPSec is not supported\n");
> > +				return -EINVAL;
> > +			}
> > +			*tx_offloads |= DEV_TX_OFFLOAD_SECURITY;
> > +			/* Enable HW IPv4 cksum as well, if it is available */
> > +			if (dev_info.tx_offload_capa &
> > +					DEV_TX_OFFLOAD_IPV4_CKSUM)
> > +				*tx_offloads |= DEV_TX_OFFLOAD_IPV4_CKSUM;
> > +		}
> > +	}
> > +	return 0;
> > +}
  
Akhil Goyal Dec. 24, 2018, 9:45 a.m. UTC | #3
On 12/21/2018 9:28 PM, Ananyev, Konstantin wrote:
>
>> -----Original Message-----
>> From: Akhil Goyal [mailto:akhil.goyal@nxp.com]
>> Sent: Friday, December 21, 2018 1:57 PM
>> To: Ananyev, Konstantin <konstantin.ananyev@intel.com>; dev@dpdk.org
>> Cc: Nicolau, Radu <radu.nicolau@intel.com>; Horton, Remy <remy.horton@intel.com>
>> Subject: Re: [dpdk-dev] [PATCH v4 1/9] examples/ipsec-secgw: avoid to request unused TX offloads
>>
>> Hi Konstantin,
>>
>> On 12/14/2018 10:10 PM, Konstantin Ananyev wrote:
>>> ipsec-secgw always enables TX offloads
>>> (DEV_TX_OFFLOAD_MULTI_SEGS, DEV_TX_OFFLOAD_SECURITY),
>>> even when they are not requested by the config.
>>> That causes many PMD to choose full-featured TX function,
>>> which in many cases is much slower then one without offloads.
>>> That patch adds checks to enabled extra HW offloads, only when
>>> they were requested.
>>> Plus it enables DEV_TX_OFFLOAD_IPV4_CKSUM,
>>> only when other HW TX ofloads are going to be enabled.
>>> Otherwise SW version of ip cksum calculation is used.
>>> That allows to use vector TX function, when inline-ipsec is not
>>> requested.
>>>
>>> Signed-off-by: Remy Horton <remy.horton@intel.com>
>>> Signed-off-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
>>> Acked-by: Radu Nicolau <radu.nicolau@intel.com>
>>> ---
>>>    examples/ipsec-secgw/ipsec-secgw.c | 44 +++++++++++++++--------
>>>    examples/ipsec-secgw/ipsec.h       |  6 ++++
>>>    examples/ipsec-secgw/sa.c          | 56 ++++++++++++++++++++++++++++++
>>>    3 files changed, 91 insertions(+), 15 deletions(-)
>>>
>>> diff --git a/examples/ipsec-secgw/ipsec-secgw.c b/examples/ipsec-secgw/ipsec-secgw.c
>>> index 1bc0b5b50..cfc2b05e5 100644
>>> --- a/examples/ipsec-secgw/ipsec-secgw.c
>>> +++ b/examples/ipsec-secgw/ipsec-secgw.c
>>> @@ -208,8 +208,6 @@ static struct rte_eth_conf port_conf = {
>>>    	},
>>>    	.txmode = {
>>>    		.mq_mode = ETH_MQ_TX_NONE,
>>> -		.offloads = (DEV_TX_OFFLOAD_IPV4_CKSUM |
>>> -			     DEV_TX_OFFLOAD_MULTI_SEGS),
>> I believe this is disabling checksum offload for all cases and then
>> enabling only for inline crypto and inline proto.
> Yes.
>
>> This is breaking lookaside proto and lookaside none cases. Please
>> correct me if I am wrong.
> Why breaking?
reduction in performance is kind of breaking the code.
> For cases when HW cksum offload is disabled, IPv4 cksum calculation
> will be done in SW, see below:
> prepare_tx_pkt(...)
> {
>     ...
>      +
>      +		/* calculate IPv4 cksum in SW */
>      +		if ((pkt->ol_flags & PKT_TX_IP_CKSUM) == 0)
>      +			ip->ip_sum = rte_ipv4_cksum((struct ipv4_hdr *)ip);
>
>
> We tested lookaside-none case quite extensively - all works well,
> in fact on Intel NICs it became even a bit faster because of that change
> (though not much).
yes, it may work well on one hardware, but may not perform good in other 
hardware where cores are limited.
> Disabling HW offloads when they are not really required has 2 benefits:
>   1) allows app to be run on NICs without HW offloads support.
>   2) allows dev_configure() for TX path to select simple/vector TX functions
>       which for many NICs are significantly faster.
>
> Konstantin
>
>> So a NACK for this if my understanding is correct.
>>
  
Ananyev, Konstantin Dec. 24, 2018, 10:19 a.m. UTC | #4
> >>
> >> On 12/14/2018 10:10 PM, Konstantin Ananyev wrote:
> >>> ipsec-secgw always enables TX offloads
> >>> (DEV_TX_OFFLOAD_MULTI_SEGS, DEV_TX_OFFLOAD_SECURITY),
> >>> even when they are not requested by the config.
> >>> That causes many PMD to choose full-featured TX function,
> >>> which in many cases is much slower then one without offloads.
> >>> That patch adds checks to enabled extra HW offloads, only when
> >>> they were requested.
> >>> Plus it enables DEV_TX_OFFLOAD_IPV4_CKSUM,
> >>> only when other HW TX ofloads are going to be enabled.
> >>> Otherwise SW version of ip cksum calculation is used.
> >>> That allows to use vector TX function, when inline-ipsec is not
> >>> requested.
> >>>
> >>> Signed-off-by: Remy Horton <remy.horton@intel.com>
> >>> Signed-off-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
> >>> Acked-by: Radu Nicolau <radu.nicolau@intel.com>
> >>> ---
> >>>    examples/ipsec-secgw/ipsec-secgw.c | 44 +++++++++++++++--------
> >>>    examples/ipsec-secgw/ipsec.h       |  6 ++++
> >>>    examples/ipsec-secgw/sa.c          | 56 ++++++++++++++++++++++++++++++
> >>>    3 files changed, 91 insertions(+), 15 deletions(-)
> >>>
> >>> diff --git a/examples/ipsec-secgw/ipsec-secgw.c b/examples/ipsec-secgw/ipsec-secgw.c
> >>> index 1bc0b5b50..cfc2b05e5 100644
> >>> --- a/examples/ipsec-secgw/ipsec-secgw.c
> >>> +++ b/examples/ipsec-secgw/ipsec-secgw.c
> >>> @@ -208,8 +208,6 @@ static struct rte_eth_conf port_conf = {
> >>>    	},
> >>>    	.txmode = {
> >>>    		.mq_mode = ETH_MQ_TX_NONE,
> >>> -		.offloads = (DEV_TX_OFFLOAD_IPV4_CKSUM |
> >>> -			     DEV_TX_OFFLOAD_MULTI_SEGS),
> >> I believe this is disabling checksum offload for all cases and then
> >> enabling only for inline crypto and inline proto.
> > Yes.
> >
> >> This is breaking lookaside proto and lookaside none cases. Please
> >> correct me if I am wrong.
> > Why breaking?
> reduction in performance is kind of breaking the code.

I didn’t observe any performance drop with that patch.
In fact there was a tiny improvement (see below).
Did you see any regression with this patch on your HW?

> > For cases when HW cksum offload is disabled, IPv4 cksum calculation
> > will be done in SW, see below:
> > prepare_tx_pkt(...)
> > {
> >     ...
> >      +
> >      +		/* calculate IPv4 cksum in SW */
> >      +		if ((pkt->ol_flags & PKT_TX_IP_CKSUM) == 0)
> >      +			ip->ip_sum = rte_ipv4_cksum((struct ipv4_hdr *)ip);
> >
> >
> > We tested lookaside-none case quite extensively - all works well,
> > in fact on Intel NICs it became even a bit faster because of that change
> > (though not much).
> yes, it may work well on one hardware, but may not perform good in other
> hardware where cores are limited.

Could you elaborate a bit more what do you mean by 'cores are limited' here?
Do you mean that for some low end cpus calculating IPv4 cksum in SW is too expensive?
Note that prepare_tx_pkts() and friends read/write L2/L3 packet headers anyway -
so IPv4 header will be in L1 cache already.

> > Disabling HW offloads when they are not really required has 2 benefits:
> >   1) allows app to be run on NICs without HW offloads support.
> >   2) allows dev_configure() for TX path to select simple/vector TX functions
> >       which for many NICs are significantly faster.
> >
> > Konstantin
> >
> >> So a NACK for this if my understanding is correct.
> >>
  
Akhil Goyal Dec. 24, 2018, 10:54 a.m. UTC | #5
On 12/24/2018 3:49 PM, Ananyev, Konstantin wrote:
>
>>>> On 12/14/2018 10:10 PM, Konstantin Ananyev wrote:
>>>>> ipsec-secgw always enables TX offloads
>>>>> (DEV_TX_OFFLOAD_MULTI_SEGS, DEV_TX_OFFLOAD_SECURITY),
>>>>> even when they are not requested by the config.
>>>>> That causes many PMD to choose full-featured TX function,
>>>>> which in many cases is much slower then one without offloads.
>>>>> That patch adds checks to enabled extra HW offloads, only when
>>>>> they were requested.
>>>>> Plus it enables DEV_TX_OFFLOAD_IPV4_CKSUM,
>>>>> only when other HW TX ofloads are going to be enabled.
>>>>> Otherwise SW version of ip cksum calculation is used.
>>>>> That allows to use vector TX function, when inline-ipsec is not
>>>>> requested.
>>>>>
>>>>> Signed-off-by: Remy Horton <remy.horton@intel.com>
>>>>> Signed-off-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
>>>>> Acked-by: Radu Nicolau <radu.nicolau@intel.com>
>>>>> ---
>>>>>     examples/ipsec-secgw/ipsec-secgw.c | 44 +++++++++++++++--------
>>>>>     examples/ipsec-secgw/ipsec.h       |  6 ++++
>>>>>     examples/ipsec-secgw/sa.c          | 56 ++++++++++++++++++++++++++++++
>>>>>     3 files changed, 91 insertions(+), 15 deletions(-)
>>>>>
>>>>> diff --git a/examples/ipsec-secgw/ipsec-secgw.c b/examples/ipsec-secgw/ipsec-secgw.c
>>>>> index 1bc0b5b50..cfc2b05e5 100644
>>>>> --- a/examples/ipsec-secgw/ipsec-secgw.c
>>>>> +++ b/examples/ipsec-secgw/ipsec-secgw.c
>>>>> @@ -208,8 +208,6 @@ static struct rte_eth_conf port_conf = {
>>>>>     	},
>>>>>     	.txmode = {
>>>>>     		.mq_mode = ETH_MQ_TX_NONE,
>>>>> -		.offloads = (DEV_TX_OFFLOAD_IPV4_CKSUM |
>>>>> -			     DEV_TX_OFFLOAD_MULTI_SEGS),
>>>> I believe this is disabling checksum offload for all cases and then
>>>> enabling only for inline crypto and inline proto.
>>> Yes.
>>>
>>>> This is breaking lookaside proto and lookaside none cases. Please
>>>> correct me if I am wrong.
>>> Why breaking?
>> reduction in performance is kind of breaking the code.
> I didn’t observe any performance drop with that patch.
> In fact there was a tiny improvement (see below).
> Did you see any regression with this patch on your HW?
NXP hardware are low -end to mid end devices and we are always 
bottleneck by core cycles.
So we would like to have as much offloads to HW as possible.
>
>>> For cases when HW cksum offload is disabled, IPv4 cksum calculation
>>> will be done in SW, see below:
>>> prepare_tx_pkt(...)
>>> {
>>>      ...
>>>       +
>>>       +		/* calculate IPv4 cksum in SW */
>>>       +		if ((pkt->ol_flags & PKT_TX_IP_CKSUM) == 0)
>>>       +			ip->ip_sum = rte_ipv4_cksum((struct ipv4_hdr *)ip);
>>>
>>>
>>> We tested lookaside-none case quite extensively - all works well,
>>> in fact on Intel NICs it became even a bit faster because of that change
>>> (though not much).
>> yes, it may work well on one hardware, but may not perform good in other
>> hardware where cores are limited.
> Could you elaborate a bit more what do you mean by 'cores are limited' here?
we have single core devices as well on which we run ipsec-secgw.
> Do you mean that for some low end cpus calculating IPv4 cksum in SW is too expensive?
yes, limited by core cycles and not by HW
> Note that prepare_tx_pkts() and friends read/write L2/L3 packet headers anyway -
> so IPv4 header will be in L1 cache already.
Agreed, but still it will consume some cycles which are more than that 
of HW.
>
>>> Disabling HW offloads when they are not really required has 2 benefits:
>>>    1) allows app to be run on NICs without HW offloads support.
>>>    2) allows dev_configure() for TX path to select simple/vector TX functions
>>>        which for many NICs are significantly faster.
>>>
>>> Konstantin
>>>
>>>> So a NACK for this if my understanding is correct.
>>>>
  
Akhil Goyal Dec. 24, 2018, 10:55 a.m. UTC | #6
On 12/24/2018 4:24 PM, Akhil Goyal wrote:
>
> On 12/24/2018 3:49 PM, Ananyev, Konstantin wrote:
>>>>> On 12/14/2018 10:10 PM, Konstantin Ananyev wrote:
>>>>>> ipsec-secgw always enables TX offloads
>>>>>> (DEV_TX_OFFLOAD_MULTI_SEGS, DEV_TX_OFFLOAD_SECURITY),
>>>>>> even when they are not requested by the config.
>>>>>> That causes many PMD to choose full-featured TX function,
>>>>>> which in many cases is much slower then one without offloads.
>>>>>> That patch adds checks to enabled extra HW offloads, only when
>>>>>> they were requested.
>>>>>> Plus it enables DEV_TX_OFFLOAD_IPV4_CKSUM,
>>>>>> only when other HW TX ofloads are going to be enabled.
>>>>>> Otherwise SW version of ip cksum calculation is used.
>>>>>> That allows to use vector TX function, when inline-ipsec is not
>>>>>> requested.
>>>>>>
>>>>>> Signed-off-by: Remy Horton <remy.horton@intel.com>
>>>>>> Signed-off-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
>>>>>> Acked-by: Radu Nicolau <radu.nicolau@intel.com>
>>>>>> ---
>>>>>>      examples/ipsec-secgw/ipsec-secgw.c | 44 +++++++++++++++--------
>>>>>>      examples/ipsec-secgw/ipsec.h       |  6 ++++
>>>>>>      examples/ipsec-secgw/sa.c          | 56 ++++++++++++++++++++++++++++++
>>>>>>      3 files changed, 91 insertions(+), 15 deletions(-)
>>>>>>
>>>>>> diff --git a/examples/ipsec-secgw/ipsec-secgw.c b/examples/ipsec-secgw/ipsec-secgw.c
>>>>>> index 1bc0b5b50..cfc2b05e5 100644
>>>>>> --- a/examples/ipsec-secgw/ipsec-secgw.c
>>>>>> +++ b/examples/ipsec-secgw/ipsec-secgw.c
>>>>>> @@ -208,8 +208,6 @@ static struct rte_eth_conf port_conf = {
>>>>>>      	},
>>>>>>      	.txmode = {
>>>>>>      		.mq_mode = ETH_MQ_TX_NONE,
>>>>>> -		.offloads = (DEV_TX_OFFLOAD_IPV4_CKSUM |
>>>>>> -			     DEV_TX_OFFLOAD_MULTI_SEGS),
>>>>> I believe this is disabling checksum offload for all cases and then
>>>>> enabling only for inline crypto and inline proto.
>>>> Yes.
>>>>
>>>>> This is breaking lookaside proto and lookaside none cases. Please
>>>>> correct me if I am wrong.
>>>> Why breaking?
>>> reduction in performance is kind of breaking the code.
>> I didn’t observe any performance drop with that patch.
>> In fact there was a tiny improvement (see below).
>> Did you see any regression with this patch on your HW?
> NXP hardware are low -end to mid end devices and we are always
> bottleneck by core cycles.
> So we would like to have as much offloads to HW as possible.
>>>> For cases when HW cksum offload is disabled, IPv4 cksum calculation
>>>> will be done in SW, see below:
>>>> prepare_tx_pkt(...)
>>>> {
>>>>       ...
>>>>        +
>>>>        +		/* calculate IPv4 cksum in SW */
>>>>        +		if ((pkt->ol_flags & PKT_TX_IP_CKSUM) == 0)
>>>>        +			ip->ip_sum = rte_ipv4_cksum((struct ipv4_hdr *)ip);
>>>>
>>>>
>>>> We tested lookaside-none case quite extensively - all works well,
>>>> in fact on Intel NICs it became even a bit faster because of that change
>>>> (though not much).
>>> yes, it may work well on one hardware, but may not perform good in other
>>> hardware where cores are limited.
>> Could you elaborate a bit more what do you mean by 'cores are limited' here?
> we have single core devices as well on which we run ipsec-secgw.
>> Do you mean that for some low end cpus calculating IPv4 cksum in SW is too expensive?
> yes, limited by core cycles and not by HW
>> Note that prepare_tx_pkts() and friends read/write L2/L3 packet headers anyway -
>> so IPv4 header will be in L1 cache already.
> Agreed, but still it will consume some cycles which are more than that
> of HW.
Impact will be lesser in lookaside none, but will be more in lookaside 
offload.
>>>> Disabling HW offloads when they are not really required has 2 benefits:
>>>>     1) allows app to be run on NICs without HW offloads support.
>>>>     2) allows dev_configure() for TX path to select simple/vector TX functions
>>>>         which for many NICs are significantly faster.
>>>>
>>>> Konstantin
>>>>
>>>>> So a NACK for this if my understanding is correct.
>>>>>
  
Ananyev, Konstantin Dec. 24, 2018, 11:22 a.m. UTC | #7
> -----Original Message-----
> From: Akhil Goyal [mailto:akhil.goyal@nxp.com]
> Sent: Monday, December 24, 2018 10:54 AM
> To: Ananyev, Konstantin <konstantin.ananyev@intel.com>; dev@dpdk.org
> Cc: Nicolau, Radu <radu.nicolau@intel.com>; Horton, Remy <remy.horton@intel.com>
> Subject: Re: [dpdk-dev] [PATCH v4 1/9] examples/ipsec-secgw: avoid to request unused TX offloads
> 
> 
> 
> On 12/24/2018 3:49 PM, Ananyev, Konstantin wrote:
> >
> >>>> On 12/14/2018 10:10 PM, Konstantin Ananyev wrote:
> >>>>> ipsec-secgw always enables TX offloads
> >>>>> (DEV_TX_OFFLOAD_MULTI_SEGS, DEV_TX_OFFLOAD_SECURITY),
> >>>>> even when they are not requested by the config.
> >>>>> That causes many PMD to choose full-featured TX function,
> >>>>> which in many cases is much slower then one without offloads.
> >>>>> That patch adds checks to enabled extra HW offloads, only when
> >>>>> they were requested.
> >>>>> Plus it enables DEV_TX_OFFLOAD_IPV4_CKSUM,
> >>>>> only when other HW TX ofloads are going to be enabled.
> >>>>> Otherwise SW version of ip cksum calculation is used.
> >>>>> That allows to use vector TX function, when inline-ipsec is not
> >>>>> requested.
> >>>>>
> >>>>> Signed-off-by: Remy Horton <remy.horton@intel.com>
> >>>>> Signed-off-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
> >>>>> Acked-by: Radu Nicolau <radu.nicolau@intel.com>
> >>>>> ---
> >>>>>     examples/ipsec-secgw/ipsec-secgw.c | 44 +++++++++++++++--------
> >>>>>     examples/ipsec-secgw/ipsec.h       |  6 ++++
> >>>>>     examples/ipsec-secgw/sa.c          | 56 ++++++++++++++++++++++++++++++
> >>>>>     3 files changed, 91 insertions(+), 15 deletions(-)
> >>>>>
> >>>>> diff --git a/examples/ipsec-secgw/ipsec-secgw.c b/examples/ipsec-secgw/ipsec-secgw.c
> >>>>> index 1bc0b5b50..cfc2b05e5 100644
> >>>>> --- a/examples/ipsec-secgw/ipsec-secgw.c
> >>>>> +++ b/examples/ipsec-secgw/ipsec-secgw.c
> >>>>> @@ -208,8 +208,6 @@ static struct rte_eth_conf port_conf = {
> >>>>>     	},
> >>>>>     	.txmode = {
> >>>>>     		.mq_mode = ETH_MQ_TX_NONE,
> >>>>> -		.offloads = (DEV_TX_OFFLOAD_IPV4_CKSUM |
> >>>>> -			     DEV_TX_OFFLOAD_MULTI_SEGS),
> >>>> I believe this is disabling checksum offload for all cases and then
> >>>> enabling only for inline crypto and inline proto.
> >>> Yes.
> >>>
> >>>> This is breaking lookaside proto and lookaside none cases. Please
> >>>> correct me if I am wrong.
> >>> Why breaking?
> >> reduction in performance is kind of breaking the code.
> > I didn’t observe any performance drop with that patch.
> > In fact there was a tiny improvement (see below).
> > Did you see any regression with this patch on your HW?
> NXP hardware are low -end to mid end devices and we are always
> bottleneck by core cycles.
> So we would like to have as much offloads to HW as possible.

Ok, then I suppose we need to introduce new cmd-line options,
Something like: --txoffloads=<tx_offload_mask> --rx_offloads=<rx_offload_mask>
to keep everyone happy.
Are you ok with that?
Konstantin 

> >
> >>> For cases when HW cksum offload is disabled, IPv4 cksum calculation
> >>> will be done in SW, see below:
> >>> prepare_tx_pkt(...)
> >>> {
> >>>      ...
> >>>       +
> >>>       +		/* calculate IPv4 cksum in SW */
> >>>       +		if ((pkt->ol_flags & PKT_TX_IP_CKSUM) == 0)
> >>>       +			ip->ip_sum = rte_ipv4_cksum((struct ipv4_hdr *)ip);
> >>>
> >>>
> >>> We tested lookaside-none case quite extensively - all works well,
> >>> in fact on Intel NICs it became even a bit faster because of that change
> >>> (though not much).
> >> yes, it may work well on one hardware, but may not perform good in other
> >> hardware where cores are limited.
> > Could you elaborate a bit more what do you mean by 'cores are limited' here?
> we have single core devices as well on which we run ipsec-secgw.
> > Do you mean that for some low end cpus calculating IPv4 cksum in SW is too expensive?
> yes, limited by core cycles and not by HW
> > Note that prepare_tx_pkts() and friends read/write L2/L3 packet headers anyway -
> > so IPv4 header will be in L1 cache already.
> Agreed, but still it will consume some cycles which are more than that
> of HW.
> >
> >>> Disabling HW offloads when they are not really required has 2 benefits:
> >>>    1) allows app to be run on NICs without HW offloads support.
> >>>    2) allows dev_configure() for TX path to select simple/vector TX functions
> >>>        which for many NICs are significantly faster.
> >>>
> >>> Konstantin
> >>>
> >>>> So a NACK for this if my understanding is correct.
> >>>>
  
Akhil Goyal Dec. 24, 2018, 11:24 a.m. UTC | #8
On 12/24/2018 4:52 PM, Ananyev, Konstantin wrote:
>
>> -----Original Message-----
>> From: Akhil Goyal [mailto:akhil.goyal@nxp.com]
>> Sent: Monday, December 24, 2018 10:54 AM
>> To: Ananyev, Konstantin <konstantin.ananyev@intel.com>; dev@dpdk.org
>> Cc: Nicolau, Radu <radu.nicolau@intel.com>; Horton, Remy <remy.horton@intel.com>
>> Subject: Re: [dpdk-dev] [PATCH v4 1/9] examples/ipsec-secgw: avoid to request unused TX offloads
>>
>>
>>
>> On 12/24/2018 3:49 PM, Ananyev, Konstantin wrote:
>>>>>> On 12/14/2018 10:10 PM, Konstantin Ananyev wrote:
>>>>>>> ipsec-secgw always enables TX offloads
>>>>>>> (DEV_TX_OFFLOAD_MULTI_SEGS, DEV_TX_OFFLOAD_SECURITY),
>>>>>>> even when they are not requested by the config.
>>>>>>> That causes many PMD to choose full-featured TX function,
>>>>>>> which in many cases is much slower then one without offloads.
>>>>>>> That patch adds checks to enabled extra HW offloads, only when
>>>>>>> they were requested.
>>>>>>> Plus it enables DEV_TX_OFFLOAD_IPV4_CKSUM,
>>>>>>> only when other HW TX ofloads are going to be enabled.
>>>>>>> Otherwise SW version of ip cksum calculation is used.
>>>>>>> That allows to use vector TX function, when inline-ipsec is not
>>>>>>> requested.
>>>>>>>
>>>>>>> Signed-off-by: Remy Horton <remy.horton@intel.com>
>>>>>>> Signed-off-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
>>>>>>> Acked-by: Radu Nicolau <radu.nicolau@intel.com>
>>>>>>> ---
>>>>>>>      examples/ipsec-secgw/ipsec-secgw.c | 44 +++++++++++++++--------
>>>>>>>      examples/ipsec-secgw/ipsec.h       |  6 ++++
>>>>>>>      examples/ipsec-secgw/sa.c          | 56 ++++++++++++++++++++++++++++++
>>>>>>>      3 files changed, 91 insertions(+), 15 deletions(-)
>>>>>>>
>>>>>>> diff --git a/examples/ipsec-secgw/ipsec-secgw.c b/examples/ipsec-secgw/ipsec-secgw.c
>>>>>>> index 1bc0b5b50..cfc2b05e5 100644
>>>>>>> --- a/examples/ipsec-secgw/ipsec-secgw.c
>>>>>>> +++ b/examples/ipsec-secgw/ipsec-secgw.c
>>>>>>> @@ -208,8 +208,6 @@ static struct rte_eth_conf port_conf = {
>>>>>>>      	},
>>>>>>>      	.txmode = {
>>>>>>>      		.mq_mode = ETH_MQ_TX_NONE,
>>>>>>> -		.offloads = (DEV_TX_OFFLOAD_IPV4_CKSUM |
>>>>>>> -			     DEV_TX_OFFLOAD_MULTI_SEGS),
>>>>>> I believe this is disabling checksum offload for all cases and then
>>>>>> enabling only for inline crypto and inline proto.
>>>>> Yes.
>>>>>
>>>>>> This is breaking lookaside proto and lookaside none cases. Please
>>>>>> correct me if I am wrong.
>>>>> Why breaking?
>>>> reduction in performance is kind of breaking the code.
>>> I didn’t observe any performance drop with that patch.
>>> In fact there was a tiny improvement (see below).
>>> Did you see any regression with this patch on your HW?
>> NXP hardware are low -end to mid end devices and we are always
>> bottleneck by core cycles.
>> So we would like to have as much offloads to HW as possible.
> Ok, then I suppose we need to introduce new cmd-line options,
> Something like: --txoffloads=<tx_offload_mask> --rx_offloads=<rx_offload_mask>
> to keep everyone happy.
> Are you ok with that?
I think it should be taken from the PMD capabilities. cmd line for every 
parameter will make it very complex.
> Konstantin
>
>>>>> For cases when HW cksum offload is disabled, IPv4 cksum calculation
>>>>> will be done in SW, see below:
>>>>> prepare_tx_pkt(...)
>>>>> {
>>>>>       ...
>>>>>        +
>>>>>        +		/* calculate IPv4 cksum in SW */
>>>>>        +		if ((pkt->ol_flags & PKT_TX_IP_CKSUM) == 0)
>>>>>        +			ip->ip_sum = rte_ipv4_cksum((struct ipv4_hdr *)ip);
>>>>>
>>>>>
>>>>> We tested lookaside-none case quite extensively - all works well,
>>>>> in fact on Intel NICs it became even a bit faster because of that change
>>>>> (though not much).
>>>> yes, it may work well on one hardware, but may not perform good in other
>>>> hardware where cores are limited.
>>> Could you elaborate a bit more what do you mean by 'cores are limited' here?
>> we have single core devices as well on which we run ipsec-secgw.
>>> Do you mean that for some low end cpus calculating IPv4 cksum in SW is too expensive?
>> yes, limited by core cycles and not by HW
>>> Note that prepare_tx_pkts() and friends read/write L2/L3 packet headers anyway -
>>> so IPv4 header will be in L1 cache already.
>> Agreed, but still it will consume some cycles which are more than that
>> of HW.
>>>>> Disabling HW offloads when they are not really required has 2 benefits:
>>>>>     1) allows app to be run on NICs without HW offloads support.
>>>>>     2) allows dev_configure() for TX path to select simple/vector TX functions
>>>>>         which for many NICs are significantly faster.
>>>>>
>>>>> Konstantin
>>>>>
>>>>>> So a NACK for this if my understanding is correct.
>>>>>>
  
Ananyev, Konstantin Dec. 24, 2018, 11:37 a.m. UTC | #9
> -----Original Message-----
> From: Akhil Goyal [mailto:akhil.goyal@nxp.com]
> Sent: Monday, December 24, 2018 11:25 AM
> To: Ananyev, Konstantin <konstantin.ananyev@intel.com>; dev@dpdk.org
> Cc: Nicolau, Radu <radu.nicolau@intel.com>; Horton, Remy <remy.horton@intel.com>
> Subject: Re: [dpdk-dev] [PATCH v4 1/9] examples/ipsec-secgw: avoid to request unused TX offloads
> 
> 
> 
> On 12/24/2018 4:52 PM, Ananyev, Konstantin wrote:
> >
> >> -----Original Message-----
> >> From: Akhil Goyal [mailto:akhil.goyal@nxp.com]
> >> Sent: Monday, December 24, 2018 10:54 AM
> >> To: Ananyev, Konstantin <konstantin.ananyev@intel.com>; dev@dpdk.org
> >> Cc: Nicolau, Radu <radu.nicolau@intel.com>; Horton, Remy <remy.horton@intel.com>
> >> Subject: Re: [dpdk-dev] [PATCH v4 1/9] examples/ipsec-secgw: avoid to request unused TX offloads
> >>
> >>
> >>
> >> On 12/24/2018 3:49 PM, Ananyev, Konstantin wrote:
> >>>>>> On 12/14/2018 10:10 PM, Konstantin Ananyev wrote:
> >>>>>>> ipsec-secgw always enables TX offloads
> >>>>>>> (DEV_TX_OFFLOAD_MULTI_SEGS, DEV_TX_OFFLOAD_SECURITY),
> >>>>>>> even when they are not requested by the config.
> >>>>>>> That causes many PMD to choose full-featured TX function,
> >>>>>>> which in many cases is much slower then one without offloads.
> >>>>>>> That patch adds checks to enabled extra HW offloads, only when
> >>>>>>> they were requested.
> >>>>>>> Plus it enables DEV_TX_OFFLOAD_IPV4_CKSUM,
> >>>>>>> only when other HW TX ofloads are going to be enabled.
> >>>>>>> Otherwise SW version of ip cksum calculation is used.
> >>>>>>> That allows to use vector TX function, when inline-ipsec is not
> >>>>>>> requested.
> >>>>>>>
> >>>>>>> Signed-off-by: Remy Horton <remy.horton@intel.com>
> >>>>>>> Signed-off-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
> >>>>>>> Acked-by: Radu Nicolau <radu.nicolau@intel.com>
> >>>>>>> ---
> >>>>>>>      examples/ipsec-secgw/ipsec-secgw.c | 44 +++++++++++++++--------
> >>>>>>>      examples/ipsec-secgw/ipsec.h       |  6 ++++
> >>>>>>>      examples/ipsec-secgw/sa.c          | 56 ++++++++++++++++++++++++++++++
> >>>>>>>      3 files changed, 91 insertions(+), 15 deletions(-)
> >>>>>>>
> >>>>>>> diff --git a/examples/ipsec-secgw/ipsec-secgw.c b/examples/ipsec-secgw/ipsec-secgw.c
> >>>>>>> index 1bc0b5b50..cfc2b05e5 100644
> >>>>>>> --- a/examples/ipsec-secgw/ipsec-secgw.c
> >>>>>>> +++ b/examples/ipsec-secgw/ipsec-secgw.c
> >>>>>>> @@ -208,8 +208,6 @@ static struct rte_eth_conf port_conf = {
> >>>>>>>      	},
> >>>>>>>      	.txmode = {
> >>>>>>>      		.mq_mode = ETH_MQ_TX_NONE,
> >>>>>>> -		.offloads = (DEV_TX_OFFLOAD_IPV4_CKSUM |
> >>>>>>> -			     DEV_TX_OFFLOAD_MULTI_SEGS),
> >>>>>> I believe this is disabling checksum offload for all cases and then
> >>>>>> enabling only for inline crypto and inline proto.
> >>>>> Yes.
> >>>>>
> >>>>>> This is breaking lookaside proto and lookaside none cases. Please
> >>>>>> correct me if I am wrong.
> >>>>> Why breaking?
> >>>> reduction in performance is kind of breaking the code.
> >>> I didn’t observe any performance drop with that patch.
> >>> In fact there was a tiny improvement (see below).
> >>> Did you see any regression with this patch on your HW?
> >> NXP hardware are low -end to mid end devices and we are always
> >> bottleneck by core cycles.
> >> So we would like to have as much offloads to HW as possible.
> > Ok, then I suppose we need to introduce new cmd-line options,
> > Something like: --txoffloads=<tx_offload_mask> --rx_offloads=<rx_offload_mask>
> > to keep everyone happy.
> > Are you ok with that?
> I think it should be taken from the PMD capabilities. 

Don't see how?
Let say, Intel NICs do support HW IPv4 cksum offload, but we don't want
to enable it on its own - only if IPsec offload is also enabled.
From other side you want HW IPv4 cksum offload to be always enabled
if present.
As I can see, to fulfill everyone needs we need to provide user ability
to specify which HW offloads to use. 

> cmd line for every
> parameter will make it very complex.

> > Konstantin
> >
> >>>>> For cases when HW cksum offload is disabled, IPv4 cksum calculation
> >>>>> will be done in SW, see below:
> >>>>> prepare_tx_pkt(...)
> >>>>> {
> >>>>>       ...
> >>>>>        +
> >>>>>        +		/* calculate IPv4 cksum in SW */
> >>>>>        +		if ((pkt->ol_flags & PKT_TX_IP_CKSUM) == 0)
> >>>>>        +			ip->ip_sum = rte_ipv4_cksum((struct ipv4_hdr *)ip);
> >>>>>
> >>>>>
> >>>>> We tested lookaside-none case quite extensively - all works well,
> >>>>> in fact on Intel NICs it became even a bit faster because of that change
> >>>>> (though not much).
> >>>> yes, it may work well on one hardware, but may not perform good in other
> >>>> hardware where cores are limited.
> >>> Could you elaborate a bit more what do you mean by 'cores are limited' here?
> >> we have single core devices as well on which we run ipsec-secgw.
> >>> Do you mean that for some low end cpus calculating IPv4 cksum in SW is too expensive?
> >> yes, limited by core cycles and not by HW
> >>> Note that prepare_tx_pkts() and friends read/write L2/L3 packet headers anyway -
> >>> so IPv4 header will be in L1 cache already.
> >> Agreed, but still it will consume some cycles which are more than that
> >> of HW.
> >>>>> Disabling HW offloads when they are not really required has 2 benefits:
> >>>>>     1) allows app to be run on NICs without HW offloads support.
> >>>>>     2) allows dev_configure() for TX path to select simple/vector TX functions
> >>>>>         which for many NICs are significantly faster.
> >>>>>
> >>>>> Konstantin
> >>>>>
> >>>>>> So a NACK for this if my understanding is correct.
> >>>>>>
  
Akhil Goyal Dec. 24, 2018, 12:31 p.m. UTC | #10
On 12/24/2018 5:07 PM, Ananyev, Konstantin wrote:
>
>> -----Original Message-----
>> From: Akhil Goyal [mailto:akhil.goyal@nxp.com]
>> Sent: Monday, December 24, 2018 11:25 AM
>> To: Ananyev, Konstantin <konstantin.ananyev@intel.com>; dev@dpdk.org
>> Cc: Nicolau, Radu <radu.nicolau@intel.com>; Horton, Remy <remy.horton@intel.com>
>> Subject: Re: [dpdk-dev] [PATCH v4 1/9] examples/ipsec-secgw: avoid to request unused TX offloads
>>
>>
>>
>> On 12/24/2018 4:52 PM, Ananyev, Konstantin wrote:
>>>> -----Original Message-----
>>>> From: Akhil Goyal [mailto:akhil.goyal@nxp.com]
>>>> Sent: Monday, December 24, 2018 10:54 AM
>>>> To: Ananyev, Konstantin <konstantin.ananyev@intel.com>; dev@dpdk.org
>>>> Cc: Nicolau, Radu <radu.nicolau@intel.com>; Horton, Remy <remy.horton@intel.com>
>>>> Subject: Re: [dpdk-dev] [PATCH v4 1/9] examples/ipsec-secgw: avoid to request unused TX offloads
>>>>
>>>>
>>>>
>>>> On 12/24/2018 3:49 PM, Ananyev, Konstantin wrote:
>>>>>>>> On 12/14/2018 10:10 PM, Konstantin Ananyev wrote:
>>>>>>>>> ipsec-secgw always enables TX offloads
>>>>>>>>> (DEV_TX_OFFLOAD_MULTI_SEGS, DEV_TX_OFFLOAD_SECURITY),
>>>>>>>>> even when they are not requested by the config.
>>>>>>>>> That causes many PMD to choose full-featured TX function,
>>>>>>>>> which in many cases is much slower then one without offloads.
>>>>>>>>> That patch adds checks to enabled extra HW offloads, only when
>>>>>>>>> they were requested.
>>>>>>>>> Plus it enables DEV_TX_OFFLOAD_IPV4_CKSUM,
>>>>>>>>> only when other HW TX ofloads are going to be enabled.
>>>>>>>>> Otherwise SW version of ip cksum calculation is used.
>>>>>>>>> That allows to use vector TX function, when inline-ipsec is not
>>>>>>>>> requested.
>>>>>>>>>
>>>>>>>>> Signed-off-by: Remy Horton <remy.horton@intel.com>
>>>>>>>>> Signed-off-by: Konstantin Ananyev <konstantin.ananyev@intel.com>
>>>>>>>>> Acked-by: Radu Nicolau <radu.nicolau@intel.com>
>>>>>>>>> ---
>>>>>>>>>       examples/ipsec-secgw/ipsec-secgw.c | 44 +++++++++++++++--------
>>>>>>>>>       examples/ipsec-secgw/ipsec.h       |  6 ++++
>>>>>>>>>       examples/ipsec-secgw/sa.c          | 56 ++++++++++++++++++++++++++++++
>>>>>>>>>       3 files changed, 91 insertions(+), 15 deletions(-)
>>>>>>>>>
>>>>>>>>> diff --git a/examples/ipsec-secgw/ipsec-secgw.c b/examples/ipsec-secgw/ipsec-secgw.c
>>>>>>>>> index 1bc0b5b50..cfc2b05e5 100644
>>>>>>>>> --- a/examples/ipsec-secgw/ipsec-secgw.c
>>>>>>>>> +++ b/examples/ipsec-secgw/ipsec-secgw.c
>>>>>>>>> @@ -208,8 +208,6 @@ static struct rte_eth_conf port_conf = {
>>>>>>>>>       	},
>>>>>>>>>       	.txmode = {
>>>>>>>>>       		.mq_mode = ETH_MQ_TX_NONE,
>>>>>>>>> -		.offloads = (DEV_TX_OFFLOAD_IPV4_CKSUM |
>>>>>>>>> -			     DEV_TX_OFFLOAD_MULTI_SEGS),
>>>>>>>> I believe this is disabling checksum offload for all cases and then
>>>>>>>> enabling only for inline crypto and inline proto.
>>>>>>> Yes.
>>>>>>>
>>>>>>>> This is breaking lookaside proto and lookaside none cases. Please
>>>>>>>> correct me if I am wrong.
>>>>>>> Why breaking?
>>>>>> reduction in performance is kind of breaking the code.
>>>>> I didn’t observe any performance drop with that patch.
>>>>> In fact there was a tiny improvement (see below).
>>>>> Did you see any regression with this patch on your HW?
>>>> NXP hardware are low -end to mid end devices and we are always
>>>> bottleneck by core cycles.
>>>> So we would like to have as much offloads to HW as possible.
>>> Ok, then I suppose we need to introduce new cmd-line options,
>>> Something like: --txoffloads=<tx_offload_mask> --rx_offloads=<rx_offload_mask>
>>> to keep everyone happy.
>>> Are you ok with that?
>> I think it should be taken from the PMD capabilities.
> Don't see how?
> Let say, Intel NICs do support HW IPv4 cksum offload, but we don't want
> to enable it on its own - only if IPsec offload is also enabled.
>  From other side you want HW IPv4 cksum offload to be always enabled
> if present.
> As I can see, to fulfill everyone needs we need to provide user ability
> to specify which HW offloads to use.
if there is no other way to resolve it, then probably you can add an 
optional parameter to disable offloads when it is required.
I believe default behavior should not be changed.
>
>> cmd line for every
>> parameter will make it very complex.
>>> Konstantin
>>>
>>>>>>> For cases when HW cksum offload is disabled, IPv4 cksum calculation
>>>>>>> will be done in SW, see below:
>>>>>>> prepare_tx_pkt(...)
>>>>>>> {
>>>>>>>        ...
>>>>>>>         +
>>>>>>>         +		/* calculate IPv4 cksum in SW */
>>>>>>>         +		if ((pkt->ol_flags & PKT_TX_IP_CKSUM) == 0)
>>>>>>>         +			ip->ip_sum = rte_ipv4_cksum((struct ipv4_hdr *)ip);
>>>>>>>
>>>>>>>
>>>>>>> We tested lookaside-none case quite extensively - all works well,
>>>>>>> in fact on Intel NICs it became even a bit faster because of that change
>>>>>>> (though not much).
>>>>>> yes, it may work well on one hardware, but may not perform good in other
>>>>>> hardware where cores are limited.
>>>>> Could you elaborate a bit more what do you mean by 'cores are limited' here?
>>>> we have single core devices as well on which we run ipsec-secgw.
>>>>> Do you mean that for some low end cpus calculating IPv4 cksum in SW is too expensive?
>>>> yes, limited by core cycles and not by HW
>>>>> Note that prepare_tx_pkts() and friends read/write L2/L3 packet headers anyway -
>>>>> so IPv4 header will be in L1 cache already.
>>>> Agreed, but still it will consume some cycles which are more than that
>>>> of HW.
>>>>>>> Disabling HW offloads when they are not really required has 2 benefits:
>>>>>>>      1) allows app to be run on NICs without HW offloads support.
>>>>>>>      2) allows dev_configure() for TX path to select simple/vector TX functions
>>>>>>>          which for many NICs are significantly faster.
>>>>>>>
>>>>>>> Konstantin
>>>>>>>
>>>>>>>> So a NACK for this if my understanding is correct.
>>>>>>>>
  

Patch

diff --git a/examples/ipsec-secgw/ipsec-secgw.c b/examples/ipsec-secgw/ipsec-secgw.c
index 1bc0b5b50..cfc2b05e5 100644
--- a/examples/ipsec-secgw/ipsec-secgw.c
+++ b/examples/ipsec-secgw/ipsec-secgw.c
@@ -208,8 +208,6 @@  static struct rte_eth_conf port_conf = {
 	},
 	.txmode = {
 		.mq_mode = ETH_MQ_TX_NONE,
-		.offloads = (DEV_TX_OFFLOAD_IPV4_CKSUM |
-			     DEV_TX_OFFLOAD_MULTI_SEGS),
 	},
 };
 
@@ -315,7 +313,8 @@  prepare_traffic(struct rte_mbuf **pkts, struct ipsec_traffic *t,
 }
 
 static inline void
-prepare_tx_pkt(struct rte_mbuf *pkt, uint16_t port)
+prepare_tx_pkt(struct rte_mbuf *pkt, uint16_t port,
+		const struct lcore_conf *qconf)
 {
 	struct ip *ip;
 	struct ether_hdr *ethhdr;
@@ -325,14 +324,19 @@  prepare_tx_pkt(struct rte_mbuf *pkt, uint16_t port)
 	ethhdr = (struct ether_hdr *)rte_pktmbuf_prepend(pkt, ETHER_HDR_LEN);
 
 	if (ip->ip_v == IPVERSION) {
-		pkt->ol_flags |= PKT_TX_IP_CKSUM | PKT_TX_IPV4;
+		pkt->ol_flags |= qconf->outbound.ipv4_offloads;
 		pkt->l3_len = sizeof(struct ip);
 		pkt->l2_len = ETHER_HDR_LEN;
 
 		ip->ip_sum = 0;
+
+		/* calculate IPv4 cksum in SW */
+		if ((pkt->ol_flags & PKT_TX_IP_CKSUM) == 0)
+			ip->ip_sum = rte_ipv4_cksum((struct ipv4_hdr *)ip);
+
 		ethhdr->ether_type = rte_cpu_to_be_16(ETHER_TYPE_IPv4);
 	} else {
-		pkt->ol_flags |= PKT_TX_IPV6;
+		pkt->ol_flags |= qconf->outbound.ipv6_offloads;
 		pkt->l3_len = sizeof(struct ip6_hdr);
 		pkt->l2_len = ETHER_HDR_LEN;
 
@@ -346,18 +350,19 @@  prepare_tx_pkt(struct rte_mbuf *pkt, uint16_t port)
 }
 
 static inline void
-prepare_tx_burst(struct rte_mbuf *pkts[], uint16_t nb_pkts, uint16_t port)
+prepare_tx_burst(struct rte_mbuf *pkts[], uint16_t nb_pkts, uint16_t port,
+		const struct lcore_conf *qconf)
 {
 	int32_t i;
 	const int32_t prefetch_offset = 2;
 
 	for (i = 0; i < (nb_pkts - prefetch_offset); i++) {
 		rte_mbuf_prefetch_part2(pkts[i + prefetch_offset]);
-		prepare_tx_pkt(pkts[i], port);
+		prepare_tx_pkt(pkts[i], port, qconf);
 	}
 	/* Process left packets */
 	for (; i < nb_pkts; i++)
-		prepare_tx_pkt(pkts[i], port);
+		prepare_tx_pkt(pkts[i], port, qconf);
 }
 
 /* Send burst of packets on an output interface */
@@ -371,7 +376,7 @@  send_burst(struct lcore_conf *qconf, uint16_t n, uint16_t port)
 	queueid = qconf->tx_queue_id[port];
 	m_table = (struct rte_mbuf **)qconf->tx_mbufs[port].m_table;
 
-	prepare_tx_burst(m_table, n, port);
+	prepare_tx_burst(m_table, n, port, qconf);
 
 	ret = rte_eth_tx_burst(port, queueid, m_table, n);
 	if (unlikely(ret < n)) {
@@ -1543,7 +1548,7 @@  cryptodevs_init(void)
 }
 
 static void
-port_init(uint16_t portid)
+port_init(uint16_t portid, uint64_t req_rx_offloads, uint64_t req_tx_offloads)
 {
 	struct rte_eth_dev_info dev_info;
 	struct rte_eth_txconf *txconf;
@@ -1584,10 +1589,10 @@  port_init(uint16_t portid)
 		local_port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_JUMBO_FRAME;
 	}
 
-	if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_SECURITY)
-		local_port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_SECURITY;
-	if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_SECURITY)
-		local_port_conf.txmode.offloads |= DEV_TX_OFFLOAD_SECURITY;
+	/* Capabilities will already have been checked.. */
+	local_port_conf.rxmode.offloads |= req_rx_offloads;
+	local_port_conf.txmode.offloads |= req_tx_offloads;
+
 	if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE)
 		local_port_conf.txmode.offloads |=
 			DEV_TX_OFFLOAD_MBUF_FAST_FREE;
@@ -1639,6 +1644,13 @@  port_init(uint16_t portid)
 
 		qconf = &lcore_conf[lcore_id];
 		qconf->tx_queue_id[portid] = tx_queueid;
+
+		/* Pre-populate pkt offloads based on capabilities */
+		qconf->outbound.ipv4_offloads = PKT_TX_IPV4;
+		qconf->outbound.ipv6_offloads = PKT_TX_IPV6;
+		if (req_tx_offloads & DEV_TX_OFFLOAD_IPV4_CKSUM)
+			qconf->outbound.ipv4_offloads |= PKT_TX_IP_CKSUM;
+
 		tx_queueid++;
 
 		/* init RX queues */
@@ -1749,6 +1761,7 @@  main(int32_t argc, char **argv)
 	uint32_t lcore_id;
 	uint8_t socket_id;
 	uint16_t portid;
+	uint64_t req_rx_offloads, req_tx_offloads;
 
 	/* init EAL */
 	ret = rte_eal_init(argc, argv);
@@ -1804,7 +1817,8 @@  main(int32_t argc, char **argv)
 		if ((enabled_port_mask & (1 << portid)) == 0)
 			continue;
 
-		port_init(portid);
+		sa_check_offloads(portid, &req_rx_offloads, &req_tx_offloads);
+		port_init(portid, req_rx_offloads, req_tx_offloads);
 	}
 
 	cryptodevs_init();
diff --git a/examples/ipsec-secgw/ipsec.h b/examples/ipsec-secgw/ipsec.h
index c998c8076..9b1586f52 100644
--- a/examples/ipsec-secgw/ipsec.h
+++ b/examples/ipsec-secgw/ipsec.h
@@ -146,6 +146,8 @@  struct ipsec_ctx {
 	struct rte_mempool *session_pool;
 	struct rte_mbuf *ol_pkts[MAX_PKT_BURST] __rte_aligned(sizeof(void *));
 	uint16_t ol_pkts_cnt;
+	uint64_t ipv4_offloads;
+	uint64_t ipv6_offloads;
 };
 
 struct cdev_key {
@@ -239,4 +241,8 @@  sa_init(struct socket_ctx *ctx, int32_t socket_id);
 void
 rt_init(struct socket_ctx *ctx, int32_t socket_id);
 
+int
+sa_check_offloads(uint16_t port_id, uint64_t *rx_offloads,
+		uint64_t *tx_offloads);
+
 #endif /* __IPSEC_H__ */
diff --git a/examples/ipsec-secgw/sa.c b/examples/ipsec-secgw/sa.c
index d2d3550a4..ff8c4b829 100644
--- a/examples/ipsec-secgw/sa.c
+++ b/examples/ipsec-secgw/sa.c
@@ -1017,3 +1017,59 @@  outbound_sa_lookup(struct sa_ctx *sa_ctx, uint32_t sa_idx[],
 	for (i = 0; i < nb_pkts; i++)
 		sa[i] = &sa_ctx->sa[sa_idx[i]];
 }
+
+/*
+ * Select HW offloads to be used.
+ */
+int
+sa_check_offloads(uint16_t port_id, uint64_t *rx_offloads,
+		uint64_t *tx_offloads)
+{
+	struct ipsec_sa *rule;
+	uint32_t idx_sa;
+	struct rte_eth_dev_info dev_info;
+
+	rte_eth_dev_info_get(port_id, &dev_info);
+
+	*rx_offloads = 0;
+	*tx_offloads = 0;
+
+	/* Check for inbound rules that use offloads and use this port */
+	for (idx_sa = 0; idx_sa < nb_sa_in; idx_sa++) {
+		rule = &sa_in[idx_sa];
+		if ((rule->type == RTE_SECURITY_ACTION_TYPE_INLINE_CRYPTO ||
+				rule->type ==
+				RTE_SECURITY_ACTION_TYPE_INLINE_PROTOCOL)
+				&& rule->portid == port_id) {
+			if ((dev_info.rx_offload_capa & DEV_RX_OFFLOAD_SECURITY)
+					== 0) {
+				RTE_LOG(WARNING, PORT,
+					"HW RX IPSec is not supported\n");
+				return -EINVAL;
+			}
+			*rx_offloads |= DEV_RX_OFFLOAD_SECURITY;
+		}
+	}
+
+	/* Check for outbound rules that use offloads and use this port */
+	for (idx_sa = 0; idx_sa < nb_sa_out; idx_sa++) {
+		rule = &sa_out[idx_sa];
+		if ((rule->type == RTE_SECURITY_ACTION_TYPE_INLINE_CRYPTO ||
+				rule->type ==
+				RTE_SECURITY_ACTION_TYPE_INLINE_PROTOCOL)
+				&& rule->portid == port_id) {
+			if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_SECURITY)
+					== 0) {
+				RTE_LOG(WARNING, PORT,
+					"HW TX IPSec is not supported\n");
+				return -EINVAL;
+			}
+			*tx_offloads |= DEV_TX_OFFLOAD_SECURITY;
+			/* Enable HW IPv4 cksum as well, if it is available */
+			if (dev_info.tx_offload_capa &
+					DEV_TX_OFFLOAD_IPV4_CKSUM)
+				*tx_offloads |= DEV_TX_OFFLOAD_IPV4_CKSUM;
+		}
+	}
+	return 0;
+}