app/testpmd: fix l4 sw csum over multi segments

Message ID 20211015051306.320328-1-xiaoyun.li@intel.com (mailing list archive)
State Superseded, archived
Headers
Series app/testpmd: fix l4 sw csum over multi segments |

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/iol-broadcom-Functional success Functional Testing PASS
ci/iol-broadcom-Performance success Performance Testing PASS
ci/iol-intel-Functional success Functional Testing PASS
ci/iol-intel-Performance success Performance Testing PASS
ci/iol-x86_64-unit-testing success Testing PASS
ci/iol-x86_64-compile-testing success Testing PASS
ci/github-robot: build success github build: passed
ci/iol-aarch64-unit-testing success Testing PASS
ci/iol-mellanox-Performance success Performance Testing PASS
ci/iol-aarch64-compile-testing success Testing PASS

Commit Message

Li, Xiaoyun Oct. 15, 2021, 5:13 a.m. UTC
  In csum forwarding mode, software UDP/TCP csum calculation only takes
the first segment into account while using the whole packet length so
the calculation will read invalid memory region with multi-segments
packets and will get wrong value.
This patch fixes this issue.

Fixes: af75078fece3 ("first public release")
Cc: stable@dpdk.org

Signed-off-by: Xiaoyun Li <xiaoyun.li@intel.com>
---
 app/test-pmd/csumonly.c | 31 +++++++++++++++++++++++--------
 1 file changed, 23 insertions(+), 8 deletions(-)
  

Comments

David Marchand Oct. 15, 2021, 8:09 a.m. UTC | #1
Hello,

On Fri, Oct 15, 2021 at 7:27 AM Xiaoyun Li <xiaoyun.li@intel.com> wrote:
>
> In csum forwarding mode, software UDP/TCP csum calculation only takes
> the first segment into account while using the whole packet length so
> the calculation will read invalid memory region with multi-segments
> packets and will get wrong value.
> This patch fixes this issue.
>
> Fixes: af75078fece3 ("first public release")
> Cc: stable@dpdk.org
>
> Signed-off-by: Xiaoyun Li <xiaoyun.li@intel.com>
> ---
>  app/test-pmd/csumonly.c | 31 +++++++++++++++++++++++--------
>  1 file changed, 23 insertions(+), 8 deletions(-)
>
> diff --git a/app/test-pmd/csumonly.c b/app/test-pmd/csumonly.c
> index 090797318a..5df3be0a6f 100644
> --- a/app/test-pmd/csumonly.c
> +++ b/app/test-pmd/csumonly.c
> @@ -18,7 +18,7 @@
>  #include <rte_log.h>
>  #include <rte_debug.h>
>  #include <rte_cycles.h>
> -#include <rte_memory.h>
> +#include <rte_malloc.h>

This include caught my eye.


>  #include <rte_memcpy.h>
>  #include <rte_launch.h>
>  #include <rte_eal.h>
> @@ -56,6 +56,11 @@
>  #define GRE_SUPPORTED_FIELDS   (GRE_CHECKSUM_PRESENT | GRE_KEY_PRESENT |\
>                                  GRE_SEQUENCE_PRESENT)
>
> +/* When UDP or TCP or outer UDP csum offload is off, sw l4 csum is needed */
> +#define UDP_TCP_CSUM            (DEV_TX_OFFLOAD_UDP_CKSUM |\
> +                                DEV_TX_OFFLOAD_TCP_CKSUM |\
> +                                DEV_TX_OFFLOAD_OUTER_UDP_CKSUM)
> +
>  /* We cannot use rte_cpu_to_be_16() on a constant in a switch/case */
>  #if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN
>  #define _htons(x) ((uint16_t)((((x) & 0x00ffU) << 8) | (((x) & 0xff00U) >> 8)))
> @@ -602,12 +607,8 @@ process_outer_cksums(void *outer_l3_hdr, struct testpmd_offload_info *info,
>         /* do not recalculate udp cksum if it was 0 */
>         if (udp_hdr->dgram_cksum != 0) {
>                 udp_hdr->dgram_cksum = 0;
> -               if (info->outer_ethertype == _htons(RTE_ETHER_TYPE_IPV4))
> -                       udp_hdr->dgram_cksum =
> -                               rte_ipv4_udptcp_cksum(ipv4_hdr, udp_hdr);
> -               else
> -                       udp_hdr->dgram_cksum =
> -                               rte_ipv6_udptcp_cksum(ipv6_hdr, udp_hdr);
> +               udp_hdr->dgram_cksum = get_udptcp_checksum(outer_l3_hdr,
> +                                       udp_hdr, info->outer_ethertype);
>         }
>
>         return ol_flags;
> @@ -802,6 +803,7 @@ pkt_burst_checksum_forward(struct fwd_stream *fs)
>         struct rte_mbuf *m, *p;
>         struct rte_ether_hdr *eth_hdr;
>         void *l3_hdr = NULL, *outer_l3_hdr = NULL; /* can be IPv4 or IPv6 */
> +       uint8_t *l3_buf = NULL;
>         void **gro_ctx;
>         uint16_t gro_pkts_num;
>         uint8_t gro_enable;
> @@ -877,7 +879,19 @@ pkt_burst_checksum_forward(struct fwd_stream *fs)
>                 rte_ether_addr_copy(&ports[fs->tx_port].eth_addr,
>                                 &eth_hdr->src_addr);
>                 parse_ethernet(eth_hdr, &info);
> -               l3_hdr = (char *)eth_hdr + info.l2_len;
> +               /* When sw csum is needed, multi-segs needs a buf to contain
> +                * the whole packet for later UDP/TCP csum calculation.
> +                */
> +               if (m->nb_segs > 1 && !(tx_ol_flags & PKT_TX_TCP_SEG) &&
> +                   !(tx_offloads & UDP_TCP_CSUM)) {
> +                       l3_buf = rte_zmalloc("csum l3_buf",
> +                                            info.pkt_len - info.l2_len,
> +                                            RTE_CACHE_LINE_SIZE);

Rather than call a dyn allocation in datapath, can't we have a static
buffer on the stack?


> +                       rte_pktmbuf_read(m, info.l2_len,
> +                                        info.pkt_len - info.l2_len, l3_buf);
> +                       l3_hdr = l3_buf;
> +               } else
> +                       l3_hdr = (char *)eth_hdr + info.l2_len;
>
>                 /* check if it's a supported tunnel */
>                 if (txp->parse_tunnel) {
> @@ -1051,6 +1065,7 @@ pkt_burst_checksum_forward(struct fwd_stream *fs)
>                         printf("tx: flags=%s", buf);
>                         printf("\n");
>                 }
> +               rte_free(l3_buf);
>         }
>
>         if (unlikely(gro_enable)) {
> --
> 2.25.1
>
  
Li, Xiaoyun Oct. 18, 2021, 2:02 a.m. UTC | #2
Hi

> -----Original Message-----
> From: David Marchand <david.marchand@redhat.com>
> Sent: Friday, October 15, 2021 16:10
> To: Li, Xiaoyun <xiaoyun.li@intel.com>
> Cc: Yigit, Ferruh <ferruh.yigit@intel.com>; dev <dev@dpdk.org>; dpdk stable
> <stable@dpdk.org>
> Subject: Re: [dpdk-dev] [PATCH] app/testpmd: fix l4 sw csum over multi
> segments
> 
> Hello,
> 
> On Fri, Oct 15, 2021 at 7:27 AM Xiaoyun Li <xiaoyun.li@intel.com> wrote:
> >
> > In csum forwarding mode, software UDP/TCP csum calculation only takes
> > the first segment into account while using the whole packet length so
> > the calculation will read invalid memory region with multi-segments
> > packets and will get wrong value.
> > This patch fixes this issue.
> >
> > Fixes: af75078fece3 ("first public release")
> > Cc: stable@dpdk.org
> >
> > Signed-off-by: Xiaoyun Li <xiaoyun.li@intel.com>
> > ---
> >  app/test-pmd/csumonly.c | 31 +++++++++++++++++++++++--------
> >  1 file changed, 23 insertions(+), 8 deletions(-)
> >
> > diff --git a/app/test-pmd/csumonly.c b/app/test-pmd/csumonly.c index
> > 090797318a..5df3be0a6f 100644
> > --- a/app/test-pmd/csumonly.c
> > +++ b/app/test-pmd/csumonly.c
> > @@ -18,7 +18,7 @@
> >  #include <rte_log.h>
> >  #include <rte_debug.h>
> >  #include <rte_cycles.h>
> > -#include <rte_memory.h>
> > +#include <rte_malloc.h>
> 
> This include caught my eye.
> 
> 
> >  #include <rte_memcpy.h>
> >  #include <rte_launch.h>
> >  #include <rte_eal.h>
> > @@ -56,6 +56,11 @@
> >  #define GRE_SUPPORTED_FIELDS   (GRE_CHECKSUM_PRESENT |
> GRE_KEY_PRESENT |\
> >                                  GRE_SEQUENCE_PRESENT)
> >
> > +/* When UDP or TCP or outer UDP csum offload is off, sw l4 csum is needed
> */
> > +#define UDP_TCP_CSUM            (DEV_TX_OFFLOAD_UDP_CKSUM |\
> > +                                DEV_TX_OFFLOAD_TCP_CKSUM |\
> > +                                DEV_TX_OFFLOAD_OUTER_UDP_CKSUM)
> > +
> >  /* We cannot use rte_cpu_to_be_16() on a constant in a switch/case */
> > #if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN  #define _htons(x)
> > ((uint16_t)((((x) & 0x00ffU) << 8) | (((x) & 0xff00U) >> 8))) @@
> > -602,12 +607,8 @@ process_outer_cksums(void *outer_l3_hdr, struct
> testpmd_offload_info *info,
> >         /* do not recalculate udp cksum if it was 0 */
> >         if (udp_hdr->dgram_cksum != 0) {
> >                 udp_hdr->dgram_cksum = 0;
> > -               if (info->outer_ethertype == _htons(RTE_ETHER_TYPE_IPV4))
> > -                       udp_hdr->dgram_cksum =
> > -                               rte_ipv4_udptcp_cksum(ipv4_hdr, udp_hdr);
> > -               else
> > -                       udp_hdr->dgram_cksum =
> > -                               rte_ipv6_udptcp_cksum(ipv6_hdr, udp_hdr);
> > +               udp_hdr->dgram_cksum = get_udptcp_checksum(outer_l3_hdr,
> > +                                       udp_hdr,
> > + info->outer_ethertype);
> >         }
> >
> >         return ol_flags;
> > @@ -802,6 +803,7 @@ pkt_burst_checksum_forward(struct fwd_stream *fs)
> >         struct rte_mbuf *m, *p;
> >         struct rte_ether_hdr *eth_hdr;
> >         void *l3_hdr = NULL, *outer_l3_hdr = NULL; /* can be IPv4 or
> > IPv6 */
> > +       uint8_t *l3_buf = NULL;
> >         void **gro_ctx;
> >         uint16_t gro_pkts_num;
> >         uint8_t gro_enable;
> > @@ -877,7 +879,19 @@ pkt_burst_checksum_forward(struct fwd_stream *fs)
> >                 rte_ether_addr_copy(&ports[fs->tx_port].eth_addr,
> >                                 &eth_hdr->src_addr);
> >                 parse_ethernet(eth_hdr, &info);
> > -               l3_hdr = (char *)eth_hdr + info.l2_len;
> > +               /* When sw csum is needed, multi-segs needs a buf to contain
> > +                * the whole packet for later UDP/TCP csum calculation.
> > +                */
> > +               if (m->nb_segs > 1 && !(tx_ol_flags & PKT_TX_TCP_SEG) &&
> > +                   !(tx_offloads & UDP_TCP_CSUM)) {
> > +                       l3_buf = rte_zmalloc("csum l3_buf",
> > +                                            info.pkt_len - info.l2_len,
> > +                                            RTE_CACHE_LINE_SIZE);
> 
> Rather than call a dyn allocation in datapath, can't we have a static buffer on
> the stack?

I wanted to do that. But the issue only happens when it's a large packet. Each hw has its own limitation on max packet size but it grows fast.
I'm not sure how large array should I use. 64K? Since total length in IP hdr is 16 bit.

BRs
Xiaoyun

> 
> 
> > +                       rte_pktmbuf_read(m, info.l2_len,
> > +                                        info.pkt_len - info.l2_len, l3_buf);
> > +                       l3_hdr = l3_buf;
> > +               } else
> > +                       l3_hdr = (char *)eth_hdr + info.l2_len;
> >
> >                 /* check if it's a supported tunnel */
> >                 if (txp->parse_tunnel) { @@ -1051,6 +1065,7 @@
> > pkt_burst_checksum_forward(struct fwd_stream *fs)
> >                         printf("tx: flags=%s", buf);
> >                         printf("\n");
> >                 }
> > +               rte_free(l3_buf);
> >         }
> >
> >         if (unlikely(gro_enable)) {
> > --
> > 2.25.1
> >
> 
> 
> --
> David Marchand
  
Stephen Hemminger Oct. 18, 2021, 3 a.m. UTC | #3
On Fri, 15 Oct 2021 13:13:06 +0800
Xiaoyun Li <xiaoyun.li@intel.com> wrote:

> +		/* When sw csum is needed, multi-segs needs a buf to contain
> +		 * the whole packet for later UDP/TCP csum calculation.
> +		 */
> +		if (m->nb_segs > 1 && !(tx_ol_flags & PKT_TX_TCP_SEG) &&
> +		    !(tx_offloads & UDP_TCP_CSUM)) {
> +			l3_buf = rte_zmalloc("csum l3_buf",
> +					     info.pkt_len - info.l2_len,
> +					     RTE_CACHE_LINE_SIZE);
> +			rte_pktmbuf_read(m, info.l2_len,
> +					 info.pkt_len - info.l2_len, l3_buf);
> +			l3_hdr = l3_buf;
> +		} else
> +			l3_hdr = (char *)eth_hdr + info.l2_len;
>  

Rather than copying whole packet, make the code handle checksum streaming.
  
Li, Xiaoyun Oct. 18, 2021, 3:16 a.m. UTC | #4
Hi

> -----Original Message-----
> From: Stephen Hemminger <stephen@networkplumber.org>
> Sent: Monday, October 18, 2021 11:00
> To: Li, Xiaoyun <xiaoyun.li@intel.com>
> Cc: Yigit, Ferruh <ferruh.yigit@intel.com>; dev@dpdk.org; stable@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH] app/testpmd: fix l4 sw csum over multi
> segments
> 
> On Fri, 15 Oct 2021 13:13:06 +0800
> Xiaoyun Li <xiaoyun.li@intel.com> wrote:
> 
> > +		/* When sw csum is needed, multi-segs needs a buf to contain
> > +		 * the whole packet for later UDP/TCP csum calculation.
> > +		 */
> > +		if (m->nb_segs > 1 && !(tx_ol_flags & PKT_TX_TCP_SEG) &&
> > +		    !(tx_offloads & UDP_TCP_CSUM)) {
> > +			l3_buf = rte_zmalloc("csum l3_buf",
> > +					     info.pkt_len - info.l2_len,
> > +					     RTE_CACHE_LINE_SIZE);
> > +			rte_pktmbuf_read(m, info.l2_len,
> > +					 info.pkt_len - info.l2_len, l3_buf);
> > +			l3_hdr = l3_buf;
> > +		} else
> > +			l3_hdr = (char *)eth_hdr + info.l2_len;
> >
> 
> Rather than copying whole packet, make the code handle checksum streaming.

Copying is the easiest way to do this.

The problem of handling checksum streaming is that in the first segment, l2 and l3 hdr len is 14 bytes when checksum takes 4 bytes each time.
If the datalen of the first segment is 4 bytes aligned (usual case), for the second segment and the following segments, they may need to add a special 2 bytes 0x0 at the start.
Also, mbuf is not passed down to process_inner/outer_chksum so the change will be a lot.
  
Li, Xiaoyun Oct. 18, 2021, 4:40 a.m. UTC | #5
> -----Original Message-----
> From: dev <dev-bounces@dpdk.org> On Behalf Of Li, Xiaoyun
> Sent: Monday, October 18, 2021 11:17
> To: Stephen Hemminger <stephen@networkplumber.org>
> Cc: Yigit, Ferruh <ferruh.yigit@intel.com>; dev@dpdk.org; stable@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH] app/testpmd: fix l4 sw csum over multi
> segments
> 
> Hi
> 
> > -----Original Message-----
> > From: Stephen Hemminger <stephen@networkplumber.org>
> > Sent: Monday, October 18, 2021 11:00
> > To: Li, Xiaoyun <xiaoyun.li@intel.com>
> > Cc: Yigit, Ferruh <ferruh.yigit@intel.com>; dev@dpdk.org;
> > stable@dpdk.org
> > Subject: Re: [dpdk-dev] [PATCH] app/testpmd: fix l4 sw csum over multi
> > segments
> >
> > On Fri, 15 Oct 2021 13:13:06 +0800
> > Xiaoyun Li <xiaoyun.li@intel.com> wrote:
> >
> > > +		/* When sw csum is needed, multi-segs needs a buf to contain
> > > +		 * the whole packet for later UDP/TCP csum calculation.
> > > +		 */
> > > +		if (m->nb_segs > 1 && !(tx_ol_flags & PKT_TX_TCP_SEG) &&
> > > +		    !(tx_offloads & UDP_TCP_CSUM)) {
> > > +			l3_buf = rte_zmalloc("csum l3_buf",
> > > +					     info.pkt_len - info.l2_len,
> > > +					     RTE_CACHE_LINE_SIZE);
> > > +			rte_pktmbuf_read(m, info.l2_len,
> > > +					 info.pkt_len - info.l2_len, l3_buf);
> > > +			l3_hdr = l3_buf;
> > > +		} else
> > > +			l3_hdr = (char *)eth_hdr + info.l2_len;
> > >
> >
> > Rather than copying whole packet, make the code handle checksum streaming.
> 
> Copying is the easiest way to do this.
> 
> The problem of handling checksum streaming is that in the first segment, l2 and
> l3 hdr len is 14 bytes when checksum takes 4 bytes each time.
> If the datalen of the first segment is 4 bytes aligned (usual case), for the second
> segment and the following segments, they may need to add a special 2 bytes
> 0x0 at the start.
> Also, mbuf is not passed down to process_inner/outer_chksum so the change
> will be a lot.

Also, rte_ipv4/6_udptcp_cksum can't be directly called. Because it only takes ip_hdr and whole packet buffer as input.
  
Ananyev, Konstantin Oct. 18, 2021, 10:15 a.m. UTC | #6
> > > +		/* When sw csum is needed, multi-segs needs a buf to contain
> > > +		 * the whole packet for later UDP/TCP csum calculation.
> > > +		 */
> > > +		if (m->nb_segs > 1 && !(tx_ol_flags & PKT_TX_TCP_SEG) &&
> > > +		    !(tx_offloads & UDP_TCP_CSUM)) {
> > > +			l3_buf = rte_zmalloc("csum l3_buf",
> > > +					     info.pkt_len - info.l2_len,
> > > +					     RTE_CACHE_LINE_SIZE);
> > > +			rte_pktmbuf_read(m, info.l2_len,
> > > +					 info.pkt_len - info.l2_len, l3_buf);
> > > +			l3_hdr = l3_buf;
> > > +		} else
> > > +			l3_hdr = (char *)eth_hdr + info.l2_len;
> > >
> >
> > Rather than copying whole packet, make the code handle checksum streaming.
> 
> Copying is the easiest way to do this.
> 
> The problem of handling checksum streaming is that in the first segment, l2 and l3 hdr len is 14 bytes when checksum takes 4 bytes each
> time.
> If the datalen of the first segment is 4 bytes aligned (usual case), for the second segment and the following segments, they may need to add
> a special 2 bytes 0x0 at the start.

Didn't understand that one...
Why you suddenly need to pad non-first segments with zeroes?
Why simply rte_raw_cksum() can't be used for multi-seg case?

> Also, mbuf is not passed down to process_inner/outer_chksum so the change will be a lot.

I also think that copying whole packet just to calculate a checksum - way too much overhead.
  
Li, Xiaoyun Oct. 19, 2021, 1:54 a.m. UTC | #7
> -----Original Message-----
> From: Ananyev, Konstantin <konstantin.ananyev@intel.com>
> Sent: Monday, October 18, 2021 18:16
> To: Li, Xiaoyun <xiaoyun.li@intel.com>; Stephen Hemminger
> <stephen@networkplumber.org>
> Cc: Yigit, Ferruh <ferruh.yigit@intel.com>; dev@dpdk.org; stable@dpdk.org
> Subject: RE: [dpdk-dev] [PATCH] app/testpmd: fix l4 sw csum over multi
> segments
> 
> 
> > > > +		/* When sw csum is needed, multi-segs needs a buf to contain
> > > > +		 * the whole packet for later UDP/TCP csum calculation.
> > > > +		 */
> > > > +		if (m->nb_segs > 1 && !(tx_ol_flags & PKT_TX_TCP_SEG) &&
> > > > +		    !(tx_offloads & UDP_TCP_CSUM)) {
> > > > +			l3_buf = rte_zmalloc("csum l3_buf",
> > > > +					     info.pkt_len - info.l2_len,
> > > > +					     RTE_CACHE_LINE_SIZE);
> > > > +			rte_pktmbuf_read(m, info.l2_len,
> > > > +					 info.pkt_len - info.l2_len, l3_buf);
> > > > +			l3_hdr = l3_buf;
> > > > +		} else
> > > > +			l3_hdr = (char *)eth_hdr + info.l2_len;
> > > >
> > >
> > > Rather than copying whole packet, make the code handle checksum
> streaming.
> >
> > Copying is the easiest way to do this.
> >
> > The problem of handling checksum streaming is that in the first
> > segment, l2 and l3 hdr len is 14 bytes when checksum takes 4 bytes each time.
> > If the datalen of the first segment is 4 bytes aligned (usual case),
> > for the second segment and the following segments, they may need to add a
> special 2 bytes 0x0 at the start.
> 
> Didn't understand that one...
> Why you suddenly need to pad non-first segments with zeroes?
> Why simply rte_raw_cksum() can't be used for multi-seg case?

Normal udp/tcp packets:
The first segment: eth hdr + ip hdr + udp/tcp packet (The total length of this is mbuf data len so like 2048, 4 bytes aligned)
The second segment: continue udp/tcp packet

Now, udp/tcp checksum is calculated. It will take the whole udp/tcp packet. 4 bytes + 4 bytes + 4 bytes...
Then
1st segment: udp/tcp packet (size = 2048 - 14 = 2034, not 4 bytes aligned, 2 bytes left, if use rte_raw_cksum(), the last 2 bytes will be combined with 2 bytes zeros)
2nd segment: continue udp/tcp packet (size = data_len)

For 2nd segment, if don't add 2 bytes zeros first, the checksum value will be wrong.
Because it should be for example 0x1234 (0x12 is left in 1st, 0x34 is in 2nd), 0x1200+0x0034 is correct but 0x1200+0x3400 is not correct.

That's why I think all of the following segments needs zero padding first.

And above is only the usual case of normal tcp/udp packets. The issue also exists for tunnel packets which will calculate outer udp and inner udp/tcp checksum.

> 
> > Also, mbuf is not passed down to process_inner/outer_chksum so the change
> will be a lot.
> 
> I also think that copying whole packet just to calculate a checksum - way too
> much overhead.

Yes. I agree. But it only happens when users don't enable checksum offload, don't enable TSO and the packet crosses multi-segments.
  

Patch

diff --git a/app/test-pmd/csumonly.c b/app/test-pmd/csumonly.c
index 090797318a..5df3be0a6f 100644
--- a/app/test-pmd/csumonly.c
+++ b/app/test-pmd/csumonly.c
@@ -18,7 +18,7 @@ 
 #include <rte_log.h>
 #include <rte_debug.h>
 #include <rte_cycles.h>
-#include <rte_memory.h>
+#include <rte_malloc.h>
 #include <rte_memcpy.h>
 #include <rte_launch.h>
 #include <rte_eal.h>
@@ -56,6 +56,11 @@ 
 #define GRE_SUPPORTED_FIELDS	(GRE_CHECKSUM_PRESENT | GRE_KEY_PRESENT |\
 				 GRE_SEQUENCE_PRESENT)
 
+/* When UDP or TCP or outer UDP csum offload is off, sw l4 csum is needed */
+#define UDP_TCP_CSUM            (DEV_TX_OFFLOAD_UDP_CKSUM |\
+				 DEV_TX_OFFLOAD_TCP_CKSUM |\
+				 DEV_TX_OFFLOAD_OUTER_UDP_CKSUM)
+
 /* We cannot use rte_cpu_to_be_16() on a constant in a switch/case */
 #if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN
 #define _htons(x) ((uint16_t)((((x) & 0x00ffU) << 8) | (((x) & 0xff00U) >> 8)))
@@ -602,12 +607,8 @@  process_outer_cksums(void *outer_l3_hdr, struct testpmd_offload_info *info,
 	/* do not recalculate udp cksum if it was 0 */
 	if (udp_hdr->dgram_cksum != 0) {
 		udp_hdr->dgram_cksum = 0;
-		if (info->outer_ethertype == _htons(RTE_ETHER_TYPE_IPV4))
-			udp_hdr->dgram_cksum =
-				rte_ipv4_udptcp_cksum(ipv4_hdr, udp_hdr);
-		else
-			udp_hdr->dgram_cksum =
-				rte_ipv6_udptcp_cksum(ipv6_hdr, udp_hdr);
+		udp_hdr->dgram_cksum = get_udptcp_checksum(outer_l3_hdr,
+					udp_hdr, info->outer_ethertype);
 	}
 
 	return ol_flags;
@@ -802,6 +803,7 @@  pkt_burst_checksum_forward(struct fwd_stream *fs)
 	struct rte_mbuf *m, *p;
 	struct rte_ether_hdr *eth_hdr;
 	void *l3_hdr = NULL, *outer_l3_hdr = NULL; /* can be IPv4 or IPv6 */
+	uint8_t *l3_buf = NULL;
 	void **gro_ctx;
 	uint16_t gro_pkts_num;
 	uint8_t gro_enable;
@@ -877,7 +879,19 @@  pkt_burst_checksum_forward(struct fwd_stream *fs)
 		rte_ether_addr_copy(&ports[fs->tx_port].eth_addr,
 				&eth_hdr->src_addr);
 		parse_ethernet(eth_hdr, &info);
-		l3_hdr = (char *)eth_hdr + info.l2_len;
+		/* When sw csum is needed, multi-segs needs a buf to contain
+		 * the whole packet for later UDP/TCP csum calculation.
+		 */
+		if (m->nb_segs > 1 && !(tx_ol_flags & PKT_TX_TCP_SEG) &&
+		    !(tx_offloads & UDP_TCP_CSUM)) {
+			l3_buf = rte_zmalloc("csum l3_buf",
+					     info.pkt_len - info.l2_len,
+					     RTE_CACHE_LINE_SIZE);
+			rte_pktmbuf_read(m, info.l2_len,
+					 info.pkt_len - info.l2_len, l3_buf);
+			l3_hdr = l3_buf;
+		} else
+			l3_hdr = (char *)eth_hdr + info.l2_len;
 
 		/* check if it's a supported tunnel */
 		if (txp->parse_tunnel) {
@@ -1051,6 +1065,7 @@  pkt_burst_checksum_forward(struct fwd_stream *fs)
 			printf("tx: flags=%s", buf);
 			printf("\n");
 		}
+		rte_free(l3_buf);
 	}
 
 	if (unlikely(gro_enable)) {