app/testpmd: improve MAC swap performance

Message ID 20181120044537.9495-1-qi.z.zhang@intel.com (mailing list archive)
State Superseded, archived
Delegated to: Ferruh Yigit
Headers
Series app/testpmd: improve MAC swap performance |

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/Intel-compilation success Compilation OK
ci/mellanox-Performance-Testing success Performance Testing PASS
ci/intel-Performance-Testing success Performance Testing PASS

Commit Message

Qi Zhang Nov. 20, 2018, 4:45 a.m. UTC
  The patch optimizes the mac swap operation by taking advantage
of SSE instructions, it only impacts x86 platform.

Cc: stable@dpdk.org

Signed-off-by: Qi Zhang <qi.z.zhang@intel.com>
---
 app/test-pmd/macswap.c | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)
  

Comments

Ananyev, Konstantin Nov. 20, 2018, 9:16 a.m. UTC | #1
Hi Qi,

> -----Original Message-----
> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Qi Zhang
> Sent: Tuesday, November 20, 2018 4:46 AM
> To: Richardson, Bruce <bruce.richardson@intel.com>; Wiles, Keith <keith.wiles@intel.com>
> Cc: dev@dpdk.org; Lu, Wenzhuo <wenzhuo.lu@intel.com>; Iremonger, Bernard <bernard.iremonger@intel.com>; Zhang, Qi Z
> <qi.z.zhang@intel.com>; stable@dpdk.org
> Subject: [dpdk-dev] [PATCH] app/testpmd: improve MAC swap performance
> 
> The patch optimizes the mac swap operation by taking advantage
> of SSE instructions, it only impacts x86 platform.
> 
> Cc: stable@dpdk.org
> 
> Signed-off-by: Qi Zhang <qi.z.zhang@intel.com>
> ---
>  app/test-pmd/macswap.c | 16 +++++++++++++++-
>  1 file changed, 15 insertions(+), 1 deletion(-)
> 
> diff --git a/app/test-pmd/macswap.c b/app/test-pmd/macswap.c
> index a8384d5b8..0722782b0 100644
> --- a/app/test-pmd/macswap.c
> +++ b/app/test-pmd/macswap.c
> @@ -78,7 +78,6 @@ pkt_burst_mac_swap(struct fwd_stream *fs)
>  	struct rte_port  *txp;
>  	struct rte_mbuf  *mb;
>  	struct ether_hdr *eth_hdr;
> -	struct ether_addr addr;
>  	uint16_t nb_rx;
>  	uint16_t nb_tx;
>  	uint16_t i;
> @@ -95,6 +94,15 @@ pkt_burst_mac_swap(struct fwd_stream *fs)
>  	start_tsc = rte_rdtsc();
>  #endif
> 
> +#ifdef RTE_ARCH_X86
> +	__m128i addr;
> +	__m128i shfl_msk = _mm_set_epi8(15, 14, 13, 12,
> +					5, 4, 3, 2,
> +					1, 0, 11, 10,
> +					9, 8, 7, 6);
> +#else
> +	struct ether_addr addr;
> +#endif

I think it would better to place IA specific code into a separate fnction
(and probably into a separate .h file).
BTW, just curious what % of improvement it gives?
Konstantin


>  	/*
>  	 * Receive a burst of packets and forward them.
>  	 */
> @@ -123,9 +131,15 @@ pkt_burst_mac_swap(struct fwd_stream *fs)
>  		eth_hdr = rte_pktmbuf_mtod(mb, struct ether_hdr *);
> 
>  		/* Swap dest and src mac addresses. */
> +#ifdef RTE_ARCH_X86
> +		addr = _mm_loadu_si128((__m128i *)eth_hdr);
> +		addr = _mm_shuffle_epi8(addr, shfl_msk);
> +		_mm_storeu_si128((__m128i *)eth_hdr, addr);
> +#else
>  		ether_addr_copy(&eth_hdr->d_addr, &addr);
>  		ether_addr_copy(&eth_hdr->s_addr, &eth_hdr->d_addr);
>  		ether_addr_copy(&addr, &eth_hdr->s_addr);
> +#endif
> 
>  		mb->ol_flags &= IND_ATTACHED_MBUF | EXT_ATTACHED_MBUF;
>  		mb->ol_flags |= ol_flags;
> --
> 2.13.6
  
Wiles, Keith Nov. 20, 2018, 2:48 p.m. UTC | #2
> On Nov 20, 2018, at 3:16 AM, Ananyev, Konstantin <konstantin.ananyev@intel.com> wrote:
> 
> Hi Qi,
> 
>> -----Original Message-----
>> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Qi Zhang
>> Sent: Tuesday, November 20, 2018 4:46 AM
>> To: Richardson, Bruce <bruce.richardson@intel.com>; Wiles, Keith <keith.wiles@intel.com>
>> Cc: dev@dpdk.org; Lu, Wenzhuo <wenzhuo.lu@intel.com>; Iremonger, Bernard <bernard.iremonger@intel.com>; Zhang, Qi Z
>> <qi.z.zhang@intel.com>; stable@dpdk.org
>> Subject: [dpdk-dev] [PATCH] app/testpmd: improve MAC swap performance
>> 
>> The patch optimizes the mac swap operation by taking advantage
>> of SSE instructions, it only impacts x86 platform.
>> 
>> Cc: stable@dpdk.org
>> 
>> Signed-off-by: Qi Zhang <qi.z.zhang@intel.com>
>> ---
>> app/test-pmd/macswap.c | 16 +++++++++++++++-
>> 1 file changed, 15 insertions(+), 1 deletion(-)
>> 
>> diff --git a/app/test-pmd/macswap.c b/app/test-pmd/macswap.c
>> index a8384d5b8..0722782b0 100644
>> --- a/app/test-pmd/macswap.c
>> +++ b/app/test-pmd/macswap.c
>> @@ -78,7 +78,6 @@ pkt_burst_mac_swap(struct fwd_stream *fs)
>> 	struct rte_port  *txp;
>> 	struct rte_mbuf  *mb;
>> 	struct ether_hdr *eth_hdr;
>> -	struct ether_addr addr;
>> 	uint16_t nb_rx;
>> 	uint16_t nb_tx;
>> 	uint16_t i;
>> @@ -95,6 +94,15 @@ pkt_burst_mac_swap(struct fwd_stream *fs)
>> 	start_tsc = rte_rdtsc();
>> #endif
>> 
>> +#ifdef RTE_ARCH_X86
>> +	__m128i addr;
>> +	__m128i shfl_msk = _mm_set_epi8(15, 14, 13, 12,
>> +					5, 4, 3, 2,
>> +					1, 0, 11, 10,
>> +					9, 8, 7, 6);
>> +#else
>> +	struct ether_addr addr;
>> +#endif
> 
> I think it would better to place IA specific code into a separate fnction
> (and probably into a separate .h file).
> BTW, just curious what % of improvement it gives?
> Konstantin
> 

If we are going to the trouble of moving the code out of the .c into a .h then I suggest it be placed into the rte_ethdev headers for everyone to use.
> 
>> 	/*
>> 	 * Receive a burst of packets and forward them.
>> 	 */
>> @@ -123,9 +131,15 @@ pkt_burst_mac_swap(struct fwd_stream *fs)
>> 		eth_hdr = rte_pktmbuf_mtod(mb, struct ether_hdr *);
>> 
>> 		/* Swap dest and src mac addresses. */
>> +#ifdef RTE_ARCH_X86
>> +		addr = _mm_loadu_si128((__m128i *)eth_hdr);
>> +		addr = _mm_shuffle_epi8(addr, shfl_msk);
>> +		_mm_storeu_si128((__m128i *)eth_hdr, addr);
>> +#else
>> 		ether_addr_copy(&eth_hdr->d_addr, &addr);
>> 		ether_addr_copy(&eth_hdr->s_addr, &eth_hdr->d_addr);
>> 		ether_addr_copy(&addr, &eth_hdr->s_addr);
>> +#endif
>> 
>> 		mb->ol_flags &= IND_ATTACHED_MBUF | EXT_ATTACHED_MBUF;
>> 		mb->ol_flags |= ol_flags;
>> --
>> 2.13.6

Regards,
Keith
  
Qi Zhang Nov. 20, 2018, 4:58 p.m. UTC | #3
> -----Original Message-----
> From: Ananyev, Konstantin
> Sent: Tuesday, November 20, 2018 1:17 AM
> To: Zhang, Qi Z <qi.z.zhang@intel.com>; Richardson, Bruce
> <bruce.richardson@intel.com>; Wiles, Keith <keith.wiles@intel.com>
> Cc: dev@dpdk.org; Lu, Wenzhuo <wenzhuo.lu@intel.com>; Iremonger, Bernard
> <bernard.iremonger@intel.com>; Zhang, Qi Z <qi.z.zhang@intel.com>;
> stable@dpdk.org
> Subject: RE: [dpdk-dev] [PATCH] app/testpmd: improve MAC swap performance
> 
> Hi Qi,
> 
> > -----Original Message-----
> > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Qi Zhang
> > Sent: Tuesday, November 20, 2018 4:46 AM
> > To: Richardson, Bruce <bruce.richardson@intel.com>; Wiles, Keith
> > <keith.wiles@intel.com>
> > Cc: dev@dpdk.org; Lu, Wenzhuo <wenzhuo.lu@intel.com>; Iremonger,
> > Bernard <bernard.iremonger@intel.com>; Zhang, Qi Z
> > <qi.z.zhang@intel.com>; stable@dpdk.org
> > Subject: [dpdk-dev] [PATCH] app/testpmd: improve MAC swap performance
> >
> > The patch optimizes the mac swap operation by taking advantage of SSE
> > instructions, it only impacts x86 platform.
> >
> > Cc: stable@dpdk.org
> >
> > Signed-off-by: Qi Zhang <qi.z.zhang@intel.com>
> > ---
> >  app/test-pmd/macswap.c | 16 +++++++++++++++-
> >  1 file changed, 15 insertions(+), 1 deletion(-)
> >
> > diff --git a/app/test-pmd/macswap.c b/app/test-pmd/macswap.c index
> > a8384d5b8..0722782b0 100644
> > --- a/app/test-pmd/macswap.c
> > +++ b/app/test-pmd/macswap.c
> > @@ -78,7 +78,6 @@ pkt_burst_mac_swap(struct fwd_stream *fs)
> >  	struct rte_port  *txp;
> >  	struct rte_mbuf  *mb;
> >  	struct ether_hdr *eth_hdr;
> > -	struct ether_addr addr;
> >  	uint16_t nb_rx;
> >  	uint16_t nb_tx;
> >  	uint16_t i;
> > @@ -95,6 +94,15 @@ pkt_burst_mac_swap(struct fwd_stream *fs)
> >  	start_tsc = rte_rdtsc();
> >  #endif
> >
> > +#ifdef RTE_ARCH_X86
> > +	__m128i addr;
> > +	__m128i shfl_msk = _mm_set_epi8(15, 14, 13, 12,
> > +					5, 4, 3, 2,
> > +					1, 0, 11, 10,
> > +					9, 8, 7, 6);
> > +#else
> > +	struct ether_addr addr;
> > +#endif
> 
> I think it would better to place IA specific code into a separate fnction (and
> probably into a separate .h file).

OK, I will think about how to rework this.

> BTW, just curious what % of improvement it gives?

So far , the only server I can test is a 1.6GHz Broadwell server with 2 ports on 1 i40e 25G.
The macswap performance is increase from 16.8mpps to 20mpps (about 19% improvement)
	
> Konstantin
> 
> 
> >  	/*
> >  	 * Receive a burst of packets and forward them.
> >  	 */
> > @@ -123,9 +131,15 @@ pkt_burst_mac_swap(struct fwd_stream *fs)
> >  		eth_hdr = rte_pktmbuf_mtod(mb, struct ether_hdr *);
> >
> >  		/* Swap dest and src mac addresses. */
> > +#ifdef RTE_ARCH_X86
> > +		addr = _mm_loadu_si128((__m128i *)eth_hdr);
> > +		addr = _mm_shuffle_epi8(addr, shfl_msk);
> > +		_mm_storeu_si128((__m128i *)eth_hdr, addr); #else
> >  		ether_addr_copy(&eth_hdr->d_addr, &addr);
> >  		ether_addr_copy(&eth_hdr->s_addr, &eth_hdr->d_addr);
> >  		ether_addr_copy(&addr, &eth_hdr->s_addr);
> > +#endif
> >
> >  		mb->ol_flags &= IND_ATTACHED_MBUF | EXT_ATTACHED_MBUF;
> >  		mb->ol_flags |= ol_flags;
> > --
> > 2.13.6
  
Ananyev, Konstantin Nov. 20, 2018, 5:26 p.m. UTC | #4
> -----Original Message-----
> From: Zhang, Qi Z
> Sent: Tuesday, November 20, 2018 4:58 PM
> To: Ananyev, Konstantin <konstantin.ananyev@intel.com>; Richardson, Bruce <bruce.richardson@intel.com>; Wiles, Keith
> <keith.wiles@intel.com>
> Cc: dev@dpdk.org; Lu, Wenzhuo <wenzhuo.lu@intel.com>; Iremonger, Bernard <bernard.iremonger@intel.com>; stable@dpdk.org
> Subject: RE: [dpdk-dev] [PATCH] app/testpmd: improve MAC swap performance
> 
> 
> 
> > -----Original Message-----
> > From: Ananyev, Konstantin
> > Sent: Tuesday, November 20, 2018 1:17 AM
> > To: Zhang, Qi Z <qi.z.zhang@intel.com>; Richardson, Bruce
> > <bruce.richardson@intel.com>; Wiles, Keith <keith.wiles@intel.com>
> > Cc: dev@dpdk.org; Lu, Wenzhuo <wenzhuo.lu@intel.com>; Iremonger, Bernard
> > <bernard.iremonger@intel.com>; Zhang, Qi Z <qi.z.zhang@intel.com>;
> > stable@dpdk.org
> > Subject: RE: [dpdk-dev] [PATCH] app/testpmd: improve MAC swap performance
> >
> > Hi Qi,
> >
> > > -----Original Message-----
> > > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Qi Zhang
> > > Sent: Tuesday, November 20, 2018 4:46 AM
> > > To: Richardson, Bruce <bruce.richardson@intel.com>; Wiles, Keith
> > > <keith.wiles@intel.com>
> > > Cc: dev@dpdk.org; Lu, Wenzhuo <wenzhuo.lu@intel.com>; Iremonger,
> > > Bernard <bernard.iremonger@intel.com>; Zhang, Qi Z
> > > <qi.z.zhang@intel.com>; stable@dpdk.org
> > > Subject: [dpdk-dev] [PATCH] app/testpmd: improve MAC swap performance
> > >
> > > The patch optimizes the mac swap operation by taking advantage of SSE
> > > instructions, it only impacts x86 platform.
> > >
> > > Cc: stable@dpdk.org
> > >
> > > Signed-off-by: Qi Zhang <qi.z.zhang@intel.com>
> > > ---
> > >  app/test-pmd/macswap.c | 16 +++++++++++++++-
> > >  1 file changed, 15 insertions(+), 1 deletion(-)
> > >
> > > diff --git a/app/test-pmd/macswap.c b/app/test-pmd/macswap.c index
> > > a8384d5b8..0722782b0 100644
> > > --- a/app/test-pmd/macswap.c
> > > +++ b/app/test-pmd/macswap.c
> > > @@ -78,7 +78,6 @@ pkt_burst_mac_swap(struct fwd_stream *fs)
> > >  	struct rte_port  *txp;
> > >  	struct rte_mbuf  *mb;
> > >  	struct ether_hdr *eth_hdr;
> > > -	struct ether_addr addr;
> > >  	uint16_t nb_rx;
> > >  	uint16_t nb_tx;
> > >  	uint16_t i;
> > > @@ -95,6 +94,15 @@ pkt_burst_mac_swap(struct fwd_stream *fs)
> > >  	start_tsc = rte_rdtsc();
> > >  #endif
> > >
> > > +#ifdef RTE_ARCH_X86
> > > +	__m128i addr;
> > > +	__m128i shfl_msk = _mm_set_epi8(15, 14, 13, 12,
> > > +					5, 4, 3, 2,
> > > +					1, 0, 11, 10,
> > > +					9, 8, 7, 6);
> > > +#else
> > > +	struct ether_addr addr;
> > > +#endif
> >
> > I think it would better to place IA specific code into a separate fnction (and
> > probably into a separate .h file).
> 
> OK, I will think about how to rework this.

Ideally would be good to have an generic one, and IA optimized version.

> 
> > BTW, just curious what % of improvement it gives?
> 
> So far , the only server I can test is a 1.6GHz Broadwell server with 2 ports on 1 i40e 25G.
> The macswap performance is increase from 16.8mpps to 20mpps (about 19% improvement)

Quite a lot, definitely looks like worth it.

> 
> > Konstantin
> >
> >
> > >  	/*
> > >  	 * Receive a burst of packets and forward them.
> > >  	 */
> > > @@ -123,9 +131,15 @@ pkt_burst_mac_swap(struct fwd_stream *fs)
> > >  		eth_hdr = rte_pktmbuf_mtod(mb, struct ether_hdr *);
> > >
> > >  		/* Swap dest and src mac addresses. */
> > > +#ifdef RTE_ARCH_X86
> > > +		addr = _mm_loadu_si128((__m128i *)eth_hdr);
> > > +		addr = _mm_shuffle_epi8(addr, shfl_msk);
> > > +		_mm_storeu_si128((__m128i *)eth_hdr, addr); #else
> > >  		ether_addr_copy(&eth_hdr->d_addr, &addr);
> > >  		ether_addr_copy(&eth_hdr->s_addr, &eth_hdr->d_addr);
> > >  		ether_addr_copy(&addr, &eth_hdr->s_addr);
> > > +#endif
> > >
> > >  		mb->ol_flags &= IND_ATTACHED_MBUF | EXT_ATTACHED_MBUF;
> > >  		mb->ol_flags |= ol_flags;
> > > --
> > > 2.13.6
  
Ananyev, Konstantin Nov. 20, 2018, 10:53 p.m. UTC | #5
> -----Original Message-----
> From: Ananyev, Konstantin
> Sent: Tuesday, November 20, 2018 5:26 PM
> To: Zhang, Qi Z <qi.z.zhang@intel.com>; Richardson, Bruce <bruce.richardson@intel.com>; Wiles, Keith <keith.wiles@intel.com>
> Cc: dev@dpdk.org; Lu, Wenzhuo <wenzhuo.lu@intel.com>; Iremonger, Bernard <bernard.iremonger@intel.com>; stable@dpdk.org
> Subject: RE: [dpdk-dev] [PATCH] app/testpmd: improve MAC swap performance
> 
> 
> 
> > -----Original Message-----
> > From: Zhang, Qi Z
> > Sent: Tuesday, November 20, 2018 4:58 PM
> > To: Ananyev, Konstantin <konstantin.ananyev@intel.com>; Richardson, Bruce <bruce.richardson@intel.com>; Wiles, Keith
> > <keith.wiles@intel.com>
> > Cc: dev@dpdk.org; Lu, Wenzhuo <wenzhuo.lu@intel.com>; Iremonger, Bernard <bernard.iremonger@intel.com>; stable@dpdk.org
> > Subject: RE: [dpdk-dev] [PATCH] app/testpmd: improve MAC swap performance
> >
> >
> >
> > > -----Original Message-----
> > > From: Ananyev, Konstantin
> > > Sent: Tuesday, November 20, 2018 1:17 AM
> > > To: Zhang, Qi Z <qi.z.zhang@intel.com>; Richardson, Bruce
> > > <bruce.richardson@intel.com>; Wiles, Keith <keith.wiles@intel.com>
> > > Cc: dev@dpdk.org; Lu, Wenzhuo <wenzhuo.lu@intel.com>; Iremonger, Bernard
> > > <bernard.iremonger@intel.com>; Zhang, Qi Z <qi.z.zhang@intel.com>;
> > > stable@dpdk.org
> > > Subject: RE: [dpdk-dev] [PATCH] app/testpmd: improve MAC swap performance
> > >
> > > Hi Qi,
> > >
> > > > -----Original Message-----
> > > > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Qi Zhang
> > > > Sent: Tuesday, November 20, 2018 4:46 AM
> > > > To: Richardson, Bruce <bruce.richardson@intel.com>; Wiles, Keith
> > > > <keith.wiles@intel.com>
> > > > Cc: dev@dpdk.org; Lu, Wenzhuo <wenzhuo.lu@intel.com>; Iremonger,
> > > > Bernard <bernard.iremonger@intel.com>; Zhang, Qi Z
> > > > <qi.z.zhang@intel.com>; stable@dpdk.org
> > > > Subject: [dpdk-dev] [PATCH] app/testpmd: improve MAC swap performance
> > > >
> > > > The patch optimizes the mac swap operation by taking advantage of SSE
> > > > instructions, it only impacts x86 platform.
> > > >
> > > > Cc: stable@dpdk.org
> > > >
> > > > Signed-off-by: Qi Zhang <qi.z.zhang@intel.com>
> > > > ---
> > > >  app/test-pmd/macswap.c | 16 +++++++++++++++-
> > > >  1 file changed, 15 insertions(+), 1 deletion(-)
> > > >
> > > > diff --git a/app/test-pmd/macswap.c b/app/test-pmd/macswap.c index
> > > > a8384d5b8..0722782b0 100644
> > > > --- a/app/test-pmd/macswap.c
> > > > +++ b/app/test-pmd/macswap.c
> > > > @@ -78,7 +78,6 @@ pkt_burst_mac_swap(struct fwd_stream *fs)
> > > >  	struct rte_port  *txp;
> > > >  	struct rte_mbuf  *mb;
> > > >  	struct ether_hdr *eth_hdr;
> > > > -	struct ether_addr addr;
> > > >  	uint16_t nb_rx;
> > > >  	uint16_t nb_tx;
> > > >  	uint16_t i;
> > > > @@ -95,6 +94,15 @@ pkt_burst_mac_swap(struct fwd_stream *fs)
> > > >  	start_tsc = rte_rdtsc();
> > > >  #endif
> > > >
> > > > +#ifdef RTE_ARCH_X86
> > > > +	__m128i addr;
> > > > +	__m128i shfl_msk = _mm_set_epi8(15, 14, 13, 12,
> > > > +					5, 4, 3, 2,
> > > > +					1, 0, 11, 10,
> > > > +					9, 8, 7, 6);
> > > > +#else
> > > > +	struct ether_addr addr;
> > > > +#endif
> > >
> > > I think it would better to place IA specific code into a separate fnction (and
> > > probably into a separate .h file).
> >
> > OK, I will think about how to rework this.
> 
> Ideally would be good to have an generic one, and IA optimized version.
> 
> >
> > > BTW, just curious what % of improvement it gives?
> >
> > So far , the only server I can test is a 1.6GHz Broadwell server with 2 ports on 1 i40e 25G.
> > The macswap performance is increase from 16.8mpps to 20mpps (about 19% improvement)
> 
> Quite a lot, definitely looks like worth it.

You probably can squeeze few more cycles doing it in bulks of 4 or so.
Konstantin
  
Qi Zhang Nov. 21, 2018, 9:24 p.m. UTC | #6
> -----Original Message-----
> From: Ananyev, Konstantin
> Sent: Tuesday, November 20, 2018 2:54 PM
> To: Zhang, Qi Z <qi.z.zhang@intel.com>; Richardson, Bruce
> <bruce.richardson@intel.com>; Wiles, Keith <keith.wiles@intel.com>
> Cc: dev@dpdk.org; Lu, Wenzhuo <wenzhuo.lu@intel.com>; Iremonger, Bernard
> <bernard.iremonger@intel.com>; stable@dpdk.org
> Subject: RE: [dpdk-dev] [PATCH] app/testpmd: improve MAC swap performance
> 
> 
> 
> > -----Original Message-----
> > From: Ananyev, Konstantin
> > Sent: Tuesday, November 20, 2018 5:26 PM
> > To: Zhang, Qi Z <qi.z.zhang@intel.com>; Richardson, Bruce
> > <bruce.richardson@intel.com>; Wiles, Keith <keith.wiles@intel.com>
> > Cc: dev@dpdk.org; Lu, Wenzhuo <wenzhuo.lu@intel.com>; Iremonger,
> > Bernard <bernard.iremonger@intel.com>; stable@dpdk.org
> > Subject: RE: [dpdk-dev] [PATCH] app/testpmd: improve MAC swap
> > performance
> >
> >
> >
> > > -----Original Message-----
> > > From: Zhang, Qi Z
> > > Sent: Tuesday, November 20, 2018 4:58 PM
> > > To: Ananyev, Konstantin <konstantin.ananyev@intel.com>; Richardson,
> > > Bruce <bruce.richardson@intel.com>; Wiles, Keith
> > > <keith.wiles@intel.com>
> > > Cc: dev@dpdk.org; Lu, Wenzhuo <wenzhuo.lu@intel.com>; Iremonger,
> > > Bernard <bernard.iremonger@intel.com>; stable@dpdk.org
> > > Subject: RE: [dpdk-dev] [PATCH] app/testpmd: improve MAC swap
> > > performance
> > >
> > >
> > >
> > > > -----Original Message-----
> > > > From: Ananyev, Konstantin
> > > > Sent: Tuesday, November 20, 2018 1:17 AM
> > > > To: Zhang, Qi Z <qi.z.zhang@intel.com>; Richardson, Bruce
> > > > <bruce.richardson@intel.com>; Wiles, Keith <keith.wiles@intel.com>
> > > > Cc: dev@dpdk.org; Lu, Wenzhuo <wenzhuo.lu@intel.com>; Iremonger,
> > > > Bernard <bernard.iremonger@intel.com>; Zhang, Qi Z
> > > > <qi.z.zhang@intel.com>; stable@dpdk.org
> > > > Subject: RE: [dpdk-dev] [PATCH] app/testpmd: improve MAC swap
> > > > performance
> > > >
> > > > Hi Qi,
> > > >
> > > > > -----Original Message-----
> > > > > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Qi Zhang
> > > > > Sent: Tuesday, November 20, 2018 4:46 AM
> > > > > To: Richardson, Bruce <bruce.richardson@intel.com>; Wiles, Keith
> > > > > <keith.wiles@intel.com>
> > > > > Cc: dev@dpdk.org; Lu, Wenzhuo <wenzhuo.lu@intel.com>; Iremonger,
> > > > > Bernard <bernard.iremonger@intel.com>; Zhang, Qi Z
> > > > > <qi.z.zhang@intel.com>; stable@dpdk.org
> > > > > Subject: [dpdk-dev] [PATCH] app/testpmd: improve MAC swap
> > > > > performance
> > > > >
> > > > > The patch optimizes the mac swap operation by taking advantage
> > > > > of SSE instructions, it only impacts x86 platform.
> > > > >
> > > > > Cc: stable@dpdk.org
> > > > >
> > > > > Signed-off-by: Qi Zhang <qi.z.zhang@intel.com>
> > > > > ---
> > > > >  app/test-pmd/macswap.c | 16 +++++++++++++++-
> > > > >  1 file changed, 15 insertions(+), 1 deletion(-)
> > > > >
> > > > > diff --git a/app/test-pmd/macswap.c b/app/test-pmd/macswap.c
> > > > > index
> > > > > a8384d5b8..0722782b0 100644
> > > > > --- a/app/test-pmd/macswap.c
> > > > > +++ b/app/test-pmd/macswap.c
> > > > > @@ -78,7 +78,6 @@ pkt_burst_mac_swap(struct fwd_stream *fs)
> > > > >  	struct rte_port  *txp;
> > > > >  	struct rte_mbuf  *mb;
> > > > >  	struct ether_hdr *eth_hdr;
> > > > > -	struct ether_addr addr;
> > > > >  	uint16_t nb_rx;
> > > > >  	uint16_t nb_tx;
> > > > >  	uint16_t i;
> > > > > @@ -95,6 +94,15 @@ pkt_burst_mac_swap(struct fwd_stream *fs)
> > > > >  	start_tsc = rte_rdtsc();
> > > > >  #endif
> > > > >
> > > > > +#ifdef RTE_ARCH_X86
> > > > > +	__m128i addr;
> > > > > +	__m128i shfl_msk = _mm_set_epi8(15, 14, 13, 12,
> > > > > +					5, 4, 3, 2,
> > > > > +					1, 0, 11, 10,
> > > > > +					9, 8, 7, 6);
> > > > > +#else
> > > > > +	struct ether_addr addr;
> > > > > +#endif
> > > >
> > > > I think it would better to place IA specific code into a separate
> > > > fnction (and probably into a separate .h file).
> > >
> > > OK, I will think about how to rework this.
> >
> > Ideally would be good to have an generic one, and IA optimized version.
> >
> > >
> > > > BTW, just curious what % of improvement it gives?
> > >
> > > So far , the only server I can test is a 1.6GHz Broadwell server with 2 ports on
> 1 i40e 25G.
> > > The macswap performance is increase from 16.8mpps to 20mpps (about
> > > 19% improvement)

I need to add a notice here, I found previous test is running on CPU from remote socket.
For the test on CPU from local socket on the same server, actually the mac swap performance is improved from 23.34 to 26.36, its about 12.9% increase, but still considerable.

> >
> > Quite a lot, definitely looks like worth it.
> 
> You probably can squeeze few more cycles doing it in bulks of 4 or so.

it's a good idea, based on my experience I can get more than 4% increase by batch with 4, 
it can reach 27.46mpps, so now its 17.7% increase, I will send patch later, please help to polish:)

Thanks
Qi

> Konstantin
  
Wiles, Keith Nov. 23, 2018, 10:43 p.m. UTC | #7
> On Nov 19, 2018, at 10:45 PM, Zhang, Qi Z <qi.z.zhang@intel.com> wrote:
> 
> The patch optimizes the mac swap operation by taking advantage
> of SSE instructions, it only impacts x86 platform.
> 
> Cc: stable@dpdk.org
> 
> Signed-off-by: Qi Zhang <qi.z.zhang@intel.com>
> ---
> app/test-pmd/macswap.c | 16 +++++++++++++++-
> 1 file changed, 15 insertions(+), 1 deletion(-)
> 
> diff --git a/app/test-pmd/macswap.c b/app/test-pmd/macswap.c
> index a8384d5b8..0722782b0 100644
> --- a/app/test-pmd/macswap.c
> +++ b/app/test-pmd/macswap.c
> @@ -78,7 +78,6 @@ pkt_burst_mac_swap(struct fwd_stream *fs)
> 	struct rte_port  *txp;
> 	struct rte_mbuf  *mb;
> 	struct ether_hdr *eth_hdr;
> -	struct ether_addr addr;
> 	uint16_t nb_rx;
> 	uint16_t nb_tx;
> 	uint16_t i;
> @@ -95,6 +94,15 @@ pkt_burst_mac_swap(struct fwd_stream *fs)
> 	start_tsc = rte_rdtsc();
> #endif
> 
> +#ifdef RTE_ARCH_X86
> +	__m128i addr;
> +	__m128i shfl_msk = _mm_set_epi8(15, 14, 13, 12,
> +					5, 4, 3, 2,
> +					1, 0, 11, 10,
> +					9, 8, 7, 6);

I was playing around with these mask values and I was not able to make it work as I expected.
I ended up with different values in the mask.

_mm_set_epi8(15, 14, 13, 12, 5, 4, 3, 2, 1, 0, 11, 10, 9, 8, 7, 6);

After dumping the memory for a large number of tests this one seems correct, can you verify your mask is correct?

> +#else
> +	struct ether_addr addr;
> +#endif
> 	/*
> 	 * Receive a burst of packets and forward them.
> 	 */
> @@ -123,9 +131,15 @@ pkt_burst_mac_swap(struct fwd_stream *fs)
> 		eth_hdr = rte_pktmbuf_mtod(mb, struct ether_hdr *);
> 
> 		/* Swap dest and src mac addresses. */
> +#ifdef RTE_ARCH_X86
> +		addr = _mm_loadu_si128((__m128i *)eth_hdr);
> +		addr = _mm_shuffle_epi8(addr, shfl_msk);
> +		_mm_storeu_si128((__m128i *)eth_hdr, addr);
> +#else
> 		ether_addr_copy(&eth_hdr->d_addr, &addr);
> 		ether_addr_copy(&eth_hdr->s_addr, &eth_hdr->d_addr);
> 		ether_addr_copy(&addr, &eth_hdr->s_addr);
> +#endif
> 
> 		mb->ol_flags &= IND_ATTACHED_MBUF | EXT_ATTACHED_MBUF;
> 		mb->ol_flags |= ol_flags;
> -- 
> 2.13.6
> 

Regards,
Keith
  
Wiles, Keith Nov. 23, 2018, 10:43 p.m. UTC | #8
> On Nov 19, 2018, at 10:45 PM, Zhang, Qi Z <qi.z.zhang@intel.com> wrote:
> 
> The patch optimizes the mac swap operation by taking advantage
> of SSE instructions, it only impacts x86 platform.
> 
> Cc: stable@dpdk.org
> 
> Signed-off-by: Qi Zhang <qi.z.zhang@intel.com>
> ---
> app/test-pmd/macswap.c | 16 +++++++++++++++-
> 1 file changed, 15 insertions(+), 1 deletion(-)
> 
> diff --git a/app/test-pmd/macswap.c b/app/test-pmd/macswap.c
> index a8384d5b8..0722782b0 100644
> --- a/app/test-pmd/macswap.c
> +++ b/app/test-pmd/macswap.c
> @@ -78,7 +78,6 @@ pkt_burst_mac_swap(struct fwd_stream *fs)
> 	struct rte_port  *txp;
> 	struct rte_mbuf  *mb;
> 	struct ether_hdr *eth_hdr;
> -	struct ether_addr addr;
> 	uint16_t nb_rx;
> 	uint16_t nb_tx;
> 	uint16_t i;
> @@ -95,6 +94,15 @@ pkt_burst_mac_swap(struct fwd_stream *fs)
> 	start_tsc = rte_rdtsc();
> #endif
> 
> +#ifdef RTE_ARCH_X86
> +	__m128i addr;
> +	__m128i shfl_msk = _mm_set_epi8(15, 14, 13, 12,
> +					5, 4, 3, 2,
> +					1, 0, 11, 10,
> +					9, 8, 7, 6);

I was playing around with these mask values and I was not able to make it work as I expected.
I ended up with different values in the mask.

_mm_set_epi8(15, 14, 13, 12, 5, 4, 3, 2, 1, 0, 11, 10, 9, 8, 7, 6);

After dumping the memory for a large number of tests this one seems correct, can you verify your mask is correct?

> +#else
> +	struct ether_addr addr;
> +#endif
> 	/*
> 	 * Receive a burst of packets and forward them.
> 	 */
> @@ -123,9 +131,15 @@ pkt_burst_mac_swap(struct fwd_stream *fs)
> 		eth_hdr = rte_pktmbuf_mtod(mb, struct ether_hdr *);
> 
> 		/* Swap dest and src mac addresses. */
> +#ifdef RTE_ARCH_X86
> +		addr = _mm_loadu_si128((__m128i *)eth_hdr);
> +		addr = _mm_shuffle_epi8(addr, shfl_msk);
> +		_mm_storeu_si128((__m128i *)eth_hdr, addr);
> +#else
> 		ether_addr_copy(&eth_hdr->d_addr, &addr);
> 		ether_addr_copy(&eth_hdr->s_addr, &eth_hdr->d_addr);
> 		ether_addr_copy(&addr, &eth_hdr->s_addr);
> +#endif
> 
> 		mb->ol_flags &= IND_ATTACHED_MBUF | EXT_ATTACHED_MBUF;
> 		mb->ol_flags |= ol_flags;
> -- 
> 2.13.6
> 

Regards,
Keith
  
Wiles, Keith Nov. 24, 2018, 4:24 p.m. UTC | #9
> On Nov 23, 2018, at 4:43 PM, Wiles, Keith <keith.wiles@intel.com> wrote:
> 
> 
> 
>> On Nov 19, 2018, at 10:45 PM, Zhang, Qi Z <qi.z.zhang@intel.com> wrote:
>> 
>> The patch optimizes the mac swap operation by taking advantage
>> of SSE instructions, it only impacts x86 platform.
>> 
>> Cc: stable@dpdk.org
>> 
>> Signed-off-by: Qi Zhang <qi.z.zhang@intel.com>
>> ---
>> app/test-pmd/macswap.c | 16 +++++++++++++++-
>> 1 file changed, 15 insertions(+), 1 deletion(-)
>> 
>> diff --git a/app/test-pmd/macswap.c b/app/test-pmd/macswap.c
>> index a8384d5b8..0722782b0 100644
>> --- a/app/test-pmd/macswap.c
>> +++ b/app/test-pmd/macswap.c
>> @@ -78,7 +78,6 @@ pkt_burst_mac_swap(struct fwd_stream *fs)
>> 	struct rte_port  *txp;
>> 	struct rte_mbuf  *mb;
>> 	struct ether_hdr *eth_hdr;
>> -	struct ether_addr addr;
>> 	uint16_t nb_rx;
>> 	uint16_t nb_tx;
>> 	uint16_t i;
>> @@ -95,6 +94,15 @@ pkt_burst_mac_swap(struct fwd_stream *fs)
>> 	start_tsc = rte_rdtsc();
>> #endif
>> 
>> +#ifdef RTE_ARCH_X86
>> +	__m128i addr;
>> +	__m128i shfl_msk = _mm_set_epi8(15, 14, 13, 12,
>> +					5, 4, 3, 2,
>> +					1, 0, 11, 10,
>> +					9, 8, 7, 6);
> 
> I was playing around with these mask values and I was not able to make it work as I expected.
> I ended up with different values in the mask.
> 
> _mm_set_epi8(15, 14, 13, 12, 5, 4, 3, 2, 1, 0, 11, 10, 9, 8, 7, 6);
> 
> After dumping the memory for a large number of tests this one seems correct, can you verify your mask is correct?

Sorry, I do not know why I thought the code was not the same, but your example is correct my mistake.
> 
>> +#else
>> +	struct ether_addr addr;
>> +#endif
>> 	/*
>> 	 * Receive a burst of packets and forward them.
>> 	 */
>> @@ -123,9 +131,15 @@ pkt_burst_mac_swap(struct fwd_stream *fs)
>> 		eth_hdr = rte_pktmbuf_mtod(mb, struct ether_hdr *);
>> 
>> 		/* Swap dest and src mac addresses. */
>> +#ifdef RTE_ARCH_X86
>> +		addr = _mm_loadu_si128((__m128i *)eth_hdr);
>> +		addr = _mm_shuffle_epi8(addr, shfl_msk);
>> +		_mm_storeu_si128((__m128i *)eth_hdr, addr);
>> +#else
>> 		ether_addr_copy(&eth_hdr->d_addr, &addr);
>> 		ether_addr_copy(&eth_hdr->s_addr, &eth_hdr->d_addr);
>> 		ether_addr_copy(&addr, &eth_hdr->s_addr);
>> +#endif
>> 
>> 		mb->ol_flags &= IND_ATTACHED_MBUF | EXT_ATTACHED_MBUF;
>> 		mb->ol_flags |= ol_flags;
>> -- 
>> 2.13.6
>> 
> 
> Regards,
> Keith

Regards,
Keith
  
Qi Zhang Nov. 27, 2018, 1:06 a.m. UTC | #10
> -----Original Message-----
> From: Wiles, Keith
> Sent: Saturday, November 24, 2018 8:24 AM
> To: Zhang, Qi Z <qi.z.zhang@intel.com>
> Cc: Richardson, Bruce <bruce.richardson@intel.com>; dev <dev@dpdk.org>; Lu,
> Wenzhuo <wenzhuo.lu@intel.com>; Iremonger, Bernard
> <bernard.iremonger@intel.com>; stable@dpdk.org
> Subject: Re: [PATCH] app/testpmd: improve MAC swap performance
> 
> 
> 
> > On Nov 23, 2018, at 4:43 PM, Wiles, Keith <keith.wiles@intel.com> wrote:
> >
> >
> >
> >> On Nov 19, 2018, at 10:45 PM, Zhang, Qi Z <qi.z.zhang@intel.com> wrote:
> >>
> >> The patch optimizes the mac swap operation by taking advantage of SSE
> >> instructions, it only impacts x86 platform.
> >>
> >> Cc: stable@dpdk.org
> >>
> >> Signed-off-by: Qi Zhang <qi.z.zhang@intel.com>
> >> ---
> >> app/test-pmd/macswap.c | 16 +++++++++++++++-
> >> 1 file changed, 15 insertions(+), 1 deletion(-)
> >>
> >> diff --git a/app/test-pmd/macswap.c b/app/test-pmd/macswap.c index
> >> a8384d5b8..0722782b0 100644
> >> --- a/app/test-pmd/macswap.c
> >> +++ b/app/test-pmd/macswap.c
> >> @@ -78,7 +78,6 @@ pkt_burst_mac_swap(struct fwd_stream *fs)
> >> 	struct rte_port  *txp;
> >> 	struct rte_mbuf  *mb;
> >> 	struct ether_hdr *eth_hdr;
> >> -	struct ether_addr addr;
> >> 	uint16_t nb_rx;
> >> 	uint16_t nb_tx;
> >> 	uint16_t i;
> >> @@ -95,6 +94,15 @@ pkt_burst_mac_swap(struct fwd_stream *fs)
> >> 	start_tsc = rte_rdtsc();
> >> #endif
> >>
> >> +#ifdef RTE_ARCH_X86
> >> +	__m128i addr;
> >> +	__m128i shfl_msk = _mm_set_epi8(15, 14, 13, 12,
> >> +					5, 4, 3, 2,
> >> +					1, 0, 11, 10,
> >> +					9, 8, 7, 6);
> >
> > I was playing around with these mask values and I was not able to make it
> work as I expected.
> > I ended up with different values in the mask.
> >
> > _mm_set_epi8(15, 14, 13, 12, 5, 4, 3, 2, 1, 0, 11, 10, 9, 8, 7, 6);
> >
> > After dumping the memory for a large number of tests this one seems correct,
> can you verify your mask is correct?
> 
> Sorry, I do not know why I thought the code was not the same, but your
> example is correct my mistake.

Thanks for review and verify this!

Regards
Qi

> >
> >> +#else
> >> +	struct ether_addr addr;
> >> +#endif
> >> 	/*
> >> 	 * Receive a burst of packets and forward them.
> >> 	 */
> >> @@ -123,9 +131,15 @@ pkt_burst_mac_swap(struct fwd_stream *fs)
> >> 		eth_hdr = rte_pktmbuf_mtod(mb, struct ether_hdr *);
> >>
> >> 		/* Swap dest and src mac addresses. */
> >> +#ifdef RTE_ARCH_X86
> >> +		addr = _mm_loadu_si128((__m128i *)eth_hdr);
> >> +		addr = _mm_shuffle_epi8(addr, shfl_msk);
> >> +		_mm_storeu_si128((__m128i *)eth_hdr, addr); #else
> >> 		ether_addr_copy(&eth_hdr->d_addr, &addr);
> >> 		ether_addr_copy(&eth_hdr->s_addr, &eth_hdr->d_addr);
> >> 		ether_addr_copy(&addr, &eth_hdr->s_addr);
> >> +#endif
> >>
> >> 		mb->ol_flags &= IND_ATTACHED_MBUF | EXT_ATTACHED_MBUF;
> >> 		mb->ol_flags |= ol_flags;
> >> --
> >> 2.13.6
> >>
> >
> > Regards,
> > Keith
> 
> Regards,
> Keith
  

Patch

diff --git a/app/test-pmd/macswap.c b/app/test-pmd/macswap.c
index a8384d5b8..0722782b0 100644
--- a/app/test-pmd/macswap.c
+++ b/app/test-pmd/macswap.c
@@ -78,7 +78,6 @@  pkt_burst_mac_swap(struct fwd_stream *fs)
 	struct rte_port  *txp;
 	struct rte_mbuf  *mb;
 	struct ether_hdr *eth_hdr;
-	struct ether_addr addr;
 	uint16_t nb_rx;
 	uint16_t nb_tx;
 	uint16_t i;
@@ -95,6 +94,15 @@  pkt_burst_mac_swap(struct fwd_stream *fs)
 	start_tsc = rte_rdtsc();
 #endif
 
+#ifdef RTE_ARCH_X86
+	__m128i addr;
+	__m128i shfl_msk = _mm_set_epi8(15, 14, 13, 12,
+					5, 4, 3, 2,
+					1, 0, 11, 10,
+					9, 8, 7, 6);
+#else
+	struct ether_addr addr;
+#endif
 	/*
 	 * Receive a burst of packets and forward them.
 	 */
@@ -123,9 +131,15 @@  pkt_burst_mac_swap(struct fwd_stream *fs)
 		eth_hdr = rte_pktmbuf_mtod(mb, struct ether_hdr *);
 
 		/* Swap dest and src mac addresses. */
+#ifdef RTE_ARCH_X86
+		addr = _mm_loadu_si128((__m128i *)eth_hdr);
+		addr = _mm_shuffle_epi8(addr, shfl_msk);
+		_mm_storeu_si128((__m128i *)eth_hdr, addr);
+#else
 		ether_addr_copy(&eth_hdr->d_addr, &addr);
 		ether_addr_copy(&eth_hdr->s_addr, &eth_hdr->d_addr);
 		ether_addr_copy(&addr, &eth_hdr->s_addr);
+#endif
 
 		mb->ol_flags &= IND_ATTACHED_MBUF | EXT_ATTACHED_MBUF;
 		mb->ol_flags |= ol_flags;