acl: fix build issue with some arm64 compiler

Message ID 20190606145054.39995-1-jerinj@marvell.com (mailing list archive)
State Superseded, archived
Headers
Series acl: fix build issue with some arm64 compiler |

Checks

Context Check Description
ci/intel-Performance-Testing success Performance Testing PASS
ci/mellanox-Performance-Testing success Performance Testing PASS
ci/Intel-compilation success Compilation OK
ci/checkpatch warning coding style issues

Commit Message

Jerin Jacob Kollanukkaran June 6, 2019, 2:50 p.m. UTC
  From: Jerin Jacob <jerinj@marvell.com>

Some compilers reporting the following error, though the existing
code doesn't have any uninitialized variable case.
Just to make compiler happy, initialize the int32x4_t variable
one shot in C language.

../lib/librte_acl/acl_run_neon.h: In function 'search_neon_4'
../lib/librte_acl/acl_run_neon.h:230:12: error: 'input' may be
used uninitialized in this function [-Werror=maybe-uninitialized]
  int32x4_t input;

Fixes: 34fa6c27c156 ("acl: add NEON optimization for ARMv8")
Cc: stable@dpdk.org

Signed-off-by: Jerin Jacob <jerinj@marvell.com>
---
 lib/librte_acl/acl_run_neon.h | 29 ++++++++++++-----------------
 1 file changed, 12 insertions(+), 17 deletions(-)
  

Comments

Michael Santana June 6, 2019, 3:55 p.m. UTC | #1
On 6/6/19 10:50 AM, jerinj@marvell.com wrote:
> From: Jerin Jacob <jerinj@marvell.com>
>
> Some compilers reporting the following error, though the existing
> code doesn't have any uninitialized variable case.
> Just to make compiler happy, initialize the int32x4_t variable
> one shot in C language.
>
> ../lib/librte_acl/acl_run_neon.h: In function 'search_neon_4'
> ../lib/librte_acl/acl_run_neon.h:230:12: error: 'input' may be
> used uninitialized in this function [-Werror=maybe-uninitialized]
>    int32x4_t input;
>
> Fixes: 34fa6c27c156 ("acl: add NEON optimization for ARMv8")
> Cc: stable@dpdk.org
>
> Signed-off-by: Jerin Jacob <jerinj@marvell.com>
> ---
>   lib/librte_acl/acl_run_neon.h | 29 ++++++++++++-----------------
>   1 file changed, 12 insertions(+), 17 deletions(-)
>
> diff --git a/lib/librte_acl/acl_run_neon.h b/lib/librte_acl/acl_run_neon.h
> index 01b9766d8..dc9e9efe9 100644
> --- a/lib/librte_acl/acl_run_neon.h
> +++ b/lib/librte_acl/acl_run_neon.h
> @@ -165,7 +165,6 @@ search_neon_8(const struct rte_acl_ctx *ctx, const uint8_t **data,
>   	uint64_t index_array[8];
>   	struct completion cmplt[8];
>   	struct parms parms[8];
> -	int32x4_t input0, input1;
>   
>   	acl_set_flow(&flows, cmplt, RTE_DIM(cmplt), data, results,
>   		     total_packets, categories, ctx->trans_table);
> @@ -181,17 +180,14 @@ search_neon_8(const struct rte_acl_ctx *ctx, const uint8_t **data,
>   
>   	while (flows.started > 0) {
>   		/* Gather 4 bytes of input data for each stream. */
> -		input0 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 0), input0, 0);
> -		input1 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 4), input1, 0);
> -
> -		input0 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 1), input0, 1);
> -		input1 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 5), input1, 1);
> -
> -		input0 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 2), input0, 2);
> -		input1 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 6), input1, 2);
> -
> -		input0 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 3), input0, 3);
> -		input1 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 7), input1, 3);
> +		int32x4_t input0 = {GET_NEXT_4BYTES(parms, 0),
> +				    GET_NEXT_4BYTES(parms, 1),
> +				    GET_NEXT_4BYTES(parms, 2),
> +				    GET_NEXT_4BYTES(parms, 3)};
> +		int32x4_t input1 = {GET_NEXT_4BYTES(parms, 4),
> +				    GET_NEXT_4BYTES(parms, 5),
> +				    GET_NEXT_4BYTES(parms, 6),
> +				    GET_NEXT_4BYTES(parms, 7)};
>   
>   		/* Process the 4 bytes of input on each stream. */
>   
> @@ -227,7 +223,6 @@ search_neon_4(const struct rte_acl_ctx *ctx, const uint8_t **data,
>   	uint64_t index_array[4];
>   	struct completion cmplt[4];
>   	struct parms parms[4];
> -	int32x4_t input;
>   
>   	acl_set_flow(&flows, cmplt, RTE_DIM(cmplt), data, results,
>   		     total_packets, categories, ctx->trans_table);
> @@ -242,10 +237,10 @@ search_neon_4(const struct rte_acl_ctx *ctx, const uint8_t **data,
>   
>   	while (flows.started > 0) {
>   		/* Gather 4 bytes of input data for each stream. */
> -		input = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 0), input, 0);
> -		input = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 1), input, 1);
> -		input = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 2), input, 2);
> -		input = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 3), input, 3);
> +		int32x4_t input = {GET_NEXT_4BYTES(parms, 0),
> +				   GET_NEXT_4BYTES(parms, 1),
> +				   GET_NEXT_4BYTES(parms, 2),
> +				   GET_NEXT_4BYTES(parms, 3)};
>   
>   		/* Process the 4 bytes of input on each stream. */
>   		input = transition4(input, flows.trans, index_array);

Fixed on travis: https://travis-ci.com/Maickii/dpdk-2/builds/114612090

Acked-by: Michael Santana <msantana@redhat.com>
  
Honnappa Nagarahalli June 7, 2019, 5:35 a.m. UTC | #2
> Subject: [dpdk-dev] [PATCH] acl: fix build issue with some arm64 compiler
> 
> From: Jerin Jacob <jerinj@marvell.com>
> 
> Some compilers reporting the following error, though the existing code
> doesn't have any uninitialized variable case.
> Just to make compiler happy, initialize the int32x4_t variable one shot in C
> language.
> 
> ../lib/librte_acl/acl_run_neon.h: In function 'search_neon_4'
> ../lib/librte_acl/acl_run_neon.h:230:12: error: 'input' may be used
> uninitialized in this function [-Werror=maybe-uninitialized]
>   int32x4_t input;
> 
> Fixes: 34fa6c27c156 ("acl: add NEON optimization for ARMv8")
> Cc: stable@dpdk.org
> 
> Signed-off-by: Jerin Jacob <jerinj@marvell.com>
> ---
>  lib/librte_acl/acl_run_neon.h | 29 ++++++++++++-----------------
>  1 file changed, 12 insertions(+), 17 deletions(-)
> 
> diff --git a/lib/librte_acl/acl_run_neon.h b/lib/librte_acl/acl_run_neon.h
> index 01b9766d8..dc9e9efe9 100644
> --- a/lib/librte_acl/acl_run_neon.h
> +++ b/lib/librte_acl/acl_run_neon.h
> @@ -165,7 +165,6 @@ search_neon_8(const struct rte_acl_ctx *ctx, const
> uint8_t **data,
>  	uint64_t index_array[8];
>  	struct completion cmplt[8];
>  	struct parms parms[8];
> -	int32x4_t input0, input1;
> 
>  	acl_set_flow(&flows, cmplt, RTE_DIM(cmplt), data, results,
>  		     total_packets, categories, ctx->trans_table); @@ -181,17
> +180,14 @@ search_neon_8(const struct rte_acl_ctx *ctx, const uint8_t
> **data,
> 
>  	while (flows.started > 0) {
>  		/* Gather 4 bytes of input data for each stream. */
> -		input0 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 0),
> input0, 0);
> -		input1 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 4),
> input1, 0);
> -
> -		input0 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 1),
> input0, 1);
> -		input1 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 5),
> input1, 1);
> -
> -		input0 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 2),
> input0, 2);
> -		input1 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 6),
> input1, 2);
> -
> -		input0 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 3),
> input0, 3);
> -		input1 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 7),
> input1, 3);
> +		int32x4_t input0 = {GET_NEXT_4BYTES(parms, 0),
> +				    GET_NEXT_4BYTES(parms, 1),
> +				    GET_NEXT_4BYTES(parms, 2),
> +				    GET_NEXT_4BYTES(parms, 3)};
> +		int32x4_t input1 = {GET_NEXT_4BYTES(parms, 4),
> +				    GET_NEXT_4BYTES(parms, 5),
> +				    GET_NEXT_4BYTES(parms, 6),
> +				    GET_NEXT_4BYTES(parms, 7)};
> 
This mixes the use of NEON intrinsics with GCC vector extensions. ACLE (Arm C Language Extensions) specifically recommends not to mix the two methods in section 12.2.6. IMO, Aaron's suggestion of using a temp vector should be good.

>  		/* Process the 4 bytes of input on each stream. */
> 
> @@ -227,7 +223,6 @@ search_neon_4(const struct rte_acl_ctx *ctx, const
> uint8_t **data,
>  	uint64_t index_array[4];
>  	struct completion cmplt[4];
>  	struct parms parms[4];
> -	int32x4_t input;
> 
>  	acl_set_flow(&flows, cmplt, RTE_DIM(cmplt), data, results,
>  		     total_packets, categories, ctx->trans_table); @@ -242,10
> +237,10 @@ search_neon_4(const struct rte_acl_ctx *ctx, const uint8_t
> **data,
> 
>  	while (flows.started > 0) {
>  		/* Gather 4 bytes of input data for each stream. */
> -		input = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 0), input,
> 0);
> -		input = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 1), input,
> 1);
> -		input = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 2), input,
> 2);
> -		input = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 3), input,
> 3);
> +		int32x4_t input = {GET_NEXT_4BYTES(parms, 0),
> +				   GET_NEXT_4BYTES(parms, 1),
> +				   GET_NEXT_4BYTES(parms, 2),
> +				   GET_NEXT_4BYTES(parms, 3)};
> 
>  		/* Process the 4 bytes of input on each stream. */
>  		input = transition4(input, flows.trans, index_array);
> --
> 2.21.0
  
Honnappa Nagarahalli June 7, 2019, 5:42 a.m. UTC | #3
On 6/6/19 10:50 AM, mailto:jerinj@marvell.com wrote:
From: Jerin Jacob mailto:jerinj@marvell.com

Some compilers reporting the following error, though the existing
code doesn't have any uninitialized variable case.
Just to make compiler happy, initialize the int32x4_t variable
one shot in C language.

../lib/librte_acl/acl_run_neon.h: In function 'search_neon_4'
../lib/librte_acl/acl_run_neon.h:230:12: error: 'input' may be
used uninitialized in this function [-Werror=maybe-uninitialized]
  int32x4_t input;

Fixes: 34fa6c27c156 ("acl: add NEON optimization for ARMv8")
Cc: mailto:stable@dpdk.org

Signed-off-by: Jerin Jacob mailto:jerinj@marvell.com
---
 lib/librte_acl/acl_run_neon.h | 29 ++++++++++++-----------------
 1 file changed, 12 insertions(+), 17 deletions(-)

diff --git a/lib/librte_acl/acl_run_neon.h b/lib/librte_acl/acl_run_neon.h
index 01b9766d8..dc9e9efe9 100644
--- a/lib/librte_acl/acl_run_neon.h
+++ b/lib/librte_acl/acl_run_neon.h
@@ -165,7 +165,6 @@ search_neon_8(const struct rte_acl_ctx *ctx, const uint8_t **data,
 	uint64_t index_array[8];
 	struct completion cmplt[8];
 	struct parms parms[8];
-	int32x4_t input0, input1;
 
 	acl_set_flow(&flows, cmplt, RTE_DIM(cmplt), data, results,
 		     total_packets, categories, ctx->trans_table);
@@ -181,17 +180,14 @@ search_neon_8(const struct rte_acl_ctx *ctx, const uint8_t **data,
 
 	while (flows.started > 0) {
 		/* Gather 4 bytes of input data for each stream. */
-		input0 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 0), input0, 0);
-		input1 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 4), input1, 0);
-
-		input0 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 1), input0, 1);
-		input1 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 5), input1, 1);
-
-		input0 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 2), input0, 2);
-		input1 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 6), input1, 2);
-
-		input0 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 3), input0, 3);
-		input1 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 7), input1, 3);
+		int32x4_t input0 = {GET_NEXT_4BYTES(parms, 0),
+				    GET_NEXT_4BYTES(parms, 1),
+				    GET_NEXT_4BYTES(parms, 2),
+				    GET_NEXT_4BYTES(parms, 3)};
+		int32x4_t input1 = {GET_NEXT_4BYTES(parms, 4),
+				    GET_NEXT_4BYTES(parms, 5),
+				    GET_NEXT_4BYTES(parms, 6),
+				    GET_NEXT_4BYTES(parms, 7)};
 
 		/* Process the 4 bytes of input on each stream. */
 
@@ -227,7 +223,6 @@ search_neon_4(const struct rte_acl_ctx *ctx, const uint8_t **data,
 	uint64_t index_array[4];
 	struct completion cmplt[4];
 	struct parms parms[4];
-	int32x4_t input;
 
 	acl_set_flow(&flows, cmplt, RTE_DIM(cmplt), data, results,
 		     total_packets, categories, ctx->trans_table);
@@ -242,10 +237,10 @@ search_neon_4(const struct rte_acl_ctx *ctx, const uint8_t **data,
 
 	while (flows.started > 0) {
 		/* Gather 4 bytes of input data for each stream. */
-		input = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 0), input, 0);
-		input = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 1), input, 1);
-		input = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 2), input, 2);
-		input = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 3), input, 3);
+		int32x4_t input = {GET_NEXT_4BYTES(parms, 0),
+				   GET_NEXT_4BYTES(parms, 1),
+				   GET_NEXT_4BYTES(parms, 2),
+				   GET_NEXT_4BYTES(parms, 3)};
 
 		/* Process the 4 bytes of input on each stream. */
 		input = transition4(input, flows.trans, index_array);
Fixed on travis: https://travis-ci.com/Maickii/dpdk-2/builds/114612090
Acked-by: Michael Santana mailto:msantana@redhat.com

[Honnappa] Prefer to go with Aaron's patch with a temp variable for setting the first lane. Mixing of NEON intrinsics and GCC vector extensions is not recommended as per Arm C Language Extensions guide 12.2.6
  
Jerin Jacob Kollanukkaran June 7, 2019, 6:21 a.m. UTC | #4
> -----Original Message-----
> From: Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>
> Sent: Friday, June 7, 2019 11:05 AM
> To: Jerin Jacob Kollanukkaran <jerinj@marvell.com>; dev@dpdk.org
> Cc: thomas@monjalon.net; Gavin Hu (Arm Technology China)
> <Gavin.Hu@arm.com>; msantana@redhat.com; aconole@redhat.com; Jerin
> Jacob Kollanukkaran <jerinj@marvell.com>; Honnappa Nagarahalli
> <Honnappa.Nagarahalli@arm.com>; stable@dpdk.org; nd <nd@arm.com>
> Subject: [EXT] RE: [dpdk-dev] [PATCH] acl: fix build issue with some arm64
> compiler
> 
> ----------------------------------------------------------------------
> > Subject: [dpdk-dev] [PATCH] acl: fix build issue with some arm64
> > compiler
> >
> > From: Jerin Jacob <jerinj@marvell.com>
> >
> > Some compilers reporting the following error, though the existing code
> > doesn't have any uninitialized variable case.
> > Just to make compiler happy, initialize the int32x4_t variable one
> > shot in C language.
> >
> > ../lib/librte_acl/acl_run_neon.h: In function 'search_neon_4'
> > ../lib/librte_acl/acl_run_neon.h:230:12: error: 'input' may be used
> > uninitialized in this function [-Werror=maybe-uninitialized]
> >   int32x4_t input;
> >
> > Fixes: 34fa6c27c156 ("acl: add NEON optimization for ARMv8")
> > Cc: stable@dpdk.org
> >
> > Signed-off-by: Jerin Jacob <jerinj@marvell.com>
> > ---
> >  lib/librte_acl/acl_run_neon.h | 29 ++++++++++++-----------------
> >  1 file changed, 12 insertions(+), 17 deletions(-)
> >
> > diff --git a/lib/librte_acl/acl_run_neon.h
> > b/lib/librte_acl/acl_run_neon.h index 01b9766d8..dc9e9efe9 100644
> > --- a/lib/librte_acl/acl_run_neon.h
> > +++ b/lib/librte_acl/acl_run_neon.h
> > @@ -165,7 +165,6 @@ search_neon_8(const struct rte_acl_ctx *ctx, const
> > uint8_t **data,
> >  	uint64_t index_array[8];
> >  	struct completion cmplt[8];
> >  	struct parms parms[8];
> > -	int32x4_t input0, input1;
> >
> >  	acl_set_flow(&flows, cmplt, RTE_DIM(cmplt), data, results,
> >  		     total_packets, categories, ctx->trans_table); @@ -181,17
> > +180,14 @@ search_neon_8(const struct rte_acl_ctx *ctx, const uint8_t
> > **data,
> >
> >  	while (flows.started > 0) {
> >  		/* Gather 4 bytes of input data for each stream. */
> > -		input0 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 0),
> > input0, 0);
> > -		input1 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 4),
> > input1, 0);
> > -
> > -		input0 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 1),
> > input0, 1);
> > -		input1 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 5),
> > input1, 1);
> > -
> > -		input0 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 2),
> > input0, 2);
> > -		input1 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 6),
> > input1, 2);
> > -
> > -		input0 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 3),
> > input0, 3);
> > -		input1 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 7),
> > input1, 3);
> > +		int32x4_t input0 = {GET_NEXT_4BYTES(parms, 0),
> > +				    GET_NEXT_4BYTES(parms, 1),
> > +				    GET_NEXT_4BYTES(parms, 2),
> > +				    GET_NEXT_4BYTES(parms, 3)};
> > +		int32x4_t input1 = {GET_NEXT_4BYTES(parms, 4),
> > +				    GET_NEXT_4BYTES(parms, 5),
> > +				    GET_NEXT_4BYTES(parms, 6),
> > +				    GET_NEXT_4BYTES(parms, 7)};
> >
> This mixes the use of NEON intrinsics with GCC vector extensions. ACLE (Arm C
> Language Extensions) specifically recommends not to mix the two methods in
> section 12.2.6. IMO, Aaron's suggestion of using a temp vector should be good.

We are using this pattern across DPDK and SSE for x86 as well.
https://git.dpdk.org/dpdk/tree/drivers/net/i40e/i40e_rxtx_vec_neon.c#n91

Since it used in fastpath, a temp variable would be additional cost for no reason.
If GCC supports it then I think it is fine, I think, above usage matters with C++ portability.


> 
> >  		/* Process the 4 bytes of input on each stream. */
> >
> > @@ -227,7 +223,6 @@ search_neon_4(const struct rte_acl_ctx *ctx, const
> > uint8_t **data,
> >  	uint64_t index_array[4];
> >  	struct completion cmplt[4];
> >  	struct parms parms[4];
> > -	int32x4_t input;
> >
> >  	acl_set_flow(&flows, cmplt, RTE_DIM(cmplt), data, results,
> >  		     total_packets, categories, ctx->trans_table); @@ -242,10
> > +237,10 @@ search_neon_4(const struct rte_acl_ctx *ctx, const uint8_t
> > **data,
> >
> >  	while (flows.started > 0) {
> >  		/* Gather 4 bytes of input data for each stream. */
> > -		input = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 0), input,
> > 0);
> > -		input = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 1), input,
> > 1);
> > -		input = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 2), input,
> > 2);
> > -		input = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 3), input,
> > 3);
> > +		int32x4_t input = {GET_NEXT_4BYTES(parms, 0),
> > +				   GET_NEXT_4BYTES(parms, 1),
> > +				   GET_NEXT_4BYTES(parms, 2),
> > +				   GET_NEXT_4BYTES(parms, 3)};
> >
> >  		/* Process the 4 bytes of input on each stream. */
> >  		input = transition4(input, flows.trans, index_array);
> > --
> > 2.21.0
  
Honnappa Nagarahalli June 10, 2019, 5:29 a.m. UTC | #5
> >
> > ----------------------------------------------------------------------
> > > Subject: [dpdk-dev] [PATCH] acl: fix build issue with some arm64
> > > compiler
> > >
> > > From: Jerin Jacob <jerinj@marvell.com>
> > >
> > > Some compilers reporting the following error, though the existing
> > > code doesn't have any uninitialized variable case.
> > > Just to make compiler happy, initialize the int32x4_t variable one
> > > shot in C language.
> > >
> > > ../lib/librte_acl/acl_run_neon.h: In function 'search_neon_4'
> > > ../lib/librte_acl/acl_run_neon.h:230:12: error: 'input' may be used
> > > uninitialized in this function [-Werror=maybe-uninitialized]
> > >   int32x4_t input;
> > >
> > > Fixes: 34fa6c27c156 ("acl: add NEON optimization for ARMv8")
> > > Cc: stable@dpdk.org
> > >
> > > Signed-off-by: Jerin Jacob <jerinj@marvell.com>
> > > ---
> > >  lib/librte_acl/acl_run_neon.h | 29 ++++++++++++-----------------
> > >  1 file changed, 12 insertions(+), 17 deletions(-)
> > >
> > > diff --git a/lib/librte_acl/acl_run_neon.h
> > > b/lib/librte_acl/acl_run_neon.h index 01b9766d8..dc9e9efe9 100644
> > > --- a/lib/librte_acl/acl_run_neon.h
> > > +++ b/lib/librte_acl/acl_run_neon.h
> > > @@ -165,7 +165,6 @@ search_neon_8(const struct rte_acl_ctx *ctx,
> > > const uint8_t **data,
> > >  	uint64_t index_array[8];
> > >  	struct completion cmplt[8];
> > >  	struct parms parms[8];
> > > -	int32x4_t input0, input1;
> > >
> > >  	acl_set_flow(&flows, cmplt, RTE_DIM(cmplt), data, results,
> > >  		     total_packets, categories, ctx->trans_table); @@ -181,17
> > > +180,14 @@ search_neon_8(const struct rte_acl_ctx *ctx, const
> > > +uint8_t
> > > **data,
> > >
> > >  	while (flows.started > 0) {
> > >  		/* Gather 4 bytes of input data for each stream. */
> > > -		input0 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 0),
> > > input0, 0);
> > > -		input1 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 4),
> > > input1, 0);
> > > -
> > > -		input0 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 1),
> > > input0, 1);
> > > -		input1 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 5),
> > > input1, 1);
> > > -
> > > -		input0 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 2),
> > > input0, 2);
> > > -		input1 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 6),
> > > input1, 2);
> > > -
> > > -		input0 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 3),
> > > input0, 3);
> > > -		input1 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 7),
> > > input1, 3);
> > > +		int32x4_t input0 = {GET_NEXT_4BYTES(parms, 0),
> > > +				    GET_NEXT_4BYTES(parms, 1),
> > > +				    GET_NEXT_4BYTES(parms, 2),
> > > +				    GET_NEXT_4BYTES(parms, 3)};
> > > +		int32x4_t input1 = {GET_NEXT_4BYTES(parms, 4),
> > > +				    GET_NEXT_4BYTES(parms, 5),
> > > +				    GET_NEXT_4BYTES(parms, 6),
> > > +				    GET_NEXT_4BYTES(parms, 7)};
> > >
> > This mixes the use of NEON intrinsics with GCC vector extensions. ACLE
> > (Arm C Language Extensions) specifically recommends not to mix the two
> > methods in section 12.2.6. IMO, Aaron's suggestion of using a temp vector
> should be good.
> 
> We are using this pattern across DPDK and SSE for x86 as well.
> https://git.dpdk.org/dpdk/tree/drivers/net/i40e/i40e_rxtx_vec_neon.c#n91
I am not sure about x86, I have not looked at a document similar to ACLE for x86. IMO, it is not relevant here as this is Arm specific code.

> 
> Since it used in fastpath, a temp variable would be additional cost for no
> reason.
Then, I would suggest we can go with using 'vdupq_n_s32'.

> If GCC supports it then I think it is fine, I think, above usage matters with C++
> portability.
I did not understand the C++ portability part. Can you elaborate more?

> 
> 
> >
> > >  		/* Process the 4 bytes of input on each stream. */
> > >
> > > @@ -227,7 +223,6 @@ search_neon_4(const struct rte_acl_ctx *ctx,
> > > const uint8_t **data,
> > >  	uint64_t index_array[4];
> > >  	struct completion cmplt[4];
> > >  	struct parms parms[4];
> > > -	int32x4_t input;
> > >
> > >  	acl_set_flow(&flows, cmplt, RTE_DIM(cmplt), data, results,
> > >  		     total_packets, categories, ctx->trans_table); @@ -242,10
> > > +237,10 @@ search_neon_4(const struct rte_acl_ctx *ctx, const
> > > +uint8_t
> > > **data,
> > >
> > >  	while (flows.started > 0) {
> > >  		/* Gather 4 bytes of input data for each stream. */
> > > -		input = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 0), input,
> > > 0);
> > > -		input = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 1), input,
> > > 1);
> > > -		input = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 2), input,
> > > 2);
> > > -		input = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 3), input,
> > > 3);
> > > +		int32x4_t input = {GET_NEXT_4BYTES(parms, 0),
> > > +				   GET_NEXT_4BYTES(parms, 1),
> > > +				   GET_NEXT_4BYTES(parms, 2),
> > > +				   GET_NEXT_4BYTES(parms, 3)};
> > >
> > >  		/* Process the 4 bytes of input on each stream. */
> > >  		input = transition4(input, flows.trans, index_array);
> > > --
> > > 2.21.0
  
Jerin Jacob Kollanukkaran June 10, 2019, 9:39 a.m. UTC | #6
> -----Original Message-----
> From: Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>
> Sent: Monday, June 10, 2019 11:00 AM
> To: Jerin Jacob Kollanukkaran <jerinj@marvell.com>; dev@dpdk.org
> Cc: thomas@monjalon.net; Gavin Hu (Arm Technology China)
> <Gavin.Hu@arm.com>; msantana@redhat.com; aconole@redhat.com;
> stable@dpdk.org; Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>;
> nd <nd@arm.com>; nd <nd@arm.com>
> Subject: [EXT] RE: [dpdk-dev] [PATCH] acl: fix build issue with some arm64
> compiler
> 
> > > --
> > > > Subject: [dpdk-dev] [PATCH] acl: fix build issue with some arm64
> > > > compiler
> > > >
> > > > From: Jerin Jacob <jerinj@marvell.com>
> > > >
> > > > Some compilers reporting the following error, though the existing
> > > > code doesn't have any uninitialized variable case.
> > > > Just to make compiler happy, initialize the int32x4_t variable one
> > > > shot in C language.
> > > >
> > > > ../lib/librte_acl/acl_run_neon.h: In function 'search_neon_4'
> > > > ../lib/librte_acl/acl_run_neon.h:230:12: error: 'input' may be
> > > > used uninitialized in this function [-Werror=maybe-uninitialized]
> > > >   int32x4_t input;
> > > >
> > > > Fixes: 34fa6c27c156 ("acl: add NEON optimization for ARMv8")
> > > > Cc: stable@dpdk.org
> > > >
> > > > Signed-off-by: Jerin Jacob <jerinj@marvell.com>
> > > > ---
> > > >  lib/librte_acl/acl_run_neon.h | 29 ++++++++++++-----------------
> > > >  1 file changed, 12 insertions(+), 17 deletions(-)
> > > >
> > > > diff --git a/lib/librte_acl/acl_run_neon.h
> > > > b/lib/librte_acl/acl_run_neon.h index 01b9766d8..dc9e9efe9 100644
> > > > --- a/lib/librte_acl/acl_run_neon.h
> > > > +++ b/lib/librte_acl/acl_run_neon.h
> > > > @@ -165,7 +165,6 @@ search_neon_8(const struct rte_acl_ctx *ctx,
> > > > const uint8_t **data,
> > > >  	uint64_t index_array[8];
> > > >  	struct completion cmplt[8];
> > > >  	struct parms parms[8];
> > > > -	int32x4_t input0, input1;
> > > >
> > > >  	acl_set_flow(&flows, cmplt, RTE_DIM(cmplt), data, results,
> > > >  		     total_packets, categories, ctx->trans_table); @@ -181,17
> > > > +180,14 @@ search_neon_8(const struct rte_acl_ctx *ctx, const
> > > > +uint8_t
> > > > **data,
> > > >
> > > >  	while (flows.started > 0) {
> > > >  		/* Gather 4 bytes of input data for each stream. */
> > > > -		input0 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 0),
> > > > input0, 0);
> > > > -		input1 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 4),
> > > > input1, 0);
> > > > -
> > > > -		input0 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 1),
> > > > input0, 1);
> > > > -		input1 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 5),
> > > > input1, 1);
> > > > -
> > > > -		input0 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 2),
> > > > input0, 2);
> > > > -		input1 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 6),
> > > > input1, 2);
> > > > -
> > > > -		input0 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 3),
> > > > input0, 3);
> > > > -		input1 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 7),
> > > > input1, 3);
> > > > +		int32x4_t input0 = {GET_NEXT_4BYTES(parms, 0),
> > > > +				    GET_NEXT_4BYTES(parms, 1),
> > > > +				    GET_NEXT_4BYTES(parms, 2),
> > > > +				    GET_NEXT_4BYTES(parms, 3)};
> > > > +		int32x4_t input1 = {GET_NEXT_4BYTES(parms, 4),
> > > > +				    GET_NEXT_4BYTES(parms, 5),
> > > > +				    GET_NEXT_4BYTES(parms, 6),
> > > > +				    GET_NEXT_4BYTES(parms, 7)};
> > > >
> > > This mixes the use of NEON intrinsics with GCC vector extensions.
> > > ACLE (Arm C Language Extensions) specifically recommends not to mix
> > > the two methods in section 12.2.6. IMO, Aaron's suggestion of using
> > > a temp vector
> > should be good.
> >
> > We are using this pattern across DPDK and SSE for x86 as well.
> > https://git.dpdk.org/dpdk/tree/drivers/net/i40e/i40e_rxtx_vec_neon.c#n
> > 91
> I am not sure about x86, I have not looked at a document similar to ACLE for
> x86. IMO, it is not relevant here as this is Arm specific code.

What I meant was its been already used in DPDK for arm64.
https://git.dpdk.org/dpdk/tree/drivers/net/i40e/i40e_rxtx_vec_neon.c#n91

Please see offial page vector gcc gcc documentation. The examples are using this scheme.
https://gcc.gnu.org/onlinedocs/gcc/Vector-Extensions.html

This is to just create 'input' variable. I am fine to use any other scheme with out additional cost
of instructions.

> 
> >
> > Since it used in fastpath, a temp variable would be additional cost
> > for no reason.
> Then, I would suggest we can go with using 'vdupq_n_s32'.

We have to form uint64x2_t with 4 x uint32_t variable, How does 'vdupq_n_s32' help here?
Can you share code snippet without any temp variable?

> 
> > If GCC supports it then I think it is fine, I think, above usage
> > matters with C++ portability.
> I did not understand the C++ portability part. Can you elaborate more?
> 
> >
> >
> > >
> > > >  		/* Process the 4 bytes of input on each stream. */
> > > >
> > > > @@ -227,7 +223,6 @@ search_neon_4(const struct rte_acl_ctx *ctx,
> > > > const uint8_t **data,
> > > >  	uint64_t index_array[4];
> > > >  	struct completion cmplt[4];
> > > >  	struct parms parms[4];
> > > > -	int32x4_t input;
> > > >
> > > >  	acl_set_flow(&flows, cmplt, RTE_DIM(cmplt), data, results,
> > > >  		     total_packets, categories, ctx->trans_table); @@ -242,10
> > > > +237,10 @@ search_neon_4(const struct rte_acl_ctx *ctx, const
> > > > +uint8_t
> > > > **data,
> > > >
> > > >  	while (flows.started > 0) {
> > > >  		/* Gather 4 bytes of input data for each stream. */
> > > > -		input = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 0), input,
> > > > 0);
> > > > -		input = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 1), input,
> > > > 1);
> > > > -		input = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 2), input,
> > > > 2);
> > > > -		input = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 3), input,
> > > > 3);
> > > > +		int32x4_t input = {GET_NEXT_4BYTES(parms, 0),
> > > > +				   GET_NEXT_4BYTES(parms, 1),
> > > > +				   GET_NEXT_4BYTES(parms, 2),
> > > > +				   GET_NEXT_4BYTES(parms, 3)};
> > > >
> > > >  		/* Process the 4 bytes of input on each stream. */
> > > >  		input = transition4(input, flows.trans, index_array);
> > > > --
> > > > 2.21.0
  
Aaron Conole June 10, 2019, 12:10 p.m. UTC | #7
<jerinj@marvell.com> writes:

> From: Jerin Jacob <jerinj@marvell.com>
>
> Some compilers reporting the following error, though the existing
> code doesn't have any uninitialized variable case.
> Just to make compiler happy, initialize the int32x4_t variable
> one shot in C language.
>
> ../lib/librte_acl/acl_run_neon.h: In function 'search_neon_4'
> ../lib/librte_acl/acl_run_neon.h:230:12: error: 'input' may be
> used uninitialized in this function [-Werror=maybe-uninitialized]
>   int32x4_t input;
>
> Fixes: 34fa6c27c156 ("acl: add NEON optimization for ARMv8")
> Cc: stable@dpdk.org
>
> Signed-off-by: Jerin Jacob <jerinj@marvell.com>
> ---

This pattern is easy to understand, congruent with other usages in the
code base, has good patch statistics, and solves the issue.

Acked-by: Aaron Conole <aconole@redhat.com>

I prefer this solution to the others posted.  Thanks for looking into
it, Jerin!
  
Honnappa Nagarahalli June 11, 2019, 1:27 a.m. UTC | #8
> > > > --
> > > > > Subject: [dpdk-dev] [PATCH] acl: fix build issue with some arm64
> > > > > compiler
> > > > >
> > > > > From: Jerin Jacob <jerinj@marvell.com>
> > > > >
> > > > > Some compilers reporting the following error, though the
> > > > > existing code doesn't have any uninitialized variable case.
> > > > > Just to make compiler happy, initialize the int32x4_t variable
> > > > > one shot in C language.
> > > > >
> > > > > ../lib/librte_acl/acl_run_neon.h: In function 'search_neon_4'
> > > > > ../lib/librte_acl/acl_run_neon.h:230:12: error: 'input' may be
> > > > > used uninitialized in this function [-Werror=maybe-uninitialized]
> > > > >   int32x4_t input;
> > > > >
> > > > > Fixes: 34fa6c27c156 ("acl: add NEON optimization for ARMv8")
> > > > > Cc: stable@dpdk.org
> > > > >
> > > > > Signed-off-by: Jerin Jacob <jerinj@marvell.com>
> > > > > ---
> > > > >  lib/librte_acl/acl_run_neon.h | 29
> > > > > ++++++++++++-----------------
> > > > >  1 file changed, 12 insertions(+), 17 deletions(-)
> > > > >
> > > > > diff --git a/lib/librte_acl/acl_run_neon.h
> > > > > b/lib/librte_acl/acl_run_neon.h index 01b9766d8..dc9e9efe9
> > > > > 100644
> > > > > --- a/lib/librte_acl/acl_run_neon.h
> > > > > +++ b/lib/librte_acl/acl_run_neon.h
> > > > > @@ -165,7 +165,6 @@ search_neon_8(const struct rte_acl_ctx *ctx,
> > > > > const uint8_t **data,
> > > > >  	uint64_t index_array[8];
> > > > >  	struct completion cmplt[8];
> > > > >  	struct parms parms[8];
> > > > > -	int32x4_t input0, input1;
> > > > >
> > > > >  	acl_set_flow(&flows, cmplt, RTE_DIM(cmplt), data, results,
> > > > >  		     total_packets, categories, ctx->trans_table); @@ -181,17
> > > > > +180,14 @@ search_neon_8(const struct rte_acl_ctx *ctx, const
> > > > > +uint8_t
> > > > > **data,
> > > > >
> > > > >  	while (flows.started > 0) {
> > > > >  		/* Gather 4 bytes of input data for each stream. */
> > > > > -		input0 = vsetq_lane_s32(GET_NEXT_4BYTES(parms,
> 0),
> > > > > input0, 0);
> > > > > -		input1 = vsetq_lane_s32(GET_NEXT_4BYTES(parms,
> 4),
> > > > > input1, 0);
> > > > > -
> > > > > -		input0 = vsetq_lane_s32(GET_NEXT_4BYTES(parms,
> 1),
> > > > > input0, 1);
> > > > > -		input1 = vsetq_lane_s32(GET_NEXT_4BYTES(parms,
> 5),
> > > > > input1, 1);
> > > > > -
> > > > > -		input0 = vsetq_lane_s32(GET_NEXT_4BYTES(parms,
> 2),
> > > > > input0, 2);
> > > > > -		input1 = vsetq_lane_s32(GET_NEXT_4BYTES(parms,
> 6),
> > > > > input1, 2);
> > > > > -
> > > > > -		input0 = vsetq_lane_s32(GET_NEXT_4BYTES(parms,
> 3),
> > > > > input0, 3);
> > > > > -		input1 = vsetq_lane_s32(GET_NEXT_4BYTES(parms,
> 7),
> > > > > input1, 3);
> > > > > +		int32x4_t input0 = {GET_NEXT_4BYTES(parms, 0),
> > > > > +				    GET_NEXT_4BYTES(parms, 1),
> > > > > +				    GET_NEXT_4BYTES(parms, 2),
> > > > > +				    GET_NEXT_4BYTES(parms, 3)};
> > > > > +		int32x4_t input1 = {GET_NEXT_4BYTES(parms, 4),
> > > > > +				    GET_NEXT_4BYTES(parms, 5),
> > > > > +				    GET_NEXT_4BYTES(parms, 6),
> > > > > +				    GET_NEXT_4BYTES(parms, 7)};
> > > > >
> > > > This mixes the use of NEON intrinsics with GCC vector extensions.
> > > > ACLE (Arm C Language Extensions) specifically recommends not to
> > > > mix the two methods in section 12.2.6. IMO, Aaron's suggestion of
> > > > using a temp vector
> > > should be good.
> > >
> > > We are using this pattern across DPDK and SSE for x86 as well.
> > > https://git.dpdk.org/dpdk/tree/drivers/net/i40e/i40e_rxtx_vec_neon.c
> > > #n
> > > 91
> > I am not sure about x86, I have not looked at a document similar to
> > ACLE for x86. IMO, it is not relevant here as this is Arm specific code.
> 
> What I meant was its been already used in DPDK for arm64.
> https://git.dpdk.org/dpdk/tree/drivers/net/i40e/i40e_rxtx_vec_neon.c#n91
Ok, got it. I have had discussion with compiler folks at Arm with mixing vector programming models and the recommendation has been to use NEON exclusively. I have had this discussion with Marvel compiler folks too some time back.

> 
> Please see offial page vector gcc gcc documentation. The examples are using
> this scheme.
> https://gcc.gnu.org/onlinedocs/gcc/Vector-Extensions.html
> 
> This is to just create 'input' variable. I am fine to use any other scheme with
> out additional cost of instructions.
> 
> >
> > >
> > > Since it used in fastpath, a temp variable would be additional cost
> > > for no reason.
> > Then, I would suggest we can go with using 'vdupq_n_s32'.
> 
> We have to form uint64x2_t with 4 x uint32_t variable, How does
> 'vdupq_n_s32' help here?
We would use 'vdupq_n_s32' only for the first initialization, the rest of the code remains the same (see the diff below)

> Can you share code snippet without any temp variable?
diff --git a/lib/librte_acl/acl_run_neon.h b/lib/librte_acl/acl_run_neon.h
index 01b9766d8..b3196cd12 100644
--- a/lib/librte_acl/acl_run_neon.h
+++ b/lib/librte_acl/acl_run_neon.h
@@ -181,8 +181,8 @@ search_neon_8(const struct rte_acl_ctx *ctx, const uint8_t **data,

        while (flows.started > 0) {
                /* Gather 4 bytes of input data for each stream. */
-               input0 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 0), input0, 0);
-               input1 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 4), input1, 0);
+               input0 = vdupq_n_s32(GET_NEXT_4BYTES(parms, 0));
+               input1 = vdupq_n_s32(GET_NEXT_4BYTES(parms, 4));

                input0 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 1), input0, 1);
                input1 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 5), input1, 1);
@@ -242,7 +242,7 @@ search_neon_4(const struct rte_acl_ctx *ctx, const uint8_t **data,

        while (flows.started > 0) {
                /* Gather 4 bytes of input data for each stream. */
-               input = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 0), input, 0);
+               input = vdupq_n_s32(GET_NEXT_4BYTES(parms, 0));
                input = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 1), input, 1);
                input = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 2), input, 2);
                input = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 3), input, 3);

My understanding is that the generated code for both your patch and my changes above is the same. Above suggested changes will conform to ACLE recommendation.

> 
> >
> > > If GCC supports it then I think it is fine, I think, above usage
> > > matters with C++ portability.
> > I did not understand the C++ portability part. Can you elaborate more?
> >
> > >
> > >
> > > >
> > > > >  		/* Process the 4 bytes of input on each stream. */
> > > > >
> > > > > @@ -227,7 +223,6 @@ search_neon_4(const struct rte_acl_ctx *ctx,
> > > > > const uint8_t **data,
> > > > >  	uint64_t index_array[4];
> > > > >  	struct completion cmplt[4];
> > > > >  	struct parms parms[4];
> > > > > -	int32x4_t input;
> > > > >
> > > > >  	acl_set_flow(&flows, cmplt, RTE_DIM(cmplt), data, results,
> > > > >  		     total_packets, categories, ctx->trans_table); @@ -242,10
> > > > > +237,10 @@ search_neon_4(const struct rte_acl_ctx *ctx, const
> > > > > +uint8_t
> > > > > **data,
> > > > >
> > > > >  	while (flows.started > 0) {
> > > > >  		/* Gather 4 bytes of input data for each stream. */
> > > > > -		input = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 0),
> input,
> > > > > 0);
> > > > > -		input = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 1),
> input,
> > > > > 1);
> > > > > -		input = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 2),
> input,
> > > > > 2);
> > > > > -		input = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 3),
> input,
> > > > > 3);
> > > > > +		int32x4_t input = {GET_NEXT_4BYTES(parms, 0),
> > > > > +				   GET_NEXT_4BYTES(parms, 1),
> > > > > +				   GET_NEXT_4BYTES(parms, 2),
> > > > > +				   GET_NEXT_4BYTES(parms, 3)};
> > > > >
> > > > >  		/* Process the 4 bytes of input on each stream. */
> > > > >  		input = transition4(input, flows.trans, index_array);
> > > > > --
> > > > > 2.21.0
  
Jerin Jacob Kollanukkaran June 11, 2019, 2:24 p.m. UTC | #9
> -----Original Message-----
> From: Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>
> Sent: Tuesday, June 11, 2019 6:58 AM
> To: Jerin Jacob Kollanukkaran <jerinj@marvell.com>; dev@dpdk.org
> Cc: thomas@monjalon.net; Gavin Hu (Arm Technology China)
> <Gavin.Hu@arm.com>; msantana@redhat.com; aconole@redhat.com;
> stable@dpdk.org; Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>;
> nd <nd@arm.com>; nd <nd@arm.com>
> Subject: [EXT] RE: [dpdk-dev] [PATCH] acl: fix build issue with some arm64
> compiler
> 
> > >
> > > >
> > > > Since it used in fastpath, a temp variable would be additional
> > > > cost for no reason.
> > > Then, I would suggest we can go with using 'vdupq_n_s32'.
> >
> > We have to form uint64x2_t with 4 x uint32_t variable, How does
> > 'vdupq_n_s32' help here?
> We would use 'vdupq_n_s32' only for the first initialization, the rest of the code
> remains the same (see the diff below)
> 
> > Can you share code snippet without any temp variable?
> diff --git a/lib/librte_acl/acl_run_neon.h b/lib/librte_acl/acl_run_neon.h index
> 01b9766d8..b3196cd12 100644
> --- a/lib/librte_acl/acl_run_neon.h
> +++ b/lib/librte_acl/acl_run_neon.h
> @@ -181,8 +181,8 @@ search_neon_8(const struct rte_acl_ctx *ctx, const
> uint8_t **data,
> 
>         while (flows.started > 0) {
>                 /* Gather 4 bytes of input data for each stream. */
> -               input0 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 0), input0, 0);
> -               input1 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 4), input1, 0);
> +               input0 = vdupq_n_s32(GET_NEXT_4BYTES(parms, 0));
> +               input1 = vdupq_n_s32(GET_NEXT_4BYTES(parms, 4));
> 
>                 input0 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 1), input0, 1);
>                 input1 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 5), input1, 1); @@ -
> 242,7 +242,7 @@ search_neon_4(const struct rte_acl_ctx *ctx, const uint8_t
> **data,
> 
>         while (flows.started > 0) {
>                 /* Gather 4 bytes of input data for each stream. */
> -               input = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 0), input, 0);
> +               input = vdupq_n_s32(GET_NEXT_4BYTES(parms, 0));
>                 input = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 1), input, 1);
>                 input = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 2), input, 2);
>                 input = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 3), input, 3);
> 
> My understanding is that the generated code for both your patch and my
> changes above is the same. Above suggested changes will conform to ACLE
> recommendation.

Though instructions are different. Effective cycles are same even though
First dup updates the four positions.
To make forward progress send the v2 based on the updated logic
 just to make ACLE  Spec happy, I don’t see any real reason to do it though 😊

http://patches.dpdk.org/patch/54656/
  
Honnappa Nagarahalli June 11, 2019, 7:48 p.m. UTC | #10
Reduced the CC list (changing the topic slightly)

> >
> > My understanding is that the generated code for both your patch and my
> > changes above is the same. Above suggested changes will conform to
> > ACLE recommendation.
> 
> Though instructions are different. Effective cycles are same even though First
> dup updates the four positions.
Can you elaborate on how the instructions are different?
I wrote the following code with both the methods:

uint32x4_t u32x4_gather_gcc (uint32_t *p0, uint32_t *p1, uint32_t *p2, uint32_t *p3)
{
     uint32x4_t r = {*p0, *p1, *p2, *p3};

     return r;
}

uint32x4_t u32x4_gather_acle (uint32_t *p0, uint32_t *p1, uint32_t *p2, uint32_t *p3)
{
     uint32x4_t r;

     r = vdupq_n_u32 (* p0);
     r = vsetq_lane_u32 (*p1, r, 1);
     r = vsetq_lane_u32 (*p2, r, 2);
     r = vsetq_lane_u32 (*p3, r, 3);

     return r;
}

The generated code has the same instructions for both (omitted the unwanted parts):

u32x4_gather_gcc:
        ld1r    {v0.4s}, [x0]
        ld1     {v0.s}[1], [x1]
        ld1     {v0.s}[2], [x2]
        ld1     {v0.s}[3], [x3]
        ret

u32x4_gather_acle:
        ld1r    {v0.4s}, [x0]
        ld1     {v0.s}[1], [x1]
        ld1     {v0.s}[2], [x2]
        ld1     {v0.s}[3], [x3]
        ret

The first 'ld1r' updates all the lanes in both the cases.

> To make forward progress send the v2 based on the updated logic  just to
> make ACLE  Spec happy, I don’t see any real reason to do it though 😊
Thanks for the patch, it was important to make forward progress.
But, I think we should carry forward the discussion as I plan to change other parts of DPDK on similar lines. I want to understand why you think there is no real reason. The ACLE recommendation mentions the reasoning.

> 
> http://patches.dpdk.org/patch/54656/
>
  
Jerin Jacob Kollanukkaran June 12, 2019, 2:41 a.m. UTC | #11
> -----Original Message-----
> From: Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>
> Sent: Wednesday, June 12, 2019 1:18 AM
> To: Jerin Jacob Kollanukkaran <jerinj@marvell.com>; dev@dpdk.org
> Cc: thomas@monjalon.net; Gavin Hu (Arm Technology China)
> <Gavin.Hu@arm.com>; nd <nd@arm.com>; nd <nd@arm.com>
> Subject: [EXT] RE: [dpdk-dev] [PATCH] acl: fix build issue with some arm64
> compiler
> 
> Reduced the CC list (changing the topic slightly)
> 
> > >
> > > My understanding is that the generated code for both your patch and
> > > my changes above is the same. Above suggested changes will conform
> > > to ACLE recommendation.
> >
> > Though instructions are different. Effective cycles are same even
> > though First dup updates the four positions.
> Can you elaborate on how the instructions are different?
> I wrote the following code with both the methods:
> 
> uint32x4_t u32x4_gather_gcc (uint32_t *p0, uint32_t *p1, uint32_t *p2,
> uint32_t *p3) {
>      uint32x4_t r = {*p0, *p1, *p2, *p3};
> 
>      return r;
> }
> 
> uint32x4_t u32x4_gather_acle (uint32_t *p0, uint32_t *p1, uint32_t *p2,
> uint32_t *p3) {
>      uint32x4_t r;
> 
>      r = vdupq_n_u32 (* p0);
>      r = vsetq_lane_u32 (*p1, r, 1);
>      r = vsetq_lane_u32 (*p2, r, 2);
>      r = vsetq_lane_u32 (*p3, r, 3);
> 
>      return r;
> }
> 
> The generated code has the same instructions for both (omitted the unwanted
> parts):
> 
> u32x4_gather_gcc:
>         ld1r    {v0.4s}, [x0]
>         ld1     {v0.s}[1], [x1]
>         ld1     {v0.s}[2], [x2]
>         ld1     {v0.s}[3], [x3]
>         ret
> 
> u32x4_gather_acle:
>         ld1r    {v0.4s}, [x0]
>         ld1     {v0.s}[1], [x1]
>         ld1     {v0.s}[2], [x2]
>         ld1     {v0.s}[3], [x3]
>         ret
> 
> The first 'ld1r' updates all the lanes in both the cases.


Please check actual generated code for ACL case. We can see difference
 0x00000000005cc1dc <+1884>:  80 6a 65 bc     ldr     s0, [x20, x5]
vs
  0x00000000005cc1dc <+1884>:  9e 6a 65 b8     ldr     w30, [x20, x5]

With patch:

244                     /* Gather 4 bytes of input data for each stream. */
245                     input = vdupq_n_s32(GET_NEXT_4BYTES(parms, 0));
   0x00000000005cc1c8 <+1864>:  b4 4f 46 a9     ldp     x20, x19, [x29, #96]
   0x00000000005cc1d8 <+1880>:  65 02 40 b9     ldr     w5, [x19]
   0x00000000005cc1dc <+1884>:  80 6a 65 bc     ldr     s0, [x20, x5]
   0x00000000005cc26c <+2028>:  73 12 00 91     add     x19, x19, #0x4
   0x00000000005cc2ac <+2092>:  b3 37 00 f9     str     x19, [x29, #104]

246                     input = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 1), input, 1);
   0x00000000005cc1d0 <+1872>:  a6 9f 47 a9     ldp     x6, x7, [x29, #120]
   0x00000000005cc1ec <+1900>:  e5 00 40 b9     ldr     w5, [x7]
   0x00000000005cc1f0 <+1904>:  d6 68 65 b8     ldr     w22, [x6, x5]
   0x00000000005cc21c <+1948>:  e7 10 00 91     add     x7, x7, #0x4
   0x00000000005cc260 <+2016>:  a7 43 00 f9     str     x7, [x29, #128]

247                     input = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 2), input, 2);
   0x00000000005cc1d4 <+1876>:  b5 4b 40 f9     ldr     x21, [x29, #144]
   0x00000000005cc1f4 <+1908>:  a6 4f 40 f9     ldr     x6, [x29, #152]
   0x00000000005cc1f8 <+1912>:  d4 00 40 b9     ldr     w20, [x6]
   0x00000000005cc1fc <+1916>:  b5 6a 74 b8     ldr     w21, [x21, x20]
   0x00000000005cc224 <+1956>:  c6 10 00 91     add     x6, x6, #0x4
   0x00000000005cc264 <+2020>:  a6 4f 00 f9     str     x6, [x29, #152]

248                     input = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 3), input, 3);
   0x00000000005cc200 <+1920>:  a5 5b 40 f9     ldr     x5, [x29, #176]
   0x00000000005cc204 <+1924>:  b4 00 40 b9     ldr     w20, [x5]
   0x00000000005cc208 <+1928>:  a5 10 00 91     add     x5, x5, #0x4
   0x00000000005cc218 <+1944>:  b7 57 40 f9     ldr     x23, [x29, #168]
   0x00000000005cc220 <+1952>:  f4 6a 74 b8     ldr     w20, [x23, x20]
   0x00000000005cc228 <+1960>:  a5 5b 00 f9     str     x5, [x29, #176]
   
With out patch:
   
   245                     input = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 0), input, 0);
   0x00000000005cc1c8 <+1864>:  b4 4f 46 a9     ldp     x20, x19, [x29, #96]
   0x00000000005cc1d8 <+1880>:  65 02 40 b9     ldr     w5, [x19]
   0x00000000005cc1dc <+1884>:  9e 6a 65 b8     ldr     w30, [x20, x5]
   0x00000000005cc248 <+1992>:  73 12 00 91     add     x19, x19, #0x4
   0x00000000005cc24c <+1996>:  b3 37 00 f9     str     x19, [x29, #104]

246                     input = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 1), input, 1);
   0x00000000005cc1d0 <+1872>:  a6 9f 47 a9     ldp     x6, x7, [x29, #120]
   0x00000000005cc1ec <+1900>:  e5 00 40 b9     ldr     w5, [x7]
   0x00000000005cc1f0 <+1904>:  d6 68 65 b8     ldr     w22, [x6, x5]
   0x00000000005cc228 <+1960>:  e7 10 00 91     add     x7, x7, #0x4
   0x00000000005cc240 <+1984>:  a7 43 00 f9     str     x7, [x29, #128]

247                     input = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 2), input, 2);
   0x00000000005cc1d4 <+1876>:  b5 4b 40 f9     ldr     x21, [x29, #144]
   0x00000000005cc1f4 <+1908>:  a6 4f 40 f9     ldr     x6, [x29, #152]
   0x00000000005cc1f8 <+1912>:  d4 00 40 b9     ldr     w20, [x6]
   0x00000000005cc1fc <+1916>:  b5 6a 74 b8     ldr     w21, [x21, x20]
   0x00000000005cc22c <+1964>:  c6 10 00 91     add     x6, x6, #0x4
   0x00000000005cc244 <+1988>:  a6 4f 00 f9     str     x6, [x29, #152]

248                     input = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 3), input, 3);
   0x00000000005cc200 <+1920>:  a5 5b 40 f9     ldr     x5, [x29, #176]
   0x00000000005cc204 <+1924>:  b4 00 40 b9     ldr     w20, [x5]
   0x00000000005cc208 <+1928>:  a5 10 00 91     add     x5, x5, #0x4
   0x00000000005cc21c <+1948>:  b7 57 40 f9     ldr     x23, [x29, #168]
   0x00000000005cc224 <+1956>:  f4 6a 74 b8     ldr     w20, [x23, x20]
   0x00000000005cc230 <+1968>:  a5 5b 00 f9     str     x5, [x29, #176]




> 
> > To make forward progress send the v2 based on the updated logic  just
> > to make ACLE  Spec happy, I don’t see any real reason to do it though
> > 😊
> Thanks for the patch, it was important to make forward progress.
> But, I think we should carry forward the discussion as I plan to change other
> parts of DPDK on similar lines. I want to understand why you think there is no
> real reason. The ACLE recommendation mentions the reasoning.

# I see following in the ACLE spec. What is the actual reasoning? 
"
ACLE does not define static construction of vector types. E.g.
 int32x4_t x = { 1, 2, 3, 4 };
Is not portable. Use the vcreate or vdup intrinsics to construct values from scalars.
"

# Why does compiler(gcc) allows if it not indented to use? 

# I think, it may be time to introduce UndefinedBehaviorSanitizer (UBSan)
Gcc feature to DPDK to detect undefined behavior checks to detect such case

>

> >
> > http://patches.dpdk.org/patch/54656/
> >
  
Honnappa Nagarahalli June 17, 2019, 12:48 a.m. UTC | #12
> >
> > Reduced the CC list (changing the topic slightly)
> >
> > > >
> > > > My understanding is that the generated code for both your patch
> > > > and my changes above is the same. Above suggested changes will
> > > > conform to ACLE recommendation.
> > >
> > > Though instructions are different. Effective cycles are same even
> > > though First dup updates the four positions.
> > Can you elaborate on how the instructions are different?
> > I wrote the following code with both the methods:
> >
> > uint32x4_t u32x4_gather_gcc (uint32_t *p0, uint32_t *p1, uint32_t *p2,
> > uint32_t *p3) {
> >      uint32x4_t r = {*p0, *p1, *p2, *p3};
> >
> >      return r;
> > }
> >
> > uint32x4_t u32x4_gather_acle (uint32_t *p0, uint32_t *p1, uint32_t
> > *p2, uint32_t *p3) {
> >      uint32x4_t r;
> >
> >      r = vdupq_n_u32 (* p0);
> >      r = vsetq_lane_u32 (*p1, r, 1);
> >      r = vsetq_lane_u32 (*p2, r, 2);
> >      r = vsetq_lane_u32 (*p3, r, 3);
> >
> >      return r;
> > }
> >
> > The generated code has the same instructions for both (omitted the
> > unwanted
> > parts):
> >
> > u32x4_gather_gcc:
> >         ld1r    {v0.4s}, [x0]
> >         ld1     {v0.s}[1], [x1]
> >         ld1     {v0.s}[2], [x2]
> >         ld1     {v0.s}[3], [x3]
> >         ret
> >
> > u32x4_gather_acle:
> >         ld1r    {v0.4s}, [x0]
> >         ld1     {v0.s}[1], [x1]
> >         ld1     {v0.s}[2], [x2]
> >         ld1     {v0.s}[3], [x3]
> >         ret
> >
> > The first 'ld1r' updates all the lanes in both the cases.
> 
> 
> Please check actual generated code for ACL case. We can see difference
I think there is something wrong with the way you are looking at the generated code. Please see comments below.

>  0x00000000005cc1dc <+1884>:  80 6a 65 bc     ldr     s0, [x20, x5]
> vs
>   0x00000000005cc1dc <+1884>:  9e 6a 65 b8     ldr     w30, [x20, x5]
The register W30 is a scalar register.

> 
> With patch:
> 
> 244                     /* Gather 4 bytes of input data for each stream. */
> 245                     input = vdupq_n_s32(GET_NEXT_4BYTES(parms, 0));
>    0x00000000005cc1c8 <+1864>:  b4 4f 46 a9     ldp     x20, x19, [x29, #96]
>    0x00000000005cc1d8 <+1880>:  65 02 40 b9     ldr     w5, [x19]
>    0x00000000005cc1dc <+1884>:  80 6a 65 bc     ldr     s0, [x20, x5]
>    0x00000000005cc26c <+2028>:  73 12 00 91     add     x19, x19, #0x4
>    0x00000000005cc2ac <+2092>:  b3 37 00 f9     str     x19, [x29, #104]
> 
> 246                     input = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 1), input,
This one and below ones are not containing any vector instructions.

> 1);
>    0x00000000005cc1d0 <+1872>:  a6 9f 47 a9     ldp     x6, x7, [x29, #120]
>    0x00000000005cc1ec <+1900>:  e5 00 40 b9     ldr     w5, [x7]
>    0x00000000005cc1f0 <+1904>:  d6 68 65 b8     ldr     w22, [x6, x5]
>    0x00000000005cc21c <+1948>:  e7 10 00 91     add     x7, x7, #0x4
>    0x00000000005cc260 <+2016>:  a7 43 00 f9     str     x7, [x29, #128]
> 
> 247                     input = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 2), input,
> 2);
>    0x00000000005cc1d4 <+1876>:  b5 4b 40 f9     ldr     x21, [x29, #144]
>    0x00000000005cc1f4 <+1908>:  a6 4f 40 f9     ldr     x6, [x29, #152]
>    0x00000000005cc1f8 <+1912>:  d4 00 40 b9     ldr     w20, [x6]
>    0x00000000005cc1fc <+1916>:  b5 6a 74 b8     ldr     w21, [x21, x20]
>    0x00000000005cc224 <+1956>:  c6 10 00 91     add     x6, x6, #0x4
>    0x00000000005cc264 <+2020>:  a6 4f 00 f9     str     x6, [x29, #152]
> 
> 248                     input = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 3), input,
> 3);
>    0x00000000005cc200 <+1920>:  a5 5b 40 f9     ldr     x5, [x29, #176]
>    0x00000000005cc204 <+1924>:  b4 00 40 b9     ldr     w20, [x5]
>    0x00000000005cc208 <+1928>:  a5 10 00 91     add     x5, x5, #0x4
>    0x00000000005cc218 <+1944>:  b7 57 40 f9     ldr     x23, [x29, #168]
>    0x00000000005cc220 <+1952>:  f4 6a 74 b8     ldr     w20, [x23, x20]
>    0x00000000005cc228 <+1960>:  a5 5b 00 f9     str     x5, [x29, #176]
> 
> With out patch:
This generated code does not contain any vector instructions. Can you please check?
I changed the code to be similar to ACL code, please look at [1], the generated code is the same.

[1] https://gcc.godbolt.org/z/p1sQNA

> 
>    245                     input = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 0), input,
> 0);
>    0x00000000005cc1c8 <+1864>:  b4 4f 46 a9     ldp     x20, x19, [x29, #96]
>    0x00000000005cc1d8 <+1880>:  65 02 40 b9     ldr     w5, [x19]
>    0x00000000005cc1dc <+1884>:  9e 6a 65 b8     ldr     w30, [x20, x5]
>    0x00000000005cc248 <+1992>:  73 12 00 91     add     x19, x19, #0x4
>    0x00000000005cc24c <+1996>:  b3 37 00 f9     str     x19, [x29, #104]
> 
> 246                     input = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 1), input,
> 1);
>    0x00000000005cc1d0 <+1872>:  a6 9f 47 a9     ldp     x6, x7, [x29, #120]
>    0x00000000005cc1ec <+1900>:  e5 00 40 b9     ldr     w5, [x7]
>    0x00000000005cc1f0 <+1904>:  d6 68 65 b8     ldr     w22, [x6, x5]
>    0x00000000005cc228 <+1960>:  e7 10 00 91     add     x7, x7, #0x4
>    0x00000000005cc240 <+1984>:  a7 43 00 f9     str     x7, [x29, #128]
> 
> 247                     input = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 2), input,
> 2);
>    0x00000000005cc1d4 <+1876>:  b5 4b 40 f9     ldr     x21, [x29, #144]
>    0x00000000005cc1f4 <+1908>:  a6 4f 40 f9     ldr     x6, [x29, #152]
>    0x00000000005cc1f8 <+1912>:  d4 00 40 b9     ldr     w20, [x6]
>    0x00000000005cc1fc <+1916>:  b5 6a 74 b8     ldr     w21, [x21, x20]
>    0x00000000005cc22c <+1964>:  c6 10 00 91     add     x6, x6, #0x4
>    0x00000000005cc244 <+1988>:  a6 4f 00 f9     str     x6, [x29, #152]
> 
> 248                     input = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 3), input,
> 3);
>    0x00000000005cc200 <+1920>:  a5 5b 40 f9     ldr     x5, [x29, #176]
>    0x00000000005cc204 <+1924>:  b4 00 40 b9     ldr     w20, [x5]
>    0x00000000005cc208 <+1928>:  a5 10 00 91     add     x5, x5, #0x4
>    0x00000000005cc21c <+1948>:  b7 57 40 f9     ldr     x23, [x29, #168]
>    0x00000000005cc224 <+1956>:  f4 6a 74 b8     ldr     w20, [x23, x20]
>    0x00000000005cc230 <+1968>:  a5 5b 00 f9     str     x5, [x29, #176]
> 
> 
> >
> > > To make forward progress send the v2 based on the updated logic
> > > just to make ACLE  Spec happy, I don’t see any real reason to do it
> > > though
> > > 😊
> > Thanks for the patch, it was important to make forward progress.
> > But, I think we should carry forward the discussion as I plan to
> > change other parts of DPDK on similar lines. I want to understand why
> > you think there is no real reason. The ACLE recommendation mentions the
> reasoning.
> 
> # I see following in the ACLE spec. What is the actual reasoning?
> "
> ACLE does not define static construction of vector types. E.g.
>  int32x4_t x = { 1, 2, 3, 4 };
> Is not portable. Use the vcreate or vdup intrinsics to construct values from
> scalars.
> "
Here is the complete text from ACLE 2.1

12.2.6 Compatibility with other vector programming models
Programmers should take particular care when combining the Neon Intrinsics API with alternative vector programming models; ACLE does not specify how the NEON Intrinsics API interoperates with them.
For instance, the GCC vector extension permits
include “arm_neon.h”
...
uint32x2_t x = {0, 1}; // GCC extension.
uint32_t y = vget_lane_s32 (x, 0); // ACLE NEON Intrinsic.
But with this code the value stored in ‘y’ will depend on both the target architecture (AArch32 or AArch64) and whether the program is running in big- or little-endian mode.
It is recommended that NEON Intrinsics be used consistently:
include “arm_neon.h”
...
const int temp[2] = {0, 1};
uint32x2_t x = vld1_s32 (temp);
uint32_t y = vget_lane_s32 (x, 0);

> 
> # Why does compiler(gcc) allows if it not indented to use?
I do not have an answer. This is a recommendation and all that I am trying to say is, following the recommendation does not cost us anything in performance.

> 
> # I think, it may be time to introduce UndefinedBehaviorSanitizer (UBSan)
> Gcc feature to DPDK to detect undefined behavior checks to detect such case
I am not sure if it helps here.

> 
> >
> 
> > >
> > > http://patches.dpdk.org/patch/54656/
> > >
  
Jerin Jacob Kollanukkaran June 17, 2019, 6:52 a.m. UTC | #13
> -----Original Message-----
> From: Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>
> Sent: Monday, June 17, 2019 6:19 AM
> To: Jerin Jacob Kollanukkaran <jerinj@marvell.com>; dev@dpdk.org
> Cc: thomas@monjalon.net; Gavin Hu (Arm Technology China)
> <Gavin.Hu@arm.com>; Honnappa Nagarahalli
> <Honnappa.Nagarahalli@arm.com>; nd <nd@arm.com>; nd <nd@arm.com>
> Subject: [EXT] RE: [dpdk-dev] [PATCH] acl: fix build issue with some arm64
> compiler
> 
> External Email
> 
> ----------------------------------------------------------------------
> > >
> > > Reduced the CC list (changing the topic slightly)
> > >
> > > > >
> > > > > My understanding is that the generated code for both your patch
> > > > > and my changes above is the same. Above suggested changes will
> > > > > conform to ACLE recommendation.
> > > >
> > > > Though instructions are different. Effective cycles are same even
> > > > though First dup updates the four positions.
> > > Can you elaborate on how the instructions are different?
> > > I wrote the following code with both the methods:
> > >
> > > uint32x4_t u32x4_gather_gcc (uint32_t *p0, uint32_t *p1, uint32_t
> > > *p2, uint32_t *p3) {
> > >      uint32x4_t r = {*p0, *p1, *p2, *p3};
> > >
> > >      return r;
> > > }
> > >
> > > uint32x4_t u32x4_gather_acle (uint32_t *p0, uint32_t *p1, uint32_t
> > > *p2, uint32_t *p3) {
> > >      uint32x4_t r;
> > >
> > >      r = vdupq_n_u32 (* p0);
> > >      r = vsetq_lane_u32 (*p1, r, 1);
> > >      r = vsetq_lane_u32 (*p2, r, 2);
> > >      r = vsetq_lane_u32 (*p3, r, 3);
> > >
> > >      return r;
> > > }
> > >
> > > The generated code has the same instructions for both (omitted the
> > > unwanted
> > > parts):
> > >
> > > u32x4_gather_gcc:
> > >         ld1r    {v0.4s}, [x0]
> > >         ld1     {v0.s}[1], [x1]
> > >         ld1     {v0.s}[2], [x2]
> > >         ld1     {v0.s}[3], [x3]
> > >         ret
> > >
> > > u32x4_gather_acle:
> > >         ld1r    {v0.4s}, [x0]
> > >         ld1     {v0.s}[1], [x1]
> > >         ld1     {v0.s}[2], [x2]
> > >         ld1     {v0.s}[3], [x3]
> > >         ret
> > >
> > > The first 'ld1r' updates all the lanes in both the cases.
> >
> >
> > Please check actual generated code for ACL case. We can see difference
> I think there is something wrong with the way you are looking at the
> generated code. Please see comments below.

I am generating the dis assembly like below.
gdb -batch -ex 'file build/app/test ' -ex 'disassemble /rm search_neon_4'

You can try it out.

> 
> > > > To make forward progress send the v2 based on the updated logic
> > > > just to make ACLE  Spec happy, I don’t see any real reason to do
> > > > it though
> > > > 😊
> > > Thanks for the patch, it was important to make forward progress.
> > > But, I think we should carry forward the discussion as I plan to
> > > change other parts of DPDK on similar lines. I want to understand
> > > why you think there is no real reason. The ACLE recommendation
> > > mentions the
> > reasoning.
> >
> > # I see following in the ACLE spec. What is the actual reasoning?
> > "
> > ACLE does not define static construction of vector types. E.g.
> >  int32x4_t x = { 1, 2, 3, 4 };
> > Is not portable. Use the vcreate or vdup intrinsics to construct
> > values from scalars.
> > "
> Here is the complete text from ACLE 2.1
> 
> 12.2.6 Compatibility with other vector programming models Programmers
> should take particular care when combining the Neon Intrinsics API with
> alternative vector programming models; ACLE does not specify how the
> NEON Intrinsics API interoperates with them.
> For instance, the GCC vector extension permits include “arm_neon.h”
> ...
> uint32x2_t x = {0, 1}; // GCC extension.
> uint32_t y = vget_lane_s32 (x, 0); // ACLE NEON Intrinsic.
> But with this code the value stored in ‘y’ will depend on both the target
> architecture (AArch32 or AArch64) and whether the program is running in
> big- or little-endian mode.

I don’t have a big endian machine to test. I would be interesting to see 
The output in bigendian. 

> It is recommended that NEON Intrinsics be used consistently:
> include “arm_neon.h”
> ...
> const int temp[2] = {0, 1};
> uint32x2_t x = vld1_s32 (temp);
> uint32_t y = vget_lane_s32 (x, 0);
> 
> >
> > # Why does compiler(gcc) allows if it not indented to use?
> I do not have an answer. This is a recommendation and all that I am trying to
> say is, following the recommendation does not cost us anything in
> performance.

If there is no performance regression then no issue in changing to this format.

> 
> >
> > # I think, it may be time to introduce UndefinedBehaviorSanitizer
> > (UBSan) Gcc feature to DPDK to detect undefined behavior checks to
> > detect such case
> I am not sure if it helps here.
> 
> >
> > >
> >
> > > >
> > > > http://patches.dpdk.org/patch/54656/
> > > >
  

Patch

diff --git a/lib/librte_acl/acl_run_neon.h b/lib/librte_acl/acl_run_neon.h
index 01b9766d8..dc9e9efe9 100644
--- a/lib/librte_acl/acl_run_neon.h
+++ b/lib/librte_acl/acl_run_neon.h
@@ -165,7 +165,6 @@  search_neon_8(const struct rte_acl_ctx *ctx, const uint8_t **data,
 	uint64_t index_array[8];
 	struct completion cmplt[8];
 	struct parms parms[8];
-	int32x4_t input0, input1;
 
 	acl_set_flow(&flows, cmplt, RTE_DIM(cmplt), data, results,
 		     total_packets, categories, ctx->trans_table);
@@ -181,17 +180,14 @@  search_neon_8(const struct rte_acl_ctx *ctx, const uint8_t **data,
 
 	while (flows.started > 0) {
 		/* Gather 4 bytes of input data for each stream. */
-		input0 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 0), input0, 0);
-		input1 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 4), input1, 0);
-
-		input0 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 1), input0, 1);
-		input1 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 5), input1, 1);
-
-		input0 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 2), input0, 2);
-		input1 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 6), input1, 2);
-
-		input0 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 3), input0, 3);
-		input1 = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 7), input1, 3);
+		int32x4_t input0 = {GET_NEXT_4BYTES(parms, 0),
+				    GET_NEXT_4BYTES(parms, 1),
+				    GET_NEXT_4BYTES(parms, 2),
+				    GET_NEXT_4BYTES(parms, 3)};
+		int32x4_t input1 = {GET_NEXT_4BYTES(parms, 4),
+				    GET_NEXT_4BYTES(parms, 5),
+				    GET_NEXT_4BYTES(parms, 6),
+				    GET_NEXT_4BYTES(parms, 7)};
 
 		/* Process the 4 bytes of input on each stream. */
 
@@ -227,7 +223,6 @@  search_neon_4(const struct rte_acl_ctx *ctx, const uint8_t **data,
 	uint64_t index_array[4];
 	struct completion cmplt[4];
 	struct parms parms[4];
-	int32x4_t input;
 
 	acl_set_flow(&flows, cmplt, RTE_DIM(cmplt), data, results,
 		     total_packets, categories, ctx->trans_table);
@@ -242,10 +237,10 @@  search_neon_4(const struct rte_acl_ctx *ctx, const uint8_t **data,
 
 	while (flows.started > 0) {
 		/* Gather 4 bytes of input data for each stream. */
-		input = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 0), input, 0);
-		input = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 1), input, 1);
-		input = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 2), input, 2);
-		input = vsetq_lane_s32(GET_NEXT_4BYTES(parms, 3), input, 3);
+		int32x4_t input = {GET_NEXT_4BYTES(parms, 0),
+				   GET_NEXT_4BYTES(parms, 1),
+				   GET_NEXT_4BYTES(parms, 2),
+				   GET_NEXT_4BYTES(parms, 3)};
 
 		/* Process the 4 bytes of input on each stream. */
 		input = transition4(input, flows.trans, index_array);